File size: 5,176 Bytes
e82a88e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | """
Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.
Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
write documents in ASCENDING quality order so a sequential read during training
acts as a curriculum — the model sees noisier web text first and the densest
material (textbooks, then Wikipedia) last. Research shows this ordering plus a
moderate LR decay beats random shuffling for free.
The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
little denser (more math, stricter web filter):
FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density]
Python stack (filtered) 10% ~160M tokens
FineMath-4+ 15% ~235M tokens
Cosmopedia (stanford+wikihow) 25% ~395M tokens
Wikipedia EN 5% ~80M tokens [last / highest density]
---- -----------
100% ~1.57B tokens (Chinchilla-optimal)
Usage:
python prepare_data.py # full ~1.57B token build
python prepare_data.py --smoke # tiny build to test the pipeline
"""
from __future__ import annotations
import argparse
import os
import numpy as np
from huggingface_hub import login
login(token="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")
TOKENIZER_PATH = "ivme_tokenizer.json"
OUT_DIR = "data"
DTYPE = np.uint16
# (source_key, target_tokens) in ASCENDING quality order — written in this order.
TOKEN_BUDGET = [
("fineweb_hq", 710_000_000),
("python", 160_000_000),
("finemath", 235_000_000),
("cosmopedia", 395_000_000),
("wikipedia", 80_000_000),
]
SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]
VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally
def make_stream(source_key):
"""Return (iterable_of_text, text_field) for a source."""
from datasets import load_dataset
if source_key == "fineweb_hq":
ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
return ds, "text"
if source_key == "cosmopedia":
# Two dense subsets concatenated.
a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
from itertools import chain
return chain(a, b), "text"
if source_key == "finemath":
ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
return ds, "text"
if source_key == "python":
ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
return ds, "content"
if source_key == "wikipedia":
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
return ds, "text"
raise ValueError(source_key)
def build(budget):
from tokenizers import Tokenizer
os.makedirs(OUT_DIR, exist_ok=True)
tok = Tokenizer.from_file(TOKENIZER_PATH)
eos_id = tok.token_to_id("<|eos|>")
train_path = os.path.join(OUT_DIR, "train.bin")
val_path = os.path.join(OUT_DIR, "val.bin")
total_target = sum(n for _, n in budget)
print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")
train_f = open(train_path, "wb")
val_buf = [] # small, held in memory
written_train = 0
for source_key, target in budget:
stream, field = make_stream(source_key)
src_written = 0
# Reserve a slice of each source's tail for validation.
val_target = int(VAL_TOKENS * (target / total_target))
print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")
for row in stream:
text = row.get(field)
if not text:
continue
ids = tok.encode(text).ids
ids.append(eos_id) # document boundary
arr = np.array(ids, dtype=DTYPE)
if len(val_buf) * 0 + src_written >= target:
break
# Send the first val_target tokens of this source to val, rest to train.
if src_written < val_target:
val_buf.append(arr)
else:
arr.tofile(train_f)
written_train += len(arr)
src_written += len(arr)
if src_written % 5_000_000 < len(arr):
print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")
train_f.close()
val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
val_arr.tofile(val_path)
print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}")
print(f"[data] curriculum order preserved (sequential read = ascending quality)")
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
args = ap.parse_args()
build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)
|