""" Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk. Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We write documents in ASCENDING quality order so a sequential read during training acts as a curriculum — the model sees noisier web text first and the densest material (textbooks, then Wikipedia) last. Research shows this ordering plus a moderate LR decay beats random shuffling for free. The mix mirrors what made Archaea-74M punch so far above its weight, pushed a little denser (more math, stricter web filter): FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density] Python stack (filtered) 10% ~160M tokens FineMath-4+ 15% ~235M tokens Cosmopedia (stanford+wikihow) 25% ~395M tokens Wikipedia EN 5% ~80M tokens [last / highest density] ---- ----------- 100% ~1.57B tokens (Chinchilla-optimal) Usage: python prepare_data.py # full ~1.57B token build python prepare_data.py --smoke # tiny build to test the pipeline """ from __future__ import annotations import argparse import os import numpy as np from huggingface_hub import login login(token="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") TOKENIZER_PATH = "ivme_tokenizer.json" OUT_DIR = "data" DTYPE = np.uint16 # (source_key, target_tokens) in ASCENDING quality order — written in this order. TOKEN_BUDGET = [ ("fineweb_hq", 710_000_000), ("python", 160_000_000), ("finemath", 235_000_000), ("cosmopedia", 395_000_000), ("wikipedia", 80_000_000), ] SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET] VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally def make_stream(source_key): """Return (iterable_of_text, text_field) for a source.""" from datasets import load_dataset if source_key == "fineweb_hq": ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True) return ds, "text" if source_key == "cosmopedia": # Two dense subsets concatenated. a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True) b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True) from itertools import chain return chain(a, b), "text" if source_key == "finemath": ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True) return ds, "text" if source_key == "python": ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True) return ds, "content" if source_key == "wikipedia": ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True) return ds, "text" raise ValueError(source_key) def build(budget): from tokenizers import Tokenizer os.makedirs(OUT_DIR, exist_ok=True) tok = Tokenizer.from_file(TOKENIZER_PATH) eos_id = tok.token_to_id("<|eos|>") train_path = os.path.join(OUT_DIR, "train.bin") val_path = os.path.join(OUT_DIR, "val.bin") total_target = sum(n for _, n in budget) print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources") train_f = open(train_path, "wb") val_buf = [] # small, held in memory written_train = 0 for source_key, target in budget: stream, field = make_stream(source_key) src_written = 0 # Reserve a slice of each source's tail for validation. val_target = int(VAL_TOKENS * (target / total_target)) print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)") for row in stream: text = row.get(field) if not text: continue ids = tok.encode(text).ids ids.append(eos_id) # document boundary arr = np.array(ids, dtype=DTYPE) if len(val_buf) * 0 + src_written >= target: break # Send the first val_target tokens of this source to val, rest to train. if src_written < val_target: val_buf.append(arr) else: arr.tofile(train_f) written_train += len(arr) src_written += len(arr) if src_written % 5_000_000 < len(arr): print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M") train_f.close() val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE) val_arr.tofile(val_path) print(f"[data] train.bin : {written_train:,} tokens -> {train_path}") print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}") print(f"[data] curriculum order preserved (sequential read = ascending quality)") if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline") args = ap.parse_args() build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)