| """ |
| Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk. |
| |
| Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We |
| write documents in ASCENDING quality order so a sequential read during training |
| acts as a curriculum — the model sees noisier web text first and the densest |
| material (textbooks, then Wikipedia) last. Research shows this ordering plus a |
| moderate LR decay beats random shuffling for free. |
| |
| The mix mirrors what made Archaea-74M punch so far above its weight, pushed a |
| little denser (more math, stricter web filter): |
| |
| FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density] |
| Python stack (filtered) 10% ~160M tokens |
| FineMath-4+ 15% ~235M tokens |
| Cosmopedia (stanford+wikihow) 25% ~395M tokens |
| Wikipedia EN 5% ~80M tokens [last / highest density] |
| ---- ----------- |
| 100% ~1.57B tokens (Chinchilla-optimal) |
| |
| Usage: |
| python prepare_data.py # full ~1.57B token build |
| python prepare_data.py --smoke # tiny build to test the pipeline |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
|
|
| import numpy as np |
|
|
| from huggingface_hub import login |
|
|
| login(token="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") |
|
|
| TOKENIZER_PATH = "ivme_tokenizer.json" |
| OUT_DIR = "data" |
| DTYPE = np.uint16 |
|
|
| |
| TOKEN_BUDGET = [ |
| ("fineweb_hq", 710_000_000), |
| ("python", 160_000_000), |
| ("finemath", 235_000_000), |
| ("cosmopedia", 395_000_000), |
| ("wikipedia", 80_000_000), |
| ] |
| SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET] |
|
|
| VAL_TOKENS = 2_000_000 |
|
|
|
|
| def make_stream(source_key): |
| """Return (iterable_of_text, text_field) for a source.""" |
| from datasets import load_dataset |
|
|
| if source_key == "fineweb_hq": |
| ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True) |
| return ds, "text" |
| if source_key == "cosmopedia": |
| |
| a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True) |
| b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True) |
| from itertools import chain |
| return chain(a, b), "text" |
| if source_key == "finemath": |
| ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True) |
| return ds, "text" |
| if source_key == "python": |
| ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True) |
| return ds, "content" |
| if source_key == "wikipedia": |
| ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True) |
| return ds, "text" |
| raise ValueError(source_key) |
|
|
|
|
| def build(budget): |
| from tokenizers import Tokenizer |
|
|
| os.makedirs(OUT_DIR, exist_ok=True) |
| tok = Tokenizer.from_file(TOKENIZER_PATH) |
| eos_id = tok.token_to_id("<|eos|>") |
|
|
| train_path = os.path.join(OUT_DIR, "train.bin") |
| val_path = os.path.join(OUT_DIR, "val.bin") |
|
|
| total_target = sum(n for _, n in budget) |
| print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources") |
|
|
| train_f = open(train_path, "wb") |
| val_buf = [] |
|
|
| written_train = 0 |
| for source_key, target in budget: |
| stream, field = make_stream(source_key) |
| src_written = 0 |
| |
| val_target = int(VAL_TOKENS * (target / total_target)) |
| print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)") |
|
|
| for row in stream: |
| text = row.get(field) |
| if not text: |
| continue |
| ids = tok.encode(text).ids |
| ids.append(eos_id) |
| arr = np.array(ids, dtype=DTYPE) |
|
|
| if len(val_buf) * 0 + src_written >= target: |
| break |
| |
| if src_written < val_target: |
| val_buf.append(arr) |
| else: |
| arr.tofile(train_f) |
| written_train += len(arr) |
| src_written += len(arr) |
|
|
| if src_written % 5_000_000 < len(arr): |
| print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M") |
|
|
| train_f.close() |
| val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE) |
| val_arr.tofile(val_path) |
|
|
| print(f"[data] train.bin : {written_train:,} tokens -> {train_path}") |
| print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}") |
| print(f"[data] curriculum order preserved (sequential read = ascending quality)") |
|
|
|
|
| if __name__ == "__main__": |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline") |
| args = ap.parse_args() |
| build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET) |
|
|