File size: 5,176 Bytes

e82a88e

"""
Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.

Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
write documents in ASCENDING quality order so a sequential read during training
acts as a curriculum — the model sees noisier web text first and the densest
material (textbooks, then Wikipedia) last. Research shows this ordering plus a
moderate LR decay beats random shuffling for free.

The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
little denser (more math, stricter web filter):

    FineWeb-HQ (score-gated web)   45%   ~710M tokens   [first / lowest density]
    Python stack (filtered)        10%   ~160M tokens
    FineMath-4+                    15%   ~235M tokens
    Cosmopedia (stanford+wikihow)  25%   ~395M tokens
    Wikipedia EN                    5%    ~80M tokens   [last / highest density]
                                   ----  -----------
                                   100%  ~1.57B tokens  (Chinchilla-optimal)

Usage:
    python prepare_data.py                 # full ~1.57B token build
    python prepare_data.py --smoke         # tiny build to test the pipeline
"""

from __future__ import annotations

import argparse
import os

import numpy as np

from huggingface_hub import login

login(token="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")

TOKENIZER_PATH = "ivme_tokenizer.json"
OUT_DIR = "data"
DTYPE = np.uint16

# (source_key, target_tokens) in ASCENDING quality order — written in this order.
TOKEN_BUDGET = [
    ("fineweb_hq", 710_000_000),
    ("python",     160_000_000),
    ("finemath",   235_000_000),
    ("cosmopedia", 395_000_000),
    ("wikipedia",   80_000_000),
]
SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]

VAL_TOKENS = 2_000_000  # held out from the tail of each source proportionally


def make_stream(source_key):
    """Return (iterable_of_text, text_field) for a source."""
    from datasets import load_dataset

    if source_key == "fineweb_hq":
        ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
        return ds, "text"
    if source_key == "cosmopedia":
        # Two dense subsets concatenated.
        a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
        b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
        from itertools import chain
        return chain(a, b), "text"
    if source_key == "finemath":
        ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
        return ds, "text"
    if source_key == "python":
        ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
        return ds, "content"
    if source_key == "wikipedia":
        ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
        return ds, "text"
    raise ValueError(source_key)


def build(budget):
    from tokenizers import Tokenizer

    os.makedirs(OUT_DIR, exist_ok=True)
    tok = Tokenizer.from_file(TOKENIZER_PATH)
    eos_id = tok.token_to_id("<|eos|>")

    train_path = os.path.join(OUT_DIR, "train.bin")
    val_path = os.path.join(OUT_DIR, "val.bin")

    total_target = sum(n for _, n in budget)
    print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")

    train_f = open(train_path, "wb")
    val_buf = []  # small, held in memory

    written_train = 0
    for source_key, target in budget:
        stream, field = make_stream(source_key)
        src_written = 0
        # Reserve a slice of each source's tail for validation.
        val_target = int(VAL_TOKENS * (target / total_target))
        print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")

        for row in stream:
            text = row.get(field)
            if not text:
                continue
            ids = tok.encode(text).ids
            ids.append(eos_id)  # document boundary
            arr = np.array(ids, dtype=DTYPE)

            if len(val_buf) * 0 + src_written >= target:
                break
            # Send the first val_target tokens of this source to val, rest to train.
            if src_written < val_target:
                val_buf.append(arr)
            else:
                arr.tofile(train_f)
                written_train += len(arr)
            src_written += len(arr)

            if src_written % 5_000_000 < len(arr):
                print(f"  [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")

    train_f.close()
    val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
    val_arr.tofile(val_path)

    print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
    print(f"[data] val.bin   : {len(val_arr):,} tokens -> {val_path}")
    print(f"[data] curriculum order preserved (sequential read = ascending quality)")


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
    args = ap.parse_args()
    build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)