Ivme-Conversate-22M-Base / prepare_data.py
ereniko's picture
Upload prepare_data.py with huggingface_hub
e82a88e verified
"""
Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.
Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
write documents in ASCENDING quality order so a sequential read during training
acts as a curriculum — the model sees noisier web text first and the densest
material (textbooks, then Wikipedia) last. Research shows this ordering plus a
moderate LR decay beats random shuffling for free.
The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
little denser (more math, stricter web filter):
FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density]
Python stack (filtered) 10% ~160M tokens
FineMath-4+ 15% ~235M tokens
Cosmopedia (stanford+wikihow) 25% ~395M tokens
Wikipedia EN 5% ~80M tokens [last / highest density]
---- -----------
100% ~1.57B tokens (Chinchilla-optimal)
Usage:
python prepare_data.py # full ~1.57B token build
python prepare_data.py --smoke # tiny build to test the pipeline
"""
from __future__ import annotations
import argparse
import os
import numpy as np
from huggingface_hub import login
login(token="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")
TOKENIZER_PATH = "ivme_tokenizer.json"
OUT_DIR = "data"
DTYPE = np.uint16
# (source_key, target_tokens) in ASCENDING quality order — written in this order.
TOKEN_BUDGET = [
("fineweb_hq", 710_000_000),
("python", 160_000_000),
("finemath", 235_000_000),
("cosmopedia", 395_000_000),
("wikipedia", 80_000_000),
]
SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]
VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally
def make_stream(source_key):
"""Return (iterable_of_text, text_field) for a source."""
from datasets import load_dataset
if source_key == "fineweb_hq":
ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
return ds, "text"
if source_key == "cosmopedia":
# Two dense subsets concatenated.
a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
from itertools import chain
return chain(a, b), "text"
if source_key == "finemath":
ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
return ds, "text"
if source_key == "python":
ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
return ds, "content"
if source_key == "wikipedia":
ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
return ds, "text"
raise ValueError(source_key)
def build(budget):
from tokenizers import Tokenizer
os.makedirs(OUT_DIR, exist_ok=True)
tok = Tokenizer.from_file(TOKENIZER_PATH)
eos_id = tok.token_to_id("<|eos|>")
train_path = os.path.join(OUT_DIR, "train.bin")
val_path = os.path.join(OUT_DIR, "val.bin")
total_target = sum(n for _, n in budget)
print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")
train_f = open(train_path, "wb")
val_buf = [] # small, held in memory
written_train = 0
for source_key, target in budget:
stream, field = make_stream(source_key)
src_written = 0
# Reserve a slice of each source's tail for validation.
val_target = int(VAL_TOKENS * (target / total_target))
print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")
for row in stream:
text = row.get(field)
if not text:
continue
ids = tok.encode(text).ids
ids.append(eos_id) # document boundary
arr = np.array(ids, dtype=DTYPE)
if len(val_buf) * 0 + src_written >= target:
break
# Send the first val_target tokens of this source to val, rest to train.
if src_written < val_target:
val_buf.append(arr)
else:
arr.tofile(train_f)
written_train += len(arr)
src_written += len(arr)
if src_written % 5_000_000 < len(arr):
print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")
train_f.close()
val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
val_arr.tofile(val_path)
print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}")
print(f"[data] curriculum order preserved (sequential read = ascending quality)")
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
args = ap.parse_args()
build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)