Upload prepare_data.py with huggingface_hub

e82a88e verified 1 day ago

5.18 kB

	"""
	Prepare İvme's pretraining data: stream the dense mix, tokenize, pack to disk.

	Output is a flat uint16 memmap (vocab 16384 < 65536, so uint16 is exact). We
	write documents in ASCENDING quality order so a sequential read during training
	acts as a curriculum — the model sees noisier web text first and the densest
	material (textbooks, then Wikipedia) last. Research shows this ordering plus a
	moderate LR decay beats random shuffling for free.

	The mix mirrors what made Archaea-74M punch so far above its weight, pushed a
	little denser (more math, stricter web filter):

	FineWeb-HQ (score-gated web) 45% ~710M tokens [first / lowest density]
	Python stack (filtered) 10% ~160M tokens
	FineMath-4+ 15% ~235M tokens
	Cosmopedia (stanford+wikihow) 25% ~395M tokens
	Wikipedia EN 5% ~80M tokens [last / highest density]
	---- -----------
	100% ~1.57B tokens (Chinchilla-optimal)

	Usage:
	python prepare_data.py # full ~1.57B token build
	python prepare_data.py --smoke # tiny build to test the pipeline
	"""

	from __future__ import annotations

	import argparse
	import os

	import numpy as np

	from huggingface_hub import login

	login(token="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")

	TOKENIZER_PATH = "ivme_tokenizer.json"
	OUT_DIR = "data"
	DTYPE = np.uint16

	# (source_key, target_tokens) in ASCENDING quality order — written in this order.
	TOKEN_BUDGET = [
	("fineweb_hq", 710_000_000),
	("python", 160_000_000),
	("finemath", 235_000_000),
	("cosmopedia", 395_000_000),
	("wikipedia", 80_000_000),
	]
	SMOKE_BUDGET = [(k, 200_000) for k, _ in TOKEN_BUDGET]

	VAL_TOKENS = 2_000_000 # held out from the tail of each source proportionally


	def make_stream(source_key):
	"""Return (iterable_of_text, text_field) for a source."""
	from datasets import load_dataset

	if source_key == "fineweb_hq":
	ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
	return ds, "text"
	if source_key == "cosmopedia":
	# Two dense subsets concatenated.
	a = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
	b = load_dataset("HuggingFaceTB/cosmopedia", "wikihow", split="train", streaming=True)
	from itertools import chain
	return chain(a, b), "text"
	if source_key == "finemath":
	ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
	return ds, "text"
	if source_key == "python":
	ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
	return ds, "content"
	if source_key == "wikipedia":
	ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
	return ds, "text"
	raise ValueError(source_key)


	def build(budget):
	from tokenizers import Tokenizer

	os.makedirs(OUT_DIR, exist_ok=True)
	tok = Tokenizer.from_file(TOKENIZER_PATH)
	eos_id = tok.token_to_id("<\|eos\|>")

	train_path = os.path.join(OUT_DIR, "train.bin")
	val_path = os.path.join(OUT_DIR, "val.bin")

	total_target = sum(n for _, n in budget)
	print(f"[data] target ~{total_target/1e6:.0f}M tokens across {len(budget)} sources")

	train_f = open(train_path, "wb")
	val_buf = [] # small, held in memory

	written_train = 0
	for source_key, target in budget:
	stream, field = make_stream(source_key)
	src_written = 0
	# Reserve a slice of each source's tail for validation.
	val_target = int(VAL_TOKENS * (target / total_target))
	print(f"[data] {source_key}: target {target/1e6:.0f}M (val {val_target/1e6:.2f}M)")

	for row in stream:
	text = row.get(field)
	if not text:
	continue
	ids = tok.encode(text).ids
	ids.append(eos_id) # document boundary
	arr = np.array(ids, dtype=DTYPE)

	if len(val_buf) * 0 + src_written >= target:
	break
	# Send the first val_target tokens of this source to val, rest to train.
	if src_written < val_target:
	val_buf.append(arr)
	else:
	arr.tofile(train_f)
	written_train += len(arr)
	src_written += len(arr)

	if src_written % 5_000_000 < len(arr):
	print(f" [{source_key}] {src_written/1e6:.1f}M / {target/1e6:.0f}M")

	train_f.close()
	val_arr = np.concatenate(val_buf) if val_buf else np.array([], dtype=DTYPE)
	val_arr.tofile(val_path)

	print(f"[data] train.bin : {written_train:,} tokens -> {train_path}")
	print(f"[data] val.bin : {len(val_arr):,} tokens -> {val_path}")
	print(f"[data] curriculum order preserved (sequential read = ascending quality)")


	if __name__ == "__main__":
	ap = argparse.ArgumentParser()
	ap.add_argument("--smoke", action="store_true", help="tiny build to test the pipeline")
	args = ap.parse_args()
	build(SMOKE_BUDGET if args.smoke else TOKEN_BUDGET)