Upload tokenizer.py with huggingface_hub

f0169be verified 1 day ago

5.02 kB

	"""
	Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1).

	We train on a balanced sample drawn from the same dense mix used for
	pretraining, so the tokenizer's merges reflect the actual data distribution
	(web text + textbooks + math + code). A from-scratch tokenizer matters at this
	scale: every wasted token in the vocab is embedding-table budget burned.

	Usage:
	python tokenizer.py --train # train and save ivme_tokenizer.json
	python tokenizer.py --test # quick round-trip check on saved tokenizer
	"""

	from __future__ import annotations

	import argparse
	import os

	# Avoid fork/threading crashes when the Rust tokenizer consumes Python data.
	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

	VOCAB_SIZE = 16_384
	TOKENIZER_PATH = "ivme_tokenizer.json"

	# Truncate each document — subword merges are learned fine from prefixes, and
	# this keeps the trainer's in-memory word counts bounded (no segfaults).
	MAX_CHARS = 8_000

	# Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small
	# bank of chat-control tokens reused later for instruct fine-tuning.
	SPECIAL_TOKENS = [
	"<\|pad\|>", "<\|bos\|>", "<\|eos\|>", "<\|unk\|>",
	"<\|user\|>", "<\|assistant\|>", "<\|system\|>",
	]

	# How many documents to sample per source for tokenizer training. A 16K vocab
	# needs very little data — ~130K docs total is plenty and keeps RAM bounded.
	SAMPLE_PER_SOURCE = {
	"fineweb_hq": 60_000,
	"cosmopedia": 30_000,
	"finemath": 20_000,
	"python": 20_000,
	"wikipedia": 20_000,
	}


	def text_iterator():
	"""Yield raw text strings sampled from each source in the dense mix."""
	from datasets import load_dataset

	def take(ds, n, field="text"):
	count = 0
	for row in ds:
	txt = row.get(field)
	if txt:
	yield txt[:MAX_CHARS]
	count += 1
	if count >= n:
	return

	print("[tok] streaming FineWeb-HQ ...")
	ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True)
	yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"])

	print("[tok] streaming Cosmopedia ...")
	ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True)
	yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"])

	print("[tok] streaming FineMath ...")
	ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True)
	yield from take(ds, SAMPLE_PER_SOURCE["finemath"])

	print("[tok] streaming Python stack ...")
	ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True)
	yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content")

	print("[tok] streaming Wikipedia ...")
	ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True)
	yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"])


	def train():
	from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders

	tokenizer = Tokenizer(models.BPE(unk_token="<\|unk\|>"))
	# ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style.
	tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
	tokenizer.decoder = decoders.ByteLevel()

	trainer = trainers.BpeTrainer(
	vocab_size=VOCAB_SIZE,
	special_tokens=SPECIAL_TOKENS,
	min_frequency=2,
	show_progress=True,
	)

	# Collect ALL text into memory first, in the main thread. This is the fix
	# for the segfault: train_from_iterator consumes its input from Rust threads,
	# so doing network I/O / dataset loading lazily mid-iteration crashes. By
	# fully materializing first, every download happens here, safely.
	print("[tok] collecting corpus into memory (this is where downloads happen)...")
	texts = list(text_iterator())
	print(f"[tok] collected {len(texts):,} documents")

	print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...")
	tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts))
	tokenizer.save(TOKENIZER_PATH)
	print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})")


	def test():
	from tokenizers import Tokenizer

	tok = Tokenizer.from_file(TOKENIZER_PATH)
	samples = [
	"İvme is a stupidly small language model.",
	"def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)",
	"The derivative of x^2 is 2x.",
	]
	for s in samples:
	ids = tok.encode(s).ids
	back = tok.decode(ids)
	print(f"\n text : {s}")
	print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)")
	print(f" decoded: {back}")


	if __name__ == "__main__":
	ap = argparse.ArgumentParser()
	ap.add_argument("--train", action="store_true")
	ap.add_argument("--test", action="store_true")
	args = ap.parse_args()
	if args.train:
	train()
	elif args.test:
	test()
	else:
	print("pass --train or --test")