""" Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1). We train on a balanced sample drawn from the same dense mix used for pretraining, so the tokenizer's merges reflect the actual data distribution (web text + textbooks + math + code). A from-scratch tokenizer matters at this scale: every wasted token in the vocab is embedding-table budget burned. Usage: python tokenizer.py --train # train and save ivme_tokenizer.json python tokenizer.py --test # quick round-trip check on saved tokenizer """ from __future__ import annotations import argparse import os # Avoid fork/threading crashes when the Rust tokenizer consumes Python data. os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") VOCAB_SIZE = 16_384 TOKENIZER_PATH = "ivme_tokenizer.json" # Truncate each document — subword merges are learned fine from prefixes, and # this keeps the trainer's in-memory word counts bounded (no segfaults). MAX_CHARS = 8_000 # Special tokens. We keep a small, purposeful set: pad, bos, eos, and a small # bank of chat-control tokens reused later for instruct fine-tuning. SPECIAL_TOKENS = [ "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", "<|user|>", "<|assistant|>", "<|system|>", ] # How many documents to sample per source for tokenizer training. A 16K vocab # needs very little data — ~130K docs total is plenty and keeps RAM bounded. SAMPLE_PER_SOURCE = { "fineweb_hq": 60_000, "cosmopedia": 30_000, "finemath": 20_000, "python": 20_000, "wikipedia": 20_000, } def text_iterator(): """Yield raw text strings sampled from each source in the dense mix.""" from datasets import load_dataset def take(ds, n, field="text"): count = 0 for row in ds: txt = row.get(field) if txt: yield txt[:MAX_CHARS] count += 1 if count >= n: return print("[tok] streaming FineWeb-HQ ...") ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True) yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"]) print("[tok] streaming Cosmopedia ...") ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True) yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"]) print("[tok] streaming FineMath ...") ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True) yield from take(ds, SAMPLE_PER_SOURCE["finemath"]) print("[tok] streaming Python stack ...") ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True) yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content") print("[tok] streaming Wikipedia ...") ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True) yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"]) def train(): from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>")) # ByteLevel pre-tokenizer: no out-of-vocab characters ever, GPT-2 style. tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() trainer = trainers.BpeTrainer( vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS, min_frequency=2, show_progress=True, ) # Collect ALL text into memory first, in the main thread. This is the fix # for the segfault: train_from_iterator consumes its input from Rust threads, # so doing network I/O / dataset loading lazily mid-iteration crashes. By # fully materializing first, every download happens here, safely. print("[tok] collecting corpus into memory (this is where downloads happen)...") texts = list(text_iterator()) print(f"[tok] collected {len(texts):,} documents") print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...") tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts)) tokenizer.save(TOKENIZER_PATH) print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})") def test(): from tokenizers import Tokenizer tok = Tokenizer.from_file(TOKENIZER_PATH) samples = [ "İvme is a stupidly small language model.", "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)", "The derivative of x^2 is 2x.", ] for s in samples: ids = tok.encode(s).ids back = tok.decode(ids) print(f"\n text : {s}") print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)") print(f" decoded: {back}") if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument("--train", action="store_true") ap.add_argument("--test", action="store_true") args = ap.parse_args() if args.train: train() elif args.test: test() else: print("pass --train or --test")