| """ |
| Train İvme's BPE tokenizer from scratch (16,384 vocab, English v1). |
| |
| We train on a balanced sample drawn from the same dense mix used for |
| pretraining, so the tokenizer's merges reflect the actual data distribution |
| (web text + textbooks + math + code). A from-scratch tokenizer matters at this |
| scale: every wasted token in the vocab is embedding-table budget burned. |
| |
| Usage: |
| python tokenizer.py --train # train and save ivme_tokenizer.json |
| python tokenizer.py --test # quick round-trip check on saved tokenizer |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
|
|
| |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") |
|
|
| VOCAB_SIZE = 16_384 |
| TOKENIZER_PATH = "ivme_tokenizer.json" |
|
|
| |
| |
| MAX_CHARS = 8_000 |
|
|
| |
| |
| SPECIAL_TOKENS = [ |
| "<|pad|>", "<|bos|>", "<|eos|>", "<|unk|>", |
| "<|user|>", "<|assistant|>", "<|system|>", |
| ] |
|
|
| |
| |
| SAMPLE_PER_SOURCE = { |
| "fineweb_hq": 60_000, |
| "cosmopedia": 30_000, |
| "finemath": 20_000, |
| "python": 20_000, |
| "wikipedia": 20_000, |
| } |
|
|
|
|
| def text_iterator(): |
| """Yield raw text strings sampled from each source in the dense mix.""" |
| from datasets import load_dataset |
|
|
| def take(ds, n, field="text"): |
| count = 0 |
| for row in ds: |
| txt = row.get(field) |
| if txt: |
| yield txt[:MAX_CHARS] |
| count += 1 |
| if count >= n: |
| return |
|
|
| print("[tok] streaming FineWeb-HQ ...") |
| ds = load_dataset("epfml/FineWeb-HQ", split="train", streaming=True) |
| yield from take(ds, SAMPLE_PER_SOURCE["fineweb_hq"]) |
|
|
| print("[tok] streaming Cosmopedia ...") |
| ds = load_dataset("HuggingFaceTB/cosmopedia", "stanford", split="train", streaming=True) |
| yield from take(ds, SAMPLE_PER_SOURCE["cosmopedia"]) |
|
|
| print("[tok] streaming FineMath ...") |
| ds = load_dataset("HuggingFaceTB/finemath", "finemath-4plus", split="train", streaming=True) |
| yield from take(ds, SAMPLE_PER_SOURCE["finemath"]) |
|
|
| print("[tok] streaming Python stack ...") |
| ds = load_dataset("bigcode/python-stack-v1-functions-filtered", split="train", streaming=True) |
| yield from take(ds, SAMPLE_PER_SOURCE["python"], field="content") |
|
|
| print("[tok] streaming Wikipedia ...") |
| ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train", streaming=True) |
| yield from take(ds, SAMPLE_PER_SOURCE["wikipedia"]) |
|
|
|
|
| def train(): |
| from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders |
|
|
| tokenizer = Tokenizer(models.BPE(unk_token="<|unk|>")) |
| |
| tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) |
| tokenizer.decoder = decoders.ByteLevel() |
|
|
| trainer = trainers.BpeTrainer( |
| vocab_size=VOCAB_SIZE, |
| special_tokens=SPECIAL_TOKENS, |
| min_frequency=2, |
| show_progress=True, |
| ) |
|
|
| |
| |
| |
| |
| print("[tok] collecting corpus into memory (this is where downloads happen)...") |
| texts = list(text_iterator()) |
| print(f"[tok] collected {len(texts):,} documents") |
|
|
| print(f"[tok] training BPE to {VOCAB_SIZE:,} tokens ...") |
| tokenizer.train_from_iterator(texts, trainer=trainer, length=len(texts)) |
| tokenizer.save(TOKENIZER_PATH) |
| print(f"[tok] saved -> {TOKENIZER_PATH} (vocab {tokenizer.get_vocab_size():,})") |
|
|
|
|
| def test(): |
| from tokenizers import Tokenizer |
|
|
| tok = Tokenizer.from_file(TOKENIZER_PATH) |
| samples = [ |
| "İvme is a stupidly small language model.", |
| "def fibonacci(n): return n if n < 2 else fibonacci(n-1) + fibonacci(n-2)", |
| "The derivative of x^2 is 2x.", |
| ] |
| for s in samples: |
| ids = tok.encode(s).ids |
| back = tok.decode(ids) |
| print(f"\n text : {s}") |
| print(f" tokens : {len(ids)} ({len(s)/max(1,len(ids)):.2f} chars/token)") |
| print(f" decoded: {back}") |
|
|
|
|
| if __name__ == "__main__": |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--train", action="store_true") |
| ap.add_argument("--test", action="store_true") |
| args = ap.parse_args() |
| if args.train: |
| train() |
| elif args.test: |
| test() |
| else: |
| print("pass --train or --test") |