| from datasets import load_dataset |
| from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, Regex |
|
|
| |
| DATASET_NAME = "sedthh/gutenberg_english" |
| VOCAB_SIZE = 32000 |
| SAMPLE_SIZE = 3000 |
| BATCH_SIZE = 100 |
|
|
| |
| print(f"1. Connecting to {DATASET_NAME}...") |
| dataset = load_dataset(DATASET_NAME, split="train", streaming=True) |
|
|
| |
| def batch_iterator(): |
| batch = [] |
| print("2. Collecting data...") |
| for i, item in enumerate(dataset): |
| if i >= SAMPLE_SIZE: break |
| |
| batch.append(item['TEXT']) |
| |
| if len(batch) == BATCH_SIZE: |
| print(f" > Processing batch {(i+1)//BATCH_SIZE}...", end='\r') |
| yield batch |
| batch = [] |
| if batch: yield batch |
|
|
| |
| print("\n3. Initializing Tokenizer...") |
| tokenizer = Tokenizer(models.BPE()) |
|
|
|
|
| qwen_pattern = Regex(r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""") |
|
|
| tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ |
| pre_tokenizers.Split(pattern=qwen_pattern, behavior="isolated"), |
| pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) |
| ]) |
|
|
| tokenizer.decoder = decoders.ByteLevel() |
|
|
| trainer = trainers.BpeTrainer( |
| vocab_size=VOCAB_SIZE, |
| special_tokens=["<|endoftext|>", "<|padding|>"], |
| show_progress=True, |
| initial_alphabet=pre_tokenizers.ByteLevel.alphabet() |
| ) |
|
|
| |
| print("4. Training Qwen-style tokenizer...") |
| tokenizer.train_from_iterator(batch_iterator(), trainer=trainer) |
|
|
| |
| tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) |
| tokenizer.save("qwen_style_tokenizer.json") |
| print(f"\nSUCCESS! Saved 'qwen_style_tokenizer.json'") |