Text Generation
Microexpert_NG / tokenizer.py
gustavlangstroem's picture
Upload 4 files
9987dd2 verified
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, Regex
# --- CONFIGURATION ---
DATASET_NAME = "sedthh/gutenberg_english"
VOCAB_SIZE = 32000
SAMPLE_SIZE = 3000
BATCH_SIZE = 100
# 1. Connect
print(f"1. Connecting to {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, split="train", streaming=True)
# 2. The Generator
def batch_iterator():
batch = []
print("2. Collecting data...")
for i, item in enumerate(dataset):
if i >= SAMPLE_SIZE: break
batch.append(item['TEXT'])
if len(batch) == BATCH_SIZE:
print(f" > Processing batch {(i+1)//BATCH_SIZE}...", end='\r')
yield batch
batch = []
if batch: yield batch
# 3. TOKENIZER
print("\n3. Initializing Tokenizer...")
tokenizer = Tokenizer(models.BPE())
qwen_pattern = Regex(r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""")
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=qwen_pattern, behavior="isolated"),
pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])
tokenizer.decoder = decoders.ByteLevel()
trainer = trainers.BpeTrainer(
vocab_size=VOCAB_SIZE,
special_tokens=["<|endoftext|>", "<|padding|>"],
show_progress=True,
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)
# 4. Train
print("4. Training Qwen-style tokenizer...")
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
# 5. Save
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.save("qwen_style_tokenizer.json")
print(f"\nSUCCESS! Saved 'qwen_style_tokenizer.json'")