from datasets import load_dataset from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors, Regex # --- CONFIGURATION --- DATASET_NAME = "sedthh/gutenberg_english" VOCAB_SIZE = 32000 SAMPLE_SIZE = 3000 BATCH_SIZE = 100 # 1. Connect print(f"1. Connecting to {DATASET_NAME}...") dataset = load_dataset(DATASET_NAME, split="train", streaming=True) # 2. The Generator def batch_iterator(): batch = [] print("2. Collecting data...") for i, item in enumerate(dataset): if i >= SAMPLE_SIZE: break batch.append(item['TEXT']) if len(batch) == BATCH_SIZE: print(f" > Processing batch {(i+1)//BATCH_SIZE}...", end='\r') yield batch batch = [] if batch: yield batch # 3. TOKENIZER print("\n3. Initializing Tokenizer...") tokenizer = Tokenizer(models.BPE()) qwen_pattern = Regex(r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""") tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ pre_tokenizers.Split(pattern=qwen_pattern, behavior="isolated"), pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) ]) tokenizer.decoder = decoders.ByteLevel() trainer = trainers.BpeTrainer( vocab_size=VOCAB_SIZE, special_tokens=["<|endoftext|>", "<|padding|>"], show_progress=True, initial_alphabet=pre_tokenizers.ByteLevel.alphabet() ) # 4. Train print("4. Training Qwen-style tokenizer...") tokenizer.train_from_iterator(batch_iterator(), trainer=trainer) # 5. Save tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) tokenizer.save("qwen_style_tokenizer.json") print(f"\nSUCCESS! Saved 'qwen_style_tokenizer.json'")