| from pathlib import Path |
| from datasets import Dataset |
| from tokenizers import ( |
| Tokenizer, |
| models, |
| normalizers, |
| pre_tokenizers, |
| decoders, |
| trainers, |
| ) |
| from tqdm.auto import tqdm |
| import wandb |
| from utils import get_raw_data |
|
|
|
|
| DATA_PATH = Path(r"..\data\IWSLT-15-en-vi") |
| |
| TOKENIZER_NAME = "iwslt_en-vi_tokenizer_32k.json" |
| TOKENIZER_SAVE_PATH = Path(r"..\artifacts\tokenizers") / TOKENIZER_NAME |
|
|
| |
| VOCAB_SIZE: int = 32_000 |
| SPECIAL_TOKENS: list[str] = ["[PAD]", "[UNK]", "[SOS]", "[EOS]"] |
|
|
| BATCH_SIZE_FOR_TOKENIZER: int = 10000 |
| NUM_WORKERS: int = 8 |
|
|
|
|
| def get_training_corpus(dataset: Dataset, batch_size: int = 1000): |
| """ |
| A generator function to yield batches of text. |
| |
| This implementation uses dataset.iter(batch_size=...), which is the |
| highly optimized, zero-copy Arrow iterator. |
| |
| We then use list comprehensions to extract the 'en' and 'vi' strings |
| from the nested list of dictionaries returned by the iterator. |
| """ |
|
|
| |
| |
| for batch in dataset.iter(batch_size=batch_size): |
|
|
| |
| |
|
|
| |
| en_strings: list[str] = [item["en"] for item in batch["translation"]] |
| vi_strings: list[str] = [item["vi"] for item in batch["translation"]] |
|
|
| |
| yield en_strings |
| yield vi_strings |
|
|
|
|
| def instantiate_tokenizer() -> Tokenizer: |
| |
| tokenizer = Tokenizer(models.BPE(unk_token="[UNK]")) |
|
|
| |
| |
| tokenizer.normalizer = normalizers.Sequence( |
| [ |
| normalizers.NFKC(), |
| normalizers.Lowercase(), |
| ] |
| ) |
|
|
| |
| |
| tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() |
|
|
| |
| tokenizer.decoder = decoders.BPEDecoder() |
|
|
| print("Tokenizer (empty) initialized.") |
| return tokenizer |
|
|
|
|
| def train_tokenizer(): |
| |
| trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS) |
|
|
| print("Tokenizer Trainer initialized.") |
|
|
| train_dataset = get_raw_data(DATA_PATH, for_tokenizer=True) |
| if not isinstance(train_dataset, Dataset): |
| train_dataset = Dataset.from_list(train_dataset) |
| print(f"Starting tokenizer training on {len(train_dataset)} pairs...") |
|
|
| |
| text_iterator = get_training_corpus( |
| train_dataset, |
| batch_size=BATCH_SIZE_FOR_TOKENIZER, |
| ) |
|
|
| |
| total_steps = (len(train_dataset) // BATCH_SIZE_FOR_TOKENIZER) * 2 |
| if total_steps == 0: |
| total_steps = 1 |
|
|
| tokenizer: Tokenizer = instantiate_tokenizer() |
| |
| try: |
| tokenizer.train_from_iterator( |
| tqdm( |
| text_iterator, |
| total=total_steps, |
| desc="Training Tokenizer (IWSLT-Local)", |
| ), |
| trainer=trainer, |
| length=total_steps, |
| ) |
| except KeyboardInterrupt: |
| print("\nTokenizer training interrupted by user.") |
|
|
| print("Tokenizer training complete.") |
|
|
| tokenizer.save(str(TOKENIZER_SAVE_PATH)) |
|
|
| print(f"Tokenizer saved to: {TOKENIZER_SAVE_PATH}") |
| print(f"Total vocabulary size: {tokenizer.get_vocab_size()}") |
|
|
|
|
| if __name__ == "__main__": |
| |
| |
|
|
| |
| |
|
|
| train_tokenizer() |
|
|
| run = wandb.init( |
| entity="alaindelong-hcmut", |
| project="Attention Is All You Build", |
| job_type="tokenizer-train", |
| ) |
|
|
| |
| tokenizer_artifact = wandb.Artifact( |
| name="iwslt_en-vi_tokenizer", |
| type="tokenizer", |
| description="BPE Tokenizer trained on IWSLT 15 (133k+ pairs en-vi)", |
| metadata={ |
| "vocab_size": 32000, |
| "algorithm": "BPE", |
| "framework": "huggingface", |
| "training_data": "iwslt-15-en-vi-133k", |
| "lower_case": False, |
| }, |
| ) |
| tokenizer_artifact.add_file(local_path=str(TOKENIZER_SAVE_PATH)) |
| run.log_artifact(tokenizer_artifact, aliases=["baseline"]) |
|
|
| run.finish() |
|
|