File size: 6,674 Bytes
f6f45d5 1fed70a f6f45d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | """
Fine-tune google/byt5-small on Singlish β Sinhala word-level transliteration.
Input: wsd_pairs.csv (romanized, sinhala)
Output: byt5-singlish-sinhala/ (HuggingFace model directory)
Training approach:
- Input : romanized word (e.g. "wadi")
- Target : sinhala word (e.g. "ΰ·ΰ·ΰΆ©ΰ·")
- Model : ByT5-small (byte-level T5, no vocab issues with any script)
- Beam=5 at inference β top-5 candidates for MLM reranking
Tokenized dataset is saved to disk after first run β restarts skip
straight to training without re-tokenizing.
"""
from pathlib import Path
import torch
from datasets import Dataset, load_from_disk
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
Seq2SeqTrainer,
Seq2SeqTrainingArguments,
default_data_collator,
)
# ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BASE_MODEL = "google/byt5-small"
DATA_PATH = Path(__file__).parent / "wsd_pairs.csv"
CACHE_DIR = Path(__file__).parent / "tokenized_cache"
OUTPUT_DIR = Path(__file__).parent / "byt5-singlish-sinhala"
MAX_SAMPLES = 1_000_000 # 1M pairs β more than enough for word transliteration
TRAIN_SPLIT = 0.97
MAX_INPUT_LEN = 64
MAX_TARGET_LEN = 64
BATCH_SIZE = 64 # 16GB VRAM β ByT5-small with seq_len=64
EPOCHS = 2
LR = 5e-4
SEED = 42
# ββ Tokenize ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def tokenize_fn(batch, tokenizer):
# Pad to fixed max_length so all tensors have the same shape.
# This lets set_format("torch") work and default_data_collator just stacks.
model_inputs = tokenizer(
batch["romanized"],
max_length=MAX_INPUT_LEN,
truncation=True,
padding="max_length",
)
labels = tokenizer(
batch["sinhala"],
max_length=MAX_TARGET_LEN,
truncation=True,
padding="max_length",
)
# Replace pad token with -100 so it's ignored in cross-entropy loss
model_inputs["labels"] = [
[(t if t != tokenizer.pad_token_id else -100) for t in ids]
for ids in labels["input_ids"]
]
return model_inputs
# ββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def main():
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f" Device : {device}")
if device != "cuda":
raise RuntimeError(
"CUDA GPU is required for training. "
"No GPU was detected, so training was stopped to avoid CPU slowdown."
)
print(f" GPU : {torch.cuda.get_device_name(0)}")
print(f" VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)
train_cache = CACHE_DIR / "train"
eval_cache = CACHE_DIR / "eval"
if train_cache.exists() and eval_cache.exists():
print("Loading pre-tokenized dataset from disk cache β¦")
train_ds = load_from_disk(str(train_cache))
eval_ds = load_from_disk(str(eval_cache))
print(f" train={len(train_ds):,} eval={len(eval_ds):,}")
else:
print(f"Loading data from {DATA_PATH} β¦")
ds = Dataset.from_csv(str(DATA_PATH))
ds = ds.filter(lambda x: bool(x["romanized"]) and bool(x["sinhala"]))
print(f" {len(ds):,} pairs β sampling {MAX_SAMPLES:,} β¦")
# Shuffle and take MAX_SAMPLES
ds = ds.shuffle(seed=SEED).select(range(min(MAX_SAMPLES, len(ds))))
split = ds.train_test_split(test_size=1 - TRAIN_SPLIT, seed=SEED)
train_raw = split["train"]
eval_raw = split["test"]
print(f" train={len(train_raw):,} eval={len(eval_raw):,}")
print("Tokenizing and saving to disk (one-time, ~5 min) β¦")
train_ds = train_raw.map(
lambda b: tokenize_fn(b, tokenizer),
batched=True,
batch_size=10_000,
num_proc=8,
keep_in_memory=True,
remove_columns=["romanized", "sinhala"],
desc="Tokenizing train",
)
eval_ds = eval_raw.map(
lambda b: tokenize_fn(b, tokenizer),
batched=True,
batch_size=10_000,
num_proc=8,
keep_in_memory=True,
remove_columns=["romanized", "sinhala"],
desc="Tokenizing eval",
)
CACHE_DIR.mkdir(parents=True, exist_ok=True)
train_ds.save_to_disk(str(train_cache))
eval_ds.save_to_disk(str(eval_cache))
print(" Saved to disk. Future runs will load instantly.")
train_ds.set_format("torch")
eval_ds.set_format("torch")
# All sequences are pre-padded to fixed length β just stack them
collator = default_data_collator
warmup_steps = int(0.05 * (len(train_ds) // BATCH_SIZE))
args = Seq2SeqTrainingArguments(
output_dir=str(OUTPUT_DIR),
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
learning_rate=LR,
warmup_steps=warmup_steps,
weight_decay=0.01,
predict_with_generate=True,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
logging_steps=200,
dataloader_num_workers=0, # 0 = main process only (most stable on Windows)
dataloader_pin_memory=True,
bf16=torch.cuda.is_bf16_supported(),
fp16=not torch.cuda.is_bf16_supported() and torch.cuda.is_available(),
seed=SEED,
report_to="none",
)
trainer = Seq2SeqTrainer(
model=model,
args=args,
train_dataset=train_ds,
eval_dataset=eval_ds,
processing_class=tokenizer,
data_collator=collator,
)
print("Starting training β¦")
trainer.train()
print(f"Saving model to {OUTPUT_DIR}/final β¦")
model.save_pretrained(OUTPUT_DIR / "final")
tokenizer.save_pretrained(OUTPUT_DIR / "final")
print("Done.")
if __name__ == "__main__":
main()
|