SinCode / seq2seq /prepare_experiment_model.py
KalanaPabasara
SinCode v3 — seq2seq pipeline, evaluation scripts, IndoNLP benchmark data
1fed70a
"""
Prepare local clean model snapshots and experiment copies.
Workflow:
1) Download/save a clean Hugging Face model to a stable local path once.
2) Create a copy of that clean local model for each experiment run.
This prevents accidental overwrites of your base model and keeps
fine-tuning runs isolated.
Examples:
python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala
python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5
"""
from __future__ import annotations
import argparse
import shutil
import sys
from datetime import datetime
from pathlib import Path
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
ROOT = Path(__file__).parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from core.constants import DEFAULT_MBART_MODEL
CLEAN_ROOT = ROOT / "seq2seq" / "clean_models"
EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Download a clean model once and create an isolated experiment copy (GPU required)."
)
parser.add_argument(
"--model-id",
default=DEFAULT_MBART_MODEL,
help="Hugging Face model ID to prepare.",
)
parser.add_argument(
"--clean-dir",
type=Path,
default=None,
help="Optional custom clean-model directory.",
)
parser.add_argument(
"--run-name",
default=None,
help="Optional experiment run folder name. Defaults to timestamp.",
)
parser.add_argument(
"--force-refresh-clean",
action="store_true",
help="Re-download and overwrite the local clean model snapshot.",
)
parser.add_argument(
"--allow-cpu",
action="store_true",
help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.",
)
return parser.parse_args()
def safe_name(model_id: str) -> str:
return model_id.replace("/", "--")
def main() -> None:
args = parse_args()
if not torch.cuda.is_available() and not args.allow_cpu:
raise RuntimeError(
"CUDA GPU is required by default. "
"No GPU detected. Use --allow-cpu only if you intentionally want CPU mode."
)
model_slug = safe_name(args.model_id)
clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug)
if clean_dir.exists() and args.force_refresh_clean:
print(f"Removing existing clean model at: {clean_dir}")
shutil.rmtree(clean_dir)
if not clean_dir.exists():
print(f"Downloading clean model: {args.model_id}")
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)
clean_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(clean_dir)
model.save_pretrained(clean_dir)
print(f"Saved clean model to: {clean_dir}")
else:
print(f"Using existing clean model: {clean_dir}")
run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S")
exp_dir = EXPERIMENT_ROOT / model_slug / run_name
exp_model_dir = exp_dir / "model"
if exp_model_dir.exists():
raise FileExistsError(
f"Experiment model directory already exists: {exp_model_dir}. "
"Use a different --run-name."
)
exp_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(clean_dir, exp_model_dir)
print("\nExperiment ready")
print(f" clean_model : {clean_dir}")
print(f" experiment : {exp_dir}")
print(f" model_copy : {exp_model_dir}")
if __name__ == "__main__":
main()