""" Prepare local clean model snapshots and experiment copies. Workflow: 1) Download/save a clean Hugging Face model to a stable local path once. 2) Create a copy of that clean local model for each experiment run. This prevents accidental overwrites of your base model and keeps fine-tuning runs isolated. Examples: python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5 """ from __future__ import annotations import argparse import shutil import sys from datetime import datetime from pathlib import Path import torch from transformers import AutoModelForSeq2SeqLM, AutoTokenizer ROOT = Path(__file__).parent.parent if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from core.constants import DEFAULT_MBART_MODEL CLEAN_ROOT = ROOT / "seq2seq" / "clean_models" EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Download a clean model once and create an isolated experiment copy (GPU required)." ) parser.add_argument( "--model-id", default=DEFAULT_MBART_MODEL, help="Hugging Face model ID to prepare.", ) parser.add_argument( "--clean-dir", type=Path, default=None, help="Optional custom clean-model directory.", ) parser.add_argument( "--run-name", default=None, help="Optional experiment run folder name. Defaults to timestamp.", ) parser.add_argument( "--force-refresh-clean", action="store_true", help="Re-download and overwrite the local clean model snapshot.", ) parser.add_argument( "--allow-cpu", action="store_true", help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.", ) return parser.parse_args() def safe_name(model_id: str) -> str: return model_id.replace("/", "--") def main() -> None: args = parse_args() if not torch.cuda.is_available() and not args.allow_cpu: raise RuntimeError( "CUDA GPU is required by default. " "No GPU detected. Use --allow-cpu only if you intentionally want CPU mode." ) model_slug = safe_name(args.model_id) clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug) if clean_dir.exists() and args.force_refresh_clean: print(f"Removing existing clean model at: {clean_dir}") shutil.rmtree(clean_dir) if not clean_dir.exists(): print(f"Downloading clean model: {args.model_id}") tokenizer = AutoTokenizer.from_pretrained(args.model_id) model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id) clean_dir.mkdir(parents=True, exist_ok=True) tokenizer.save_pretrained(clean_dir) model.save_pretrained(clean_dir) print(f"Saved clean model to: {clean_dir}") else: print(f"Using existing clean model: {clean_dir}") run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S") exp_dir = EXPERIMENT_ROOT / model_slug / run_name exp_model_dir = exp_dir / "model" if exp_model_dir.exists(): raise FileExistsError( f"Experiment model directory already exists: {exp_model_dir}. " "Use a different --run-name." ) exp_dir.mkdir(parents=True, exist_ok=True) shutil.copytree(clean_dir, exp_model_dir) print("\nExperiment ready") print(f" clean_model : {clean_dir}") print(f" experiment : {exp_dir}") print(f" model_copy : {exp_model_dir}") if __name__ == "__main__": main()