Spaces:

Kalana001
/

SinCode

Running

SinCode / seq2seq /prepare_experiment_model.py

KalanaPabasara

SinCode v3 — seq2seq pipeline, evaluation scripts, IndoNLP benchmark data

1fed70a 19 days ago

3.77 kB

	"""
	Prepare local clean model snapshots and experiment copies.

	Workflow:
	1) Download/save a clean Hugging Face model to a stable local path once.
	2) Create a copy of that clean local model for each experiment run.

	This prevents accidental overwrites of your base model and keeps
	fine-tuning runs isolated.

	Examples:
	python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala
	python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5
	"""

	from __future__ import annotations

	import argparse
	import shutil
	import sys
	from datetime import datetime
	from pathlib import Path

	import torch
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	ROOT = Path(__file__).parent.parent
	if str(ROOT) not in sys.path:
	sys.path.insert(0, str(ROOT))

	from core.constants import DEFAULT_MBART_MODEL

	CLEAN_ROOT = ROOT / "seq2seq" / "clean_models"
	EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments"


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description="Download a clean model once and create an isolated experiment copy (GPU required)."
	)
	parser.add_argument(
	"--model-id",
	default=DEFAULT_MBART_MODEL,
	help="Hugging Face model ID to prepare.",
	)
	parser.add_argument(
	"--clean-dir",
	type=Path,
	default=None,
	help="Optional custom clean-model directory.",
	)
	parser.add_argument(
	"--run-name",
	default=None,
	help="Optional experiment run folder name. Defaults to timestamp.",
	)
	parser.add_argument(
	"--force-refresh-clean",
	action="store_true",
	help="Re-download and overwrite the local clean model snapshot.",
	)
	parser.add_argument(
	"--allow-cpu",
	action="store_true",
	help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.",
	)
	return parser.parse_args()


	def safe_name(model_id: str) -> str:
	return model_id.replace("/", "--")


	def main() -> None:
	args = parse_args()

	if not torch.cuda.is_available() and not args.allow_cpu:
	raise RuntimeError(
	"CUDA GPU is required by default. "
	"No GPU detected. Use --allow-cpu only if you intentionally want CPU mode."
	)

	model_slug = safe_name(args.model_id)
	clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug)

	if clean_dir.exists() and args.force_refresh_clean:
	print(f"Removing existing clean model at: {clean_dir}")
	shutil.rmtree(clean_dir)

	if not clean_dir.exists():
	print(f"Downloading clean model: {args.model_id}")
	tokenizer = AutoTokenizer.from_pretrained(args.model_id)
	model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)

	clean_dir.mkdir(parents=True, exist_ok=True)
	tokenizer.save_pretrained(clean_dir)
	model.save_pretrained(clean_dir)
	print(f"Saved clean model to: {clean_dir}")
	else:
	print(f"Using existing clean model: {clean_dir}")

	run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S")
	exp_dir = EXPERIMENT_ROOT / model_slug / run_name
	exp_model_dir = exp_dir / "model"

	if exp_model_dir.exists():
	raise FileExistsError(
	f"Experiment model directory already exists: {exp_model_dir}. "
	"Use a different --run-name."
	)

	exp_dir.mkdir(parents=True, exist_ok=True)
	shutil.copytree(clean_dir, exp_model_dir)

	print("\nExperiment ready")
	print(f" clean_model : {clean_dir}")
	print(f" experiment : {exp_dir}")
	print(f" model_copy : {exp_model_dir}")


	if __name__ == "__main__":
	main()