Buckets:

linvest21
/

shft-artifacts

Files

xet

linvest21/shft-artifacts / code /self_healing_finetuning /training /launch.py

linvest21

about 8 hours ago

download

raw

5.07 kB

	from __future__ import annotations

	from pathlib import Path

	from eval.certification import certify
	from eval.evidence import write_iteration_evidence
	from n21.config import load_structured, write_json
	from n21.settings import CONFIG_ROOT
	from observability.audit_log import AuditLogger, utc_now
	from observability.heartbeat import heartbeat
	from observability.progress import ProgressState
	from orchestrator.provider_routing import get_provider, validate_route
	from model_policy.roadmap import candidate_evidence
	from training.start_policy import resolve_training_start


	def run_training(
	run_dir: Path,
	*,
	run_id: str,
	model_candidate: str,
	train_provider: str,
	infer_provider: str,
	release_id: str \| None = None,
	finetune_start_policy: str = "bootstrap",
	live: bool = False,
	) -> dict[str, object]:
	route_errors = validate_route(train_provider, infer_provider)
	if route_errors:
	raise ValueError("; ".join(route_errors))
	logs = run_dir / "logs"
	logger = AuditLogger(logs / "audit.jsonl")
	heartbeat(logger, ProgressState(run_id, 1, "train", "prepare", 10, 10, 25, train_provider, infer_provider, model_candidate, next_action="write run manifest"))
	provider = get_provider(train_provider)
	training_start = resolve_training_start(
	release_id=release_id,
	model_candidate=model_candidate,
	start_policy=finetune_start_policy,
	)
	manifest = {
	"run_id": run_id,
	"release_id": release_id,
	"model_candidate": model_candidate,
	"training_start": training_start,
	"model_roadmap_evidence": candidate_evidence(model_candidate),
	"execution": {"train_provider": train_provider, "infer_provider": infer_provider, "dry_run": not live},
	"provider_config": load_structured(CONFIG_ROOT / "providers" / f"{train_provider}.yaml"),
	"created_at": utc_now(),
	}
	write_json(run_dir / "manifests" / "run_manifest.json", manifest)
	handle = provider.start_train(manifest)
	write_json(run_dir / "trainer_state" / "train_handle.json", handle)
	heartbeat(logger, ProgressState(run_id, 1, "train", "provider_plan", 100, 100, 35, train_provider, infer_provider, model_candidate, status="planned", next_action="write iteration evidence"))
	training_delta = "dry-run bootstrap training plan generated"
	cert = certify("dev", model_candidate=model_candidate)
	scores = cert["scores"]
	improvement = cert["improvement_report"]["improvements"]
	gate_result = cert["gate_result"]
	promotion_recommendation = "promote_to_stage" if cert["gate_result"] == "pass" else "hold"
	rationale = cert["improvement_report"]["rationale"]
	scoring = {
	"mode": "dry_run_fixture_scores",
	"quality_signal": "orchestration_only",
	"promotion_eligible": False,
	"notes": [
	"These fixture scores validate the SHFT state machine only.",
	"Production promotion requires paired live model evaluation on frozen prompts.",
	],
	}
	if live:
	handle_status = str(handle.get("status", ""))
	if handle_status == "submitted":
	training_delta = "live Hugging Face job submitted; await remote trainer artifacts before certification"
	elif handle_status == "submit_failed":
	training_delta = "live Hugging Face job submission attempted but failed; see train_handle provider_result"
	else:
	training_delta = "live Hugging Face job plan generated but not submitted"
	gate_result = "pending_remote_paired_eval"
	promotion_recommendation = "await_paired_eval"
	rationale = [
	"Live training submission does not certify model quality.",
	"The paired model-vs-model proof must complete before any promotion decision.",
	]
	scoring = {
	"mode": "pending_live_paired_eval",
	"quality_signal": "none_yet",
	"promotion_eligible": False,
	"notes": [
	"HF training job submission is infrastructure evidence only.",
	"Use paired_eval_report.json critical_pass_rate, pairwise losses, and aggregate score for model-quality gating.",
	],
	}
	evidence = {
	"run_id": run_id,
	"iteration_id": "iter_001",
	"parent_iteration_id": None,
	"base_model": model_candidate,
	"training_start": training_start,
	"model_roadmap_evidence": manifest["model_roadmap_evidence"],
	"train_provider": train_provider,
	"infer_provider": infer_provider,
	"repair_reason": None,
	"changes": {"training_delta": training_delta},
	"scores": scores,
	"improvement": improvement,
	"scoring": scoring,
	"gate_result": gate_result,
	"promotion_recommendation": promotion_recommendation,
	"rationale": rationale,
	}
	write_iteration_evidence(run_dir / "iterations" / "iter_001", evidence)
	return {"run_manifest": manifest, "train_handle": handle, "iteration_evidence": evidence}

Xet Storage Details

Size:: 5.07 kB
Xet hash:: d4a64f78b99f57f8b1377c4f1dc3bebf17fc6a340da6abe6c590dcb2425c6346

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.