Buckets:
| from __future__ import annotations | |
| from pathlib import Path | |
| from eval.certification import certify | |
| from eval.evidence import write_iteration_evidence | |
| from n21.config import load_structured, write_json | |
| from n21.settings import CONFIG_ROOT | |
| from observability.audit_log import AuditLogger, utc_now | |
| from observability.heartbeat import heartbeat | |
| from observability.progress import ProgressState | |
| from orchestrator.provider_routing import get_provider, validate_route | |
| from model_policy.roadmap import candidate_evidence | |
| from training.start_policy import resolve_training_start | |
| def run_training( | |
| run_dir: Path, | |
| *, | |
| run_id: str, | |
| model_candidate: str, | |
| train_provider: str, | |
| infer_provider: str, | |
| release_id: str | None = None, | |
| finetune_start_policy: str = "bootstrap", | |
| live: bool = False, | |
| ) -> dict[str, object]: | |
| route_errors = validate_route(train_provider, infer_provider) | |
| if route_errors: | |
| raise ValueError("; ".join(route_errors)) | |
| logs = run_dir / "logs" | |
| logger = AuditLogger(logs / "audit.jsonl") | |
| heartbeat(logger, ProgressState(run_id, 1, "train", "prepare", 10, 10, 25, train_provider, infer_provider, model_candidate, next_action="write run manifest")) | |
| provider = get_provider(train_provider) | |
| training_start = resolve_training_start( | |
| release_id=release_id, | |
| model_candidate=model_candidate, | |
| start_policy=finetune_start_policy, | |
| ) | |
| manifest = { | |
| "run_id": run_id, | |
| "release_id": release_id, | |
| "model_candidate": model_candidate, | |
| "training_start": training_start, | |
| "model_roadmap_evidence": candidate_evidence(model_candidate), | |
| "execution": {"train_provider": train_provider, "infer_provider": infer_provider, "dry_run": not live}, | |
| "provider_config": load_structured(CONFIG_ROOT / "providers" / f"{train_provider}.yaml"), | |
| "created_at": utc_now(), | |
| } | |
| write_json(run_dir / "manifests" / "run_manifest.json", manifest) | |
| handle = provider.start_train(manifest) | |
| write_json(run_dir / "trainer_state" / "train_handle.json", handle) | |
| heartbeat(logger, ProgressState(run_id, 1, "train", "provider_plan", 100, 100, 35, train_provider, infer_provider, model_candidate, status="planned", next_action="write iteration evidence")) | |
| training_delta = "dry-run bootstrap training plan generated" | |
| cert = certify("dev", model_candidate=model_candidate) | |
| scores = cert["scores"] | |
| improvement = cert["improvement_report"]["improvements"] | |
| gate_result = cert["gate_result"] | |
| promotion_recommendation = "promote_to_stage" if cert["gate_result"] == "pass" else "hold" | |
| rationale = cert["improvement_report"]["rationale"] | |
| scoring = { | |
| "mode": "dry_run_fixture_scores", | |
| "quality_signal": "orchestration_only", | |
| "promotion_eligible": False, | |
| "notes": [ | |
| "These fixture scores validate the SHFT state machine only.", | |
| "Production promotion requires paired live model evaluation on frozen prompts.", | |
| ], | |
| } | |
| if live: | |
| handle_status = str(handle.get("status", "")) | |
| if handle_status == "submitted": | |
| training_delta = "live Hugging Face job submitted; await remote trainer artifacts before certification" | |
| elif handle_status == "submit_failed": | |
| training_delta = "live Hugging Face job submission attempted but failed; see train_handle provider_result" | |
| else: | |
| training_delta = "live Hugging Face job plan generated but not submitted" | |
| gate_result = "pending_remote_paired_eval" | |
| promotion_recommendation = "await_paired_eval" | |
| rationale = [ | |
| "Live training submission does not certify model quality.", | |
| "The paired model-vs-model proof must complete before any promotion decision.", | |
| ] | |
| scoring = { | |
| "mode": "pending_live_paired_eval", | |
| "quality_signal": "none_yet", | |
| "promotion_eligible": False, | |
| "notes": [ | |
| "HF training job submission is infrastructure evidence only.", | |
| "Use paired_eval_report.json critical_pass_rate, pairwise losses, and aggregate score for model-quality gating.", | |
| ], | |
| } | |
| evidence = { | |
| "run_id": run_id, | |
| "iteration_id": "iter_001", | |
| "parent_iteration_id": None, | |
| "base_model": model_candidate, | |
| "training_start": training_start, | |
| "model_roadmap_evidence": manifest["model_roadmap_evidence"], | |
| "train_provider": train_provider, | |
| "infer_provider": infer_provider, | |
| "repair_reason": None, | |
| "changes": {"training_delta": training_delta}, | |
| "scores": scores, | |
| "improvement": improvement, | |
| "scoring": scoring, | |
| "gate_result": gate_result, | |
| "promotion_recommendation": promotion_recommendation, | |
| "rationale": rationale, | |
| } | |
| write_iteration_evidence(run_dir / "iterations" / "iter_001", evidence) | |
| return {"run_manifest": manifest, "train_handle": handle, "iteration_evidence": evidence} | |
Xet Storage Details
- Size:
- 5.07 kB
- Xet hash:
- d4a64f78b99f57f8b1377c4f1dc3bebf17fc6a340da6abe6c590dcb2425c6346
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.