linvest21's picture
download
raw
5.07 kB
from __future__ import annotations
from pathlib import Path
from eval.certification import certify
from eval.evidence import write_iteration_evidence
from n21.config import load_structured, write_json
from n21.settings import CONFIG_ROOT
from observability.audit_log import AuditLogger, utc_now
from observability.heartbeat import heartbeat
from observability.progress import ProgressState
from orchestrator.provider_routing import get_provider, validate_route
from model_policy.roadmap import candidate_evidence
from training.start_policy import resolve_training_start
def run_training(
run_dir: Path,
*,
run_id: str,
model_candidate: str,
train_provider: str,
infer_provider: str,
release_id: str | None = None,
finetune_start_policy: str = "bootstrap",
live: bool = False,
) -> dict[str, object]:
route_errors = validate_route(train_provider, infer_provider)
if route_errors:
raise ValueError("; ".join(route_errors))
logs = run_dir / "logs"
logger = AuditLogger(logs / "audit.jsonl")
heartbeat(logger, ProgressState(run_id, 1, "train", "prepare", 10, 10, 25, train_provider, infer_provider, model_candidate, next_action="write run manifest"))
provider = get_provider(train_provider)
training_start = resolve_training_start(
release_id=release_id,
model_candidate=model_candidate,
start_policy=finetune_start_policy,
)
manifest = {
"run_id": run_id,
"release_id": release_id,
"model_candidate": model_candidate,
"training_start": training_start,
"model_roadmap_evidence": candidate_evidence(model_candidate),
"execution": {"train_provider": train_provider, "infer_provider": infer_provider, "dry_run": not live},
"provider_config": load_structured(CONFIG_ROOT / "providers" / f"{train_provider}.yaml"),
"created_at": utc_now(),
}
write_json(run_dir / "manifests" / "run_manifest.json", manifest)
handle = provider.start_train(manifest)
write_json(run_dir / "trainer_state" / "train_handle.json", handle)
heartbeat(logger, ProgressState(run_id, 1, "train", "provider_plan", 100, 100, 35, train_provider, infer_provider, model_candidate, status="planned", next_action="write iteration evidence"))
training_delta = "dry-run bootstrap training plan generated"
cert = certify("dev", model_candidate=model_candidate)
scores = cert["scores"]
improvement = cert["improvement_report"]["improvements"]
gate_result = cert["gate_result"]
promotion_recommendation = "promote_to_stage" if cert["gate_result"] == "pass" else "hold"
rationale = cert["improvement_report"]["rationale"]
scoring = {
"mode": "dry_run_fixture_scores",
"quality_signal": "orchestration_only",
"promotion_eligible": False,
"notes": [
"These fixture scores validate the SHFT state machine only.",
"Production promotion requires paired live model evaluation on frozen prompts.",
],
}
if live:
handle_status = str(handle.get("status", ""))
if handle_status == "submitted":
training_delta = "live Hugging Face job submitted; await remote trainer artifacts before certification"
elif handle_status == "submit_failed":
training_delta = "live Hugging Face job submission attempted but failed; see train_handle provider_result"
else:
training_delta = "live Hugging Face job plan generated but not submitted"
gate_result = "pending_remote_paired_eval"
promotion_recommendation = "await_paired_eval"
rationale = [
"Live training submission does not certify model quality.",
"The paired model-vs-model proof must complete before any promotion decision.",
]
scoring = {
"mode": "pending_live_paired_eval",
"quality_signal": "none_yet",
"promotion_eligible": False,
"notes": [
"HF training job submission is infrastructure evidence only.",
"Use paired_eval_report.json critical_pass_rate, pairwise losses, and aggregate score for model-quality gating.",
],
}
evidence = {
"run_id": run_id,
"iteration_id": "iter_001",
"parent_iteration_id": None,
"base_model": model_candidate,
"training_start": training_start,
"model_roadmap_evidence": manifest["model_roadmap_evidence"],
"train_provider": train_provider,
"infer_provider": infer_provider,
"repair_reason": None,
"changes": {"training_delta": training_delta},
"scores": scores,
"improvement": improvement,
"scoring": scoring,
"gate_result": gate_result,
"promotion_recommendation": promotion_recommendation,
"rationale": rationale,
}
write_iteration_evidence(run_dir / "iterations" / "iter_001", evidence)
return {"run_manifest": manifest, "train_handle": handle, "iteration_evidence": evidence}

Xet Storage Details

Size:
5.07 kB
·
Xet hash:
d4a64f78b99f57f8b1377c4f1dc3bebf17fc6a340da6abe6c590dcb2425c6346

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.