Buckets:
| from __future__ import annotations | |
| from config.thresholds import resolve_thresholds | |
| from eval.improvement_report import build_improvement_report | |
| from model_policy.roadmap import candidate_evidence | |
| DEFAULT_BASELINE = { | |
| "aggregate": 0.884, | |
| "financebench": 0.891, | |
| "convfinqa": 0.862, | |
| "phrasebank_macro_f1": 0.921, | |
| "private_prompt_replay": 0.940, | |
| } | |
| DEFAULT_CURRENT = { | |
| "aggregate": 0.903, | |
| "financebench": 0.900, | |
| "convfinqa": 0.871, | |
| "phrasebank_macro_f1": 0.928, | |
| "private_prompt_replay": 0.968, | |
| } | |
| def certify(env: str, task: str = "finance_qa", model_candidate: str | None = None) -> dict[str, object]: | |
| thresholds = resolve_thresholds(env, task) | |
| report = build_improvement_report(DEFAULT_BASELINE, DEFAULT_CURRENT) | |
| quality = thresholds["quality_gates"] | |
| improvement = thresholds["improvement_gates"] | |
| errors: list[str] = [] | |
| if DEFAULT_CURRENT["aggregate"] < quality["aggregate_fineval_min"]: | |
| errors.append("aggregate score below threshold") | |
| if report["improvements"]["aggregate"]["abs"] < improvement["aggregate_score_delta_min_abs"]: | |
| errors.append("aggregate improvement below threshold") | |
| if DEFAULT_CURRENT["private_prompt_replay"] < quality["private_prompt_replay_pass_rate_min"]: | |
| errors.append("private prompt replay below threshold") | |
| result = { | |
| "gate_result": "pass" if not errors else "fail", | |
| "errors": errors, | |
| "scores": {"baseline_prod": DEFAULT_BASELINE, "current_iteration": DEFAULT_CURRENT}, | |
| "improvement_report": report, | |
| "thresholds": thresholds, | |
| } | |
| if model_candidate: | |
| result["model_roadmap_evidence"] = candidate_evidence(model_candidate) | |
| return result | |
Xet Storage Details
- Size:
- 1.72 kB
- Xet hash:
- c6724271685aefae52a690ccee646a04a2d1a6ed90e5dfb13eebbba1deba4096
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.