linvest21's picture
download
raw
1.72 kB
from __future__ import annotations
from config.thresholds import resolve_thresholds
from eval.improvement_report import build_improvement_report
from model_policy.roadmap import candidate_evidence
DEFAULT_BASELINE = {
"aggregate": 0.884,
"financebench": 0.891,
"convfinqa": 0.862,
"phrasebank_macro_f1": 0.921,
"private_prompt_replay": 0.940,
}
DEFAULT_CURRENT = {
"aggregate": 0.903,
"financebench": 0.900,
"convfinqa": 0.871,
"phrasebank_macro_f1": 0.928,
"private_prompt_replay": 0.968,
}
def certify(env: str, task: str = "finance_qa", model_candidate: str | None = None) -> dict[str, object]:
thresholds = resolve_thresholds(env, task)
report = build_improvement_report(DEFAULT_BASELINE, DEFAULT_CURRENT)
quality = thresholds["quality_gates"]
improvement = thresholds["improvement_gates"]
errors: list[str] = []
if DEFAULT_CURRENT["aggregate"] < quality["aggregate_fineval_min"]:
errors.append("aggregate score below threshold")
if report["improvements"]["aggregate"]["abs"] < improvement["aggregate_score_delta_min_abs"]:
errors.append("aggregate improvement below threshold")
if DEFAULT_CURRENT["private_prompt_replay"] < quality["private_prompt_replay_pass_rate_min"]:
errors.append("private prompt replay below threshold")
result = {
"gate_result": "pass" if not errors else "fail",
"errors": errors,
"scores": {"baseline_prod": DEFAULT_BASELINE, "current_iteration": DEFAULT_CURRENT},
"improvement_report": report,
"thresholds": thresholds,
}
if model_candidate:
result["model_roadmap_evidence"] = candidate_evidence(model_candidate)
return result

Xet Storage Details

Size:
1.72 kB
·
Xet hash:
c6724271685aefae52a690ccee646a04a2d1a6ed90e5dfb13eebbba1deba4096

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.