Buckets:

linvest21
/

shft-artifacts

Files

xet

linvest21/shft-artifacts / code /self_healing_finetuning /eval /certification.py

linvest21

6 days ago

download

raw

1.72 kB

	from __future__ import annotations

	from config.thresholds import resolve_thresholds
	from eval.improvement_report import build_improvement_report
	from model_policy.roadmap import candidate_evidence


	DEFAULT_BASELINE = {
	"aggregate": 0.884,
	"financebench": 0.891,
	"convfinqa": 0.862,
	"phrasebank_macro_f1": 0.921,
	"private_prompt_replay": 0.940,
	}

	DEFAULT_CURRENT = {
	"aggregate": 0.903,
	"financebench": 0.900,
	"convfinqa": 0.871,
	"phrasebank_macro_f1": 0.928,
	"private_prompt_replay": 0.968,
	}


	def certify(env: str, task: str = "finance_qa", model_candidate: str \| None = None) -> dict[str, object]:
	thresholds = resolve_thresholds(env, task)
	report = build_improvement_report(DEFAULT_BASELINE, DEFAULT_CURRENT)
	quality = thresholds["quality_gates"]
	improvement = thresholds["improvement_gates"]
	errors: list[str] = []
	if DEFAULT_CURRENT["aggregate"] < quality["aggregate_fineval_min"]:
	errors.append("aggregate score below threshold")
	if report["improvements"]["aggregate"]["abs"] < improvement["aggregate_score_delta_min_abs"]:
	errors.append("aggregate improvement below threshold")
	if DEFAULT_CURRENT["private_prompt_replay"] < quality["private_prompt_replay_pass_rate_min"]:
	errors.append("private prompt replay below threshold")
	result = {
	"gate_result": "pass" if not errors else "fail",
	"errors": errors,
	"scores": {"baseline_prod": DEFAULT_BASELINE, "current_iteration": DEFAULT_CURRENT},
	"improvement_report": report,
	"thresholds": thresholds,
	}
	if model_candidate:
	result["model_roadmap_evidence"] = candidate_evidence(model_candidate)
	return result

Xet Storage Details

Size:: 1.72 kB
Xet hash:: c6724271685aefae52a690ccee646a04a2d1a6ed90e5dfb13eebbba1deba4096

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.