linvest21's picture
download
raw
12.4 kB
from __future__ import annotations
from pathlib import Path
from typing import Any
from n21.config import load_structured
from n21.settings import CONFIG_ROOT
DEFAULT_CONFIG = CONFIG_ROOT / "thresholds" / "model_quality.yaml"
def load_model_quality_thresholds(path: Path | None = None) -> dict[str, Any]:
return load_structured(path or DEFAULT_CONFIG)
def _number(value: Any, default: float = 0.0) -> float:
try:
return float(value)
except (TypeError, ValueError):
return default
def _bool(value: Any) -> bool:
return bool(value) is True
def _add(checks: dict[str, Any], errors: list[str], name: str, ok: bool, detail: str) -> None:
checks[name] = {"ok": ok, "detail": detail}
if not ok:
errors.append(f"{name}: {detail}")
def evaluate_model_quality_gate(
*,
paired_eval: dict[str, Any] | None = None,
training_plan: dict[str, Any] | None = None,
dataset_manifest: dict[str, Any] | None = None,
model_judge_report: dict[str, Any] | None = None,
human_review_report: dict[str, Any] | None = None,
thresholds: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""Evaluate production model-quality gates from measured evidence.
This deliberately treats fixture/orchestration cycle evidence as irrelevant.
A model can pass only with paired model-vs-model measurement, adequate
training budget, corpus-retention evidence, model-as-judge rubric evidence,
and human spot-check evidence.
"""
cfg = thresholds or load_model_quality_thresholds()
errors: list[str] = []
warnings: list[str] = []
checks: dict[str, Any] = {}
paired_cfg = cfg.get("paired_eval", {})
if paired_cfg.get("required", True) and paired_eval is None:
_add(checks, errors, "paired_eval_present", False, "missing eval/paired_eval_report.json")
elif paired_eval is not None:
baseline = paired_eval.get("baseline", {})
candidate = paired_eval.get("candidate", {})
improvement = paired_eval.get("improvement", {})
sample_count = int(_number(paired_eval.get("sample_count") or candidate.get("sample_count")))
candidate_aggregate = _number(candidate.get("aggregate"))
baseline_aggregate = _number(baseline.get("aggregate"))
candidate_critical = _number(candidate.get("critical_pass_rate"))
aggregate_delta = _number(improvement.get("aggregate_abs"))
critical_delta = _number(improvement.get("critical_pass_rate_abs"))
loss_rate = _number(improvement.get("pairwise_loss_rate"))
win_rate = _number(improvement.get("pairwise_win_rate"))
aggregate_pct = improvement.get("aggregate_pct")
_add(
checks,
errors,
"paired_eval_sample_count",
sample_count >= int(paired_cfg.get("min_samples", 120)),
f"{sample_count} >= {paired_cfg.get('min_samples', 120)}",
)
_add(
checks,
errors,
"candidate_aggregate_absolute",
candidate_aggregate >= _number(paired_cfg.get("min_candidate_aggregate"), 0.60),
f"{candidate_aggregate:.4f} >= {paired_cfg.get('min_candidate_aggregate', 0.60)}",
)
_add(
checks,
errors,
"candidate_beats_baseline",
candidate_aggregate > baseline_aggregate,
f"{candidate_aggregate:.4f} > {baseline_aggregate:.4f}",
)
_add(
checks,
errors,
"aggregate_delta_absolute",
aggregate_delta >= _number(paired_cfg.get("min_aggregate_delta_abs"), 0.05),
f"{aggregate_delta:.4f} >= {paired_cfg.get('min_aggregate_delta_abs', 0.05)}",
)
_add(
checks,
errors,
"critical_pass_absolute",
candidate_critical >= _number(paired_cfg.get("min_candidate_critical_pass_rate"), 0.70),
f"{candidate_critical:.4f} >= {paired_cfg.get('min_candidate_critical_pass_rate', 0.70)}",
)
_add(
checks,
errors,
"critical_pass_not_regressed",
critical_delta >= _number(paired_cfg.get("min_critical_pass_delta_abs"), 0.0),
f"{critical_delta:.4f} >= {paired_cfg.get('min_critical_pass_delta_abs', 0.0)}",
)
_add(
checks,
errors,
"pairwise_loss_rate",
loss_rate <= _number(paired_cfg.get("max_pairwise_loss_rate"), 0.02),
f"{loss_rate:.4f} <= {paired_cfg.get('max_pairwise_loss_rate', 0.02)}",
)
_add(
checks,
errors,
"pairwise_win_rate",
win_rate >= _number(paired_cfg.get("min_pairwise_win_rate"), 0.55),
f"{win_rate:.4f} >= {paired_cfg.get('min_pairwise_win_rate', 0.55)}",
)
if baseline_aggregate == 0 and aggregate_pct is None and not paired_cfg.get("allow_zero_baseline_percent", False):
_add(
checks,
errors,
"nonzero_baseline_for_relative_proof",
False,
"baseline aggregate is zero; relative improvement is undefined",
)
budget_cfg = cfg.get("training_budget", {})
if budget_cfg.get("required", True) and training_plan is None:
_add(checks, errors, "training_plan_present", False, "missing remote_artifacts/training_plan.json")
elif training_plan is not None:
hp = training_plan.get("hyperparameters", {})
readiness = training_plan.get("readiness", {})
train_records = int(_number(training_plan.get("train_records")))
valid_records = int(_number(training_plan.get("valid_records")))
max_steps = int(_number(hp.get("max_steps")))
_add(
checks,
errors,
"training_records_minimum",
train_records >= int(budget_cfg.get("min_train_records", 100)),
f"{train_records} >= {budget_cfg.get('min_train_records', 100)}",
)
_add(
checks,
errors,
"validation_records_minimum",
valid_records >= int(budget_cfg.get("min_valid_records", 10)),
f"{valid_records} >= {budget_cfg.get('min_valid_records', 10)}",
)
_add(
checks,
errors,
"training_steps_minimum",
max_steps >= int(budget_cfg.get("min_max_steps", 300)),
f"{max_steps} >= {budget_cfg.get('min_max_steps', 300)}",
)
if budget_cfg.get("require_production_candidate", True):
_add(
checks,
errors,
"trainer_readiness_candidate",
readiness.get("production_candidate") is True,
f"production_candidate={readiness.get('production_candidate')}",
)
for warning in readiness.get("warnings", []):
warnings.append(f"trainer readiness warning: {warning}")
coverage_cfg = cfg.get("corpus_coverage", {})
if coverage_cfg.get("required", True) and dataset_manifest is None:
_add(checks, errors, "dataset_manifest_present", False, "missing dataset_snapshot/dataset_manifest.json")
elif dataset_manifest is not None:
quality = dataset_manifest.get("quality", {})
split_counts = dataset_manifest.get("split_counts", {})
source_records = int(_number(quality.get("record_count")))
train_count = int(_number(split_counts.get("train")))
valid_count = int(_number(split_counts.get("valid")))
test_count = int(_number(split_counts.get("test")))
split_total = train_count + valid_count + test_count
retention = (split_total / source_records) if source_records else 0.0
train_ratio = (train_count / split_total) if split_total else 0.0
valid_ratio = (valid_count / split_total) if split_total else 0.0
test_ratio = (test_count / split_total) if split_total else 0.0
_add(
checks,
errors,
"corpus_record_retention",
retention >= _number(coverage_cfg.get("min_total_record_retention"), 0.95),
f"{retention:.4f} >= {coverage_cfg.get('min_total_record_retention', 0.95)}",
)
if coverage_cfg.get("require_train_valid_test", True):
_add(checks, errors, "train_valid_test_present", train_count > 0 and valid_count > 0 and test_count > 0, f"{split_counts}")
_add(
checks,
errors,
"train_split_ratio",
train_ratio >= _number(coverage_cfg.get("min_train_split_ratio"), 0.70),
f"{train_ratio:.4f} >= {coverage_cfg.get('min_train_split_ratio', 0.70)}",
)
_add(
checks,
errors,
"valid_split_ratio",
valid_ratio >= _number(coverage_cfg.get("min_valid_split_ratio"), 0.05),
f"{valid_ratio:.4f} >= {coverage_cfg.get('min_valid_split_ratio', 0.05)}",
)
_add(
checks,
errors,
"test_split_ratio",
test_ratio >= _number(coverage_cfg.get("min_test_split_ratio"), 0.05),
f"{test_ratio:.4f} >= {coverage_cfg.get('min_test_split_ratio', 0.05)}",
)
judge_cfg = cfg.get("strong_scoring", {})
if judge_cfg.get("require_model_as_judge", True) and model_judge_report is None:
_add(checks, errors, "model_as_judge_present", False, "missing eval/model_judge_report.json")
elif model_judge_report is not None:
sample_count = int(_number(model_judge_report.get("sample_count")))
mean_score = _number(model_judge_report.get("mean_score"))
critical_rate = _number(model_judge_report.get("critical_pass_rate"))
rubric = model_judge_report.get("rubric_version")
_add(
checks,
errors,
"model_judge_rubric_version",
rubric == judge_cfg.get("required_rubric_version"),
f"{rubric} == {judge_cfg.get('required_rubric_version')}",
)
_add(
checks,
errors,
"model_judge_sample_count",
sample_count >= int(judge_cfg.get("min_judged_samples", 30)),
f"{sample_count} >= {judge_cfg.get('min_judged_samples', 30)}",
)
_add(
checks,
errors,
"model_judge_mean_score",
mean_score >= _number(judge_cfg.get("min_mean_score"), 0.80),
f"{mean_score:.4f} >= {judge_cfg.get('min_mean_score', 0.80)}",
)
_add(
checks,
errors,
"model_judge_critical_pass_rate",
critical_rate >= _number(judge_cfg.get("min_critical_pass_rate"), 0.90),
f"{critical_rate:.4f} >= {judge_cfg.get('min_critical_pass_rate', 0.90)}",
)
human_cfg = cfg.get("human_spot_check", {})
if human_cfg.get("required", True) and human_review_report is None:
_add(checks, errors, "human_spot_check_present", False, "missing eval/human_spot_check_report.json")
elif human_review_report is not None:
reviewed = int(_number(human_review_report.get("sample_count") or human_review_report.get("reviewed_samples")))
critical_failures = int(_number(human_review_report.get("critical_failures")))
approved = human_review_report.get("approved") is True or human_review_report.get("status") == "approved"
_add(
checks,
errors,
"human_review_sample_count",
reviewed >= int(human_cfg.get("min_reviewed_samples", 10)),
f"{reviewed} >= {human_cfg.get('min_reviewed_samples', 10)}",
)
_add(
checks,
errors,
"human_review_critical_failures",
critical_failures <= int(human_cfg.get("max_critical_failures", 0)),
f"{critical_failures} <= {human_cfg.get('max_critical_failures', 0)}",
)
if human_cfg.get("require_approval", True):
_add(checks, errors, "human_review_approved", approved, f"approved={approved}")
return {
"ok": not errors,
"eligible_for_promotion": not errors,
"quality_signal": "measured_model_quality" if not errors else "blocked_model_quality",
"errors": errors,
"warnings": warnings,
"checks": checks,
"thresholds": cfg,
}

Xet Storage Details

Size:
12.4 kB
·
Xet hash:
0b86945bda0a1ae72bfd205c2bf3c51d6031adc8acf2a4679f47d0b972942efc

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.