Buckets:
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any | |
| from n21.config import load_structured | |
| from n21.settings import CONFIG_ROOT | |
| DEFAULT_CONFIG = CONFIG_ROOT / "thresholds" / "model_quality.yaml" | |
| def load_model_quality_thresholds(path: Path | None = None) -> dict[str, Any]: | |
| return load_structured(path or DEFAULT_CONFIG) | |
| def _number(value: Any, default: float = 0.0) -> float: | |
| try: | |
| return float(value) | |
| except (TypeError, ValueError): | |
| return default | |
| def _bool(value: Any) -> bool: | |
| return bool(value) is True | |
| def _add(checks: dict[str, Any], errors: list[str], name: str, ok: bool, detail: str) -> None: | |
| checks[name] = {"ok": ok, "detail": detail} | |
| if not ok: | |
| errors.append(f"{name}: {detail}") | |
| def evaluate_model_quality_gate( | |
| *, | |
| paired_eval: dict[str, Any] | None = None, | |
| training_plan: dict[str, Any] | None = None, | |
| dataset_manifest: dict[str, Any] | None = None, | |
| model_judge_report: dict[str, Any] | None = None, | |
| human_review_report: dict[str, Any] | None = None, | |
| thresholds: dict[str, Any] | None = None, | |
| ) -> dict[str, Any]: | |
| """Evaluate production model-quality gates from measured evidence. | |
| This deliberately treats fixture/orchestration cycle evidence as irrelevant. | |
| A model can pass only with paired model-vs-model measurement, adequate | |
| training budget, corpus-retention evidence, model-as-judge rubric evidence, | |
| and human spot-check evidence. | |
| """ | |
| cfg = thresholds or load_model_quality_thresholds() | |
| errors: list[str] = [] | |
| warnings: list[str] = [] | |
| checks: dict[str, Any] = {} | |
| paired_cfg = cfg.get("paired_eval", {}) | |
| if paired_cfg.get("required", True) and paired_eval is None: | |
| _add(checks, errors, "paired_eval_present", False, "missing eval/paired_eval_report.json") | |
| elif paired_eval is not None: | |
| baseline = paired_eval.get("baseline", {}) | |
| candidate = paired_eval.get("candidate", {}) | |
| improvement = paired_eval.get("improvement", {}) | |
| sample_count = int(_number(paired_eval.get("sample_count") or candidate.get("sample_count"))) | |
| candidate_aggregate = _number(candidate.get("aggregate")) | |
| baseline_aggregate = _number(baseline.get("aggregate")) | |
| candidate_critical = _number(candidate.get("critical_pass_rate")) | |
| aggregate_delta = _number(improvement.get("aggregate_abs")) | |
| critical_delta = _number(improvement.get("critical_pass_rate_abs")) | |
| loss_rate = _number(improvement.get("pairwise_loss_rate")) | |
| win_rate = _number(improvement.get("pairwise_win_rate")) | |
| aggregate_pct = improvement.get("aggregate_pct") | |
| _add( | |
| checks, | |
| errors, | |
| "paired_eval_sample_count", | |
| sample_count >= int(paired_cfg.get("min_samples", 120)), | |
| f"{sample_count} >= {paired_cfg.get('min_samples', 120)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "candidate_aggregate_absolute", | |
| candidate_aggregate >= _number(paired_cfg.get("min_candidate_aggregate"), 0.60), | |
| f"{candidate_aggregate:.4f} >= {paired_cfg.get('min_candidate_aggregate', 0.60)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "candidate_beats_baseline", | |
| candidate_aggregate > baseline_aggregate, | |
| f"{candidate_aggregate:.4f} > {baseline_aggregate:.4f}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "aggregate_delta_absolute", | |
| aggregate_delta >= _number(paired_cfg.get("min_aggregate_delta_abs"), 0.05), | |
| f"{aggregate_delta:.4f} >= {paired_cfg.get('min_aggregate_delta_abs', 0.05)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "critical_pass_absolute", | |
| candidate_critical >= _number(paired_cfg.get("min_candidate_critical_pass_rate"), 0.70), | |
| f"{candidate_critical:.4f} >= {paired_cfg.get('min_candidate_critical_pass_rate', 0.70)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "critical_pass_not_regressed", | |
| critical_delta >= _number(paired_cfg.get("min_critical_pass_delta_abs"), 0.0), | |
| f"{critical_delta:.4f} >= {paired_cfg.get('min_critical_pass_delta_abs', 0.0)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "pairwise_loss_rate", | |
| loss_rate <= _number(paired_cfg.get("max_pairwise_loss_rate"), 0.02), | |
| f"{loss_rate:.4f} <= {paired_cfg.get('max_pairwise_loss_rate', 0.02)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "pairwise_win_rate", | |
| win_rate >= _number(paired_cfg.get("min_pairwise_win_rate"), 0.55), | |
| f"{win_rate:.4f} >= {paired_cfg.get('min_pairwise_win_rate', 0.55)}", | |
| ) | |
| if baseline_aggregate == 0 and aggregate_pct is None and not paired_cfg.get("allow_zero_baseline_percent", False): | |
| _add( | |
| checks, | |
| errors, | |
| "nonzero_baseline_for_relative_proof", | |
| False, | |
| "baseline aggregate is zero; relative improvement is undefined", | |
| ) | |
| budget_cfg = cfg.get("training_budget", {}) | |
| if budget_cfg.get("required", True) and training_plan is None: | |
| _add(checks, errors, "training_plan_present", False, "missing remote_artifacts/training_plan.json") | |
| elif training_plan is not None: | |
| hp = training_plan.get("hyperparameters", {}) | |
| readiness = training_plan.get("readiness", {}) | |
| train_records = int(_number(training_plan.get("train_records"))) | |
| valid_records = int(_number(training_plan.get("valid_records"))) | |
| max_steps = int(_number(hp.get("max_steps"))) | |
| _add( | |
| checks, | |
| errors, | |
| "training_records_minimum", | |
| train_records >= int(budget_cfg.get("min_train_records", 100)), | |
| f"{train_records} >= {budget_cfg.get('min_train_records', 100)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "validation_records_minimum", | |
| valid_records >= int(budget_cfg.get("min_valid_records", 10)), | |
| f"{valid_records} >= {budget_cfg.get('min_valid_records', 10)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "training_steps_minimum", | |
| max_steps >= int(budget_cfg.get("min_max_steps", 300)), | |
| f"{max_steps} >= {budget_cfg.get('min_max_steps', 300)}", | |
| ) | |
| if budget_cfg.get("require_production_candidate", True): | |
| _add( | |
| checks, | |
| errors, | |
| "trainer_readiness_candidate", | |
| readiness.get("production_candidate") is True, | |
| f"production_candidate={readiness.get('production_candidate')}", | |
| ) | |
| for warning in readiness.get("warnings", []): | |
| warnings.append(f"trainer readiness warning: {warning}") | |
| coverage_cfg = cfg.get("corpus_coverage", {}) | |
| if coverage_cfg.get("required", True) and dataset_manifest is None: | |
| _add(checks, errors, "dataset_manifest_present", False, "missing dataset_snapshot/dataset_manifest.json") | |
| elif dataset_manifest is not None: | |
| quality = dataset_manifest.get("quality", {}) | |
| split_counts = dataset_manifest.get("split_counts", {}) | |
| source_records = int(_number(quality.get("record_count"))) | |
| train_count = int(_number(split_counts.get("train"))) | |
| valid_count = int(_number(split_counts.get("valid"))) | |
| test_count = int(_number(split_counts.get("test"))) | |
| split_total = train_count + valid_count + test_count | |
| retention = (split_total / source_records) if source_records else 0.0 | |
| train_ratio = (train_count / split_total) if split_total else 0.0 | |
| valid_ratio = (valid_count / split_total) if split_total else 0.0 | |
| test_ratio = (test_count / split_total) if split_total else 0.0 | |
| _add( | |
| checks, | |
| errors, | |
| "corpus_record_retention", | |
| retention >= _number(coverage_cfg.get("min_total_record_retention"), 0.95), | |
| f"{retention:.4f} >= {coverage_cfg.get('min_total_record_retention', 0.95)}", | |
| ) | |
| if coverage_cfg.get("require_train_valid_test", True): | |
| _add(checks, errors, "train_valid_test_present", train_count > 0 and valid_count > 0 and test_count > 0, f"{split_counts}") | |
| _add( | |
| checks, | |
| errors, | |
| "train_split_ratio", | |
| train_ratio >= _number(coverage_cfg.get("min_train_split_ratio"), 0.70), | |
| f"{train_ratio:.4f} >= {coverage_cfg.get('min_train_split_ratio', 0.70)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "valid_split_ratio", | |
| valid_ratio >= _number(coverage_cfg.get("min_valid_split_ratio"), 0.05), | |
| f"{valid_ratio:.4f} >= {coverage_cfg.get('min_valid_split_ratio', 0.05)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "test_split_ratio", | |
| test_ratio >= _number(coverage_cfg.get("min_test_split_ratio"), 0.05), | |
| f"{test_ratio:.4f} >= {coverage_cfg.get('min_test_split_ratio', 0.05)}", | |
| ) | |
| judge_cfg = cfg.get("strong_scoring", {}) | |
| if judge_cfg.get("require_model_as_judge", True) and model_judge_report is None: | |
| _add(checks, errors, "model_as_judge_present", False, "missing eval/model_judge_report.json") | |
| elif model_judge_report is not None: | |
| sample_count = int(_number(model_judge_report.get("sample_count"))) | |
| mean_score = _number(model_judge_report.get("mean_score")) | |
| critical_rate = _number(model_judge_report.get("critical_pass_rate")) | |
| rubric = model_judge_report.get("rubric_version") | |
| _add( | |
| checks, | |
| errors, | |
| "model_judge_rubric_version", | |
| rubric == judge_cfg.get("required_rubric_version"), | |
| f"{rubric} == {judge_cfg.get('required_rubric_version')}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "model_judge_sample_count", | |
| sample_count >= int(judge_cfg.get("min_judged_samples", 30)), | |
| f"{sample_count} >= {judge_cfg.get('min_judged_samples', 30)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "model_judge_mean_score", | |
| mean_score >= _number(judge_cfg.get("min_mean_score"), 0.80), | |
| f"{mean_score:.4f} >= {judge_cfg.get('min_mean_score', 0.80)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "model_judge_critical_pass_rate", | |
| critical_rate >= _number(judge_cfg.get("min_critical_pass_rate"), 0.90), | |
| f"{critical_rate:.4f} >= {judge_cfg.get('min_critical_pass_rate', 0.90)}", | |
| ) | |
| human_cfg = cfg.get("human_spot_check", {}) | |
| if human_cfg.get("required", True) and human_review_report is None: | |
| _add(checks, errors, "human_spot_check_present", False, "missing eval/human_spot_check_report.json") | |
| elif human_review_report is not None: | |
| reviewed = int(_number(human_review_report.get("sample_count") or human_review_report.get("reviewed_samples"))) | |
| critical_failures = int(_number(human_review_report.get("critical_failures"))) | |
| approved = human_review_report.get("approved") is True or human_review_report.get("status") == "approved" | |
| _add( | |
| checks, | |
| errors, | |
| "human_review_sample_count", | |
| reviewed >= int(human_cfg.get("min_reviewed_samples", 10)), | |
| f"{reviewed} >= {human_cfg.get('min_reviewed_samples', 10)}", | |
| ) | |
| _add( | |
| checks, | |
| errors, | |
| "human_review_critical_failures", | |
| critical_failures <= int(human_cfg.get("max_critical_failures", 0)), | |
| f"{critical_failures} <= {human_cfg.get('max_critical_failures', 0)}", | |
| ) | |
| if human_cfg.get("require_approval", True): | |
| _add(checks, errors, "human_review_approved", approved, f"approved={approved}") | |
| return { | |
| "ok": not errors, | |
| "eligible_for_promotion": not errors, | |
| "quality_signal": "measured_model_quality" if not errors else "blocked_model_quality", | |
| "errors": errors, | |
| "warnings": warnings, | |
| "checks": checks, | |
| "thresholds": cfg, | |
| } | |
Xet Storage Details
- Size:
- 12.4 kB
- Xet hash:
- 0b86945bda0a1ae72bfd205c2bf3c51d6031adc8acf2a4679f47d0b972942efc
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.