bbkdevops's picture
download
raw
7.28 kB
"""Raw model and external official gate for TinyMind frontier claims.
Protocol wrappers can make the deployed system safer, but frontier claims need
two stronger forms of evidence:
1. raw/local model measurements that meet the target without imported protocol
scores;
2. dated external/official provider or leaderboard results from independent
sources.
"""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
from evaluation.frontier_parity import GPT55_PRO_TARGET
REQUIRED_EXTERNAL_SOURCES = {"huggingface", "lmarena", "artificial_analysis"}
def _load(path: str | Path | None) -> dict:
if not path:
return {}
p = Path(path)
return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {}
def _raw_scores(world_report: dict) -> dict[str, float]:
scores = {}
for row in world_report.get("metrics", []):
scope = str(row.get("scope", ""))
if "protocol" in scope or "import" in scope or "wrapper" in scope:
continue
try:
scores[str(row["axis"])] = float(row["score"])
except (KeyError, TypeError, ValueError):
continue
return scores
def _external_rows(payload: dict) -> list[dict]:
rows = payload.get("external_results", payload.get("results", []))
return rows if isinstance(rows, list) else []
def _valid_external(row: dict) -> bool:
required = ["source", "model", "source_url", "as_of", "scores", "official"]
if not all(row.get(key) not in (None, "") for key in required):
return False
if row.get("official") is not True:
return False
if not str(row["source_url"]).startswith(("https://", "http://")):
return False
scores = row.get("scores", {})
if not isinstance(scores, dict):
return False
return any(axis in scores for axis in GPT55_PRO_TARGET["target_axes"])
def build_raw_external_gate(
out_dir: str | Path,
raw_world_report: str | Path = "reports/world_class_eval/world_class_eval_report.json",
external_results: str | Path | None = None,
) -> dict:
world = _load(raw_world_report)
external_payload = _load(external_results)
raw = _raw_scores(world)
target_axes = GPT55_PRO_TARGET["target_axes"]
raw_rows = []
for axis, target in target_axes.items():
score = raw.get(axis, 0.0)
raw_rows.append({"axis": axis, "raw_score": score, "target": target, "passed": score >= target, "gap": target - score})
ext_rows = _external_rows(external_payload)
valid_external = [row for row in ext_rows if isinstance(row, dict) and _valid_external(row)]
source_names = {str(row.get("source", "")).lower() for row in valid_external}
source_coverage = {name: name in source_names for name in REQUIRED_EXTERNAL_SOURCES}
external_axis_pass = {}
for axis, target in target_axes.items():
axis_scores = []
for row in valid_external:
value = row.get("scores", {}).get(axis)
if value is not None:
try:
axis_scores.append(float(value))
except (TypeError, ValueError):
pass
external_axis_pass[axis] = bool(axis_scores) and min(axis_scores) >= target
raw_complete = all(row["passed"] for row in raw_rows)
external_complete = all(source_coverage.values()) and all(external_axis_pass.values())
report = {
"schema_version": "tinymind-raw-external-gate-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"raw_world_report": str(raw_world_report),
"external_results": str(external_results) if external_results else None,
"raw_rows": raw_rows,
"external": {
"required_sources": sorted(REQUIRED_EXTERNAL_SOURCES),
"source_coverage": source_coverage,
"valid_result_count": len(valid_external),
"axis_pass": external_axis_pass,
"template_path": None,
},
"claim_gate": {
"raw_gate_passed": raw_complete,
"external_gate_passed": external_complete,
"raw_external_gate_complete": raw_complete and external_complete,
"can_claim_frontier_or_beyond": raw_complete and external_complete,
"reason": "Requires raw model target scores plus independent official external results from HF, LMArena, and Artificial Analysis.",
},
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = out / "raw_external_gate_report.json"
md_path = out / "raw_external_gate_report.md"
template_path = out / "external_results_template.json"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
report["external"]["template_path"] = str(template_path)
template_path.write_text(json.dumps(_template(), ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _template() -> dict:
return {
"schema_version": "tinymind-external-official-results-v1",
"external_results": [
{
"source": "huggingface",
"model": "tinymind-model-id",
"source_url": "https://huggingface.co/...",
"as_of": "YYYY-MM-DD",
"official": True,
"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
},
{
"source": "lmarena",
"model": "tinymind-public-endpoint-name",
"source_url": "https://lmarena.ai/...",
"as_of": "YYYY-MM-DD",
"official": True,
"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
},
{
"source": "artificial_analysis",
"model": "tinymind-provider-listing",
"source_url": "https://artificialanalysis.ai/...",
"as_of": "YYYY-MM-DD",
"official": True,
"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
},
],
}
def _markdown(report: dict) -> str:
lines = [
"# TinyMind Raw/External Gate",
"",
f"- Raw gate passed: {report['claim_gate']['raw_gate_passed']}",
f"- External gate passed: {report['claim_gate']['external_gate_passed']}",
f"- Raw/external complete: {report['claim_gate']['raw_external_gate_complete']}",
f"- Can claim frontier or beyond: {report['claim_gate']['can_claim_frontier_or_beyond']}",
"",
"## Raw Rows",
"",
"| Axis | Raw score | Target | Gap | Passed |",
"|---|---:|---:|---:|---|",
]
for row in report["raw_rows"]:
lines.append(f"| {row['axis']} | {row['raw_score']:.2f} | {row['target']:.2f} | {row['gap']:.2f} | {row['passed']} |")
lines.extend(["", "## External Source Coverage", ""])
for source, passed in report["external"]["source_coverage"].items():
lines.append(f"- {source}: {passed}")
return "\n".join(lines) + "\n"

Xet Storage Details

Size:
7.28 kB
·
Xet hash:
e815618b54cb1e519acd9c855cac51a764bc0eb1e25ec2e6cf64247ec7315110

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.