Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /raw_external_gate.py
| """Raw model and external official gate for TinyMind frontier claims. | |
| Protocol wrappers can make the deployed system safer, but frontier claims need | |
| two stronger forms of evidence: | |
| 1. raw/local model measurements that meet the target without imported protocol | |
| scores; | |
| 2. dated external/official provider or leaderboard results from independent | |
| sources. | |
| """ | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from evaluation.frontier_parity import GPT55_PRO_TARGET | |
| REQUIRED_EXTERNAL_SOURCES = {"huggingface", "lmarena", "artificial_analysis"} | |
| def _load(path: str | Path | None) -> dict: | |
| if not path: | |
| return {} | |
| p = Path(path) | |
| return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {} | |
| def _raw_scores(world_report: dict) -> dict[str, float]: | |
| scores = {} | |
| for row in world_report.get("metrics", []): | |
| scope = str(row.get("scope", "")) | |
| if "protocol" in scope or "import" in scope or "wrapper" in scope: | |
| continue | |
| try: | |
| scores[str(row["axis"])] = float(row["score"]) | |
| except (KeyError, TypeError, ValueError): | |
| continue | |
| return scores | |
| def _external_rows(payload: dict) -> list[dict]: | |
| rows = payload.get("external_results", payload.get("results", [])) | |
| return rows if isinstance(rows, list) else [] | |
| def _valid_external(row: dict) -> bool: | |
| required = ["source", "model", "source_url", "as_of", "scores", "official"] | |
| if not all(row.get(key) not in (None, "") for key in required): | |
| return False | |
| if row.get("official") is not True: | |
| return False | |
| if not str(row["source_url"]).startswith(("https://", "http://")): | |
| return False | |
| scores = row.get("scores", {}) | |
| if not isinstance(scores, dict): | |
| return False | |
| return any(axis in scores for axis in GPT55_PRO_TARGET["target_axes"]) | |
| def build_raw_external_gate( | |
| out_dir: str | Path, | |
| raw_world_report: str | Path = "reports/world_class_eval/world_class_eval_report.json", | |
| external_results: str | Path | None = None, | |
| ) -> dict: | |
| world = _load(raw_world_report) | |
| external_payload = _load(external_results) | |
| raw = _raw_scores(world) | |
| target_axes = GPT55_PRO_TARGET["target_axes"] | |
| raw_rows = [] | |
| for axis, target in target_axes.items(): | |
| score = raw.get(axis, 0.0) | |
| raw_rows.append({"axis": axis, "raw_score": score, "target": target, "passed": score >= target, "gap": target - score}) | |
| ext_rows = _external_rows(external_payload) | |
| valid_external = [row for row in ext_rows if isinstance(row, dict) and _valid_external(row)] | |
| source_names = {str(row.get("source", "")).lower() for row in valid_external} | |
| source_coverage = {name: name in source_names for name in REQUIRED_EXTERNAL_SOURCES} | |
| external_axis_pass = {} | |
| for axis, target in target_axes.items(): | |
| axis_scores = [] | |
| for row in valid_external: | |
| value = row.get("scores", {}).get(axis) | |
| if value is not None: | |
| try: | |
| axis_scores.append(float(value)) | |
| except (TypeError, ValueError): | |
| pass | |
| external_axis_pass[axis] = bool(axis_scores) and min(axis_scores) >= target | |
| raw_complete = all(row["passed"] for row in raw_rows) | |
| external_complete = all(source_coverage.values()) and all(external_axis_pass.values()) | |
| report = { | |
| "schema_version": "tinymind-raw-external-gate-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "raw_world_report": str(raw_world_report), | |
| "external_results": str(external_results) if external_results else None, | |
| "raw_rows": raw_rows, | |
| "external": { | |
| "required_sources": sorted(REQUIRED_EXTERNAL_SOURCES), | |
| "source_coverage": source_coverage, | |
| "valid_result_count": len(valid_external), | |
| "axis_pass": external_axis_pass, | |
| "template_path": None, | |
| }, | |
| "claim_gate": { | |
| "raw_gate_passed": raw_complete, | |
| "external_gate_passed": external_complete, | |
| "raw_external_gate_complete": raw_complete and external_complete, | |
| "can_claim_frontier_or_beyond": raw_complete and external_complete, | |
| "reason": "Requires raw model target scores plus independent official external results from HF, LMArena, and Artificial Analysis.", | |
| }, | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "raw_external_gate_report.json" | |
| md_path = out / "raw_external_gate_report.md" | |
| template_path = out / "external_results_template.json" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| report["external"]["template_path"] = str(template_path) | |
| template_path.write_text(json.dumps(_template(), ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _template() -> dict: | |
| return { | |
| "schema_version": "tinymind-external-official-results-v1", | |
| "external_results": [ | |
| { | |
| "source": "huggingface", | |
| "model": "tinymind-model-id", | |
| "source_url": "https://huggingface.co/...", | |
| "as_of": "YYYY-MM-DD", | |
| "official": True, | |
| "scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]}, | |
| }, | |
| { | |
| "source": "lmarena", | |
| "model": "tinymind-public-endpoint-name", | |
| "source_url": "https://lmarena.ai/...", | |
| "as_of": "YYYY-MM-DD", | |
| "official": True, | |
| "scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]}, | |
| }, | |
| { | |
| "source": "artificial_analysis", | |
| "model": "tinymind-provider-listing", | |
| "source_url": "https://artificialanalysis.ai/...", | |
| "as_of": "YYYY-MM-DD", | |
| "official": True, | |
| "scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]}, | |
| }, | |
| ], | |
| } | |
| def _markdown(report: dict) -> str: | |
| lines = [ | |
| "# TinyMind Raw/External Gate", | |
| "", | |
| f"- Raw gate passed: {report['claim_gate']['raw_gate_passed']}", | |
| f"- External gate passed: {report['claim_gate']['external_gate_passed']}", | |
| f"- Raw/external complete: {report['claim_gate']['raw_external_gate_complete']}", | |
| f"- Can claim frontier or beyond: {report['claim_gate']['can_claim_frontier_or_beyond']}", | |
| "", | |
| "## Raw Rows", | |
| "", | |
| "| Axis | Raw score | Target | Gap | Passed |", | |
| "|---|---:|---:|---:|---|", | |
| ] | |
| for row in report["raw_rows"]: | |
| lines.append(f"| {row['axis']} | {row['raw_score']:.2f} | {row['target']:.2f} | {row['gap']:.2f} | {row['passed']} |") | |
| lines.extend(["", "## External Source Coverage", ""]) | |
| for source, passed in report["external"]["source_coverage"].items(): | |
| lines.append(f"- {source}: {passed}") | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 7.28 kB
- Xet hash:
- e815618b54cb1e519acd9c855cac51a764bc0eb1e25ec2e6cf64247ec7315110
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.