Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /raw_external_gate.py

bbkdevops

about 1 month ago

download

raw

7.28 kB

	"""Raw model and external official gate for TinyMind frontier claims.

	Protocol wrappers can make the deployed system safer, but frontier claims need
	two stronger forms of evidence:

	1. raw/local model measurements that meet the target without imported protocol
	scores;
	2. dated external/official provider or leaderboard results from independent
	sources.
	"""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path

	from evaluation.frontier_parity import GPT55_PRO_TARGET


	REQUIRED_EXTERNAL_SOURCES = {"huggingface", "lmarena", "artificial_analysis"}


	def _load(path: str \| Path \| None) -> dict:
	if not path:
	return {}
	p = Path(path)
	return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {}


	def _raw_scores(world_report: dict) -> dict[str, float]:
	scores = {}
	for row in world_report.get("metrics", []):
	scope = str(row.get("scope", ""))
	if "protocol" in scope or "import" in scope or "wrapper" in scope:
	continue
	try:
	scores[str(row["axis"])] = float(row["score"])
	except (KeyError, TypeError, ValueError):
	continue
	return scores


	def _external_rows(payload: dict) -> list[dict]:
	rows = payload.get("external_results", payload.get("results", []))
	return rows if isinstance(rows, list) else []


	def _valid_external(row: dict) -> bool:
	required = ["source", "model", "source_url", "as_of", "scores", "official"]
	if not all(row.get(key) not in (None, "") for key in required):
	return False
	if row.get("official") is not True:
	return False
	if not str(row["source_url"]).startswith(("https://", "http://")):
	return False
	scores = row.get("scores", {})
	if not isinstance(scores, dict):
	return False
	return any(axis in scores for axis in GPT55_PRO_TARGET["target_axes"])


	def build_raw_external_gate(
	out_dir: str \| Path,
	raw_world_report: str \| Path = "reports/world_class_eval/world_class_eval_report.json",
	external_results: str \| Path \| None = None,
	) -> dict:
	world = _load(raw_world_report)
	external_payload = _load(external_results)
	raw = _raw_scores(world)
	target_axes = GPT55_PRO_TARGET["target_axes"]
	raw_rows = []
	for axis, target in target_axes.items():
	score = raw.get(axis, 0.0)
	raw_rows.append({"axis": axis, "raw_score": score, "target": target, "passed": score >= target, "gap": target - score})

	ext_rows = _external_rows(external_payload)
	valid_external = [row for row in ext_rows if isinstance(row, dict) and _valid_external(row)]
	source_names = {str(row.get("source", "")).lower() for row in valid_external}
	source_coverage = {name: name in source_names for name in REQUIRED_EXTERNAL_SOURCES}
	external_axis_pass = {}
	for axis, target in target_axes.items():
	axis_scores = []
	for row in valid_external:
	value = row.get("scores", {}).get(axis)
	if value is not None:
	try:
	axis_scores.append(float(value))
	except (TypeError, ValueError):
	pass
	external_axis_pass[axis] = bool(axis_scores) and min(axis_scores) >= target

	raw_complete = all(row["passed"] for row in raw_rows)
	external_complete = all(source_coverage.values()) and all(external_axis_pass.values())
	report = {
	"schema_version": "tinymind-raw-external-gate-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"raw_world_report": str(raw_world_report),
	"external_results": str(external_results) if external_results else None,
	"raw_rows": raw_rows,
	"external": {
	"required_sources": sorted(REQUIRED_EXTERNAL_SOURCES),
	"source_coverage": source_coverage,
	"valid_result_count": len(valid_external),
	"axis_pass": external_axis_pass,
	"template_path": None,
	},
	"claim_gate": {
	"raw_gate_passed": raw_complete,
	"external_gate_passed": external_complete,
	"raw_external_gate_complete": raw_complete and external_complete,
	"can_claim_frontier_or_beyond": raw_complete and external_complete,
	"reason": "Requires raw model target scores plus independent official external results from HF, LMArena, and Artificial Analysis.",
	},
	}
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	json_path = out / "raw_external_gate_report.json"
	md_path = out / "raw_external_gate_report.md"
	template_path = out / "external_results_template.json"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	report["external"]["template_path"] = str(template_path)
	template_path.write_text(json.dumps(_template(), ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _template() -> dict:
	return {
	"schema_version": "tinymind-external-official-results-v1",
	"external_results": [
	{
	"source": "huggingface",
	"model": "tinymind-model-id",
	"source_url": "https://huggingface.co/...",
	"as_of": "YYYY-MM-DD",
	"official": True,
	"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
	},
	{
	"source": "lmarena",
	"model": "tinymind-public-endpoint-name",
	"source_url": "https://lmarena.ai/...",
	"as_of": "YYYY-MM-DD",
	"official": True,
	"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
	},
	{
	"source": "artificial_analysis",
	"model": "tinymind-provider-listing",
	"source_url": "https://artificialanalysis.ai/...",
	"as_of": "YYYY-MM-DD",
	"official": True,
	"scores": {axis: None for axis in GPT55_PRO_TARGET["target_axes"]},
	},
	],
	}


	def _markdown(report: dict) -> str:
	lines = [
	"# TinyMind Raw/External Gate",
	"",
	f"- Raw gate passed: {report['claim_gate']['raw_gate_passed']}",
	f"- External gate passed: {report['claim_gate']['external_gate_passed']}",
	f"- Raw/external complete: {report['claim_gate']['raw_external_gate_complete']}",
	f"- Can claim frontier or beyond: {report['claim_gate']['can_claim_frontier_or_beyond']}",
	"",
	"## Raw Rows",
	"",
	"\| Axis \| Raw score \| Target \| Gap \| Passed \|",
	"\|---\|---:\|---:\|---:\|---\|",
	]
	for row in report["raw_rows"]:
	lines.append(f"\| {row['axis']} \| {row['raw_score']:.2f} \| {row['target']:.2f} \| {row['gap']:.2f} \| {row['passed']} \|")
	lines.extend(["", "## External Source Coverage", ""])
	for source, passed in report["external"]["source_coverage"].items():
	lines.append(f"- {source}: {passed}")
	return "\n".join(lines) + "\n"

Xet Storage Details

Size:: 7.28 kB
Xet hash:: e815618b54cb1e519acd9c855cac51a764bc0eb1e25ec2e6cf64247ec7315110

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.