Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /world_class_eval.py

bbkdevops

about 1 month ago

download

raw

9.72 kB

	"""World-class multi-axis evaluation harness for TinyMind.

	This module does not invent leaderboard results. It creates a reproducible
	packet that can run local/public benchmarks now, import external/provider
	results when available, and compare TinyMind against world-model slots with a
	strict claim gate.
	"""

	from __future__ import annotations

	from datetime import datetime, timezone
	import csv
	import json
	from pathlib import Path
	from typing import Iterable


	WORLD_EVAL_TARGETS = [
	{
	"name": "Hugging Face Leaderboards/Evals",
	"url": "https://huggingface.co/docs/leaderboards/index",
	"kind": "public_hub_or_space_submission",
	"status": "requires_hub_compatible_model_or_eval_space",
	"axes": ["MMLU-Pro", "GPQA", "IFEval", "MATH", "BBH", "coding"],
	},
	{
	"name": "LMArena",
	"url": "https://lmarena.ai/",
	"kind": "human_preference_arena",
	"status": "requires_public_chat_endpoint_and_arena_acceptance",
	"axes": ["human preference", "chat quality", "style", "instruction following"],
	},
	{
	"name": "Artificial Analysis",
	"url": "https://artificialanalysis.ai/evaluations",
	"kind": "provider_benchmark_index",
	"status": "requires_provider_endpoint_or_external_listing",
	"axes": ["intelligence index", "speed", "price", "coding", "math", "reasoning"],
	},
	]

	WORLD_MODEL_SLOTS = [
	{"model": "frontier_closed_top", "class": "frontier", "params": None, "source": "external_provider_required"},
	{"model": "top_open_large", "class": "open_large", "params": None, "source": "hf_or_provider_required"},
	{"model": "70B_class", "class": "70B", "params": 70_000_000_000, "source": "size_reference"},
	{"model": "7B_class", "class": "7B", "params": 7_000_000_000, "source": "size_reference"},
	{"model": "gpt2", "class": "124M", "params": 124_439_808, "source": "local_or_hf_smoke"},
	]


	def _load(path: str \| Path \| None) -> dict:
	if not path:
	return {}
	p = Path(path)
	if not p.exists():
	return {}
	return json.loads(p.read_text(encoding="utf-8"))


	def _score(value: float \| int \| None) -> float:
	if value is None:
	return 0.0
	return max(0.0, min(100.0, float(value)))


	def _metric_rows(
	knowledge_report: dict,
	compact_report: dict,
	coherence_report: dict,
	memory_report: dict,
	external_results: Iterable[dict],
	) -> list[dict]:
	dimensions = compact_report.get("dimensions", {})
	rows = [
	{
	"axis": "knowledge_mmlu_pro",
	"score": _score(dimensions.get("official_mmlu_pro_smoke") or dimensions.get("knowledge_mmlu_pro_smoke")),
	"source": compact_report.get("json_path", "compact_intelligence"),
	"scope": "local_public_harness",
	},
	{
	"axis": "instruction_following",
	"score": _score(dimensions.get("instruction_following_smoke")),
	"source": compact_report.get("json_path", "compact_intelligence"),
	"scope": "local_smoke",
	},
	{
	"axis": "translation_th_en",
	"score": _score(dimensions.get("translation_smoke")),
	"source": compact_report.get("json_path", "compact_intelligence"),
	"scope": "local_smoke",
	},
	{
	"axis": "natural_answer_style",
	"score": _score(dimensions.get("natural_answer_style")),
	"source": compact_report.get("json_path", "compact_intelligence"),
	"scope": "local_quality_gate",
	},
	{
	"axis": "bit_exactness",
	"score": _score(dimensions.get("bit_exactness")),
	"source": compact_report.get("json_path", "compact_intelligence"),
	"scope": "local_training_metric",
	},
	{
	"axis": "layer_coherence",
	"score": _score(coherence_report.get("harmony_score")),
	"source": coherence_report.get("json_path", "layer_coherence"),
	"scope": "architecture_smoke",
	},
	{
	"axis": "long_context_exact_10m",
	"score": 100.0 if int(memory_report.get("measured_tokens", 0)) >= 10_000_000 and memory_report.get("passkey_recall", {}).get("passed") else 0.0,
	"source": memory_report.get("report_path", "extreme_memory"),
	"scope": "exact_archive_recall",
	},
	]
	full_gate = knowledge_report.get("full_cycle_gate", {})
	rows.append(
	{
	"axis": "pure_data_full_cycle",
	"score": 100.0 if full_gate.get("passed") else 0.0,
	"source": knowledge_report.get("json_path", "knowledge_full_cycle"),
	"scope": "data_train_eval_gate",
	}
	)
	for row in external_results:
	rows.append(
	{
	"axis": f"external_{row.get('name', row.get('model', 'unknown'))}",
	"score": _score(row.get("score")),
	"source": row.get("source", "imported_external"),
	"scope": row.get("scope", "external_import"),
	}
	)
	return rows


	def build_world_class_eval(
	out_dir: str \| Path,
	knowledge_report: str \| Path \| None = "reports/knowledge_full_cycle_pursuit_512/knowledge_full_cycle_report.json",
	compact_report: str \| Path \| None = "reports/compact_intelligence/compact_intelligence_dossier.json",
	coherence_report: str \| Path \| None = "reports/axiomweave_coherence/layer_coherence_report.json",
	memory_report: str \| Path \| None = "reports/extreme_memory_10m/extreme_memory_report.json",
	external_results: str \| Path \| None = None,
	) -> dict:
	knowledge = _load(knowledge_report)
	compact = _load(compact_report)
	coherence = _load(coherence_report)
	memory = _load(memory_report)
	external = _load(external_results).get("results", []) if external_results else []
	rows = _metric_rows(knowledge, compact, coherence, memory, external)
	measured_axes = [row for row in rows if row["score"] > 0]
	weak_axes = [row["axis"] for row in rows if row["score"] < 50.0]
	balanced_score = sum(row["score"] for row in rows) / max(len(rows), 1)
	hard_blockers = []
	if weak_axes:
	hard_blockers.append("some_axes_below_50")
	if not external:
	hard_blockers.append("no_imported_external_top_model_results")
	if not compact.get("claim_gate") or compact.get("claim_gate", {}).get("can_claim_smarter_than_larger_models") is not True:
	hard_blockers.append("compact_intelligence_claim_gate_blocked")
	report = {
	"schema_version": "tinymind-world-class-eval-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"purpose": "Measure broad intelligence readiness against world-class model standards without fabricating ranks.",
	"targets": WORLD_EVAL_TARGETS,
	"world_model_slots": WORLD_MODEL_SLOTS,
	"metrics": rows,
	"summary": {
	"balanced_score": balanced_score,
	"measured_axis_count": len(measured_axes),
	"axis_count": len(rows),
	"weak_axes": weak_axes,
	"hard_blockers": hard_blockers,
	},
	"claim_gate": {
	"ready_for_world_top_comparison": len(measured_axes) >= 7,
	"can_claim_better_than_top_world_models": False,
	"can_claim_production_ready_eval_packet": not not rows,
	"reason": "Official/provider ranks and strong scores across every axis are required before superiority claims.",
	},
	"submission_next_steps": [
	"publish HF-compatible model/repo or eval Space",
	"expose OpenAI-compatible public endpoint for LMArena-style testing",
	"prepare Artificial Analysis provider endpoint metadata",
	"import official result JSON/CSV into this harness and rerun claim gate",
	],
	}
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	json_path = out / "world_class_eval_report.json"
	md_path = out / "world_class_eval_report.md"
	csv_path = out / "world_class_eval_metrics.csv"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	report["csv_path"] = str(csv_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	_write_csv(csv_path, rows)
	return report


	def _write_csv(path: Path, rows: list[dict]) -> None:
	with path.open("w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=["axis", "score", "source", "scope"])
	writer.writeheader()
	writer.writerows(rows)


	def _markdown(report: dict) -> str:
	lines = [
	"# TinyMind World-Class Eval Report",
	"",
	f"- Balanced score: {report['summary']['balanced_score']:.2f}",
	f"- Measured axes: {report['summary']['measured_axis_count']}/{report['summary']['axis_count']}",
	f"- Can claim better than top world models: {report['claim_gate']['can_claim_better_than_top_world_models']}",
	"",
	"## Metrics",
	"",
	"\| Axis \| Score \| Scope \| Source \|",
	"\|---\|---:\|---\|---\|",
	]
	for row in report["metrics"]:
	lines.append(f"\| {row['axis']} \| {row['score']:.2f} \| {row['scope']} \| {row['source']} \|")
	lines.extend(["", "## External Targets", ""])
	for target in report["targets"]:
	lines.append(f"- [{target['name']}]({target['url']}): {target['status']}")
	lines.extend(["", "## Blockers", ""])
	for blocker in report["summary"]["hard_blockers"]:
	lines.append(f"- {blocker}")
	return "\n".join(lines) + "\n"

Xet Storage Details

Size:: 9.72 kB
Xet hash:: ebcdf3215277cdf1e7133fa55dbb21dbabc5ba5cecedb67ef4315a9767b45abe

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.