bbkdevops's picture
download
raw
9.72 kB
"""World-class multi-axis evaluation harness for TinyMind.
This module does not invent leaderboard results. It creates a reproducible
packet that can run local/public benchmarks now, import external/provider
results when available, and compare TinyMind against world-model slots with a
strict claim gate.
"""
from __future__ import annotations
from datetime import datetime, timezone
import csv
import json
from pathlib import Path
from typing import Iterable
WORLD_EVAL_TARGETS = [
{
"name": "Hugging Face Leaderboards/Evals",
"url": "https://huggingface.co/docs/leaderboards/index",
"kind": "public_hub_or_space_submission",
"status": "requires_hub_compatible_model_or_eval_space",
"axes": ["MMLU-Pro", "GPQA", "IFEval", "MATH", "BBH", "coding"],
},
{
"name": "LMArena",
"url": "https://lmarena.ai/",
"kind": "human_preference_arena",
"status": "requires_public_chat_endpoint_and_arena_acceptance",
"axes": ["human preference", "chat quality", "style", "instruction following"],
},
{
"name": "Artificial Analysis",
"url": "https://artificialanalysis.ai/evaluations",
"kind": "provider_benchmark_index",
"status": "requires_provider_endpoint_or_external_listing",
"axes": ["intelligence index", "speed", "price", "coding", "math", "reasoning"],
},
]
WORLD_MODEL_SLOTS = [
{"model": "frontier_closed_top", "class": "frontier", "params": None, "source": "external_provider_required"},
{"model": "top_open_large", "class": "open_large", "params": None, "source": "hf_or_provider_required"},
{"model": "70B_class", "class": "70B", "params": 70_000_000_000, "source": "size_reference"},
{"model": "7B_class", "class": "7B", "params": 7_000_000_000, "source": "size_reference"},
{"model": "gpt2", "class": "124M", "params": 124_439_808, "source": "local_or_hf_smoke"},
]
def _load(path: str | Path | None) -> dict:
if not path:
return {}
p = Path(path)
if not p.exists():
return {}
return json.loads(p.read_text(encoding="utf-8"))
def _score(value: float | int | None) -> float:
if value is None:
return 0.0
return max(0.0, min(100.0, float(value)))
def _metric_rows(
knowledge_report: dict,
compact_report: dict,
coherence_report: dict,
memory_report: dict,
external_results: Iterable[dict],
) -> list[dict]:
dimensions = compact_report.get("dimensions", {})
rows = [
{
"axis": "knowledge_mmlu_pro",
"score": _score(dimensions.get("official_mmlu_pro_smoke") or dimensions.get("knowledge_mmlu_pro_smoke")),
"source": compact_report.get("json_path", "compact_intelligence"),
"scope": "local_public_harness",
},
{
"axis": "instruction_following",
"score": _score(dimensions.get("instruction_following_smoke")),
"source": compact_report.get("json_path", "compact_intelligence"),
"scope": "local_smoke",
},
{
"axis": "translation_th_en",
"score": _score(dimensions.get("translation_smoke")),
"source": compact_report.get("json_path", "compact_intelligence"),
"scope": "local_smoke",
},
{
"axis": "natural_answer_style",
"score": _score(dimensions.get("natural_answer_style")),
"source": compact_report.get("json_path", "compact_intelligence"),
"scope": "local_quality_gate",
},
{
"axis": "bit_exactness",
"score": _score(dimensions.get("bit_exactness")),
"source": compact_report.get("json_path", "compact_intelligence"),
"scope": "local_training_metric",
},
{
"axis": "layer_coherence",
"score": _score(coherence_report.get("harmony_score")),
"source": coherence_report.get("json_path", "layer_coherence"),
"scope": "architecture_smoke",
},
{
"axis": "long_context_exact_10m",
"score": 100.0 if int(memory_report.get("measured_tokens", 0)) >= 10_000_000 and memory_report.get("passkey_recall", {}).get("passed") else 0.0,
"source": memory_report.get("report_path", "extreme_memory"),
"scope": "exact_archive_recall",
},
]
full_gate = knowledge_report.get("full_cycle_gate", {})
rows.append(
{
"axis": "pure_data_full_cycle",
"score": 100.0 if full_gate.get("passed") else 0.0,
"source": knowledge_report.get("json_path", "knowledge_full_cycle"),
"scope": "data_train_eval_gate",
}
)
for row in external_results:
rows.append(
{
"axis": f"external_{row.get('name', row.get('model', 'unknown'))}",
"score": _score(row.get("score")),
"source": row.get("source", "imported_external"),
"scope": row.get("scope", "external_import"),
}
)
return rows
def build_world_class_eval(
out_dir: str | Path,
knowledge_report: str | Path | None = "reports/knowledge_full_cycle_pursuit_512/knowledge_full_cycle_report.json",
compact_report: str | Path | None = "reports/compact_intelligence/compact_intelligence_dossier.json",
coherence_report: str | Path | None = "reports/axiomweave_coherence/layer_coherence_report.json",
memory_report: str | Path | None = "reports/extreme_memory_10m/extreme_memory_report.json",
external_results: str | Path | None = None,
) -> dict:
knowledge = _load(knowledge_report)
compact = _load(compact_report)
coherence = _load(coherence_report)
memory = _load(memory_report)
external = _load(external_results).get("results", []) if external_results else []
rows = _metric_rows(knowledge, compact, coherence, memory, external)
measured_axes = [row for row in rows if row["score"] > 0]
weak_axes = [row["axis"] for row in rows if row["score"] < 50.0]
balanced_score = sum(row["score"] for row in rows) / max(len(rows), 1)
hard_blockers = []
if weak_axes:
hard_blockers.append("some_axes_below_50")
if not external:
hard_blockers.append("no_imported_external_top_model_results")
if not compact.get("claim_gate") or compact.get("claim_gate", {}).get("can_claim_smarter_than_larger_models") is not True:
hard_blockers.append("compact_intelligence_claim_gate_blocked")
report = {
"schema_version": "tinymind-world-class-eval-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"purpose": "Measure broad intelligence readiness against world-class model standards without fabricating ranks.",
"targets": WORLD_EVAL_TARGETS,
"world_model_slots": WORLD_MODEL_SLOTS,
"metrics": rows,
"summary": {
"balanced_score": balanced_score,
"measured_axis_count": len(measured_axes),
"axis_count": len(rows),
"weak_axes": weak_axes,
"hard_blockers": hard_blockers,
},
"claim_gate": {
"ready_for_world_top_comparison": len(measured_axes) >= 7,
"can_claim_better_than_top_world_models": False,
"can_claim_production_ready_eval_packet": not not rows,
"reason": "Official/provider ranks and strong scores across every axis are required before superiority claims.",
},
"submission_next_steps": [
"publish HF-compatible model/repo or eval Space",
"expose OpenAI-compatible public endpoint for LMArena-style testing",
"prepare Artificial Analysis provider endpoint metadata",
"import official result JSON/CSV into this harness and rerun claim gate",
],
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = out / "world_class_eval_report.json"
md_path = out / "world_class_eval_report.md"
csv_path = out / "world_class_eval_metrics.csv"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
report["csv_path"] = str(csv_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
_write_csv(csv_path, rows)
return report
def _write_csv(path: Path, rows: list[dict]) -> None:
with path.open("w", encoding="utf-8", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["axis", "score", "source", "scope"])
writer.writeheader()
writer.writerows(rows)
def _markdown(report: dict) -> str:
lines = [
"# TinyMind World-Class Eval Report",
"",
f"- Balanced score: {report['summary']['balanced_score']:.2f}",
f"- Measured axes: {report['summary']['measured_axis_count']}/{report['summary']['axis_count']}",
f"- Can claim better than top world models: {report['claim_gate']['can_claim_better_than_top_world_models']}",
"",
"## Metrics",
"",
"| Axis | Score | Scope | Source |",
"|---|---:|---|---|",
]
for row in report["metrics"]:
lines.append(f"| {row['axis']} | {row['score']:.2f} | {row['scope']} | {row['source']} |")
lines.extend(["", "## External Targets", ""])
for target in report["targets"]:
lines.append(f"- [{target['name']}]({target['url']}): {target['status']}")
lines.extend(["", "## Blockers", ""])
for blocker in report["summary"]["hard_blockers"]:
lines.append(f"- {blocker}")
return "\n".join(lines) + "\n"

Xet Storage Details

Size:
9.72 kB
·
Xet hash:
ebcdf3215277cdf1e7133fa55dbb21dbabc5ba5cecedb67ef4315a9767b45abe

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.