Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /world_class_eval.py
| """World-class multi-axis evaluation harness for TinyMind. | |
| This module does not invent leaderboard results. It creates a reproducible | |
| packet that can run local/public benchmarks now, import external/provider | |
| results when available, and compare TinyMind against world-model slots with a | |
| strict claim gate. | |
| """ | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import csv | |
| import json | |
| from pathlib import Path | |
| from typing import Iterable | |
| WORLD_EVAL_TARGETS = [ | |
| { | |
| "name": "Hugging Face Leaderboards/Evals", | |
| "url": "https://huggingface.co/docs/leaderboards/index", | |
| "kind": "public_hub_or_space_submission", | |
| "status": "requires_hub_compatible_model_or_eval_space", | |
| "axes": ["MMLU-Pro", "GPQA", "IFEval", "MATH", "BBH", "coding"], | |
| }, | |
| { | |
| "name": "LMArena", | |
| "url": "https://lmarena.ai/", | |
| "kind": "human_preference_arena", | |
| "status": "requires_public_chat_endpoint_and_arena_acceptance", | |
| "axes": ["human preference", "chat quality", "style", "instruction following"], | |
| }, | |
| { | |
| "name": "Artificial Analysis", | |
| "url": "https://artificialanalysis.ai/evaluations", | |
| "kind": "provider_benchmark_index", | |
| "status": "requires_provider_endpoint_or_external_listing", | |
| "axes": ["intelligence index", "speed", "price", "coding", "math", "reasoning"], | |
| }, | |
| ] | |
| WORLD_MODEL_SLOTS = [ | |
| {"model": "frontier_closed_top", "class": "frontier", "params": None, "source": "external_provider_required"}, | |
| {"model": "top_open_large", "class": "open_large", "params": None, "source": "hf_or_provider_required"}, | |
| {"model": "70B_class", "class": "70B", "params": 70_000_000_000, "source": "size_reference"}, | |
| {"model": "7B_class", "class": "7B", "params": 7_000_000_000, "source": "size_reference"}, | |
| {"model": "gpt2", "class": "124M", "params": 124_439_808, "source": "local_or_hf_smoke"}, | |
| ] | |
| def _load(path: str | Path | None) -> dict: | |
| if not path: | |
| return {} | |
| p = Path(path) | |
| if not p.exists(): | |
| return {} | |
| return json.loads(p.read_text(encoding="utf-8")) | |
| def _score(value: float | int | None) -> float: | |
| if value is None: | |
| return 0.0 | |
| return max(0.0, min(100.0, float(value))) | |
| def _metric_rows( | |
| knowledge_report: dict, | |
| compact_report: dict, | |
| coherence_report: dict, | |
| memory_report: dict, | |
| external_results: Iterable[dict], | |
| ) -> list[dict]: | |
| dimensions = compact_report.get("dimensions", {}) | |
| rows = [ | |
| { | |
| "axis": "knowledge_mmlu_pro", | |
| "score": _score(dimensions.get("official_mmlu_pro_smoke") or dimensions.get("knowledge_mmlu_pro_smoke")), | |
| "source": compact_report.get("json_path", "compact_intelligence"), | |
| "scope": "local_public_harness", | |
| }, | |
| { | |
| "axis": "instruction_following", | |
| "score": _score(dimensions.get("instruction_following_smoke")), | |
| "source": compact_report.get("json_path", "compact_intelligence"), | |
| "scope": "local_smoke", | |
| }, | |
| { | |
| "axis": "translation_th_en", | |
| "score": _score(dimensions.get("translation_smoke")), | |
| "source": compact_report.get("json_path", "compact_intelligence"), | |
| "scope": "local_smoke", | |
| }, | |
| { | |
| "axis": "natural_answer_style", | |
| "score": _score(dimensions.get("natural_answer_style")), | |
| "source": compact_report.get("json_path", "compact_intelligence"), | |
| "scope": "local_quality_gate", | |
| }, | |
| { | |
| "axis": "bit_exactness", | |
| "score": _score(dimensions.get("bit_exactness")), | |
| "source": compact_report.get("json_path", "compact_intelligence"), | |
| "scope": "local_training_metric", | |
| }, | |
| { | |
| "axis": "layer_coherence", | |
| "score": _score(coherence_report.get("harmony_score")), | |
| "source": coherence_report.get("json_path", "layer_coherence"), | |
| "scope": "architecture_smoke", | |
| }, | |
| { | |
| "axis": "long_context_exact_10m", | |
| "score": 100.0 if int(memory_report.get("measured_tokens", 0)) >= 10_000_000 and memory_report.get("passkey_recall", {}).get("passed") else 0.0, | |
| "source": memory_report.get("report_path", "extreme_memory"), | |
| "scope": "exact_archive_recall", | |
| }, | |
| ] | |
| full_gate = knowledge_report.get("full_cycle_gate", {}) | |
| rows.append( | |
| { | |
| "axis": "pure_data_full_cycle", | |
| "score": 100.0 if full_gate.get("passed") else 0.0, | |
| "source": knowledge_report.get("json_path", "knowledge_full_cycle"), | |
| "scope": "data_train_eval_gate", | |
| } | |
| ) | |
| for row in external_results: | |
| rows.append( | |
| { | |
| "axis": f"external_{row.get('name', row.get('model', 'unknown'))}", | |
| "score": _score(row.get("score")), | |
| "source": row.get("source", "imported_external"), | |
| "scope": row.get("scope", "external_import"), | |
| } | |
| ) | |
| return rows | |
| def build_world_class_eval( | |
| out_dir: str | Path, | |
| knowledge_report: str | Path | None = "reports/knowledge_full_cycle_pursuit_512/knowledge_full_cycle_report.json", | |
| compact_report: str | Path | None = "reports/compact_intelligence/compact_intelligence_dossier.json", | |
| coherence_report: str | Path | None = "reports/axiomweave_coherence/layer_coherence_report.json", | |
| memory_report: str | Path | None = "reports/extreme_memory_10m/extreme_memory_report.json", | |
| external_results: str | Path | None = None, | |
| ) -> dict: | |
| knowledge = _load(knowledge_report) | |
| compact = _load(compact_report) | |
| coherence = _load(coherence_report) | |
| memory = _load(memory_report) | |
| external = _load(external_results).get("results", []) if external_results else [] | |
| rows = _metric_rows(knowledge, compact, coherence, memory, external) | |
| measured_axes = [row for row in rows if row["score"] > 0] | |
| weak_axes = [row["axis"] for row in rows if row["score"] < 50.0] | |
| balanced_score = sum(row["score"] for row in rows) / max(len(rows), 1) | |
| hard_blockers = [] | |
| if weak_axes: | |
| hard_blockers.append("some_axes_below_50") | |
| if not external: | |
| hard_blockers.append("no_imported_external_top_model_results") | |
| if not compact.get("claim_gate") or compact.get("claim_gate", {}).get("can_claim_smarter_than_larger_models") is not True: | |
| hard_blockers.append("compact_intelligence_claim_gate_blocked") | |
| report = { | |
| "schema_version": "tinymind-world-class-eval-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "purpose": "Measure broad intelligence readiness against world-class model standards without fabricating ranks.", | |
| "targets": WORLD_EVAL_TARGETS, | |
| "world_model_slots": WORLD_MODEL_SLOTS, | |
| "metrics": rows, | |
| "summary": { | |
| "balanced_score": balanced_score, | |
| "measured_axis_count": len(measured_axes), | |
| "axis_count": len(rows), | |
| "weak_axes": weak_axes, | |
| "hard_blockers": hard_blockers, | |
| }, | |
| "claim_gate": { | |
| "ready_for_world_top_comparison": len(measured_axes) >= 7, | |
| "can_claim_better_than_top_world_models": False, | |
| "can_claim_production_ready_eval_packet": not not rows, | |
| "reason": "Official/provider ranks and strong scores across every axis are required before superiority claims.", | |
| }, | |
| "submission_next_steps": [ | |
| "publish HF-compatible model/repo or eval Space", | |
| "expose OpenAI-compatible public endpoint for LMArena-style testing", | |
| "prepare Artificial Analysis provider endpoint metadata", | |
| "import official result JSON/CSV into this harness and rerun claim gate", | |
| ], | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "world_class_eval_report.json" | |
| md_path = out / "world_class_eval_report.md" | |
| csv_path = out / "world_class_eval_metrics.csv" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| report["csv_path"] = str(csv_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| _write_csv(csv_path, rows) | |
| return report | |
| def _write_csv(path: Path, rows: list[dict]) -> None: | |
| with path.open("w", encoding="utf-8", newline="") as f: | |
| writer = csv.DictWriter(f, fieldnames=["axis", "score", "source", "scope"]) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| def _markdown(report: dict) -> str: | |
| lines = [ | |
| "# TinyMind World-Class Eval Report", | |
| "", | |
| f"- Balanced score: {report['summary']['balanced_score']:.2f}", | |
| f"- Measured axes: {report['summary']['measured_axis_count']}/{report['summary']['axis_count']}", | |
| f"- Can claim better than top world models: {report['claim_gate']['can_claim_better_than_top_world_models']}", | |
| "", | |
| "## Metrics", | |
| "", | |
| "| Axis | Score | Scope | Source |", | |
| "|---|---:|---|---|", | |
| ] | |
| for row in report["metrics"]: | |
| lines.append(f"| {row['axis']} | {row['score']:.2f} | {row['scope']} | {row['source']} |") | |
| lines.extend(["", "## External Targets", ""]) | |
| for target in report["targets"]: | |
| lines.append(f"- [{target['name']}]({target['url']}): {target['status']}") | |
| lines.extend(["", "## Blockers", ""]) | |
| for blocker in report["summary"]["hard_blockers"]: | |
| lines.append(f"- {blocker}") | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 9.72 kB
- Xet hash:
- ebcdf3215277cdf1e7133fa55dbb21dbabc5ba5cecedb67ef4315a9767b45abe
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.