Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /logic_eval.py
| """Logic-focused evaluation for TinyMind checkpoints.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| import torch | |
| from evaluation.local_evidence import _collate, _encode | |
| from model.architecture import OmegaModel | |
| CHOICES = "ABCD" | |
| LOGIC_ITEMS = [ | |
| { | |
| "id": "modus_ponens", | |
| "question": "If P implies Q, and P is true, what follows?", | |
| "options": ["Q is true", "Q is false", "P is false", "No conclusion"], | |
| "answer": "A", | |
| "skill": "implication", | |
| }, | |
| { | |
| "id": "contrapositive", | |
| "question": "Which statement is equivalent to: if P then Q?", | |
| "options": ["if Q then P", "if not Q then not P", "if not P then not Q", "P and not Q"], | |
| "answer": "B", | |
| "skill": "equivalence", | |
| }, | |
| { | |
| "id": "evidence_policy", | |
| "question": "A factual-answer policy says answer only when evidence exists. Evidence is missing. What should happen?", | |
| "options": ["Guess", "Refuse or ask for evidence", "Claim it is false", "Ignore the policy"], | |
| "answer": "B", | |
| "skill": "policy_logic", | |
| }, | |
| { | |
| "id": "contradiction", | |
| "question": "Two premises cannot both be true. What is the first repair step?", | |
| "options": ["Add more unsupported claims", "Find the minimal conflicting pair", "Keep both as true", "Delete all evidence"], | |
| "answer": "B", | |
| "skill": "consistency", | |
| }, | |
| ] | |
| def _prompt(item: dict) -> str: | |
| opts = "\n".join(f"{CHOICES[i]}. {opt}" for i, opt in enumerate(item["options"])) | |
| return f"Answer the logic question. Return only the option letter.\nQuestion: {item['question']}\n{opts}\nAnswer:" | |
| def _choose(model: OmegaModel, prompt: str) -> str: | |
| losses = {} | |
| lines = prompt.splitlines() | |
| option_lines = [line for line in lines if len(line) > 3 and line[0] in CHOICES and line[1:3] == ". "] | |
| for line in option_lines: | |
| letter = line[0] | |
| option_text = line[3:] | |
| text = f"{prompt} {option_text}" | |
| seq = _encode(text, model.cfg.max_seq_len, model.cfg.vocab_size) | |
| input_ids, labels = _collate([seq]) | |
| losses[letter] = float(model(input_ids, labels=labels)["loss"].item()) | |
| return min(losses, key=losses.get) | |
| def run_logic_eval(checkpoint_path: str | Path, out_dir: str | Path) -> dict: | |
| ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False) | |
| model = OmegaModel(ckpt["model_cfg"]) | |
| model.load_state_dict(ckpt["model_state"]) | |
| model.eval() | |
| rows = [] | |
| correct = 0 | |
| for item in LOGIC_ITEMS: | |
| pred = _choose(model, _prompt(item)) | |
| ok = pred == item["answer"] | |
| correct += int(ok) | |
| rows.append({**item, "prediction": pred, "correct": ok}) | |
| report = { | |
| "schema_version": "tinymind-logic-eval-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "checkpoint": str(checkpoint_path), | |
| "samples": len(rows), | |
| "correct": correct, | |
| "accuracy": correct / max(len(rows), 1), | |
| "rows": rows, | |
| "world_best_claim_allowed": False, | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| path = out / "logic_eval_report.json" | |
| report["report_path"] = str(path) | |
| path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 3.43 kB
- Xet hash:
- 2e2f8403e1d3387daac72f5a0a6ac7e26a6ed6e7971e1dfda7471f44e0de4aea
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.