bbkdevops's picture
download
raw
3.43 kB
"""Logic-focused evaluation for TinyMind checkpoints."""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
import torch
from evaluation.local_evidence import _collate, _encode
from model.architecture import OmegaModel
CHOICES = "ABCD"
LOGIC_ITEMS = [
{
"id": "modus_ponens",
"question": "If P implies Q, and P is true, what follows?",
"options": ["Q is true", "Q is false", "P is false", "No conclusion"],
"answer": "A",
"skill": "implication",
},
{
"id": "contrapositive",
"question": "Which statement is equivalent to: if P then Q?",
"options": ["if Q then P", "if not Q then not P", "if not P then not Q", "P and not Q"],
"answer": "B",
"skill": "equivalence",
},
{
"id": "evidence_policy",
"question": "A factual-answer policy says answer only when evidence exists. Evidence is missing. What should happen?",
"options": ["Guess", "Refuse or ask for evidence", "Claim it is false", "Ignore the policy"],
"answer": "B",
"skill": "policy_logic",
},
{
"id": "contradiction",
"question": "Two premises cannot both be true. What is the first repair step?",
"options": ["Add more unsupported claims", "Find the minimal conflicting pair", "Keep both as true", "Delete all evidence"],
"answer": "B",
"skill": "consistency",
},
]
def _prompt(item: dict) -> str:
opts = "\n".join(f"{CHOICES[i]}. {opt}" for i, opt in enumerate(item["options"]))
return f"Answer the logic question. Return only the option letter.\nQuestion: {item['question']}\n{opts}\nAnswer:"
@torch.no_grad()
def _choose(model: OmegaModel, prompt: str) -> str:
losses = {}
lines = prompt.splitlines()
option_lines = [line for line in lines if len(line) > 3 and line[0] in CHOICES and line[1:3] == ". "]
for line in option_lines:
letter = line[0]
option_text = line[3:]
text = f"{prompt} {option_text}"
seq = _encode(text, model.cfg.max_seq_len, model.cfg.vocab_size)
input_ids, labels = _collate([seq])
losses[letter] = float(model(input_ids, labels=labels)["loss"].item())
return min(losses, key=losses.get)
def run_logic_eval(checkpoint_path: str | Path, out_dir: str | Path) -> dict:
ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
model = OmegaModel(ckpt["model_cfg"])
model.load_state_dict(ckpt["model_state"])
model.eval()
rows = []
correct = 0
for item in LOGIC_ITEMS:
pred = _choose(model, _prompt(item))
ok = pred == item["answer"]
correct += int(ok)
rows.append({**item, "prediction": pred, "correct": ok})
report = {
"schema_version": "tinymind-logic-eval-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"checkpoint": str(checkpoint_path),
"samples": len(rows),
"correct": correct,
"accuracy": correct / max(len(rows), 1),
"rows": rows,
"world_best_claim_allowed": False,
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
path = out / "logic_eval_report.json"
report["report_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
3.43 kB
·
Xet hash:
2e2f8403e1d3387daac72f5a0a6ac7e26a6ed6e7971e1dfda7471f44e0de4aea

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.