Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /logic_eval.py

bbkdevops

about 1 month ago

download

raw

3.43 kB

	"""Logic-focused evaluation for TinyMind checkpoints."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path

	import torch

	from evaluation.local_evidence import _collate, _encode
	from model.architecture import OmegaModel


	CHOICES = "ABCD"
	LOGIC_ITEMS = [
	{
	"id": "modus_ponens",
	"question": "If P implies Q, and P is true, what follows?",
	"options": ["Q is true", "Q is false", "P is false", "No conclusion"],
	"answer": "A",
	"skill": "implication",
	},
	{
	"id": "contrapositive",
	"question": "Which statement is equivalent to: if P then Q?",
	"options": ["if Q then P", "if not Q then not P", "if not P then not Q", "P and not Q"],
	"answer": "B",
	"skill": "equivalence",
	},
	{
	"id": "evidence_policy",
	"question": "A factual-answer policy says answer only when evidence exists. Evidence is missing. What should happen?",
	"options": ["Guess", "Refuse or ask for evidence", "Claim it is false", "Ignore the policy"],
	"answer": "B",
	"skill": "policy_logic",
	},
	{
	"id": "contradiction",
	"question": "Two premises cannot both be true. What is the first repair step?",
	"options": ["Add more unsupported claims", "Find the minimal conflicting pair", "Keep both as true", "Delete all evidence"],
	"answer": "B",
	"skill": "consistency",
	},
	]


	def _prompt(item: dict) -> str:
	opts = "\n".join(f"{CHOICES[i]}. {opt}" for i, opt in enumerate(item["options"]))
	return f"Answer the logic question. Return only the option letter.\nQuestion: {item['question']}\n{opts}\nAnswer:"


	@torch.no_grad()
	def _choose(model: OmegaModel, prompt: str) -> str:
	losses = {}
	lines = prompt.splitlines()
	option_lines = [line for line in lines if len(line) > 3 and line[0] in CHOICES and line[1:3] == ". "]
	for line in option_lines:
	letter = line[0]
	option_text = line[3:]
	text = f"{prompt} {option_text}"
	seq = _encode(text, model.cfg.max_seq_len, model.cfg.vocab_size)
	input_ids, labels = _collate([seq])
	losses[letter] = float(model(input_ids, labels=labels)["loss"].item())
	return min(losses, key=losses.get)


	def run_logic_eval(checkpoint_path: str \| Path, out_dir: str \| Path) -> dict:
	ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
	model = OmegaModel(ckpt["model_cfg"])
	model.load_state_dict(ckpt["model_state"])
	model.eval()
	rows = []
	correct = 0
	for item in LOGIC_ITEMS:
	pred = _choose(model, _prompt(item))
	ok = pred == item["answer"]
	correct += int(ok)
	rows.append({**item, "prediction": pred, "correct": ok})
	report = {
	"schema_version": "tinymind-logic-eval-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"checkpoint": str(checkpoint_path),
	"samples": len(rows),
	"correct": correct,
	"accuracy": correct / max(len(rows), 1),
	"rows": rows,
	"world_best_claim_allowed": False,
	}
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	path = out / "logic_eval_report.json"
	report["report_path"] = str(path)
	path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return report

Xet Storage Details

Size:: 3.43 kB
Xet hash:: 2e2f8403e1d3387daac72f5a0a6ac7e26a6ed6e7971e1dfda7471f44e0de4aea

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.