ereniko's picture
Upload eval.py with huggingface_hub
44217ec verified
"""
Eval harness for İvme-Conversate.
Wraps the custom model + tokenizer in an lm-eval compatible interface and runs
HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard.
Usage:
python eval.py --checkpoint checkpoints/ivme_base_ema.pt
python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy
python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa
Requirements:
pip install lm-eval tokenizers torch
"""
from __future__ import annotations
import argparse
import json
import sys
import torch
import numpy as np
from tokenizers import Tokenizer
# lm-eval imports
from lm_eval.api.model import LM
from lm_eval.api.instance import Instance
import lm_eval
# Local
sys.path.insert(0, ".")
from model import IvmeConfig, IvmeConversate
TOKENIZER_PATH = "ivme_tokenizer.json"
DEFAULT_TASKS = "hellaswag,arc_easy"
# --------------------------------------------------------------------------- #
# lm-eval wrapper
# --------------------------------------------------------------------------- #
class IvmeLM(LM):
def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32):
super().__init__()
self._device = torch.device(device if torch.cuda.is_available() else "cpu")
self._batch_size = batch_size
# Load tokenizer
print(f"[eval] loading tokenizer from {TOKENIZER_PATH}")
self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
self._tokenizer.no_truncation()
self._tokenizer.no_padding()
self.vocab_size = self._tokenizer.get_vocab_size()
self.eos_token_id = self._tokenizer.token_to_id("<|eos|>")
# Load model
print(f"[eval] loading model from {checkpoint_path}")
ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
cfg = ckpt["cfg"]
# Force SDPA for eval — no training kernels needed, wider compatibility
cfg.attn_backend = "sdpa"
self._model = IvmeConversate(cfg)
self._model.load_state_dict(ckpt["model"])
self._model.to(self._device)
self._model.eval()
n = self._model.num_params()
print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}")
@property
def max_length(self):
return self._model.cfg.max_seq_len
@property
def max_gen_toks(self):
return 256
def tok_encode(self, text: str) -> list[int]:
return self._tokenizer.encode(text).ids
def tok_decode(self, tokens: list[int]) -> str:
return self._tokenizer.decode(tokens)
# ---- Required lm-eval interface methods -------------------------------- #
def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
"""Compute log-likelihood of each (context, continuation) pair."""
results = []
for i in range(0, len(requests), self._batch_size):
batch = requests[i : i + self._batch_size]
results.extend(self._loglikelihood_batch(batch))
return results
def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]:
results = []
for req in batch:
context, continuation = req.args
# CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE,
# tokenizing the continuation alone mishandles the leading space and
# word-boundary merges, so the scored tokens wouldn't match what the
# model actually predicts in context. We find the continuation's token
# span by encoding the context alone only to measure its length.
ctx_ids = self.tok_encode(context)
full_ids = self.tok_encode(context + continuation)
cont_len = len(full_ids) - len(ctx_ids)
# Guard: joint tokenization can merge across the boundary leaving
# cont_len=0 or even negative. Fall back to scoring the last token.
if cont_len <= 0:
cont_len = 1
if len(full_ids) < cont_len + 1:
# Sequence too short to score anything meaningful — skip.
results.append((-float("inf"), False))
continue
all_ids = full_ids
# Truncate from the left if too long, always keeping the continuation.
if len(all_ids) > self.max_length:
all_ids = all_ids[-self.max_length:]
input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device)
with torch.no_grad():
with torch.autocast(device_type=str(self._device).split(":")[0],
dtype=torch.bfloat16,
enabled=self._device.type == "cuda"):
logits, _ = self._model(input_ids)
# Log-probs for the continuation tokens only.
# logits[:, i, :] predicts the token at position i+1, so to score the
# last cont_len tokens we read logits at [len-cont_len-1 : len-1].
cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device)
start = max(0, len(all_ids) - cont_len - 1)
cont_logits = logits[0, start : start + cont_len, :] # (cont_len, vocab)
log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1)
token_log_probs = log_probs[range(cont_len), cont_targets]
total_log_prob = token_log_probs.sum().item()
greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item()
results.append((total_log_prob, bool(greedy)))
return results
def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]:
"""Compute rolling log-likelihood for perplexity tasks."""
results = []
for req in requests:
text = req.args[0]
ids = self.tok_encode(text)
total_ll = 0.0
# Slide a window of max_length over the tokens.
for start in range(0, max(1, len(ids) - 1), self.max_length):
chunk = ids[start : start + self.max_length + 1]
if len(chunk) < 2:
break
inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device)
tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device)
with torch.no_grad():
with torch.autocast(device_type=str(self._device).split(":")[0],
dtype=torch.bfloat16,
enabled=self._device.type == "cuda"):
logits, _ = self._model(inp)
log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1)
total_ll += log_probs[range(len(tgt)), tgt].sum().item()
results.append(total_ll)
return results
def generate_until(self, requests: list[Instance]) -> list[str]:
"""Greedy generation until stop string (used by some tasks)."""
results = []
for req in requests:
context, gen_kwargs = req.args
until = gen_kwargs.get("until", ["<|eos|>"])
max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks)
ids = torch.tensor([self.tok_encode(context)], dtype=torch.long,
device=self._device)
out = self._model.generate(ids, max_new_tokens=max_new,
temperature=1.0, top_k=1) # greedy
new_ids = out[0, ids.shape[1]:].tolist()
text = self.tok_decode(new_ids)
for stop in until:
if stop in text:
text = text[:text.index(stop)]
results.append(text)
return results
# --------------------------------------------------------------------------- #
# Main
# --------------------------------------------------------------------------- #
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--checkpoint", required=True)
ap.add_argument("--tasks", default=DEFAULT_TASKS)
ap.add_argument("--batch_size", type=int, default=32)
ap.add_argument("--device", default="cuda")
ap.add_argument("--output", default="eval_results.json")
args = ap.parse_args()
model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size)
task_list = [t.strip() for t in args.tasks.split(",")]
print(f"\n[eval] running tasks: {task_list}")
results = lm_eval.simple_evaluate(
model=model,
tasks=task_list,
num_fewshot=0, # zero-shot, matching the leaderboard
batch_size=args.batch_size,
log_samples=False,
)
# Print a clean summary
print("\n" + "=" * 52)
print(" İvme-Conversate Eval Results")
print("=" * 52)
for task, metrics in results["results"].items():
acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0
print(f" {task:<20} {acc*100:.2f}%")
print("=" * 52)
print(f" Model params : {model._model.num_params()/1e6:.1f}M")
print(f" Checkpoint : {args.checkpoint}")
print(f" Eval mode : zero-shot")
print("=" * 52)
# Save full results for the model card / leaderboard PR
with open(args.output, "w") as f:
json.dump(results["results"], f, indent=2)
print(f"\n[eval] full results saved -> {args.output}")
if __name__ == "__main__":
main()