""" Eval harness for İvme-Conversate. Wraps the custom model + tokenizer in an lm-eval compatible interface and runs HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard. Usage: python eval.py --checkpoint checkpoints/ivme_base_ema.pt python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa Requirements: pip install lm-eval tokenizers torch """ from __future__ import annotations import argparse import json import sys import torch import numpy as np from tokenizers import Tokenizer # lm-eval imports from lm_eval.api.model import LM from lm_eval.api.instance import Instance import lm_eval # Local sys.path.insert(0, ".") from model import IvmeConfig, IvmeConversate TOKENIZER_PATH = "ivme_tokenizer.json" DEFAULT_TASKS = "hellaswag,arc_easy" # --------------------------------------------------------------------------- # # lm-eval wrapper # --------------------------------------------------------------------------- # class IvmeLM(LM): def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32): super().__init__() self._device = torch.device(device if torch.cuda.is_available() else "cpu") self._batch_size = batch_size # Load tokenizer print(f"[eval] loading tokenizer from {TOKENIZER_PATH}") self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH) self._tokenizer.no_truncation() self._tokenizer.no_padding() self.vocab_size = self._tokenizer.get_vocab_size() self.eos_token_id = self._tokenizer.token_to_id("<|eos|>") # Load model print(f"[eval] loading model from {checkpoint_path}") ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False) cfg = ckpt["cfg"] # Force SDPA for eval — no training kernels needed, wider compatibility cfg.attn_backend = "sdpa" self._model = IvmeConversate(cfg) self._model.load_state_dict(ckpt["model"]) self._model.to(self._device) self._model.eval() n = self._model.num_params() print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}") @property def max_length(self): return self._model.cfg.max_seq_len @property def max_gen_toks(self): return 256 def tok_encode(self, text: str) -> list[int]: return self._tokenizer.encode(text).ids def tok_decode(self, tokens: list[int]) -> str: return self._tokenizer.decode(tokens) # ---- Required lm-eval interface methods -------------------------------- # def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: """Compute log-likelihood of each (context, continuation) pair.""" results = [] for i in range(0, len(requests), self._batch_size): batch = requests[i : i + self._batch_size] results.extend(self._loglikelihood_batch(batch)) return results def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]: results = [] for req in batch: context, continuation = req.args # CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE, # tokenizing the continuation alone mishandles the leading space and # word-boundary merges, so the scored tokens wouldn't match what the # model actually predicts in context. We find the continuation's token # span by encoding the context alone only to measure its length. ctx_ids = self.tok_encode(context) full_ids = self.tok_encode(context + continuation) cont_len = len(full_ids) - len(ctx_ids) # Guard: joint tokenization can merge across the boundary leaving # cont_len=0 or even negative. Fall back to scoring the last token. if cont_len <= 0: cont_len = 1 if len(full_ids) < cont_len + 1: # Sequence too short to score anything meaningful — skip. results.append((-float("inf"), False)) continue all_ids = full_ids # Truncate from the left if too long, always keeping the continuation. if len(all_ids) > self.max_length: all_ids = all_ids[-self.max_length:] input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device) with torch.no_grad(): with torch.autocast(device_type=str(self._device).split(":")[0], dtype=torch.bfloat16, enabled=self._device.type == "cuda"): logits, _ = self._model(input_ids) # Log-probs for the continuation tokens only. # logits[:, i, :] predicts the token at position i+1, so to score the # last cont_len tokens we read logits at [len-cont_len-1 : len-1]. cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device) start = max(0, len(all_ids) - cont_len - 1) cont_logits = logits[0, start : start + cont_len, :] # (cont_len, vocab) log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1) token_log_probs = log_probs[range(cont_len), cont_targets] total_log_prob = token_log_probs.sum().item() greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item() results.append((total_log_prob, bool(greedy))) return results def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]: """Compute rolling log-likelihood for perplexity tasks.""" results = [] for req in requests: text = req.args[0] ids = self.tok_encode(text) total_ll = 0.0 # Slide a window of max_length over the tokens. for start in range(0, max(1, len(ids) - 1), self.max_length): chunk = ids[start : start + self.max_length + 1] if len(chunk) < 2: break inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device) tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device) with torch.no_grad(): with torch.autocast(device_type=str(self._device).split(":")[0], dtype=torch.bfloat16, enabled=self._device.type == "cuda"): logits, _ = self._model(inp) log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1) total_ll += log_probs[range(len(tgt)), tgt].sum().item() results.append(total_ll) return results def generate_until(self, requests: list[Instance]) -> list[str]: """Greedy generation until stop string (used by some tasks).""" results = [] for req in requests: context, gen_kwargs = req.args until = gen_kwargs.get("until", ["<|eos|>"]) max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks) ids = torch.tensor([self.tok_encode(context)], dtype=torch.long, device=self._device) out = self._model.generate(ids, max_new_tokens=max_new, temperature=1.0, top_k=1) # greedy new_ids = out[0, ids.shape[1]:].tolist() text = self.tok_decode(new_ids) for stop in until: if stop in text: text = text[:text.index(stop)] results.append(text) return results # --------------------------------------------------------------------------- # # Main # --------------------------------------------------------------------------- # def main(): ap = argparse.ArgumentParser() ap.add_argument("--checkpoint", required=True) ap.add_argument("--tasks", default=DEFAULT_TASKS) ap.add_argument("--batch_size", type=int, default=32) ap.add_argument("--device", default="cuda") ap.add_argument("--output", default="eval_results.json") args = ap.parse_args() model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size) task_list = [t.strip() for t in args.tasks.split(",")] print(f"\n[eval] running tasks: {task_list}") results = lm_eval.simple_evaluate( model=model, tasks=task_list, num_fewshot=0, # zero-shot, matching the leaderboard batch_size=args.batch_size, log_samples=False, ) # Print a clean summary print("\n" + "=" * 52) print(" İvme-Conversate Eval Results") print("=" * 52) for task, metrics in results["results"].items(): acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0 print(f" {task:<20} {acc*100:.2f}%") print("=" * 52) print(f" Model params : {model._model.num_params()/1e6:.1f}M") print(f" Checkpoint : {args.checkpoint}") print(f" Eval mode : zero-shot") print("=" * 52) # Save full results for the model card / leaderboard PR with open(args.output, "w") as f: json.dump(results["results"], f, indent=2) print(f"\n[eval] full results saved -> {args.output}") if __name__ == "__main__": main()