| """ |
| Eval harness for İvme-Conversate. |
| |
| Wraps the custom model + tokenizer in an lm-eval compatible interface and runs |
| HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard. |
| |
| Usage: |
| python eval.py --checkpoint checkpoints/ivme_base_ema.pt |
| python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy |
| python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa |
| |
| Requirements: |
| pip install lm-eval tokenizers torch |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| import torch |
| import numpy as np |
| from tokenizers import Tokenizer |
|
|
| |
| from lm_eval.api.model import LM |
| from lm_eval.api.instance import Instance |
| import lm_eval |
|
|
| |
| sys.path.insert(0, ".") |
| from model import IvmeConfig, IvmeConversate |
|
|
| TOKENIZER_PATH = "ivme_tokenizer.json" |
| DEFAULT_TASKS = "hellaswag,arc_easy" |
|
|
|
|
| |
| |
| |
| class IvmeLM(LM): |
| def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32): |
| super().__init__() |
| self._device = torch.device(device if torch.cuda.is_available() else "cpu") |
| self._batch_size = batch_size |
|
|
| |
| print(f"[eval] loading tokenizer from {TOKENIZER_PATH}") |
| self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH) |
| self._tokenizer.no_truncation() |
| self._tokenizer.no_padding() |
| self.vocab_size = self._tokenizer.get_vocab_size() |
| self.eos_token_id = self._tokenizer.token_to_id("<|eos|>") |
|
|
| |
| print(f"[eval] loading model from {checkpoint_path}") |
| ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False) |
| cfg = ckpt["cfg"] |
| |
| cfg.attn_backend = "sdpa" |
| self._model = IvmeConversate(cfg) |
| self._model.load_state_dict(ckpt["model"]) |
| self._model.to(self._device) |
| self._model.eval() |
| n = self._model.num_params() |
| print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}") |
|
|
| @property |
| def max_length(self): |
| return self._model.cfg.max_seq_len |
|
|
| @property |
| def max_gen_toks(self): |
| return 256 |
|
|
| def tok_encode(self, text: str) -> list[int]: |
| return self._tokenizer.encode(text).ids |
|
|
| def tok_decode(self, tokens: list[int]) -> str: |
| return self._tokenizer.decode(tokens) |
|
|
| |
|
|
| def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: |
| """Compute log-likelihood of each (context, continuation) pair.""" |
| results = [] |
| for i in range(0, len(requests), self._batch_size): |
| batch = requests[i : i + self._batch_size] |
| results.extend(self._loglikelihood_batch(batch)) |
| return results |
|
|
| def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]: |
| results = [] |
| for req in batch: |
| context, continuation = req.args |
|
|
| |
| |
| |
| |
| |
| ctx_ids = self.tok_encode(context) |
| full_ids = self.tok_encode(context + continuation) |
| cont_len = len(full_ids) - len(ctx_ids) |
|
|
| |
| |
| if cont_len <= 0: |
| cont_len = 1 |
| if len(full_ids) < cont_len + 1: |
| |
| results.append((-float("inf"), False)) |
| continue |
|
|
| all_ids = full_ids |
| |
| if len(all_ids) > self.max_length: |
| all_ids = all_ids[-self.max_length:] |
|
|
| input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device) |
|
|
| with torch.no_grad(): |
| with torch.autocast(device_type=str(self._device).split(":")[0], |
| dtype=torch.bfloat16, |
| enabled=self._device.type == "cuda"): |
| logits, _ = self._model(input_ids) |
|
|
| |
| |
| |
| cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device) |
| start = max(0, len(all_ids) - cont_len - 1) |
| cont_logits = logits[0, start : start + cont_len, :] |
|
|
| log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1) |
| token_log_probs = log_probs[range(cont_len), cont_targets] |
| total_log_prob = token_log_probs.sum().item() |
|
|
| greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item() |
| results.append((total_log_prob, bool(greedy))) |
|
|
| return results |
|
|
| def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]: |
| """Compute rolling log-likelihood for perplexity tasks.""" |
| results = [] |
| for req in requests: |
| text = req.args[0] |
| ids = self.tok_encode(text) |
| total_ll = 0.0 |
| |
| for start in range(0, max(1, len(ids) - 1), self.max_length): |
| chunk = ids[start : start + self.max_length + 1] |
| if len(chunk) < 2: |
| break |
| inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device) |
| tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device) |
| with torch.no_grad(): |
| with torch.autocast(device_type=str(self._device).split(":")[0], |
| dtype=torch.bfloat16, |
| enabled=self._device.type == "cuda"): |
| logits, _ = self._model(inp) |
| log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1) |
| total_ll += log_probs[range(len(tgt)), tgt].sum().item() |
| results.append(total_ll) |
| return results |
|
|
| def generate_until(self, requests: list[Instance]) -> list[str]: |
| """Greedy generation until stop string (used by some tasks).""" |
| results = [] |
| for req in requests: |
| context, gen_kwargs = req.args |
| until = gen_kwargs.get("until", ["<|eos|>"]) |
| max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks) |
| ids = torch.tensor([self.tok_encode(context)], dtype=torch.long, |
| device=self._device) |
| out = self._model.generate(ids, max_new_tokens=max_new, |
| temperature=1.0, top_k=1) |
| new_ids = out[0, ids.shape[1]:].tolist() |
| text = self.tok_decode(new_ids) |
| for stop in until: |
| if stop in text: |
| text = text[:text.index(stop)] |
| results.append(text) |
| return results |
|
|
|
|
| |
| |
| |
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--checkpoint", required=True) |
| ap.add_argument("--tasks", default=DEFAULT_TASKS) |
| ap.add_argument("--batch_size", type=int, default=32) |
| ap.add_argument("--device", default="cuda") |
| ap.add_argument("--output", default="eval_results.json") |
| args = ap.parse_args() |
|
|
| model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size) |
| task_list = [t.strip() for t in args.tasks.split(",")] |
|
|
| print(f"\n[eval] running tasks: {task_list}") |
| results = lm_eval.simple_evaluate( |
| model=model, |
| tasks=task_list, |
| num_fewshot=0, |
| batch_size=args.batch_size, |
| log_samples=False, |
| ) |
|
|
| |
| print("\n" + "=" * 52) |
| print(" İvme-Conversate Eval Results") |
| print("=" * 52) |
| for task, metrics in results["results"].items(): |
| acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0 |
| print(f" {task:<20} {acc*100:.2f}%") |
| print("=" * 52) |
| print(f" Model params : {model._model.num_params()/1e6:.1f}M") |
| print(f" Checkpoint : {args.checkpoint}") |
| print(f" Eval mode : zero-shot") |
| print("=" * 52) |
|
|
| |
| with open(args.output, "w") as f: |
| json.dump(results["results"], f, indent=2) |
| print(f"\n[eval] full results saved -> {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |