Upload eval.py with huggingface_hub

44217ec verified 1 day ago

9.55 kB

	"""
	Eval harness for İvme-Conversate.

	Wraps the custom model + tokenizer in an lm-eval compatible interface and runs
	HellaSwag and ARC-Easy — the two benchmarks scored on the Tiny-ML leaderboard.

	Usage:
	python eval.py --checkpoint checkpoints/ivme_base_ema.pt
	python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy
	python eval.py --checkpoint checkpoints/ivme_base_ema.pt --tasks hellaswag,arc_easy,piqa

	Requirements:
	pip install lm-eval tokenizers torch
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	import torch
	import numpy as np
	from tokenizers import Tokenizer

	# lm-eval imports
	from lm_eval.api.model import LM
	from lm_eval.api.instance import Instance
	import lm_eval

	# Local
	sys.path.insert(0, ".")
	from model import IvmeConfig, IvmeConversate

	TOKENIZER_PATH = "ivme_tokenizer.json"
	DEFAULT_TASKS = "hellaswag,arc_easy"


	# --------------------------------------------------------------------------- #
	# lm-eval wrapper
	# --------------------------------------------------------------------------- #
	class IvmeLM(LM):
	def __init__(self, checkpoint_path: str, device: str = "cuda", batch_size: int = 32):
	super().__init__()
	self._device = torch.device(device if torch.cuda.is_available() else "cpu")
	self._batch_size = batch_size

	# Load tokenizer
	print(f"[eval] loading tokenizer from {TOKENIZER_PATH}")
	self._tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
	self._tokenizer.no_truncation()
	self._tokenizer.no_padding()
	self.vocab_size = self._tokenizer.get_vocab_size()
	self.eos_token_id = self._tokenizer.token_to_id("<\|eos\|>")

	# Load model
	print(f"[eval] loading model from {checkpoint_path}")
	ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
	cfg = ckpt["cfg"]
	# Force SDPA for eval — no training kernels needed, wider compatibility
	cfg.attn_backend = "sdpa"
	self._model = IvmeConversate(cfg)
	self._model.load_state_dict(ckpt["model"])
	self._model.to(self._device)
	self._model.eval()
	n = self._model.num_params()
	print(f"[eval] model loaded: {n/1e6:.1f}M params on {self._device}")

	@property
	def max_length(self):
	return self._model.cfg.max_seq_len

	@property
	def max_gen_toks(self):
	return 256

	def tok_encode(self, text: str) -> list[int]:
	return self._tokenizer.encode(text).ids

	def tok_decode(self, tokens: list[int]) -> str:
	return self._tokenizer.decode(tokens)

	# ---- Required lm-eval interface methods -------------------------------- #

	def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
	"""Compute log-likelihood of each (context, continuation) pair."""
	results = []
	for i in range(0, len(requests), self._batch_size):
	batch = requests[i : i + self._batch_size]
	results.extend(self._loglikelihood_batch(batch))
	return results

	def _loglikelihood_batch(self, batch: list[Instance]) -> list[tuple[float, bool]]:
	results = []
	for req in batch:
	context, continuation = req.args

	# CRITICAL: tokenize context+continuation JOINTLY. With ByteLevel BPE,
	# tokenizing the continuation alone mishandles the leading space and
	# word-boundary merges, so the scored tokens wouldn't match what the
	# model actually predicts in context. We find the continuation's token
	# span by encoding the context alone only to measure its length.
	ctx_ids = self.tok_encode(context)
	full_ids = self.tok_encode(context + continuation)
	cont_len = len(full_ids) - len(ctx_ids)

	# Guard: joint tokenization can merge across the boundary leaving
	# cont_len=0 or even negative. Fall back to scoring the last token.
	if cont_len <= 0:
	cont_len = 1
	if len(full_ids) < cont_len + 1:
	# Sequence too short to score anything meaningful — skip.
	results.append((-float("inf"), False))
	continue

	all_ids = full_ids
	# Truncate from the left if too long, always keeping the continuation.
	if len(all_ids) > self.max_length:
	all_ids = all_ids[-self.max_length:]

	input_ids = torch.tensor([all_ids], dtype=torch.long, device=self._device)

	with torch.no_grad():
	with torch.autocast(device_type=str(self._device).split(":")[0],
	dtype=torch.bfloat16,
	enabled=self._device.type == "cuda"):
	logits, _ = self._model(input_ids)

	# Log-probs for the continuation tokens only.
	# logits[:, i, :] predicts the token at position i+1, so to score the
	# last cont_len tokens we read logits at [len-cont_len-1 : len-1].
	cont_targets = torch.tensor(all_ids[-cont_len:], device=self._device)
	start = max(0, len(all_ids) - cont_len - 1)
	cont_logits = logits[0, start : start + cont_len, :] # (cont_len, vocab)

	log_probs = torch.nn.functional.log_softmax(cont_logits.float(), dim=-1)
	token_log_probs = log_probs[range(cont_len), cont_targets]
	total_log_prob = token_log_probs.sum().item()

	greedy = (cont_logits.argmax(dim=-1) == cont_targets).all().item()
	results.append((total_log_prob, bool(greedy)))

	return results

	def loglikelihood_rolling(self, requests: list[Instance]) -> list[float]:
	"""Compute rolling log-likelihood for perplexity tasks."""
	results = []
	for req in requests:
	text = req.args[0]
	ids = self.tok_encode(text)
	total_ll = 0.0
	# Slide a window of max_length over the tokens.
	for start in range(0, max(1, len(ids) - 1), self.max_length):
	chunk = ids[start : start + self.max_length + 1]
	if len(chunk) < 2:
	break
	inp = torch.tensor([chunk[:-1]], dtype=torch.long, device=self._device)
	tgt = torch.tensor(chunk[1:], dtype=torch.long, device=self._device)
	with torch.no_grad():
	with torch.autocast(device_type=str(self._device).split(":")[0],
	dtype=torch.bfloat16,
	enabled=self._device.type == "cuda"):
	logits, _ = self._model(inp)
	log_probs = torch.nn.functional.log_softmax(logits[0].float(), dim=-1)
	total_ll += log_probs[range(len(tgt)), tgt].sum().item()
	results.append(total_ll)
	return results

	def generate_until(self, requests: list[Instance]) -> list[str]:
	"""Greedy generation until stop string (used by some tasks)."""
	results = []
	for req in requests:
	context, gen_kwargs = req.args
	until = gen_kwargs.get("until", ["<\|eos\|>"])
	max_new = gen_kwargs.get("max_gen_toks", self.max_gen_toks)
	ids = torch.tensor([self.tok_encode(context)], dtype=torch.long,
	device=self._device)
	out = self._model.generate(ids, max_new_tokens=max_new,
	temperature=1.0, top_k=1) # greedy
	new_ids = out[0, ids.shape[1]:].tolist()
	text = self.tok_decode(new_ids)
	for stop in until:
	if stop in text:
	text = text[:text.index(stop)]
	results.append(text)
	return results


	# --------------------------------------------------------------------------- #
	# Main
	# --------------------------------------------------------------------------- #
	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--checkpoint", required=True)
	ap.add_argument("--tasks", default=DEFAULT_TASKS)
	ap.add_argument("--batch_size", type=int, default=32)
	ap.add_argument("--device", default="cuda")
	ap.add_argument("--output", default="eval_results.json")
	args = ap.parse_args()

	model = IvmeLM(args.checkpoint, device=args.device, batch_size=args.batch_size)
	task_list = [t.strip() for t in args.tasks.split(",")]

	print(f"\n[eval] running tasks: {task_list}")
	results = lm_eval.simple_evaluate(
	model=model,
	tasks=task_list,
	num_fewshot=0, # zero-shot, matching the leaderboard
	batch_size=args.batch_size,
	log_samples=False,
	)

	# Print a clean summary
	print("\n" + "=" * 52)
	print(" İvme-Conversate Eval Results")
	print("=" * 52)
	for task, metrics in results["results"].items():
	acc = metrics.get("acc,none") or metrics.get("acc_norm,none") or 0.0
	print(f" {task:<20} {acc*100:.2f}%")
	print("=" * 52)
	print(f" Model params : {model._model.num_params()/1e6:.1f}M")
	print(f" Checkpoint : {args.checkpoint}")
	print(f" Eval mode : zero-shot")
	print("=" * 52)

	# Save full results for the model card / leaderboard PR
	with open(args.output, "w") as f:
	json.dump(results["results"], f, indent=2)
	print(f"\n[eval] full results saved -> {args.output}")


	if __name__ == "__main__":
	main()