theapemachine
/

cortex

Model card Files Files and versions

xet

Community

theapemachine commited on 22 days ago

Commit

4c1ba64

verified ·

1 Parent(s): c7bee4d

Add benchmark harness: scoring.py

Browse files

Files changed (1) hide show

benchmark/scoring.py +168 -0

benchmark/scoring.py ADDED Viewed

	@@ -0,0 +1,168 @@

+"""
+Scoring utilities for the Cortex benchmark harness.
+Two evaluation modes:
+  1. Log-likelihood scoring: For multiple-choice tasks (HellaSwag, ARC, PIQA, etc.)
+     Computes the average log-probability the model assigns to each continuation.
+  2. Generation scoring: For free-form generation tasks (passkey retrieval, etc.)
+     Generates text and checks against expected patterns.
+"""
+import torch
+import torch.nn.functional as F
+from typing import List, Optional, Tuple, Dict
+import re
+@torch.no_grad()
+def log_likelihood_score(
+    model,
+    tokenizer,
+    context: str,
+    continuations: List[str],
+    device: str = "cuda",
+) -> List[float]:
+    """
+    Compute normalized log-likelihood for each continuation given a context.
+    For each (context, continuation) pair:
+      1. Tokenize context + continuation together
+      2. Run forward pass to get logits
+      3. Compute average log-prob over the continuation tokens only
+    Args:
+        model: The language model
+        tokenizer: The tokenizer
+        context: The prompt/context string
+        continuations: List of possible continuations to score
+        device: Device to use
+    Returns:
+        List of normalized log-likelihood scores (higher = model prefers this continuation)
+    """
+    scores = []
+    for cont in continuations:
+        # Tokenize context and full sequence separately to find where continuation starts
+        ctx_ids = tokenizer.encode(context, add_special_tokens=False)
+        full_text = context + cont
+        full_ids = tokenizer.encode(full_text, add_special_tokens=False)
+        # The continuation tokens start after the context tokens
+        cont_start = len(ctx_ids)
+        cont_length = len(full_ids) - cont_start
+        if cont_length <= 0:
+            scores.append(float("-inf"))
+            continue
+        # Forward pass
+        input_ids = torch.tensor([full_ids], device=device)
+        # Truncate if too long for model
+        max_len = getattr(model.config, "max_position_embeddings", 2048)
+        if input_ids.shape[1] > max_len:
+            input_ids = input_ids[:, :max_len]
+            cont_length = min(cont_length, max_len - cont_start)
+            if cont_length <= 0:
+                scores.append(float("-inf"))
+                continue
+        outputs = model(input_ids)
+        logits = outputs.logits  # [1, seq_len, vocab_size]
+        # Shift: logits[i] predicts token[i+1]
+        # For continuation tokens at positions [cont_start, cont_start+cont_length),
+        # we need logits at positions [cont_start-1, cont_start+cont_length-1)
+        shift_logits = logits[0, cont_start - 1 : cont_start + cont_length - 1, :]
+        shift_labels = input_ids[0, cont_start : cont_start + cont_length]
+        # Log-probabilities
+        log_probs = F.log_softmax(shift_logits, dim=-1)
+        token_log_probs = log_probs.gather(1, shift_labels.unsqueeze(1)).squeeze(1)
+        # Normalize by continuation length (average log-prob per token)
+        avg_log_prob = token_log_probs.mean().item()
+        scores.append(avg_log_prob)
+    return scores
+@torch.no_grad()
+def generate_and_check(
+    model,
+    tokenizer,
+    prompt: str,
+    expected: str,
+    max_new_tokens: int = 64,
+    device: str = "cuda",
+    exact_match: bool = False,
+) -> Tuple[bool, str]:
+    """
+    Generate text and check if the expected answer appears in the output.
+    Args:
+        model: The language model
+        tokenizer: The tokenizer
+        prompt: The input prompt
+        expected: The expected answer string
+        max_new_tokens: Max tokens to generate
+        device: Device
+        exact_match: If True, requires exact match; otherwise substring match
+    Returns:
+        (is_correct, generated_text)
+    """
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
+    # Pad token
+    pad_token_id = tokenizer.pad_token_id
+    if pad_token_id is None:
+        pad_token_id = tokenizer.eos_token_id
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=False,
+        temperature=1.0,
+        pad_token_id=pad_token_id,
+    )
+    # Decode only the new tokens
+    new_tokens = output_ids[0, inputs["input_ids"].shape[1]:]
+    generated = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
+    if exact_match:
+        is_correct = generated.strip().lower() == expected.strip().lower()
+    else:
+        is_correct = expected.strip().lower() in generated.lower()
+    return is_correct, generated
+def accuracy_from_loglikelihoods(
+    scores_per_example: List[Tuple[List[float], int]],
+) -> Dict[str, float]:
+    """
+    Compute accuracy from log-likelihood scores.
+    Args:
+        scores_per_example: List of (scores_for_each_choice, correct_index)
+    Returns:
+        Dict with accuracy and count metrics
+    """
+    correct = 0
+    total = len(scores_per_example)
+    for scores, gold_idx in scores_per_example:
+        predicted = max(range(len(scores)), key=lambda i: scores[i])
+        if predicted == gold_idx:
+            correct += 1
+    return {
+        "accuracy": correct / total if total > 0 else 0.0,
+        "correct": correct,
+        "total": total,
+    }