Lekr0's picture
Add files using upload-large-folder tool
212a146 verified
"""
HumanEval benchmark evaluation script.
"""
import re
from typing import Any, Dict, List, Optional, Tuple
from datasets import load_dataset
from .base import Benchmarker
from .registry import BENCHMARKS
from .utils import create_simple_sgl_function
def extract_code_from_output(output: str) -> Optional[str]:
"""Extract Python code from model output.
Tries to extract code blocks or function definitions.
"""
# Try to find code in markdown code blocks
code_block_pattern = r"```(?:python)?\n(.*?)```"
match = re.search(code_block_pattern, output, re.DOTALL)
if match:
return match.group(1).strip()
# Try to find function definition (common in HumanEval)
# Look for "def " followed by code until the next def or end of string
def_pattern = r"(def\s+\w+\([^)]*\):.*?)(?=\n\ndef\s+|\Z)"
match = re.search(def_pattern, output, re.DOTALL)
if match:
return match.group(1).strip()
# Fallback: return the output as-is (might already be code)
return output.strip() if output.strip() else None
def check_code_passes_tests(code: str, test_code: str, entry_point: str) -> bool:
"""Check if generated code passes the test cases.
This is a simplified version. For full evaluation, use the official
HumanEval evaluation framework.
HumanEval test code typically contains assertions that will raise
AssertionError if the code doesn't pass. If execution completes without
exceptions, the tests pass.
"""
try:
# Create a safe execution environment
namespace = {}
# Execute the code (function definition)
exec(code, namespace)
# Execute the test code (which contains assertions)
# If no exception is raised, the tests pass
exec(test_code, namespace)
return True
except AssertionError:
# Assertion failed - test didn't pass
return False
except Exception:
# Any other exception (syntax error, runtime error, etc.) means test failed
return False
@BENCHMARKS.register("humaneval")
class HumanEvalBenchmarker(Benchmarker):
"""HumanEval benchmark implementation."""
def __init__(self, num_samples: Optional[int] = None):
"""Initialize benchmark and store test cases."""
super().__init__(num_samples, None)
self.test_cases = []
self.entry_points = []
def load_data(self) -> Tuple[List[Dict[str, Any]], List[Optional[Dict[str, str]]]]:
"""Load and preprocess HumanEval dataset."""
dataset = load_dataset("openai/openai_humaneval")["test"]
questions = []
labels = []
self.test_cases = []
self.entry_points = []
for idx, q in enumerate(dataset):
if self.num_samples is not None and idx >= self.num_samples:
break
questions.append({"question": q["prompt"]})
# Store test case and entry point for evaluation
test_code = q.get("test", "")
entry_point = q.get("entry_point", "")
self.test_cases.append(test_code)
self.entry_points.append(entry_point)
# Store canonical solution as reference (optional, for comparison)
canonical_solution = q.get("canonical_solution", "")
labels.append(
{
"test": test_code,
"entry_point": entry_point,
"canonical_solution": canonical_solution,
}
)
return questions, labels
def extract_answer(self, output: str, label: Optional[Any] = None) -> Optional[str]:
"""Extract code from model output."""
return extract_code_from_output(output)
def compute_accuracy(
self, predictions: List[Any], labels: List[Any]
) -> Optional[float]:
"""Compute accuracy for HumanEval by checking if code passes tests.
Note: This is a simplified evaluation. For official pass@k metrics,
use the HumanEval evaluation framework.
"""
if not labels or len(labels) == 0:
return None
if all(label is None for label in labels):
return None
correct = 0
valid_count = 0
for i, (pred, label) in enumerate(zip(predictions, labels)):
if label is not None and isinstance(label, dict):
valid_count += 1
if pred is not None:
try:
# Get the prompt (function signature and docstring)
prompt = self.questions[i]["question"]
entry_point = label.get("entry_point", "")
# The prompt contains the function signature (e.g., "def function_name(...):")
# The generated code might be:
# 1. Just the function body (what we want) - need to combine with prompt
# 2. The complete function including signature - use as-is
# 3. Code in markdown blocks - already extracted by extract_code_from_output
pred_str = str(pred).strip()
# Check if pred already contains a complete function definition
# (starts with "def " and contains the entry_point function name)
if pred_str.startswith("def ") and entry_point:
# Check if this is the same function (by name)
func_name_match = re.match(r"def\s+(\w+)\s*\(", pred_str)
if (
func_name_match
and func_name_match.group(1) == entry_point
):
# Generated code includes complete function, use it as-is
full_code = pred_str
else:
# Different function or no match, combine with prompt
full_code = prompt + "\n" + pred_str
elif pred_str.startswith("def "):
# Has function definition but we can't verify entry_point, use as-is
full_code = pred_str
else:
# Generated code is just the body, combine with prompt
full_code = prompt + "\n" + pred_str
# Check if code passes tests
test_code = label.get("test", "")
if test_code and check_code_passes_tests(
full_code, test_code, entry_point
):
correct += 1
except Exception as e:
# If evaluation fails, consider it incorrect
# Uncomment for debugging: print(f"Error evaluating code {i}: {e}")
pass
return correct / valid_count if valid_count > 0 else 0.0
def create_sgl_function(self):
"""Create SGL function for HumanEval."""
return create_simple_sgl_function(
function_name="get_humaneval_answer",
answer_key="answer",
max_tokens=self.get_max_new_tokens(),
)
def get_max_new_tokens(self) -> int:
"""HumanEval code generation requires more tokens."""
return 1024