Stack-2-9-finetuned / scripts /eval /humaneval_eval.py
walidsobhie-code
chore: Rename MCP server to Stack2.9
c7f1596
#!/usr/bin/env python3
"""
HumanEval benchmark evaluation for Stack 2.9.
Can run with local model (transformers/vLLM) or later via API.
"""
import json
import subprocess
import sys
from pathlib import Path
import argparse
def check_dependencies():
"""Check if human_eval package is available."""
try:
import human_eval
return True
except ImportError:
print("❌ human_eval package not found")
print(" Install with: pip install humaneval")
return False
def evaluate_with_transformers(model_name: str, gpu: bool = True):
"""Evaluate using HuggingFace transformers."""
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
except ImportError:
print("❌ transformers not installed")
return None
print(f"πŸ€– Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto" if gpu else None,
torch_dtype=torch.float16 if gpu else torch.float32
)
# Load HumanEval data
try:
from human_eval.data import write_problems, read_problems
from human_eval.evaluation import evaluate
except ImportError:
print("❌ human_eval package missing")
return None
# Run evaluation
print("πŸ§ͺ Running HumanEval evaluation...")
results = evaluate(
model=model,
tokenizer=tokenizer,
problems=read_problems(),
temperature=0.2,
max_length=2000
)
# Save results
output = {
"model": model_name,
"benchmark": "HumanEval",
"pass@1": results["pass@1"],
"pass@10": results.get("pass@10", 0),
"pass@100": results.get("pass@100", 0),
"num_problems": len(read_problems()),
"evaluated_at": datetime.now().isoformat()
}
return output
def evaluate_with_vllm(api_url: str = "http://localhost:8000"):
"""Evaluate using running vLLM server."""
import openai
from human_eval.data import read_problems
client = openai.OpenAI(
base_url=api_url,
api_key="dummy"
)
problems = read_problems()
print(f"πŸ§ͺ Evaluating {len(problems)} HumanEval problems via vLLM...")
# Implement evaluation loop
pass_at_k = {"pass@1": 0, "pass@10": 0, "pass@100": 0}
num_problems = len(problems)
# Simplified - in practice need proper sampling
for problem_id, problem in problems.items():
prompt = problem["prompt"]
response = client.chat.completions.create(
model="stack-2.9",
messages=[{"role": "user", "content": prompt}],
max_tokens=500,
temperature=0.2
)
completion = response.choices[0].message.content
# Check if completion contains solution and tests pass
# (Need actual test execution)
# For now, placeholder
# This is a placeholder - full implementation requires test execution
output = {
"model": "stack-2.9 (via vLLM)",
"benchmark": "HumanEval",
"note": "Evaluation script structure - requires full implementation with test execution",
"num_problems": num_problems
}
return output
def generate_estimate():
"""Generate baseline estimate based on Qwen2.5-Coder numbers."""
# Qwen2.5-Coder-32B reported ~82% on HumanEval
# Our fine-tune should be similar or slightly better/worse
estimate = {
"model": "Stack 2.9 (estimate)",
"benchmark": "HumanEval",
"pass@1": 0.82, # 82%
"pass@10": 0.89,
"pass@100": 0.92,
"note": "Estimate based on Qwen2.5-Coder-32B baseline. Actual numbers after training.",
"source": "https://qwenlm.github.io/blog/qwen2.5-coder/"
}
return estimate
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="HuggingFace model name or path")
parser.add_argument("--vllm-api", type=str, default="http://localhost:8000", help="vLLM API URL")
parser.add_argument("--output", type=str, default="stack-2.9-eval/results/humaneval.json")
parser.add_argument("--estimate-only", action="store_true", help="Generate estimate without running")
args = parser.parse_args()
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
print("πŸ”¬ HumanEval Benchmark Evaluation")
if args.estimate_only:
print("πŸ“Š Generating estimate based on Qwen2.5-Coder baseline...")
result = generate_estimate()
elif args.model:
if not check_dependencies():
sys.exit(1)
result = evaluate_with_transformers(args.model)
else:
# Try vLLM
print(f"🌐 Connecting to vLLM at {args.vllm_api}")
result = evaluate_with_vllm(args.vllm_api)
if result:
with open(output_path, 'w') as f:
json.dump(result, f, indent=2)
print(f"\nβœ… Results saved to {output_path}")
print(f" Pass@1 (estimated/actual): {result.get('pass@1', 'N/A')*100:.1f}%" if result.get('pass@1') else "Result saved")
else:
print("❌ Evaluation failed")
if __name__ == "__main__":
import datetime
main()