#!/usr/bin/env python3 """ HumanEval benchmark evaluation for Stack 2.9. Can run with local model (transformers/vLLM) or later via API. """ import json import subprocess import sys from pathlib import Path import argparse def check_dependencies(): """Check if human_eval package is available.""" try: import human_eval return True except ImportError: print("โŒ human_eval package not found") print(" Install with: pip install humaneval") return False def evaluate_with_transformers(model_name: str, gpu: bool = True): """Evaluate using HuggingFace transformers.""" try: from transformers import AutoModelForCausalLM, AutoTokenizer import torch except ImportError: print("โŒ transformers not installed") return None print(f"๐Ÿค– Loading model: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto" if gpu else None, torch_dtype=torch.float16 if gpu else torch.float32 ) # Load HumanEval data try: from human_eval.data import write_problems, read_problems from human_eval.evaluation import evaluate except ImportError: print("โŒ human_eval package missing") return None # Run evaluation print("๐Ÿงช Running HumanEval evaluation...") results = evaluate( model=model, tokenizer=tokenizer, problems=read_problems(), temperature=0.2, max_length=2000 ) # Save results output = { "model": model_name, "benchmark": "HumanEval", "pass@1": results["pass@1"], "pass@10": results.get("pass@10", 0), "pass@100": results.get("pass@100", 0), "num_problems": len(read_problems()), "evaluated_at": datetime.now().isoformat() } return output def evaluate_with_vllm(api_url: str = "http://localhost:8000"): """Evaluate using running vLLM server.""" import openai from human_eval.data import read_problems client = openai.OpenAI( base_url=api_url, api_key="dummy" ) problems = read_problems() print(f"๐Ÿงช Evaluating {len(problems)} HumanEval problems via vLLM...") # Implement evaluation loop pass_at_k = {"pass@1": 0, "pass@10": 0, "pass@100": 0} num_problems = len(problems) # Simplified - in practice need proper sampling for problem_id, problem in problems.items(): prompt = problem["prompt"] response = client.chat.completions.create( model="stack-2.9", messages=[{"role": "user", "content": prompt}], max_tokens=500, temperature=0.2 ) completion = response.choices[0].message.content # Check if completion contains solution and tests pass # (Need actual test execution) # For now, placeholder # This is a placeholder - full implementation requires test execution output = { "model": "stack-2.9 (via vLLM)", "benchmark": "HumanEval", "note": "Evaluation script structure - requires full implementation with test execution", "num_problems": num_problems } return output def generate_estimate(): """Generate baseline estimate based on Qwen2.5-Coder numbers.""" # Qwen2.5-Coder-32B reported ~82% on HumanEval # Our fine-tune should be similar or slightly better/worse estimate = { "model": "Stack 2.9 (estimate)", "benchmark": "HumanEval", "pass@1": 0.82, # 82% "pass@10": 0.89, "pass@100": 0.92, "note": "Estimate based on Qwen2.5-Coder-32B baseline. Actual numbers after training.", "source": "https://qwenlm.github.io/blog/qwen2.5-coder/" } return estimate def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, help="HuggingFace model name or path") parser.add_argument("--vllm-api", type=str, default="http://localhost:8000", help="vLLM API URL") parser.add_argument("--output", type=str, default="stack-2.9-eval/results/humaneval.json") parser.add_argument("--estimate-only", action="store_true", help="Generate estimate without running") args = parser.parse_args() output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) print("๐Ÿ”ฌ HumanEval Benchmark Evaluation") if args.estimate_only: print("๐Ÿ“Š Generating estimate based on Qwen2.5-Coder baseline...") result = generate_estimate() elif args.model: if not check_dependencies(): sys.exit(1) result = evaluate_with_transformers(args.model) else: # Try vLLM print(f"๐ŸŒ Connecting to vLLM at {args.vllm_api}") result = evaluate_with_vllm(args.vllm_api) if result: with open(output_path, 'w') as f: json.dump(result, f, indent=2) print(f"\nโœ… Results saved to {output_path}") print(f" Pass@1 (estimated/actual): {result.get('pass@1', 'N/A')*100:.1f}%" if result.get('pass@1') else "Result saved") else: print("โŒ Evaluation failed") if __name__ == "__main__": import datetime main()