| |
| """ |
| HumanEval benchmark evaluation for Stack 2.9. |
| Can run with local model (transformers/vLLM) or later via API. |
| """ |
|
|
| import json |
| import subprocess |
| import sys |
| from pathlib import Path |
| import argparse |
|
|
| def check_dependencies(): |
| """Check if human_eval package is available.""" |
| try: |
| import human_eval |
| return True |
| except ImportError: |
| print("β human_eval package not found") |
| print(" Install with: pip install humaneval") |
| return False |
|
|
| def evaluate_with_transformers(model_name: str, gpu: bool = True): |
| """Evaluate using HuggingFace transformers.""" |
| try: |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import torch |
| except ImportError: |
| print("β transformers not installed") |
| return None |
|
|
| print(f"π€ Loading model: {model_name}") |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| device_map="auto" if gpu else None, |
| torch_dtype=torch.float16 if gpu else torch.float32 |
| ) |
|
|
| |
| try: |
| from human_eval.data import write_problems, read_problems |
| from human_eval.evaluation import evaluate |
| except ImportError: |
| print("β human_eval package missing") |
| return None |
|
|
| |
| print("π§ͺ Running HumanEval evaluation...") |
| results = evaluate( |
| model=model, |
| tokenizer=tokenizer, |
| problems=read_problems(), |
| temperature=0.2, |
| max_length=2000 |
| ) |
|
|
| |
| output = { |
| "model": model_name, |
| "benchmark": "HumanEval", |
| "pass@1": results["pass@1"], |
| "pass@10": results.get("pass@10", 0), |
| "pass@100": results.get("pass@100", 0), |
| "num_problems": len(read_problems()), |
| "evaluated_at": datetime.now().isoformat() |
| } |
|
|
| return output |
|
|
| def evaluate_with_vllm(api_url: str = "http://localhost:8000"): |
| """Evaluate using running vLLM server.""" |
| import openai |
| from human_eval.data import read_problems |
|
|
| client = openai.OpenAI( |
| base_url=api_url, |
| api_key="dummy" |
| ) |
|
|
| problems = read_problems() |
| print(f"π§ͺ Evaluating {len(problems)} HumanEval problems via vLLM...") |
|
|
| |
| pass_at_k = {"pass@1": 0, "pass@10": 0, "pass@100": 0} |
| num_problems = len(problems) |
|
|
| |
| for problem_id, problem in problems.items(): |
| prompt = problem["prompt"] |
| response = client.chat.completions.create( |
| model="stack-2.9", |
| messages=[{"role": "user", "content": prompt}], |
| max_tokens=500, |
| temperature=0.2 |
| ) |
| completion = response.choices[0].message.content |
| |
| |
| |
|
|
| |
|
|
| output = { |
| "model": "stack-2.9 (via vLLM)", |
| "benchmark": "HumanEval", |
| "note": "Evaluation script structure - requires full implementation with test execution", |
| "num_problems": num_problems |
| } |
|
|
| return output |
|
|
| def generate_estimate(): |
| """Generate baseline estimate based on Qwen2.5-Coder numbers.""" |
| |
| |
| estimate = { |
| "model": "Stack 2.9 (estimate)", |
| "benchmark": "HumanEval", |
| "pass@1": 0.82, |
| "pass@10": 0.89, |
| "pass@100": 0.92, |
| "note": "Estimate based on Qwen2.5-Coder-32B baseline. Actual numbers after training.", |
| "source": "https://qwenlm.github.io/blog/qwen2.5-coder/" |
| } |
| return estimate |
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model", type=str, help="HuggingFace model name or path") |
| parser.add_argument("--vllm-api", type=str, default="http://localhost:8000", help="vLLM API URL") |
| parser.add_argument("--output", type=str, default="stack-2.9-eval/results/humaneval.json") |
| parser.add_argument("--estimate-only", action="store_true", help="Generate estimate without running") |
| args = parser.parse_args() |
|
|
| output_path = Path(args.output) |
| output_path.parent.mkdir(parents=True, exist_ok=True) |
|
|
| print("π¬ HumanEval Benchmark Evaluation") |
|
|
| if args.estimate_only: |
| print("π Generating estimate based on Qwen2.5-Coder baseline...") |
| result = generate_estimate() |
| elif args.model: |
| if not check_dependencies(): |
| sys.exit(1) |
| result = evaluate_with_transformers(args.model) |
| else: |
| |
| print(f"π Connecting to vLLM at {args.vllm_api}") |
| result = evaluate_with_vllm(args.vllm_api) |
|
|
| if result: |
| with open(output_path, 'w') as f: |
| json.dump(result, f, indent=2) |
| print(f"\nβ
Results saved to {output_path}") |
| print(f" Pass@1 (estimated/actual): {result.get('pass@1', 'N/A')*100:.1f}%" if result.get('pass@1') else "Result saved") |
| else: |
| print("β Evaluation failed") |
|
|
| if __name__ == "__main__": |
| import datetime |
| main() |