File size: 5,290 Bytes
6379283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python3
"""
HumanEval benchmark evaluation for Stack 2.9.
Can run with local model (transformers/vLLM) or later via API.
"""

import json
import subprocess
import sys
from pathlib import Path
import argparse

def check_dependencies():
    """Check if human_eval package is available."""
    try:
        import human_eval
        return True
    except ImportError:
        print("❌ human_eval package not found")
        print("   Install with: pip install humaneval")
        return False

def evaluate_with_transformers(model_name: str, gpu: bool = True):
    """Evaluate using HuggingFace transformers."""
    try:
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch
    except ImportError:
        print("❌ transformers not installed")
        return None

    print(f"πŸ€– Loading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto" if gpu else None,
        torch_dtype=torch.float16 if gpu else torch.float32
    )

    # Load HumanEval data
    try:
        from human_eval.data import write_problems, read_problems
        from human_eval.evaluation import evaluate
    except ImportError:
        print("❌ human_eval package missing")
        return None

    # Run evaluation
    print("πŸ§ͺ Running HumanEval evaluation...")
    results = evaluate(
        model=model,
        tokenizer=tokenizer,
        problems=read_problems(),
        temperature=0.2,
        max_length=2000
    )

    # Save results
    output = {
        "model": model_name,
        "benchmark": "HumanEval",
        "pass@1": results["pass@1"],
        "pass@10": results.get("pass@10", 0),
        "pass@100": results.get("pass@100", 0),
        "num_problems": len(read_problems()),
        "evaluated_at": datetime.now().isoformat()
    }

    return output

def evaluate_with_vllm(api_url: str = "http://localhost:8000"):
    """Evaluate using running vLLM server."""
    import openai
    from human_eval.data import read_problems

    client = openai.OpenAI(
        base_url=api_url,
        api_key="dummy"
    )

    problems = read_problems()
    print(f"πŸ§ͺ Evaluating {len(problems)} HumanEval problems via vLLM...")

    # Implement evaluation loop
    pass_at_k = {"pass@1": 0, "pass@10": 0, "pass@100": 0}
    num_problems = len(problems)

    # Simplified - in practice need proper sampling
    for problem_id, problem in problems.items():
        prompt = problem["prompt"]
        response = client.chat.completions.create(
            model="stack-2.9",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=500,
            temperature=0.2
        )
        completion = response.choices[0].message.content
        # Check if completion contains solution and tests pass
        # (Need actual test execution)
        # For now, placeholder

    # This is a placeholder - full implementation requires test execution

    output = {
        "model": "stack-2.9 (via vLLM)",
        "benchmark": "HumanEval",
        "note": "Evaluation script structure - requires full implementation with test execution",
        "num_problems": num_problems
    }

    return output

def generate_estimate():
    """Generate baseline estimate based on Qwen2.5-Coder numbers."""
    # Qwen2.5-Coder-32B reported ~82% on HumanEval
    # Our fine-tune should be similar or slightly better/worse
    estimate = {
        "model": "Stack 2.9 (estimate)",
        "benchmark": "HumanEval",
        "pass@1": 0.82,  # 82%
        "pass@10": 0.89,
        "pass@100": 0.92,
        "note": "Estimate based on Qwen2.5-Coder-32B baseline. Actual numbers after training.",
        "source": "https://qwenlm.github.io/blog/qwen2.5-coder/"
    }
    return estimate

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, help="HuggingFace model name or path")
    parser.add_argument("--vllm-api", type=str, default="http://localhost:8000", help="vLLM API URL")
    parser.add_argument("--output", type=str, default="stack-2.9-eval/results/humaneval.json")
    parser.add_argument("--estimate-only", action="store_true", help="Generate estimate without running")
    args = parser.parse_args()

    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print("πŸ”¬ HumanEval Benchmark Evaluation")

    if args.estimate_only:
        print("πŸ“Š Generating estimate based on Qwen2.5-Coder baseline...")
        result = generate_estimate()
    elif args.model:
        if not check_dependencies():
            sys.exit(1)
        result = evaluate_with_transformers(args.model)
    else:
        # Try vLLM
        print(f"🌐 Connecting to vLLM at {args.vllm_api}")
        result = evaluate_with_vllm(args.vllm_api)

    if result:
        with open(output_path, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"\nβœ… Results saved to {output_path}")
        print(f"   Pass@1 (estimated/actual): {result.get('pass@1', 'N/A')*100:.1f}%" if result.get('pass@1') else "Result saved")
    else:
        print("❌ Evaluation failed")

if __name__ == "__main__":
    import datetime
    main()