File size: 5,441 Bytes
fcb2b04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""
Main evaluation pipeline for Stack 2.9
Runs standard benchmarks and compares with base Qwen2.5-Coder-32B
"""

import os
import json
import argparse
import numpy as np
from datetime import datetime
from pathlib import Path

# Add benchmarks directory to path
import sys
sys.path.append(str(Path(__file__).parent.parent / "benchmarks"))

# Standard benchmarks
from human_eval import HumanEval
from mbpp import MBPP
from gsm8k import GSM8K
from bigbench import BIGBenchHard

class Stack29Evaluator:
    def __init__(self, model_name, base_model_name="qwen2.5-coder-32b", output_dir="results"):
        self.model_name = model_name
        self.base_model_name = base_model_name
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Initialize benchmarks
        self.benchmarks = {
            "HumanEval": HumanEval(),
            "MBPP": MBPP(),
            "GSM8K": GSM8K(),
            "BIG-Bench Hard": BIGBenchHard()
        }
        
        self.results = {}
        
    def run_all_benchmarks(self):
        """Run all standard benchmarks"""
        print(f"Running benchmarks for {self.model_name}...")
        
        for name, benchmark in self.benchmarks.items():
            print(f"\nRunning {name}...")
            self.results[name] = self._run_benchmark(benchmark)
            
        return self.results
    
    def _run_benchmark(self, benchmark):
        """Run a single benchmark and return results"""
        results = benchmark.evaluate(self.model_name)
        return {
            "pass_at_1": results.get("pass_at_1", 0),
            "pass_at_3": results.get("pass_at_3", 0),
            "pass_at_5": results.get("pass_at_5", 0),
            "total_cases": results.get("total_cases", 0),
            "accuracy": results.get("accuracy", 0)
        }
    
    def compare_with_base(self):
        """Compare results with base model"""
        base_results = {}
        
        # Run base model benchmarks
        base_evaluator = Stack29Evaluator(self.base_model_name, output_dir=self.output_dir)
        base_results = base_evaluator.run_all_benchmarks()
        
        comparison = {}
        
        for benchmark_name in self.results:
            current = self.results[benchmark_name]
            base = base_results[benchmark_name]
            
            comparison[benchmark_name] = {
                "current": current,
                "base": base,
                "improvement": {
                    "pass_at_1": self._calculate_improvement(current["pass_at_1"], base["pass_at_1"]),
                    "pass_at_3": self._calculate_improvement(current["pass_at_3"], base["pass_at_3"]),
                    "pass_at_5": self._calculate_improvement(current["pass_at_5"], base["pass_at_5"]),
                    "accuracy": self._calculate_improvement(current["accuracy"], base["accuracy"])
                }
            }
        
        return comparison
    
    def _calculate_improvement(self, current, base):
        """Calculate percentage improvement"""
        if base == 0:
            return float('inf') if current > 0 else 0
        return ((current - base) / base) * 100
    
    def save_results(self):
        """Save all results to JSON"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save raw results
        results_path = self.output_dir / f"results_{timestamp}.json"
        with open(results_path, 'w') as f:
            json.dump({
                "model": self.model_name,
                "timestamp": timestamp,
                "results": self.results
            }, f, indent=2)
        
        # Save comparison
        comparison_path = self.output_dir / f"comparison_{timestamp}.json"
        with open(comparison_path, 'w') as f:
            json.dump({
                "model": self.model_name,
                "base_model": self.base_model_name,
                "timestamp": timestamp,
                "comparison": self.compare_with_base()
            }, f, indent=2)
        
        print(f"Results saved to {results_path}")
        print(f"Comparison saved to {comparison_path}")
        
        return results_path, comparison_path
    
    def generate_summary(self):
        """Generate markdown summary of results"""
        summary = f"""# Stack 2.9 Evaluation Results - {self.model_name}

## Summary
Evaluation results for Stack 2.9 compared with base {self.base_model_name}.

## Benchmarks

"""
        
        for name, result in self.results.items():
            summary += f"""### {name}

- Pass@1: {result['pass_at_1']}/{result['total_cases']} ({result['accuracy']*100:.2f}%)
- Pass@3: {result.get('pass_at_3', 0)}/{result['total_cases']}
- Pass@5: {result.get('pass_at_5', 0)}/{result['total_cases']}

"""
        
        return summary


def main():
    parser = argparse.ArgumentParser(description='Evaluate Stack 2.9')
    parser.add_argument('--model', required=True, help='Model name to evaluate')
    parser.add_argument('--base-model', default='qwen2.5-coder-32b', help='Base model name for comparison')
    parser.add_argument('--output', default='results', help='Output directory')
    
    args = parser.parse_args()
    
    evaluator = Stack29Evaluator(args.model, args.base_model, args.output)
    evaluator.run_all_benchmarks()
    evaluator.save_results()
    
    print(evaluator.generate_summary())


if __name__ == "__main__":
    main()