| """ |
| Main evaluation pipeline for Stack 2.9 |
| Runs standard benchmarks and compares with base Qwen2.5-Coder-32B |
| """ |
|
|
| import os |
| import json |
| import argparse |
| import numpy as np |
| from datetime import datetime |
| from pathlib import Path |
|
|
| |
| import sys |
| sys.path.append(str(Path(__file__).parent.parent / "benchmarks")) |
|
|
| |
| from human_eval import HumanEval |
| from mbpp import MBPP |
| from gsm8k import GSM8K |
| from bigbench import BIGBenchHard |
|
|
| class Stack29Evaluator: |
| def __init__(self, model_name, base_model_name="qwen2.5-coder-32b", output_dir="results"): |
| self.model_name = model_name |
| self.base_model_name = base_model_name |
| self.output_dir = Path(output_dir) |
| self.output_dir.mkdir(exist_ok=True) |
| |
| |
| self.benchmarks = { |
| "HumanEval": HumanEval(), |
| "MBPP": MBPP(), |
| "GSM8K": GSM8K(), |
| "BIG-Bench Hard": BIGBenchHard() |
| } |
| |
| self.results = {} |
| |
| def run_all_benchmarks(self): |
| """Run all standard benchmarks""" |
| print(f"Running benchmarks for {self.model_name}...") |
| |
| for name, benchmark in self.benchmarks.items(): |
| print(f"\nRunning {name}...") |
| self.results[name] = self._run_benchmark(benchmark) |
| |
| return self.results |
| |
| def _run_benchmark(self, benchmark): |
| """Run a single benchmark and return results""" |
| results = benchmark.evaluate(self.model_name) |
| return { |
| "pass_at_1": results.get("pass_at_1", 0), |
| "pass_at_3": results.get("pass_at_3", 0), |
| "pass_at_5": results.get("pass_at_5", 0), |
| "total_cases": results.get("total_cases", 0), |
| "accuracy": results.get("accuracy", 0) |
| } |
| |
| def compare_with_base(self): |
| """Compare results with base model""" |
| base_results = {} |
| |
| |
| base_evaluator = Stack29Evaluator(self.base_model_name, output_dir=self.output_dir) |
| base_results = base_evaluator.run_all_benchmarks() |
| |
| comparison = {} |
| |
| for benchmark_name in self.results: |
| current = self.results[benchmark_name] |
| base = base_results[benchmark_name] |
| |
| comparison[benchmark_name] = { |
| "current": current, |
| "base": base, |
| "improvement": { |
| "pass_at_1": self._calculate_improvement(current["pass_at_1"], base["pass_at_1"]), |
| "pass_at_3": self._calculate_improvement(current["pass_at_3"], base["pass_at_3"]), |
| "pass_at_5": self._calculate_improvement(current["pass_at_5"], base["pass_at_5"]), |
| "accuracy": self._calculate_improvement(current["accuracy"], base["accuracy"]) |
| } |
| } |
| |
| return comparison |
| |
| def _calculate_improvement(self, current, base): |
| """Calculate percentage improvement""" |
| if base == 0: |
| return float('inf') if current > 0 else 0 |
| return ((current - base) / base) * 100 |
| |
| def save_results(self): |
| """Save all results to JSON""" |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| |
| |
| results_path = self.output_dir / f"results_{timestamp}.json" |
| with open(results_path, 'w') as f: |
| json.dump({ |
| "model": self.model_name, |
| "timestamp": timestamp, |
| "results": self.results |
| }, f, indent=2) |
| |
| |
| comparison_path = self.output_dir / f"comparison_{timestamp}.json" |
| with open(comparison_path, 'w') as f: |
| json.dump({ |
| "model": self.model_name, |
| "base_model": self.base_model_name, |
| "timestamp": timestamp, |
| "comparison": self.compare_with_base() |
| }, f, indent=2) |
| |
| print(f"Results saved to {results_path}") |
| print(f"Comparison saved to {comparison_path}") |
| |
| return results_path, comparison_path |
| |
| def generate_summary(self): |
| """Generate markdown summary of results""" |
| summary = f"""# Stack 2.9 Evaluation Results - {self.model_name} |
| |
| ## Summary |
| Evaluation results for Stack 2.9 compared with base {self.base_model_name}. |
| |
| ## Benchmarks |
| |
| """ |
| |
| for name, result in self.results.items(): |
| summary += f"""### {name} |
| |
| - Pass@1: {result['pass_at_1']}/{result['total_cases']} ({result['accuracy']*100:.2f}%) |
| - Pass@3: {result.get('pass_at_3', 0)}/{result['total_cases']} |
| - Pass@5: {result.get('pass_at_5', 0)}/{result['total_cases']} |
| |
| """ |
| |
| return summary |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description='Evaluate Stack 2.9') |
| parser.add_argument('--model', required=True, help='Model name to evaluate') |
| parser.add_argument('--base-model', default='qwen2.5-coder-32b', help='Base model name for comparison') |
| parser.add_argument('--output', default='results', help='Output directory') |
| |
| args = parser.parse_args() |
| |
| evaluator = Stack29Evaluator(args.model, args.base_model, args.output) |
| evaluator.run_all_benchmarks() |
| evaluator.save_results() |
| |
| print(evaluator.generate_summary()) |
|
|
|
|
| if __name__ == "__main__": |
| main() |