| #!/usr/bin/env python3 | |
| """ | |
| Run HumanEval benchmark - fixed path version | |
| """ | |
| import sys | |
| import os | |
| # Add the eval directory to path | |
| eval_dir = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval" | |
| sys.path.insert(0, eval_dir) | |
| os.chdir(eval_dir) | |
| # Now import and run | |
| from benchmarks.human_eval import HumanEval | |
| print("=" * 50) | |
| print("Running HumanEval Benchmark (Stub Mode)") | |
| print("=" * 50) | |
| # Run with 5 problems in stub mode | |
| benchmark = HumanEval(max_problems=5) | |
| results = benchmark.evaluate() | |
| print("\n" + "=" * 50) | |
| print("RESULTS:") | |
| print("=" * 50) | |
| print(f"Total: {results['total_cases']}") | |
| print(f"Passed: {results['pass_at_1']}") | |
| print(f"Accuracy: {results['accuracy']*100:.1f}%") | |
| print(f"Model: {results['model']}") | |
| print("=" * 50) |