| #!/usr/bin/env python3 | |
| """ | |
| Test HumanEval benchmark in stub mode (no model required) | |
| Tests that the evaluation pipeline works correctly. | |
| """ | |
| import sys | |
| sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval') | |
| from benchmarks.human_eval import HumanEval | |
| def test_humaneval_stub(): | |
| """Test HumanEval with stub mode (canonical solutions).""" | |
| print("=" * 50) | |
| print("Testing HumanEval Pipeline (Stub Mode)") | |
| print("=" * 50) | |
| # Create benchmark - will use stub mode since no model configured | |
| benchmark = HumanEval(max_problems=5) | |
| # Run evaluation (will use canonical solutions in stub mode) | |
| results = benchmark.evaluate() | |
| print("\n" + "=" * 50) | |
| print("RESULTS:") | |
| print("=" * 50) | |
| print(f"Total problems: {results['total_cases']}") | |
| print(f"Passed: {results['pass_at_1']}") | |
| print(f"Pass rate: {results['accuracy']*100:.1f}%") | |
| print(f"Model: {results['model']}") | |
| print("=" * 50) | |
| return results | |
| if __name__ == "__main__": | |
| try: | |
| results = test_humaneval_stub() | |
| print("\n✅ Pipeline test completed successfully!") | |
| except Exception as e: | |
| print(f"\n❌ Pipeline test failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| sys.exit(1) |