File size: 1,291 Bytes
f80360c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | #!/usr/bin/env python3
"""
Test HumanEval benchmark in stub mode (no model required)
Tests that the evaluation pipeline works correctly.
"""
import sys
sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval')
from benchmarks.human_eval import HumanEval
def test_humaneval_stub():
"""Test HumanEval with stub mode (canonical solutions)."""
print("=" * 50)
print("Testing HumanEval Pipeline (Stub Mode)")
print("=" * 50)
# Create benchmark - will use stub mode since no model configured
benchmark = HumanEval(max_problems=5)
# Run evaluation (will use canonical solutions in stub mode)
results = benchmark.evaluate()
print("\n" + "=" * 50)
print("RESULTS:")
print("=" * 50)
print(f"Total problems: {results['total_cases']}")
print(f"Passed: {results['pass_at_1']}")
print(f"Pass rate: {results['accuracy']*100:.1f}%")
print(f"Model: {results['model']}")
print("=" * 50)
return results
if __name__ == "__main__":
try:
results = test_humaneval_stub()
print("\n✅ Pipeline test completed successfully!")
except Exception as e:
print(f"\n❌ Pipeline test failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1) |