File size: 752 Bytes
f80360c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | #!/usr/bin/env python3
"""
Run HumanEval benchmark - fixed path version
"""
import sys
import os
# Add the eval directory to path
eval_dir = "/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval"
sys.path.insert(0, eval_dir)
os.chdir(eval_dir)
# Now import and run
from benchmarks.human_eval import HumanEval
print("=" * 50)
print("Running HumanEval Benchmark (Stub Mode)")
print("=" * 50)
# Run with 5 problems in stub mode
benchmark = HumanEval(max_problems=5)
results = benchmark.evaluate()
print("\n" + "=" * 50)
print("RESULTS:")
print("=" * 50)
print(f"Total: {results['total_cases']}")
print(f"Passed: {results['pass_at_1']}")
print(f"Accuracy: {results['accuracy']*100:.1f}%")
print(f"Model: {results['model']}")
print("=" * 50) |