#!/usr/bin/env python3
"""
Test HumanEval benchmark in stub mode (no model required)
Tests that the evaluation pipeline works correctly.
"""

import sys
sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval')

from benchmarks.human_eval import HumanEval

def test_humaneval_stub():
    """Test HumanEval with stub mode (canonical solutions)."""
    print("=" * 50)
    print("Testing HumanEval Pipeline (Stub Mode)")
    print("=" * 50)
    
    # Create benchmark - will use stub mode since no model configured
    benchmark = HumanEval(max_problems=5)
    
    # Run evaluation (will use canonical solutions in stub mode)
    results = benchmark.evaluate()
    
    print("\n" + "=" * 50)
    print("RESULTS:")
    print("=" * 50)
    print(f"Total problems: {results['total_cases']}")
    print(f"Passed: {results['pass_at_1']}")
    print(f"Pass rate: {results['accuracy']*100:.1f}%")
    print(f"Model: {results['model']}")
    print("=" * 50)
    
    return results

if __name__ == "__main__":
    try:
        results = test_humaneval_stub()
        print("\n✅ Pipeline test completed successfully!")
    except Exception as e:
        print(f"\n❌ Pipeline test failed: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)