walidsobhie-code

refactor: Squeeze folders further - cleaner structure

65888d5 22 days ago

1.29 kB

	#!/usr/bin/env python3
	"""
	Test HumanEval benchmark in stub mode (no model required)
	Tests that the evaluation pipeline works correctly.
	"""

	import sys
	sys.path.insert(0, '/Users/walidsobhi/.openclaw/workspace/stack-2.9/stack-2.9-eval')

	from benchmarks.human_eval import HumanEval

	def test_humaneval_stub():
	"""Test HumanEval with stub mode (canonical solutions)."""
	print("=" * 50)
	print("Testing HumanEval Pipeline (Stub Mode)")
	print("=" * 50)

	# Create benchmark - will use stub mode since no model configured
	benchmark = HumanEval(max_problems=5)

	# Run evaluation (will use canonical solutions in stub mode)
	results = benchmark.evaluate()

	print("\n" + "=" * 50)
	print("RESULTS:")
	print("=" * 50)
	print(f"Total problems: {results['total_cases']}")
	print(f"Passed: {results['pass_at_1']}")
	print(f"Pass rate: {results['accuracy']*100:.1f}%")
	print(f"Model: {results['model']}")
	print("=" * 50)

	return results

	if __name__ == "__main__":
	try:
	results = test_humaneval_stub()
	print("\n✅ Pipeline test completed successfully!")
	except Exception as e:
	print(f"\n❌ Pipeline test failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)