#!/usr/bin/env python3 """ Cortex Benchmark Harness — CLI Entry Point Usage: # Quick test (10 examples, fast tasks only) python -m benchmark.run_benchmark --n 10 --tasks hellaswag piqa # Standard suite (50 examples, all tasks) python -m benchmark.run_benchmark --n 50 # Full evaluation (all examples) python -m benchmark.run_benchmark --n 0 --tasks hellaswag piqa arc-easy arc-challenge winogrande mmlu # Custom model python -m benchmark.run_benchmark --model meta-llama/Llama-3.2-1B --n 50 # Save results python -m benchmark.run_benchmark --n 50 --output results.json """ import argparse import json import sys import os # Ensure parent directory is on path for imports sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def main(): parser = argparse.ArgumentParser(description="Cortex Benchmark Harness") parser.add_argument( "--model", type=str, default="HuggingFaceTB/SmolLM2-135M", help="HuggingFace model ID to evaluate", ) parser.add_argument( "--tasks", nargs="+", default=["hellaswag", "piqa", "arc-easy", "winogrande"], help="Tasks to run (choices: hellaswag, piqa, arc-easy, arc-challenge, winogrande, mmlu, halueval)", ) parser.add_argument( "--n", type=int, default=50, help="Number of examples per task (0 = all available)", ) parser.add_argument( "--no-memory", action="store_true", help="Skip memory benchmarks (passkey, multi-hop)", ) parser.add_argument( "--passkey-lengths", nargs="+", type=int, default=[128, 256, 512], help="Context lengths for passkey retrieval test", ) parser.add_argument( "--n-passkey", type=int, default=5, help="Number of passkey examples per context length", ) parser.add_argument( "--device", type=str, default="auto", help="Device: cuda, mps, cpu, or auto (auto: cuda > mps > cpu)", ) parser.add_argument( "--dtype", type=str, default="float32", choices=["float32", "float16", "bfloat16"], help="Model dtype", ) parser.add_argument( "--output", type=str, default=None, help="Path to save JSON results", ) parser.add_argument( "--cortex-weights", type=str, default=None, help="Optional Cortex weights file to load before the Cortex phase", ) args = parser.parse_args() from benchmark.runner import BenchmarkRunner runner = BenchmarkRunner( model_name=args.model, device=args.device, dtype=args.dtype, cortex_weights=args.cortex_weights, ) n = args.n if args.n > 0 else None results = runner.run_comparison( tasks=args.tasks, n=n, include_memory=not args.no_memory, n_passkey=args.n_passkey, passkey_lengths=args.passkey_lengths, ) BenchmarkRunner.print_summary(results) if args.output: # Filter out non-serializable items def make_serializable(obj): if isinstance(obj, dict): return {k: make_serializable(v) for k, v in obj.items()} elif isinstance(obj, list): return [make_serializable(v) for v in obj] elif isinstance(obj, (bool, int, float, str, type(None))): return obj else: return str(obj) with open(args.output, "w") as f: json.dump(make_serializable(results), f, indent=2) print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()