#!/usr/bin/env python3 from __future__ import annotations import argparse import csv import json import subprocess from pathlib import Path from typing import Any ROOT = Path(__file__).resolve().parents[1] SCRIPT = ROOT / 'scripts' / 'score_tool_routing_confusion.py' OUT_DIR = ROOT / 'docs' / 'tool_routing_eval' def model_stem(model: str) -> str: return model.replace('/', '_') def run_one( model: str, agent: str, agent_cards: Path, prompts: Path, expected: Path, start: int, end: int, timeout: int, out_dir: Path, raw_results_dir: Path | None, ) -> None: cmd = [ 'python', str(SCRIPT), '--model', model, '--agent', agent, '--agent-cards', str(agent_cards), '--prompts', str(prompts), '--expected', str(expected), '--start', str(start), '--end', str(end), '--timeout', str(timeout), '--out-dir', str(out_dir), ] if raw_results_dir is not None: cmd.extend(['--raw-results-dir', str(raw_results_dir)]) print('\n[run]', ' '.join(cmd)) subprocess.run(cmd, check=True) def load_model_summary(model: str, out_dir: Path) -> dict[str, Any]: p = out_dir / f"tool_routing_{model_stem(model)}.json" data = json.loads(p.read_text(encoding='utf-8')) s = data['summary'] s['model'] = model return s def aggregate(summaries: list[dict[str, Any]]) -> dict[str, Any]: n = len(summaries) if n == 0: return {'n_models': 0} def avg(key: str) -> float: vals = [float(s[key]) for s in summaries] return round(sum(vals) / len(vals), 4) return { 'n_models': n, 'avg_first_accuracy': avg('first_accuracy'), 'avg_primary_accuracy': avg('primary_accuracy'), 'avg_chain_accuracy': avg('chain_accuracy'), 'avg_success_rate': avg('success_rate'), 'avg_tool_calls': avg('avg_tool_calls'), 'avg_score_total': avg('avg_score_total'), } def write_outputs(summaries: list[dict[str, Any]], agg: dict[str, Any], out_dir: Path) -> None: out_dir.mkdir(parents=True, exist_ok=True) json_path = out_dir / 'tool_routing_batch_summary.json' csv_path = out_dir / 'tool_routing_batch_summary.csv' md_path = out_dir / 'tool_routing_batch_summary.md' payload = { 'aggregate': agg, 'models': summaries, } json_path.write_text(json.dumps(payload, indent=2), encoding='utf-8') with csv_path.open('w', newline='', encoding='utf-8') as f: w = csv.DictWriter( f, fieldnames=[ 'model', 'n_cases', 'first_accuracy', 'primary_accuracy', 'chain_accuracy', 'success_rate', 'avg_tool_calls', 'avg_score_total', ], ) w.writeheader() for s in summaries: w.writerow({ 'model': s['model'], 'n_cases': s['n_cases'], 'first_accuracy': s['first_accuracy'], 'primary_accuracy': s['primary_accuracy'], 'chain_accuracy': s['chain_accuracy'], 'success_rate': s['success_rate'], 'avg_tool_calls': s['avg_tool_calls'], 'avg_score_total': s['avg_score_total'], }) lines = [ '# Tool Routing Batch Summary', '', f"- Models: **{agg.get('n_models', 0)}**", '', '## Aggregate means', '', f"- Avg first-tool accuracy: **{agg.get('avg_first_accuracy')}**", f"- Avg primary-tool accuracy: **{agg.get('avg_primary_accuracy')}**", f"- Avg chain accuracy: **{agg.get('avg_chain_accuracy')}**", f"- Avg success rate: **{agg.get('avg_success_rate')}**", f"- Avg tool calls: **{agg.get('avg_tool_calls')}**", f"- Avg score (/10): **{agg.get('avg_score_total')}**", '', '## Per-model', '', '| Model | Cases | First acc | Primary acc | Chain acc | Success | Avg calls | Avg score |', '|---|---:|---:|---:|---:|---:|---:|---:|', ] for s in summaries: lines.append( f"| {s['model']} | {s['n_cases']} | {s['first_accuracy']} | {s['primary_accuracy']} | {s['chain_accuracy']} | {s['success_rate']} | {s['avg_tool_calls']} | {s['avg_score_total']} |" ) md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') print('\nWrote:') print(f'- {json_path}') print(f'- {csv_path}') print(f'- {md_path}') def main() -> None: ap = argparse.ArgumentParser(description='Batch runner for tool-routing/confusion benchmark') ap.add_argument('--models', required=True, help='Comma-separated model IDs') ap.add_argument('--agent', required=True, help='Router agent name') ap.add_argument('--agent-cards', type=Path, required=True, help='Path containing router agent card and tools') ap.add_argument('--prompts', type=Path, default=ROOT / 'scripts' / 'tool_routing_challenges.txt') ap.add_argument('--expected', type=Path, default=ROOT / 'scripts' / 'tool_routing_expected.json') ap.add_argument('--start', type=int, default=1) ap.add_argument('--end', type=int, default=20) ap.add_argument('--timeout', type=int, default=240) ap.add_argument('--out-dir', type=Path, default=OUT_DIR) ap.add_argument('--raw-results-dir', type=Path, default=None, help='Root directory for fast-agent --results JSON files') args = ap.parse_args() models = [m.strip() for m in args.models.split(',') if m.strip()] for m in models: run_one( model=m, agent=args.agent, agent_cards=args.agent_cards, prompts=args.prompts, expected=args.expected, start=args.start, end=args.end, timeout=args.timeout, out_dir=args.out_dir, raw_results_dir=args.raw_results_dir, ) summaries = [load_model_summary(m, args.out_dir) for m in models] # sort by first then primary summaries = sorted(summaries, key=lambda s: (-s['first_accuracy'], -s['primary_accuracy'], -s['avg_score_total'], s['avg_tool_calls'], s['model'])) agg = aggregate(summaries) write_outputs(summaries, agg, args.out_dir) if __name__ == '__main__': main()