| | |
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import csv |
| | import json |
| | import subprocess |
| | from pathlib import Path |
| | from typing import Any |
| |
|
| | ROOT = Path(__file__).resolve().parents[1] |
| | SCRIPT = ROOT / 'scripts' / 'score_tool_routing_confusion.py' |
| | OUT_DIR = ROOT / 'docs' / 'tool_routing_eval' |
| |
|
| |
|
| | def model_stem(model: str) -> str: |
| | return model.replace('/', '_') |
| |
|
| |
|
| | def run_one( |
| | model: str, |
| | agent: str, |
| | agent_cards: Path, |
| | prompts: Path, |
| | expected: Path, |
| | start: int, |
| | end: int, |
| | timeout: int, |
| | out_dir: Path, |
| | raw_results_dir: Path | None, |
| | ) -> None: |
| | cmd = [ |
| | 'python', str(SCRIPT), |
| | '--model', model, |
| | '--agent', agent, |
| | '--agent-cards', str(agent_cards), |
| | '--prompts', str(prompts), |
| | '--expected', str(expected), |
| | '--start', str(start), |
| | '--end', str(end), |
| | '--timeout', str(timeout), |
| | '--out-dir', str(out_dir), |
| | ] |
| | if raw_results_dir is not None: |
| | cmd.extend(['--raw-results-dir', str(raw_results_dir)]) |
| | print('\n[run]', ' '.join(cmd)) |
| | subprocess.run(cmd, check=True) |
| |
|
| |
|
| | def load_model_summary(model: str, out_dir: Path) -> dict[str, Any]: |
| | p = out_dir / f"tool_routing_{model_stem(model)}.json" |
| | data = json.loads(p.read_text(encoding='utf-8')) |
| | s = data['summary'] |
| | s['model'] = model |
| | return s |
| |
|
| |
|
| | def aggregate(summaries: list[dict[str, Any]]) -> dict[str, Any]: |
| | n = len(summaries) |
| | if n == 0: |
| | return {'n_models': 0} |
| |
|
| | def avg(key: str) -> float: |
| | vals = [float(s[key]) for s in summaries] |
| | return round(sum(vals) / len(vals), 4) |
| |
|
| | return { |
| | 'n_models': n, |
| | 'avg_first_accuracy': avg('first_accuracy'), |
| | 'avg_primary_accuracy': avg('primary_accuracy'), |
| | 'avg_chain_accuracy': avg('chain_accuracy'), |
| | 'avg_success_rate': avg('success_rate'), |
| | 'avg_tool_calls': avg('avg_tool_calls'), |
| | 'avg_score_total': avg('avg_score_total'), |
| | } |
| |
|
| |
|
| | def write_outputs(summaries: list[dict[str, Any]], agg: dict[str, Any], out_dir: Path) -> None: |
| | out_dir.mkdir(parents=True, exist_ok=True) |
| |
|
| | json_path = out_dir / 'tool_routing_batch_summary.json' |
| | csv_path = out_dir / 'tool_routing_batch_summary.csv' |
| | md_path = out_dir / 'tool_routing_batch_summary.md' |
| |
|
| | payload = { |
| | 'aggregate': agg, |
| | 'models': summaries, |
| | } |
| | json_path.write_text(json.dumps(payload, indent=2), encoding='utf-8') |
| |
|
| | with csv_path.open('w', newline='', encoding='utf-8') as f: |
| | w = csv.DictWriter( |
| | f, |
| | fieldnames=[ |
| | 'model', 'n_cases', 'first_accuracy', 'primary_accuracy', 'chain_accuracy', |
| | 'success_rate', 'avg_tool_calls', 'avg_score_total', |
| | ], |
| | ) |
| | w.writeheader() |
| | for s in summaries: |
| | w.writerow({ |
| | 'model': s['model'], |
| | 'n_cases': s['n_cases'], |
| | 'first_accuracy': s['first_accuracy'], |
| | 'primary_accuracy': s['primary_accuracy'], |
| | 'chain_accuracy': s['chain_accuracy'], |
| | 'success_rate': s['success_rate'], |
| | 'avg_tool_calls': s['avg_tool_calls'], |
| | 'avg_score_total': s['avg_score_total'], |
| | }) |
| |
|
| | lines = [ |
| | '# Tool Routing Batch Summary', |
| | '', |
| | f"- Models: **{agg.get('n_models', 0)}**", |
| | '', |
| | '## Aggregate means', |
| | '', |
| | f"- Avg first-tool accuracy: **{agg.get('avg_first_accuracy')}**", |
| | f"- Avg primary-tool accuracy: **{agg.get('avg_primary_accuracy')}**", |
| | f"- Avg chain accuracy: **{agg.get('avg_chain_accuracy')}**", |
| | f"- Avg success rate: **{agg.get('avg_success_rate')}**", |
| | f"- Avg tool calls: **{agg.get('avg_tool_calls')}**", |
| | f"- Avg score (/10): **{agg.get('avg_score_total')}**", |
| | '', |
| | '## Per-model', |
| | '', |
| | '| Model | Cases | First acc | Primary acc | Chain acc | Success | Avg calls | Avg score |', |
| | '|---|---:|---:|---:|---:|---:|---:|---:|', |
| | ] |
| |
|
| | for s in summaries: |
| | lines.append( |
| | f"| {s['model']} | {s['n_cases']} | {s['first_accuracy']} | {s['primary_accuracy']} | {s['chain_accuracy']} | {s['success_rate']} | {s['avg_tool_calls']} | {s['avg_score_total']} |" |
| | ) |
| |
|
| | md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8') |
| |
|
| | print('\nWrote:') |
| | print(f'- {json_path}') |
| | print(f'- {csv_path}') |
| | print(f'- {md_path}') |
| |
|
| |
|
| | def main() -> None: |
| | ap = argparse.ArgumentParser(description='Batch runner for tool-routing/confusion benchmark') |
| | ap.add_argument('--models', required=True, help='Comma-separated model IDs') |
| | ap.add_argument('--agent', required=True, help='Router agent name') |
| | ap.add_argument('--agent-cards', type=Path, required=True, help='Path containing router agent card and tools') |
| | ap.add_argument('--prompts', type=Path, default=ROOT / 'scripts' / 'tool_routing_challenges.txt') |
| | ap.add_argument('--expected', type=Path, default=ROOT / 'scripts' / 'tool_routing_expected.json') |
| | ap.add_argument('--start', type=int, default=1) |
| | ap.add_argument('--end', type=int, default=20) |
| | ap.add_argument('--timeout', type=int, default=240) |
| | ap.add_argument('--out-dir', type=Path, default=OUT_DIR) |
| | ap.add_argument('--raw-results-dir', type=Path, default=None, help='Root directory for fast-agent --results JSON files') |
| | args = ap.parse_args() |
| |
|
| | models = [m.strip() for m in args.models.split(',') if m.strip()] |
| |
|
| | for m in models: |
| | run_one( |
| | model=m, |
| | agent=args.agent, |
| | agent_cards=args.agent_cards, |
| | prompts=args.prompts, |
| | expected=args.expected, |
| | start=args.start, |
| | end=args.end, |
| | timeout=args.timeout, |
| | out_dir=args.out_dir, |
| | raw_results_dir=args.raw_results_dir, |
| | ) |
| |
|
| | summaries = [load_model_summary(m, args.out_dir) for m in models] |
| | |
| | summaries = sorted(summaries, key=lambda s: (-s['first_accuracy'], -s['primary_accuracy'], -s['avg_score_total'], s['avg_tool_calls'], s['model'])) |
| | agg = aggregate(summaries) |
| | write_outputs(summaries, agg, args.out_dir) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | main() |
| |
|