hf-papers / scripts /run_tool_routing_batch.py
evalstate's picture
evalstate HF Staff
sync: promote hf_hub_community prompt v3 + add prompt/coverage harness
bba4fab verified
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import csv
import json
import subprocess
from pathlib import Path
from typing import Any
ROOT = Path(__file__).resolve().parents[1]
SCRIPT = ROOT / 'scripts' / 'score_tool_routing_confusion.py'
OUT_DIR = ROOT / 'docs' / 'tool_routing_eval'
def model_stem(model: str) -> str:
return model.replace('/', '_')
def run_one(
model: str,
agent: str,
agent_cards: Path,
prompts: Path,
expected: Path,
start: int,
end: int,
timeout: int,
out_dir: Path,
raw_results_dir: Path | None,
) -> None:
cmd = [
'python', str(SCRIPT),
'--model', model,
'--agent', agent,
'--agent-cards', str(agent_cards),
'--prompts', str(prompts),
'--expected', str(expected),
'--start', str(start),
'--end', str(end),
'--timeout', str(timeout),
'--out-dir', str(out_dir),
]
if raw_results_dir is not None:
cmd.extend(['--raw-results-dir', str(raw_results_dir)])
print('\n[run]', ' '.join(cmd))
subprocess.run(cmd, check=True)
def load_model_summary(model: str, out_dir: Path) -> dict[str, Any]:
p = out_dir / f"tool_routing_{model_stem(model)}.json"
data = json.loads(p.read_text(encoding='utf-8'))
s = data['summary']
s['model'] = model
return s
def aggregate(summaries: list[dict[str, Any]]) -> dict[str, Any]:
n = len(summaries)
if n == 0:
return {'n_models': 0}
def avg(key: str) -> float:
vals = [float(s[key]) for s in summaries]
return round(sum(vals) / len(vals), 4)
return {
'n_models': n,
'avg_first_accuracy': avg('first_accuracy'),
'avg_primary_accuracy': avg('primary_accuracy'),
'avg_chain_accuracy': avg('chain_accuracy'),
'avg_success_rate': avg('success_rate'),
'avg_tool_calls': avg('avg_tool_calls'),
'avg_score_total': avg('avg_score_total'),
}
def write_outputs(summaries: list[dict[str, Any]], agg: dict[str, Any], out_dir: Path) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
json_path = out_dir / 'tool_routing_batch_summary.json'
csv_path = out_dir / 'tool_routing_batch_summary.csv'
md_path = out_dir / 'tool_routing_batch_summary.md'
payload = {
'aggregate': agg,
'models': summaries,
}
json_path.write_text(json.dumps(payload, indent=2), encoding='utf-8')
with csv_path.open('w', newline='', encoding='utf-8') as f:
w = csv.DictWriter(
f,
fieldnames=[
'model', 'n_cases', 'first_accuracy', 'primary_accuracy', 'chain_accuracy',
'success_rate', 'avg_tool_calls', 'avg_score_total',
],
)
w.writeheader()
for s in summaries:
w.writerow({
'model': s['model'],
'n_cases': s['n_cases'],
'first_accuracy': s['first_accuracy'],
'primary_accuracy': s['primary_accuracy'],
'chain_accuracy': s['chain_accuracy'],
'success_rate': s['success_rate'],
'avg_tool_calls': s['avg_tool_calls'],
'avg_score_total': s['avg_score_total'],
})
lines = [
'# Tool Routing Batch Summary',
'',
f"- Models: **{agg.get('n_models', 0)}**",
'',
'## Aggregate means',
'',
f"- Avg first-tool accuracy: **{agg.get('avg_first_accuracy')}**",
f"- Avg primary-tool accuracy: **{agg.get('avg_primary_accuracy')}**",
f"- Avg chain accuracy: **{agg.get('avg_chain_accuracy')}**",
f"- Avg success rate: **{agg.get('avg_success_rate')}**",
f"- Avg tool calls: **{agg.get('avg_tool_calls')}**",
f"- Avg score (/10): **{agg.get('avg_score_total')}**",
'',
'## Per-model',
'',
'| Model | Cases | First acc | Primary acc | Chain acc | Success | Avg calls | Avg score |',
'|---|---:|---:|---:|---:|---:|---:|---:|',
]
for s in summaries:
lines.append(
f"| {s['model']} | {s['n_cases']} | {s['first_accuracy']} | {s['primary_accuracy']} | {s['chain_accuracy']} | {s['success_rate']} | {s['avg_tool_calls']} | {s['avg_score_total']} |"
)
md_path.write_text('\n'.join(lines) + '\n', encoding='utf-8')
print('\nWrote:')
print(f'- {json_path}')
print(f'- {csv_path}')
print(f'- {md_path}')
def main() -> None:
ap = argparse.ArgumentParser(description='Batch runner for tool-routing/confusion benchmark')
ap.add_argument('--models', required=True, help='Comma-separated model IDs')
ap.add_argument('--agent', required=True, help='Router agent name')
ap.add_argument('--agent-cards', type=Path, required=True, help='Path containing router agent card and tools')
ap.add_argument('--prompts', type=Path, default=ROOT / 'scripts' / 'tool_routing_challenges.txt')
ap.add_argument('--expected', type=Path, default=ROOT / 'scripts' / 'tool_routing_expected.json')
ap.add_argument('--start', type=int, default=1)
ap.add_argument('--end', type=int, default=20)
ap.add_argument('--timeout', type=int, default=240)
ap.add_argument('--out-dir', type=Path, default=OUT_DIR)
ap.add_argument('--raw-results-dir', type=Path, default=None, help='Root directory for fast-agent --results JSON files')
args = ap.parse_args()
models = [m.strip() for m in args.models.split(',') if m.strip()]
for m in models:
run_one(
model=m,
agent=args.agent,
agent_cards=args.agent_cards,
prompts=args.prompts,
expected=args.expected,
start=args.start,
end=args.end,
timeout=args.timeout,
out_dir=args.out_dir,
raw_results_dir=args.raw_results_dir,
)
summaries = [load_model_summary(m, args.out_dir) for m in models]
# sort by first then primary
summaries = sorted(summaries, key=lambda s: (-s['first_accuracy'], -s['primary_accuracy'], -s['avg_score_total'], s['avg_tool_calls'], s['model']))
agg = aggregate(summaries)
write_outputs(summaries, agg, args.out_dir)
if __name__ == '__main__':
main()