File size: 31,021 Bytes
b6ae7b8
fcb2b04
2088481
 
b6ae7b8
2088481
b6ae7b8
2088481
 
 
 
 
 
 
 
fcb2b04
 
b6ae7b8
fcb2b04
 
b6ae7b8
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
 
b6ae7b8
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
fcb2b04
b6ae7b8
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
 
b6ae7b8
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
fcb2b04
b6ae7b8
fcb2b04
b6ae7b8
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcb2b04
 
b6ae7b8
 
 
 
 
fcb2b04
b6ae7b8
fcb2b04
b6ae7b8
 
 
fcb2b04
b6ae7b8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
#!/usr/bin/env python3
"""
Tool Use Evaluation for Stack 2.9 [DEPRECATED]
==============================================

⚠️  WARNING: This evaluation script is DEPRECATED and the methodology is INVALID.

This evaluator uses a naive keyword-matching simulation, not actual model inference.
There is no proper benchmark implementation for tool calling. The claimed 94.1%
score is unverifiable and misleading.

A proper tool use benchmark needs to be built with 500+ realistic test cases and
actual model calls. This script remains only as a placeholder.

See EVALUATION.md for the full audit report.
"""

import argparse
import json
import os
import random
import subprocess
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Optional, List, Dict, Any

# Tool categories and test cases
TOOL_CATEGORIES = {
    "file_operations": {
        "description": "File read, write, edit, and glob operations",
        "tools": ["FileReadTool", "FileWriteTool", "FileEditTool", "GlobTool"],
        "test_cases": [
            # FileReadTool tests
            {"task": "Read the contents of /etc/hostname", "expected_tool": "FileReadTool", "expected_params": {"path": "/etc/hostname"}},
            {"task": "Show me what's in README.md", "expected_tool": "FileReadTool", "expected_params": {"path": "README.md"}},
            {"task": "Display the contents of config.json", "expected_tool": "FileReadTool", "expected_params": {"path": "config.json"}},
            {"task": "Cat the file /tmp/test.txt", "expected_tool": "FileReadTool", "expected_params": {"path": "/tmp/test.txt"}},
            {"task": "View the python file main.py", "expected_tool": "FileReadTool", "expected_params": {"path": "main.py"}},
            {"task": "Show me the contents of the src directory", "expected_tool": "GlobTool", "expected_params": {"pattern": "src/**/*"}},
            {"task": "Find all Python files in the project", "expected_tool": "GlobTool", "expected_params": {"pattern": "**/*.py"}},
            {"task": "List all JSON files", "expected_tool": "GlobTool", "expected_params": {"pattern": "**/*.json"}},
            {"task": "Find all markdown files", "expected_tool": "GlobTool", "expected_params": {"pattern": "**/*.md"}},
            {"task": "Show all files in the current directory", "expected_tool": "GlobTool", "expected_params": {"pattern": "*"}},
            # FileWriteTool tests
            {"task": "Create a file called hello.txt with content 'Hello World'", "expected_tool": "FileWriteTool", "expected_params": {"path": "hello.txt", "content": "Hello World"}},
            {"task": "Write 'export PATH=/usr/bin' to .bashrc", "expected_tool": "FileWriteTool", "expected_params": {"path": ".bashrc"}},
            {"task": "Save the data to output.json", "expected_tool": "FileWriteTool", "expected_params": {"path": "output.json"}},
            {"task": "Create a new file test.py with shebang", "expected_tool": "FileWriteTool", "expected_params": {"path": "test.py"}},
            {"task": "Write the configuration to config.yaml", "expected_tool": "FileWriteTool", "expected_params": {"path": "config.yaml"}},
            # FileEditTool tests
            {"task": "Replace 'foo' with 'bar' in file.txt", "expected_tool": "FileEditTool", "expected_params": {"path": "file.txt"}},
            {"task": "Add a new line to the end of notes.txt", "expected_tool": "FileEditTool", "expected_params": {"path": "notes.txt"}},
            {"task": "Update the version number in package.json", "expected_tool": "FileEditTool", "expected_params": {"path": "package.json"}},
            {"task": "Remove the debug statement from main.py", "expected_tool": "FileEditTool", "expected_params": {"path": "main.py"}},
            {"task": "Edit the config file to enable debug mode", "expected_tool": "FileEditTool", "expected_params": {"path": "config.json"}},
        ]
    },
    "git_operations": {
        "description": "Git commands for version control",
        "tools": ["BashTool"],
        "test_cases": [
            {"task": "Check the git status", "expected_tool": "BashTool", "expected_params": {"command": "git status"}},
            {"task": "Show me the git log", "expected_tool": "BashTool", "expected_params": {"command": "git log --oneline -10"}},
            {"task": "Create a new branch called feature-x", "expected_tool": "BashTool", "expected_params": {"command": "git checkout -b feature-x"}},
            {"task": "Commit all changes with message 'fix bug'", "expected_tool": "BashTool", "expected_params": {"command": "git add -A && git commit -m 'fix bug'"}},
            {"task": "Show the differences in main.py", "expected_tool": "BashTool", "expected_params": {"command": "git diff main.py"}},
            {"task": "Push to origin main", "expected_tool": "BashTool", "expected_params": {"command": "git push origin main"}},
            {"task": "Pull latest changes from remote", "expected_tool": "BashTool", "expected_params": {"command": "git pull"}},
            {"task": "Show which files changed in last commit", "expected_tool": "BashTool", "expected_params": {"command": "git diff --name-only HEAD~1..HEAD"}},
            {"task": "List all git branches", "expected_tool": "BashTool", "expected_params": {"command": "git branch -a"}},
            {"task": "Show the current git branch", "expected_tool": "BashTool", "expected_params": {"command": "git branch --show-current"}},
            {"task": "Stash current changes", "expected_tool": "BashTool", "expected_params": {"command": "git stash"}},
            {"task": "Apply stashed changes", "expected_tool": "BashTool", "expected_params": {"command": "git stash pop"}},
            {"task": "Show remotes", "expected_tool": "BashTool", "expected_params": {"command": "git remote -v"}},
            {"task": "Merge feature branch into main", "expected_tool": "BashTool", "expected_params": {"command": "git merge feature"}},
            {"task": "Rebase onto latest main", "expected_tool": "BashTool", "expected_params": {"command": "git rebase main"}},
        ]
    },
    "search_operations": {
        "description": "Search and grep operations",
        "tools": ["GrepTool", "WebSearchTool"],
        "test_cases": [
            {"task": "Search for 'TODO' in all Python files", "expected_tool": "GrepTool", "expected_params": {"pattern": "TODO", "files": "**/*.py"}},
            {"task": "Find all occurrences of 'debug' in src/", "expected_tool": "GrepTool", "expected_params": {"pattern": "debug", "files": "src/**/*"}},
            {"task": "Search for function definitions", "expected_tool": "GrepTool", "expected_params": {"pattern": "^def ", "files": "**/*.py"}},
            {"task": "Find imports in main.py", "expected_tool": "GrepTool", "expected_params": {"pattern": "^import |^from ", "files": "main.py"}},
            {"task": "Search for console.log in JavaScript files", "expected_tool": "GrepTool", "expected_params": {"pattern": "console.log", "files": "**/*.js"}},
            {"task": "Find all TODO comments", "expected_tool": "GrepTool", "expected_params": {"pattern": "TODO|FIXME", "files": "**/*"}},
            {"task": "Search the web for Python tutorials", "expected_tool": "WebSearchTool", "expected_params": {"query": "Python tutorials"}},
            {"task": "Search for how to use git rebase", "expected_tool": "WebSearchTool", "expected_params": {"query": "git rebase tutorial"}},
            {"task": "Look up documentation for async/await", "expected_tool": "WebSearchTool", "expected_params": {"query": "async await JavaScript documentation"}},
            {"task": "Find best practices for REST API design", "expected_tool": "WebSearchTool", "expected_params": {"query": "REST API design best practices"}},
        ]
    },
    "execution_operations": {
        "description": "Shell and command execution",
        "tools": ["BashTool"],
        "test_cases": [
            {"task": "List all files in current directory", "expected_tool": "BashTool", "expected_params": {"command": "ls -la"}},
            {"task": "Show current working directory", "expected_tool": "BashTool", "expected_params": {"command": "pwd"}},
            {"task": "Check Python version", "expected_tool": "BashTool", "expected_params": {"command": "python3 --version"}},
            {"task": "Run pytest on tests/", "expected_tool": "BashTool", "expected_params": {"command": "pytest tests/ -v"}},
            {"task": "Install requirements.txt", "expected_tool": "BashTool", "expected_params": {"command": "pip install -r requirements.txt"}},
            {"task": "Check disk usage", "expected_tool": "BashTool", "expected_params": {"command": "df -h"}},
            {"task": "Show memory usage", "expected_tool": "BashTool", "expected_params": {"command": "free -m"}},
            {"task": "Check running processes", "expected_tool": "BashTool", "expected_params": {"command": "ps aux | head -20"}},
            {"task": "Find large files", "expected_tool": "BashTool", "expected_params": {"command": "find . -type f -size +100M"}},
            {"task": "Count lines in Python files", "expected_tool": "BashTool", "expected_params": {"command": "find . -name '*.py' | xargs wc -l"}},
            {"task": "Kill process on port 3000", "expected_tool": "BashTool", "expected_params": {"command": "lsof -ti:3000 | xargs kill"}},
            {"task": "Start a Python HTTP server", "expected_tool": "BashTool", "expected_params": {"command": "python3 -m http.server 8000"}},
            {"task": "Check if port 5432 is open", "expected_tool": "BashTool", "expected_params": {"command": "nc -zv localhost 5432"}},
            {"task": "Show network connections", "expected_tool": "BashTool", "expected_params": {"command": "netstat -tuln"}},
            {"task": "Check DNS for example.com", "expected_tool": "BashTool", "expected_params": {"command": "dig example.com"}},
        ]
    },
    "task_operations": {
        "description": "Task and todo management",
        "tools": ["TaskCreateTool", "TaskListTool", "TaskUpdateTool", "TodoWriteTool"],
        "test_cases": [
            {"task": "Create a task to fix the login bug", "expected_tool": "TaskCreateTool", "expected_params": {"title": "Fix login bug"}},
            {"task": "List all pending tasks", "expected_tool": "TaskListTool", "expected_params": {}},
            {"task": "Mark task #123 as complete", "expected_tool": "TaskUpdateTool", "expected_params": {"task_id": "123", "status": "completed"}},
            {"task": "Add a todo item for code review", "expected_tool": "TodoWriteTool", "expected_params": {"content": "Code review"}},
            {"task": "Show me the task with ID 42", "expected_tool": "TaskGetTool", "expected_params": {"task_id": "42"}},
            {"task": "Stop the currently running task", "expected_tool": "TaskStopTool", "expected_params": {}},
            {"task": "Update task priority", "expected_tool": "TaskUpdateTool", "expected_params": {"task_id": "123", "priority": "high"}},
            {"task": "Create a subtask under task #5", "expected_tool": "TaskCreateTool", "expected_params": {"title": "Subtask", "parent_id": "5"}},
            {"task": "Get output of task #99", "expected_tool": "TaskOutputTool", "expected_params": {"task_id": "99"}},
            {"task": "Delete completed tasks", "expected_tool": "TodoWriteTool", "expected_params": {"filter": "completed", "action": "delete"}},
        ]
    },
    "web_operations": {
        "description": "Web fetch and API operations",
        "tools": ["WebFetchTool", "WebSearchTool"],
        "test_cases": [
            {"task": "Fetch the README from GitHub", "expected_tool": "WebFetchTool", "expected_params": {"url": "https://github.com/example/repo"}},
            {"task": "Get the weather for New York", "expected_tool": "WebSearchTool", "expected_params": {"query": "weather New York"}},
            {"task": "Look up Python documentation", "expected_tool": "WebFetchTool", "expected_params": {"url": "https://docs.python.org/"}},
            {"task": "Search for OpenAI API docs", "expected_tool": "WebSearchTool", "expected_params": {"query": "OpenAI API documentation"}},
            {"task": "Get the latest news about AI", "expected_tool": "WebSearchTool", "expected_params": {"query": "AI news 2024"}},
            {"task": "Fetch content from a URL", "expected_tool": "WebFetchTool", "expected_params": {"url": "https://example.com/api/data"}},
        ]
    },
    "config_operations": {
        "description": "Configuration and settings",
        "tools": ["ConfigTool", "SkillTool"],
        "test_cases": [
            {"task": "Show the current configuration", "expected_tool": "ConfigTool", "expected_params": {}},
            {"task": "List all available skills", "expected_tool": "SkillTool", "expected_params": {"action": "list"}},
            {"task": "Show config for git integration", "expected_tool": "ConfigTool", "expected_params": {"section": "git"}},
            {"task": "Get skill documentation for coding", "expected_tool": "SkillTool", "expected_params": {"skill": "coding", "action": "info"}},
            {"task": "Update the timeout setting", "expected_tool": "ConfigTool", "expected_params": {"key": "timeout", "value": "30"}},
            {"task": "List configured API keys", "expected_tool": "ConfigTool", "expected_params": {"section": "api_keys"}},
        ]
    },
    "agent_operations": {
        "description": "Multi-agent and team operations",
        "tools": ["TeamCreateTool", "TeamDeleteTool", "EnterPlanModeTool", "ExitPlanModeTool"],
        "test_cases": [
            {"task": "Create a team for the project", "expected_tool": "TeamCreateTool", "expected_params": {"name": "project-team"}},
            {"task": "Delete the old team", "expected_tool": "TeamDeleteTool", "expected_params": {"team": "old-team"}},
            {"task": "Enter plan mode to review changes", "expected_tool": "EnterPlanModeTool", "expected_params": {}},
            {"task": "Exit plan mode and continue", "expected_tool": "ExitPlanModeTool", "expected_params": {}},
            {"task": "Enter worktree for feature branch", "expected_tool": "EnterWorktreeTool", "expected_params": {"branch": "feature-x"}},
            {"task": "Exit current worktree", "expected_tool": "ExitWorktreeTool", "expected_params": {}},
        ]
    }
}


@dataclass
class ToolTestCase:
    """Single tool test case."""
    category: str
    task: str
    expected_tool: str
    expected_params: Dict[str, Any]


@dataclass
class ToolEvalResult:
    """Result for a single tool evaluation."""
    category: str
    task: str
    expected_tool: str
    predicted_tool: Optional[str]
    tool_correct: bool
    params_correct: bool
    execution_success: bool
    error: Optional[str] = None
    latency_ms: float = 0.0


@dataclass
class ToolEvalSummary:
    """Aggregated tool evaluation summary."""
    model: str
    timestamp: str
    total_cases: int
    tool_selection_accuracy: float
    parameter_accuracy: float
    execution_success_rate: float
    overall_success_rate: float
    category_results: Dict[str, Dict[str, float]]
    results: List[Dict] = field(default_factory=list)


class ToolUseEvaluator:
    """
    Comprehensive Tool Use Evaluation System.
    
    Evaluates tool selection, parameter extraction, and execution success
    across 500+ test cases covering all major tool categories.
    """
    
    def __init__(self, model: str = "stack-2.9"):
        self.model = model
        self.test_cases = self._generate_test_cases()
    
    def _generate_test_cases(self) -> List[ToolTestCase]:
        """Generate all tool test cases."""
        cases = []
        for category, data in TOOL_CATEGORIES.items():
            for tc in data["test_cases"]:
                cases.append(ToolTestCase(
                    category=category,
                    task=tc["task"],
                    expected_tool=tc["expected_tool"],
                    expected_params=tc.get("expected_params", {})
                ))
        
        # Add variations to reach 500+ test cases
        variations = self._generate_variations()
        cases.extend(variations)
        
        return cases
    
    def _generate_variations(self) -> List[ToolTestCase]:
        """Generate additional test case variations."""
        variations = []
        
        # File operation variations
        file_variations = [
            ("file_operations", "Read {path}", "FileReadTool", {"path": "/etc/passwd"}),
            ("file_operations", "Show me {path}", "FileReadTool", {"path": ".env"}),
            ("file_operations", "Display {path}", "FileReadTool", {"path": "docker-compose.yml"}),
            ("file_operations", "Open {path}", "FileReadTool", {"path": "script.py"}),
            ("file_operations", "Find all {ext} files", "GlobTool", {"pattern": "**/*.{ext}"}),
            ("file_operations", "Locate all {ext} files", "GlobTool", {"pattern": "**/*.{ext}"}),
            ("file_operations", "Write 'test' to {path}", "FileWriteTool", {"path": "test.txt", "content": "test"}),
            ("file_operations", "Create {path} with data", "FileWriteTool", {"path": "data.csv"}),
            ("file_operations", "Edit {path} to change X", "FileEditTool", {"path": "config.yml"}),
        ]
        
        # Git variations
        git_variations = [
            ("git_operations", "git {command}", "BashTool", {"command": "git status -sb"}),
            ("git_operations", "Show git {subcommand}", "BashTool", {"command": "git show --stat"}),
            ("git_operations", "Run git {cmd}", "BashTool", {"command": "git log -5 --graph"}),
        ]
        
        # Search variations
        search_variations = [
            ("search_operations", "grep for {pattern} in {files}", "GrepTool", {"pattern": "{pattern}", "files": "{files}"}),
            ("search_operations", "Find {pattern} in codebase", "GrepTool", {"pattern": "{pattern}", "files": "**/*"}),
            ("search_operations", "Search web for {query}", "WebSearchTool", {"query": "{query}"}),
        ]
        
        # Execution variations
        exec_variations = [
            ("execution_operations", "Run {command}", "BashTool", {"command": "{command}"}),
            ("execution_operations", "Execute {command}", "BashTool", {"command": "{command}"}),
            ("execution_operations", "Run shell command {cmd}", "BashTool", {"command": "{cmd}"}),
        ]
        
        all_variations = file_variations + git_variations + search_variations + exec_variations
        
        # Generate concrete variations
        paths = ["src/main.py", "lib/utils.js", "docs/README.md", "tests/test.py", "config/settings.json"]
        extensions = ["py", "js", "ts", "go", "rs", "java", "rb"]
        git_cmds = ["stash list", "tag -l", "reflog", "shortlog -sn", "ls-files"]
        patterns = ["function", "class", "const", "let", "var", "async", "await"]
        
        for category, task, tool, params in all_variations:
            for i in range(5):  # 5 variations each
                path = paths[i % len(paths)]
                ext = extensions[i % len(extensions)]
                git_cmd = git_cmds[i % len(git_cmds)]
                pattern = patterns[i % len(patterns)]
                
                concrete_task = task.format(
                    path=path, ext=ext, command=git_cmd, pattern=pattern,
                    files="**/*.py", query="example query", cmd="ls"
                )
                concrete_params = {}
                for k, v in params.items():
                    concrete_params[k] = v.format(
                        path=path, ext=ext, command=git_cmd, pattern=pattern,
                        files="**/*.py", query="example query", cmd="ls"
                    )
                
                variations.append(ToolTestCase(
                    category=category,
                    task=concrete_task,
                    expected_tool=tool,
                    expected_params=concrete_params
                ))
        
        return variations
    
    def predict_tool(self, task: str) -> tuple[str, Dict[str, Any]]:
        """
        Predict which tool to use for a task.
        In production, this would call the actual model.
        """
        # Simple keyword-based simulation
        task_lower = task.lower()
        
        if any(word in task_lower for word in ['read', 'show', 'display', 'view', 'cat', 'open']):
            if 'pattern' in task_lower or 'find' in task_lower:
                return "GlobTool", {"pattern": "**/*"}
            return "FileReadTool", {"path": "/tmp/file.txt"}
        
        if any(word in task_lower for word in ['write', 'create', 'save', 'make file']):
            return "FileWriteTool", {"path": "output.txt", "content": ""}
        
        if any(word in task_lower for word in ['edit', 'replace', 'update', 'modify', 'change']):
            return "FileEditTool", {"path": "file.txt"}
        
        if 'grep' in task_lower or 'search' in task_lower:
            if 'web' in task_lower or 'internet' in task_lower:
                return "WebSearchTool", {"query": "search"}
            return "GrepTool", {"pattern": "TODO", "files": "**/*.py"}
        
        if any(word in task_lower for word in ['git', 'commit', 'push', 'pull', 'branch']):
            return "BashTool", {"command": "git status"}
        
        if any(word in task_lower for word in ['run', 'execute', 'shell', 'bash', 'command']):
            return "BashTool", {"command": "ls -la"}
        
        if 'task' in task_lower:
            if 'create' in task_lower:
                return "TaskCreateTool", {"title": "New task"}
            if 'list' in task_lower:
                return "TaskListTool", {}
            if 'update' in task_lower:
                return "TaskUpdateTool", {"task_id": "1"}
            return "TaskGetTool", {"task_id": "1"}
        
        if 'todo' in task_lower:
            return "TodoWriteTool", {"content": "New todo"}
        
        if 'fetch' in task_lower or 'url' in task_lower:
            return "WebFetchTool", {"url": "https://example.com"}
        
        if 'config' in task_lower:
            return "ConfigTool", {}
        
        if 'skill' in task_lower:
            return "SkillTool", {"action": "list"}
        
        # Default to bash for unknown tasks
        return "BashTool", {"command": "echo hello"}
    
    def validate_params(self, expected: Dict, predicted: Dict) -> bool:
        """Check if predicted parameters match expected."""
        # For simplicity, check if key parameters are present
        # In production, would use more sophisticated matching
        expected_keys = set(expected.keys())
        predicted_keys = set(predicted.keys())
        
        # Must have at least the key parameters
        return bool(expected_keys & predicted_keys)
    
    def execute_tool(self, tool: str, params: Dict) -> tuple[bool, Optional[str]]:
        """
        Execute a tool with given parameters.
        Returns (success, error_message).
        """
        try:
            if tool == "BashTool":
                cmd = params.get("command", "echo test")
                result = subprocess.run(
                    cmd, shell=True, capture_output=True, timeout=5
                )
                return result.returncode == 0, None
            
            # For other tools, just simulate success
            return True, None
            
        except Exception as e:
            return False, str(e)
    
    def evaluate_single(self, test_case: ToolTestCase) -> ToolEvalResult:
        """Evaluate a single test case."""
        start_time = time.time()
        
        try:
            predicted_tool, predicted_params = self.predict_tool(test_case.task)
            
            tool_correct = predicted_tool == test_case.expected_tool
            params_correct = self.validate_params(
                test_case.expected_params, predicted_params
            )
            
            # Try to execute if tool is correct
            execution_success = False
            error = None
            if tool_correct:
                execution_success, error = self.execute_tool(
                    predicted_tool, predicted_params
                )
            
            return ToolEvalResult(
                category=test_case.category,
                task=test_case.task,
                expected_tool=test_case.expected_tool,
                predicted_tool=predicted_tool,
                tool_correct=tool_correct,
                params_correct=params_correct,
                execution_success=execution_success,
                error=error,
                latency_ms=(time.time() - start_time) * 1000
            )
            
        except Exception as e:
            return ToolEvalResult(
                category=test_case.category,
                task=test_case.task,
                expected_tool=test_case.expected_tool,
                predicted_tool=None,
                tool_correct=False,
                params_correct=False,
                execution_success=False,
                error=str(e),
                latency_ms=(time.time() - start_time) * 1000
            )
    
    def run_evaluation(self, sample_size: int = None) -> ToolEvalSummary:
        """Run full tool evaluation."""
        print(f"Starting Tool Use Evaluation for {self.model}")
        print(f"Total test cases: {len(self.test_cases)}")
        print("-" * 50)
        
        # Sample if needed for faster evaluation
        cases = self.test_cases
        if sample_size and sample_size < len(cases):
            cases = random.sample(cases, sample_size)
        
        results = []
        category_stats = {}
        
        for i, tc in enumerate(cases):
            if (i + 1) % 50 == 0:
                print(f"Progress: {i + 1}/{len(cases)}")
            
            result = self.evaluate_single(tc)
            results.append(result.__dict__)
            
            # Track category stats
            if tc.category not in category_stats:
                category_stats[tc.category] = {
                    "total": 0, "tool_correct": 0, "params_correct": 0, "exec_success": 0
                }
            
            category_stats[tc.category]["total"] += 1
            if result.tool_correct:
                category_stats[tc.category]["tool_correct"] += 1
            if result.params_correct:
                category_stats[tc.category]["params_correct"] += 1
            if result.execution_success:
                category_stats[tc.category]["exec_success"] += 1
        
        # Calculate aggregate metrics
        total = len(results)
        tool_correct = sum(1 for r in results if r["tool_correct"])
        params_correct = sum(1 for r in results if r["params_correct"])
        exec_success = sum(1 for r in results if r["execution_success"])
        
        tool_accuracy = tool_correct / total if total > 0 else 0
        param_accuracy = params_correct / total if total > 0 else 0
        exec_rate = exec_success / total if total > 0 else 0
        overall = (tool_correct + params_correct) / (2 * total) if total > 0 else 0
        
        # Category breakdowns
        category_results = {}
        for cat, stats in category_stats.items():
            category_results[cat] = {
                "tool_selection_accuracy": stats["tool_correct"] / stats["total"],
                "parameter_accuracy": stats["params_correct"] / stats["total"],
                "execution_success_rate": stats["exec_success"] / stats["total"],
                "total_cases": stats["total"]
            }
        
        print(f"\nTotal Cases: {total}")
        print(f"Tool Selection Accuracy: {tool_accuracy:.2%}")
        print(f"Parameter Accuracy: {param_accuracy:.2%}")
        print(f"Execution Success Rate: {exec_rate:.2%}")
        print(f"Overall Success Rate: {overall:.2%}")
        
        return ToolEvalSummary(
            model=self.model,
            timestamp=datetime.now().isoformat(),
            total_cases=total,
            tool_selection_accuracy=tool_accuracy,
            parameter_accuracy=param_accuracy,
            execution_success_rate=exec_rate,
            overall_success_rate=overall,
            category_results=category_results,
            results=results
        )
    
    def save_results(self, summary: ToolEvalSummary, output_dir: str):
        """Save evaluation results."""
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        # JSON
        json_path = output_dir / "tool_use_results.json"
        with open(json_path, 'w') as f:
            json.dump(summary.__dict__, f, indent=2)
        
        # Summary report
        report_path = output_dir / "tool_use_report.md"
        with open(report_path, 'w') as f:
            f.write(f"# Tool Use Evaluation Report\n\n")
            f.write(f"**Model:** {summary.model}\n")
            f.write(f"**Date:** {summary.timestamp}\n\n")
            f.write(f"## Summary\n\n")
            f.write(f"| Metric | Value |\n|--------|-------|\n")
            f.write(f"| Total Cases | {summary.total_cases} |\n")
            f.write(f"| Tool Selection Accuracy | {summary.tool_selection_accuracy:.2%} |\n")
            f.write(f"| Parameter Accuracy | {summary.parameter_accuracy:.2%} |\n")
            f.write(f"| Execution Success Rate | {summary.execution_success_rate:.2%} |\n")
            f.write(f"| **Overall Success Rate** | **{summary.overall_success_rate:.2%}** |\n\n")
            
            f.write(f"## Category Breakdown\n\n")
            f.write(f"| Category | Tool Acc | Param Acc | Exec Rate | Cases |\n")
            f.write(f"|----------|----------|-----------|-----------|-------|\n")
            for cat, stats in summary.category_results.items():
                f.write(f"| {cat} | {stats['tool_selection_accuracy']:.2%} | ")
                f.write(f"{stats['parameter_accuracy']:.2%} | ")
                f.write(f"{stats['execution_success_rate']:.2%} | ")
                f.write(f"{stats['total_cases']} |\n")
        
        print(f"\nResults saved to {output_dir}/")
        return json_path


def main():
    parser = argparse.ArgumentParser(description="Tool Use Evaluation")
    parser.add_argument("--model", default="stack-2.9", help="Model name")
    parser.add_argument("--output", default="./results", help="Output directory")
    parser.add_argument("--sample", type=int, default=None, help="Sample size (default: all)")
    
    args = parser.parse_args()
    
    evaluator = ToolUseEvaluator(model=args.model)
    results = evaluator.run_evaluation(sample_size=args.sample)
    evaluator.save_results(results, args.output)
    
    print("\n" + "=" * 50)
    print("TOOL USE EVALUATION COMPLETE")
    print("=" * 50)


if __name__ == "__main__":
    main()