| | """ |
| | Robustness Testing for Dynamic Function-Calling Agent |
| | |
| | Tests model stability with: |
| | 1. Shuffled JSON key order |
| | 2. Distractor text before schema |
| | 3. Noisy prompts |
| | |
| | Quick test that doesn't require retraining. |
| | """ |
| |
|
| | import json |
| | import random |
| | from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema |
| |
|
| | def shuffle_json_keys(obj): |
| | """Recursively shuffle the order of keys in JSON objects""" |
| | if isinstance(obj, dict): |
| | items = list(obj.items()) |
| | random.shuffle(items) |
| | return {k: shuffle_json_keys(v) for k, v in items} |
| | elif isinstance(obj, list): |
| | return [shuffle_json_keys(item) for item in obj] |
| | return obj |
| |
|
| | def add_distractor_text(schema_str): |
| | """Add distracting text before the schema""" |
| | distractors = [ |
| | "Note: This is a complex API with many parameters.", |
| | "Important: Please review all requirements carefully.", |
| | "Warning: Some fields may be optional depending on context.", |
| | "Info: This function supports multiple data formats.", |
| | "Reminder: Check authentication before making calls." |
| | ] |
| | distractor = random.choice(distractors) |
| | return f"{distractor}\n\n{schema_str}" |
| |
|
| | def test_robustness(): |
| | """Run robustness tests on the function calling agent""" |
| | print("π§ͺ Starting Robustness Tests...") |
| | |
| | |
| | model, tokenizer = load_trained_model() |
| | |
| | |
| | base_schema = { |
| | "name": "get_weather_forecast", |
| | "description": "Get weather forecast for a location", |
| | "parameters": { |
| | "type": "object", |
| | "properties": { |
| | "location": {"type": "string", "description": "City name"}, |
| | "days": {"type": "integer", "description": "Number of days", "minimum": 1}, |
| | "units": {"type": "string", "enum": ["metric", "imperial"]}, |
| | "include_hourly": {"type": "boolean", "default": False} |
| | }, |
| | "required": ["location", "days"] |
| | } |
| | } |
| | |
| | test_queries = [ |
| | "Get 3-day weather for Paris", |
| | "Weather forecast for Tokyo, 5 days, metric units", |
| | "I need the weather for London for the next week" |
| | ] |
| | |
| | results = { |
| | "baseline": [], |
| | "shuffled_keys": [], |
| | "with_distractors": [], |
| | "both_shuffled_and_distractors": [] |
| | } |
| | |
| | print("\nπ Running test scenarios...") |
| | |
| | for query in test_queries: |
| | print(f"\nπ Query: '{query}'") |
| | |
| | |
| | schema = create_json_schema(base_schema) |
| | prompt = f"""<|im_start|>system |
| | You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| | |
| | <schema> |
| | {json.dumps(base_schema, indent=2)} |
| | </schema> |
| | |
| | <|im_start|>user |
| | {query}<|im_end|> |
| | <|im_start|>assistant |
| | """ |
| | |
| | response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| | results["baseline"].append(success) |
| | print(f" β
Baseline: {'β' if success else 'β'}") |
| | |
| | |
| | shuffled_schema = shuffle_json_keys(base_schema) |
| | schema = create_json_schema(shuffled_schema) |
| | prompt = f"""<|im_start|>system |
| | You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| | |
| | <schema> |
| | {json.dumps(shuffled_schema, indent=2)} |
| | </schema> |
| | |
| | <|im_start|>user |
| | {query}<|im_end|> |
| | <|im_start|>assistant |
| | """ |
| | |
| | response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| | results["shuffled_keys"].append(success) |
| | print(f" π Shuffled: {'β' if success else 'β'}") |
| | |
| | |
| | schema = create_json_schema(base_schema) |
| | schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2)) |
| | prompt = f"""<|im_start|>system |
| | You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| | |
| | <schema> |
| | {schema_with_distractor} |
| | </schema> |
| | |
| | <|im_start|>user |
| | {query}<|im_end|> |
| | <|im_start|>assistant |
| | """ |
| | |
| | response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| | results["with_distractors"].append(success) |
| | print(f" π Distractor: {'β' if success else 'β'}") |
| | |
| | |
| | shuffled_schema = shuffle_json_keys(base_schema) |
| | schema = create_json_schema(shuffled_schema) |
| | schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2)) |
| | prompt = f"""<|im_start|>system |
| | You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<|im_end|> |
| | |
| | <schema> |
| | {schema_with_distractor} |
| | </schema> |
| | |
| | <|im_start|>user |
| | {query}<|im_end|> |
| | <|im_start|>assistant |
| | """ |
| | |
| | response, success, error = constrained_json_generate(model, tokenizer, prompt, schema) |
| | results["both_shuffled_and_distractors"].append(success) |
| | print(f" ππ Both: {'β' if success else 'β'}") |
| | |
| | |
| | print("\nπ Robustness Test Results:") |
| | print("=" * 50) |
| | |
| | for test_name, test_results in results.items(): |
| | success_rate = (sum(test_results) / len(test_results)) * 100 |
| | print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})") |
| | |
| | print("\nπ― Analysis:") |
| | baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100 |
| | |
| | for test_name, test_results in results.items(): |
| | if test_name != "baseline": |
| | test_rate = (sum(test_results) / len(test_results)) * 100 |
| | diff = test_rate - baseline_rate |
| | status = "π’" if diff >= -10 else "π‘" if diff >= -20 else "π΄" |
| | print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline") |
| | |
| | return results |
| |
|
| | if __name__ == "__main__": |
| | test_robustness() |