Spaces:

jlov7
/

Dynamic-Function-Calling-Agent

Sleeping

App Files Files Community

Dynamic-Function-Calling-Agent / robustness_test.py

jlov7

feat: Multi-tool selection and robustness testing

6639f75 8 months ago

raw

history blame contribute delete

6.27 kB

	"""
	Robustness Testing for Dynamic Function-Calling Agent

	Tests model stability with:
	1. Shuffled JSON key order
	2. Distractor text before schema
	3. Noisy prompts

	Quick test that doesn't require retraining.
	"""

	import json
	import random
	from test_constrained_model import load_trained_model, constrained_json_generate, create_json_schema

	def shuffle_json_keys(obj):
	"""Recursively shuffle the order of keys in JSON objects"""
	if isinstance(obj, dict):
	items = list(obj.items())
	random.shuffle(items)
	return {k: shuffle_json_keys(v) for k, v in items}
	elif isinstance(obj, list):
	return [shuffle_json_keys(item) for item in obj]
	return obj

	def add_distractor_text(schema_str):
	"""Add distracting text before the schema"""
	distractors = [
	"Note: This is a complex API with many parameters.",
	"Important: Please review all requirements carefully.",
	"Warning: Some fields may be optional depending on context.",
	"Info: This function supports multiple data formats.",
	"Reminder: Check authentication before making calls."
	]
	distractor = random.choice(distractors)
	return f"{distractor}\n\n{schema_str}"

	def test_robustness():
	"""Run robustness tests on the function calling agent"""
	print("🧪 Starting Robustness Tests...")

	# Load model
	model, tokenizer = load_trained_model()

	# Test schema
	base_schema = {
	"name": "get_weather_forecast",
	"description": "Get weather forecast for a location",
	"parameters": {
	"type": "object",
	"properties": {
	"location": {"type": "string", "description": "City name"},
	"days": {"type": "integer", "description": "Number of days", "minimum": 1},
	"units": {"type": "string", "enum": ["metric", "imperial"]},
	"include_hourly": {"type": "boolean", "default": False}
	},
	"required": ["location", "days"]
	}
	}

	test_queries = [
	"Get 3-day weather for Paris",
	"Weather forecast for Tokyo, 5 days, metric units",
	"I need the weather for London for the next week"
	]

	results = {
	"baseline": [],
	"shuffled_keys": [],
	"with_distractors": [],
	"both_shuffled_and_distractors": []
	}

	print("\n🔍 Running test scenarios...")

	for query in test_queries:
	print(f"\n📝 Query: '{query}'")

	# 1. Baseline test
	schema = create_json_schema(base_schema)
	prompt = f"""<\|im_start\|>system
	You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<\|im_end\|>

	<schema>
	{json.dumps(base_schema, indent=2)}
	</schema>

	<\|im_start\|>user
	{query}<\|im_end\|>
	<\|im_start\|>assistant
	"""

	response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
	results["baseline"].append(success)
	print(f" ✅ Baseline: {'✓' if success else '✗'}")

	# 2. Shuffled keys test
	shuffled_schema = shuffle_json_keys(base_schema)
	schema = create_json_schema(shuffled_schema)
	prompt = f"""<\|im_start\|>system
	You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<\|im_end\|>

	<schema>
	{json.dumps(shuffled_schema, indent=2)}
	</schema>

	<\|im_start\|>user
	{query}<\|im_end\|>
	<\|im_start\|>assistant
	"""

	response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
	results["shuffled_keys"].append(success)
	print(f" 🔀 Shuffled: {'✓' if success else '✗'}")

	# 3. Distractor text test
	schema = create_json_schema(base_schema)
	schema_with_distractor = add_distractor_text(json.dumps(base_schema, indent=2))
	prompt = f"""<\|im_start\|>system
	You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<\|im_end\|>

	<schema>
	{schema_with_distractor}
	</schema>

	<\|im_start\|>user
	{query}<\|im_end\|>
	<\|im_start\|>assistant
	"""

	response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
	results["with_distractors"].append(success)
	print(f" 🎭 Distractor: {'✓' if success else '✗'}")

	# 4. Both shuffled and distractors
	shuffled_schema = shuffle_json_keys(base_schema)
	schema = create_json_schema(shuffled_schema)
	schema_with_distractor = add_distractor_text(json.dumps(shuffled_schema, indent=2))
	prompt = f"""<\|im_start\|>system
	You are a helpful assistant that calls functions by responding with valid JSON when given a schema. Always respond with JSON function calls only, never prose.<\|im_end\|>

	<schema>
	{schema_with_distractor}
	</schema>

	<\|im_start\|>user
	{query}<\|im_end\|>
	<\|im_start\|>assistant
	"""

	response, success, error = constrained_json_generate(model, tokenizer, prompt, schema)
	results["both_shuffled_and_distractors"].append(success)
	print(f" 🔀🎭 Both: {'✓' if success else '✗'}")

	# Calculate success rates
	print("\n📊 Robustness Test Results:")
	print("=" * 50)

	for test_name, test_results in results.items():
	success_rate = (sum(test_results) / len(test_results)) * 100
	print(f"{test_name.replace('_', ' ').title()}: {success_rate:.1f}% ({sum(test_results)}/{len(test_results)})")

	print("\n🎯 Analysis:")
	baseline_rate = (sum(results["baseline"]) / len(results["baseline"])) * 100

	for test_name, test_results in results.items():
	if test_name != "baseline":
	test_rate = (sum(test_results) / len(test_results)) * 100
	diff = test_rate - baseline_rate
	status = "🟢" if diff >= -10 else "🟡" if diff >= -20 else "🔴"
	print(f"{status} {test_name.replace('_', ' ').title()}: {diff:+.1f}% vs baseline")

	return results

	if __name__ == "__main__":
	test_robustness()