{ "summary": { "total_samples": 2, "overall_accuracy": 1.0, "cwe_breakdown": { "CWE-89": { "accuracy": 1.0, "count": 2 } } }, "results": [ { "sample_id": "synthetic-00100", "gt_vulnerable": true, "gt_cwe": "CWE-89", "final_verdict": true, "is_correct": true, "total_reward": 0.8999999999999999, "history": [ { "step": 0, "model_output": "request_contextauth.c", "parsed_action": "request_contextauth.c", "reward": -0.05, "observation": { "episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd", "step_idx": 1, "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", "available_files": [ "db.py" ], "context_snippets": [], "budget_remaining": 4, "error": "context_unavailable" } }, { "step": 1, "model_output": "verdicttrueCWE-89SQL injection in user_id", "parsed_action": "verdicttrueCWE-89SQL injection in user_id", "reward": 0.95, "observation": { "episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd", "step_idx": 2, "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", "available_files": [ "db.py" ], "context_snippets": [], "budget_remaining": 3, "error": null } } ] }, { "sample_id": "synthetic-00101", "gt_vulnerable": true, "gt_cwe": "CWE-89", "final_verdict": true, "is_correct": true, "total_reward": 0.8999999999999999, "history": [ { "step": 0, "model_output": "request_contextauth.c", "parsed_action": "request_contextauth.c", "reward": -0.05, "observation": { "episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d", "step_idx": 1, "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", "available_files": [ "db.py" ], "context_snippets": [], "budget_remaining": 4, "error": "context_unavailable" } }, { "step": 1, "model_output": "verdicttrueCWE-89SQL injection in user_id", "parsed_action": "verdicttrueCWE-89SQL injection in user_id", "reward": 0.95, "observation": { "episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d", "step_idx": 2, "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", "available_files": [ "db.py" ], "context_snippets": [], "budget_remaining": 3, "error": null } } ] } ] }