Spaces:
Configuration error
Configuration error
| { | |
| "summary": { | |
| "total_samples": 2, | |
| "overall_accuracy": 1.0, | |
| "cwe_breakdown": { | |
| "CWE-89": { | |
| "accuracy": 1.0, | |
| "count": 2 | |
| } | |
| } | |
| }, | |
| "results": [ | |
| { | |
| "sample_id": "synthetic-00100", | |
| "gt_vulnerable": true, | |
| "gt_cwe": "CWE-89", | |
| "final_verdict": true, | |
| "is_correct": true, | |
| "total_reward": 0.8999999999999999, | |
| "history": [ | |
| { | |
| "step": 0, | |
| "model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>", | |
| "parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>", | |
| "reward": -0.05, | |
| "observation": { | |
| "episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd", | |
| "step_idx": 1, | |
| "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", | |
| "available_files": [ | |
| "db.py" | |
| ], | |
| "context_snippets": [], | |
| "budget_remaining": 4, | |
| "error": "context_unavailable" | |
| } | |
| }, | |
| { | |
| "step": 1, | |
| "model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>", | |
| "parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>", | |
| "reward": 0.95, | |
| "observation": { | |
| "episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd", | |
| "step_idx": 2, | |
| "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", | |
| "available_files": [ | |
| "db.py" | |
| ], | |
| "context_snippets": [], | |
| "budget_remaining": 3, | |
| "error": null | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "sample_id": "synthetic-00101", | |
| "gt_vulnerable": true, | |
| "gt_cwe": "CWE-89", | |
| "final_verdict": true, | |
| "is_correct": true, | |
| "total_reward": 0.8999999999999999, | |
| "history": [ | |
| { | |
| "step": 0, | |
| "model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>", | |
| "parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>", | |
| "reward": -0.05, | |
| "observation": { | |
| "episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d", | |
| "step_idx": 1, | |
| "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", | |
| "available_files": [ | |
| "db.py" | |
| ], | |
| "context_snippets": [], | |
| "budget_remaining": 4, | |
| "error": "context_unavailable" | |
| } | |
| }, | |
| { | |
| "step": 1, | |
| "model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>", | |
| "parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>", | |
| "reward": 0.95, | |
| "observation": { | |
| "episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d", | |
| "step_idx": 2, | |
| "diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n", | |
| "available_files": [ | |
| "db.py" | |
| ], | |
| "context_snippets": [], | |
| "budget_remaining": 3, | |
| "error": null | |
| } | |
| } | |
| ] | |
| } | |
| ] | |
| } | |