commitguard / eval_results_mock.json
Nitishkumar-ai's picture
Upload folder using huggingface_hub
e4f3d12 verified
{
"summary": {
"total_samples": 2,
"overall_accuracy": 1.0,
"cwe_breakdown": {
"CWE-89": {
"accuracy": 1.0,
"count": 2
}
}
},
"results": [
{
"sample_id": "synthetic-00100",
"gt_vulnerable": true,
"gt_cwe": "CWE-89",
"final_verdict": true,
"is_correct": true,
"total_reward": 0.8999999999999999,
"history": [
{
"step": 0,
"model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"reward": -0.05,
"observation": {
"episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd",
"step_idx": 1,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 4,
"error": "context_unavailable"
}
},
{
"step": 1,
"model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"reward": 0.95,
"observation": {
"episode_id": "c98a9f07-18fe-40a3-8c65-161a46fdabbd",
"step_idx": 2,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 3,
"error": null
}
}
]
},
{
"sample_id": "synthetic-00101",
"gt_vulnerable": true,
"gt_cwe": "CWE-89",
"final_verdict": true,
"is_correct": true,
"total_reward": 0.8999999999999999,
"history": [
{
"step": 0,
"model_output": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"parsed_action": "<action><action_type>request_context</action_type><file_path>auth.c</file_path></action>",
"reward": -0.05,
"observation": {
"episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d",
"step_idx": 1,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 4,
"error": "context_unavailable"
}
},
{
"step": 1,
"model_output": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"parsed_action": "<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable><vuln_type>CWE-89</vuln_type><exploit_sketch>SQL injection in user_id</exploit_sketch></action>",
"reward": 0.95,
"observation": {
"episode_id": "299ca2fd-e3e6-4bac-b8a2-d7404a52e07d",
"step_idx": 2,
"diff": "--- a/db.py\n+++ b/db.py\n@@\n- cursor.execute(\"SELECT * FROM users WHERE id = %s\", (user_id,))\n+ cursor.execute(f\"SELECT * FROM users WHERE id = {user_id}\")\n",
"available_files": [
"db.py"
],
"context_snippets": [],
"budget_remaining": 3,
"error": null
}
}
]
}
]
}