commitguard-env / plots /eval_baselines.json
Nitishkumar-ai's picture
Add reward tuning, improved prompt, eval harness, and serving Dockerfile
b32b61a
{
"always_vuln": {
"episodes": 500,
"confusion": {
"tp": 244,
"fp": 256,
"tn": 0,
"fn": 0
},
"precision": 0.488,
"recall": 1.0,
"f1": 0.6559,
"accuracy": 0.488,
"mean_reward": 0.0358,
"per_cwe": {
"None": {
"tp": 0,
"fp": 256,
"tn": 0,
"fn": 0
},
"CWE-20": {
"tp": 38,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-189": {
"tp": 42,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-OTHER": {
"tp": 46,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-119": {
"tp": 46,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-476": {
"tp": 55,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-22": {
"tp": 10,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-78": {
"tp": 5,
"fp": 0,
"tn": 0,
"fn": 0
},
"CWE-89": {
"tp": 2,
"fp": 0,
"tn": 0,
"fn": 0
}
}
},
"always_safe": {
"episodes": 500,
"confusion": {
"tp": 0,
"fp": 0,
"tn": 256,
"fn": 244
},
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"accuracy": 0.512,
"mean_reward": 0.268,
"per_cwe": {
"None": {
"tp": 0,
"fp": 0,
"tn": 256,
"fn": 0
},
"CWE-20": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 38
},
"CWE-189": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 42
},
"CWE-OTHER": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 46
},
"CWE-119": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 46
},
"CWE-476": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 55
},
"CWE-22": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 10
},
"CWE-78": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 5
},
"CWE-89": {
"tp": 0,
"fp": 0,
"tn": 0,
"fn": 2
}
}
},
"random": {
"episodes": 500,
"confusion": {
"tp": 128,
"fp": 133,
"tn": 123,
"fn": 116
},
"precision": 0.4904,
"recall": 0.5246,
"f1": 0.5069,
"accuracy": 0.502,
"mean_reward": 0.15,
"per_cwe": {
"None": {
"tp": 0,
"fp": 133,
"tn": 123,
"fn": 0
},
"CWE-20": {
"tp": 19,
"fp": 0,
"tn": 0,
"fn": 19
},
"CWE-189": {
"tp": 18,
"fp": 0,
"tn": 0,
"fn": 24
},
"CWE-OTHER": {
"tp": 28,
"fp": 0,
"tn": 0,
"fn": 18
},
"CWE-119": {
"tp": 25,
"fp": 0,
"tn": 0,
"fn": 21
},
"CWE-476": {
"tp": 31,
"fp": 0,
"tn": 0,
"fn": 24
},
"CWE-22": {
"tp": 4,
"fp": 0,
"tn": 0,
"fn": 6
},
"CWE-78": {
"tp": 1,
"fp": 0,
"tn": 0,
"fn": 4
},
"CWE-89": {
"tp": 2,
"fp": 0,
"tn": 0,
"fn": 0
}
}
}
}