Spaces:
Configuration error
Configuration error
File size: 2,351 Bytes
e4f3d12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | from __future__ import annotations
from .models import CommitGuardAction
def compute_reward(
*,
action: CommitGuardAction,
is_vulnerable: bool | None,
cwe: str | None,
target_file: str | None,
cwe_keywords: dict[str, list[str]] | None,
context_requests: int,
) -> float:
"""
Tiered RLVR reward (PRD 5.3, architecture contract).
Notes:
- Ground truth must remain server-only; caller passes it in.
- Reward is a scalar only; no label debug info.
"""
# Per-context-request penalty applies regardless of verdict.
reward = -0.05 * float(max(0, context_requests))
if action.parse_error:
return reward - 0.5
# Small CoT bonus: reward 'analyze' steps that provide substantial reasoning.
# This provides a tiny positive float signal to encourage thinking.
if action.action_type == "analyze":
reasoning_len = len(action.reasoning or "")
if reasoning_len > 50:
reward += min(0.05, 0.001 * (reasoning_len // 10))
return reward
if action.action_type != "verdict":
return reward
if is_vulnerable is None:
return reward
pred = bool(action.is_vulnerable) if action.is_vulnerable is not None else None
if pred is None:
return reward - 0.5
if pred is True and is_vulnerable is True:
reward += 1.0
# Correct CWE (Discrete 0.5)
if cwe and action.vuln_type and action.vuln_type.strip().upper() == cwe.strip().upper():
reward += 0.5
# Proportional Keyword Match (Continuous Float up to 0.5)
kws = (cwe_keywords or {}).get(cwe or "", []) if cwe else []
if kws:
sketch = (action.exploit_sketch or "").lower()
matches = sum(1 for k in kws if k.lower() in sketch)
# Continuous signal: reward is proportional to percentage of keywords found.
reward += 0.5 * (matches / len(kws))
return reward
if pred is True and is_vulnerable is False:
return reward - 1.0
if pred is False and is_vulnerable is True:
return reward - 0.5
if pred is False and is_vulnerable is False:
return reward + 1.0
return reward
|