from __future__ import annotations

from .models import CommitGuardAction


def compute_reward(
    *,
    action: CommitGuardAction,
    is_vulnerable: bool | None,
    cwe: str | None,
    target_file: str | None,
    cwe_keywords: dict[str, list[str]] | None,
    context_requests: int,
) -> float:
    """
    Tiered RLVR reward (PRD 5.3, architecture contract).

    Notes:
    - Ground truth must remain server-only; caller passes it in.
    - Reward is a scalar only; no label debug info.
    """
    # Per-context-request penalty applies regardless of verdict.
    reward = -0.05 * float(max(0, context_requests))

    if action.parse_error:
        return reward - 0.5

    # Small CoT bonus: reward 'analyze' steps that provide substantial reasoning.
    # This provides a tiny positive float signal to encourage thinking.
    if action.action_type == "analyze":
        reasoning_len = len(action.reasoning or "")
        if reasoning_len > 50:
            reward += min(0.05, 0.001 * (reasoning_len // 10))
        return reward

    if action.action_type != "verdict":
        return reward

    if is_vulnerable is None:
        return reward

    pred = bool(action.is_vulnerable) if action.is_vulnerable is not None else None
    if pred is None:
        return reward - 0.5

    if pred is True and is_vulnerable is True:
        reward += 1.0
        # Correct CWE (Discrete 0.5)
        if cwe and action.vuln_type and action.vuln_type.strip().upper() == cwe.strip().upper():
            reward += 0.5

        # Proportional Keyword Match (Continuous Float up to 0.5)
        kws = (cwe_keywords or {}).get(cwe or "", []) if cwe else []
        if kws:
            sketch = (action.exploit_sketch or "").lower()
            matches = sum(1 for k in kws if k.lower() in sketch)
            # Continuous signal: reward is proportional to percentage of keywords found.
            reward += 0.5 * (matches / len(kws))
        return reward

    if pred is True and is_vulnerable is False:
        return reward - 1.0

    if pred is False and is_vulnerable is True:
        return reward - 0.5

    if pred is False and is_vulnerable is False:
        return reward + 1.0

    return reward