codearena-rl / server /grader.py
havinashpatil
Finalizing CodeArena RL Benchmark: frontend improvements, GRPO training scripts, and cleaned environment
03a7eb9
import os
import json
from openai import OpenAI
from .models import ExecutionResult, TaskInfo
def force_valid_reward(value) -> float:
"""Hard guarantee: reward is strictly in (0, 1) — never 0 or 1, no exceptions."""
try:
r = float(value)
except Exception:
return 0.5
# HARD GUARANTEE: prevent .2f formatting from rounding to 1.00 or 0.00
if r <= 0.01:
return 0.01
if r >= 0.99:
return 0.99
return r
def safe_reward(reward) -> float:
"""Clamp reward to open interval (0, 1) via force_valid_reward."""
if reward is None:
reward = 0.5
return force_valid_reward(reward)
def normalize_reward(passed: int, total: int) -> float:
if total == 0:
return 0.5
raw = passed / total
return force_valid_reward(raw)
_LLM_CACHE = {}
_JUDGE_DISABLED_WARNED = False
def get_llm_quality_score(proposed_fix: str) -> dict:
global _JUDGE_DISABLED_WARNED
if proposed_fix in _LLM_CACHE:
return _LLM_CACHE[proposed_fix]
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
if not _JUDGE_DISABLED_WARNED:
print("LLM judge disabled: OPENAI_API_KEY not set. Using neutral fallback scores.")
_JUDGE_DISABLED_WARNED = True
fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
_LLM_CACHE[proposed_fix] = fallback
return fallback
try:
client = OpenAI(api_key=api_key)
response = client.chat.completions.create(
model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"),
messages=[
{"role": "system", "content": "You are a code judge. Evaluate the provided Python code on a scale of 0.0 to 1.0 for three metrics: code_quality, security, and correctness. Respond with JSON format strictly matching: {\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"},
{"role": "user", "content": proposed_fix}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
_LLM_CACHE[proposed_fix] = result
return result
except Exception as e:
print(f"LLM judge error: {e}")
fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5}
_LLM_CACHE[proposed_fix] = fallback
return fallback
def calculate_reward_components(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> dict:
compile_score = 1.0 if not exec_result.runtime_errors else 0.0
test_ratio = 0.0
if exec_result.test_total > 0:
test_ratio = exec_result.test_passed / exec_result.test_total
efficiency = 0.0
if test_ratio == 1.0:
if exec_result.execution_time_seconds <= task_info.optimal_time_seconds:
efficiency = 1.0
else:
ratio = exec_result.execution_time_seconds / max(0.001, task_info.optimal_time_seconds)
efficiency = max(0.0, 1.0 - (ratio - 1.0) / 2.0)
llm_scores = get_llm_quality_score(proposed_fix)
return {
"compile_score": compile_score,
"test_ratio": test_ratio,
"efficiency": efficiency,
"llm_correctness": float(llm_scores.get("correctness", 0.5)),
"llm_security": float(llm_scores.get("security", 0.5)),
"llm_quality": float(llm_scores.get("code_quality", 0.5))
}
def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> tuple[float, dict]:
comps = calculate_reward_components(exec_result, task_info, proposed_fix)
base_reward = (
0.15 * comps["compile_score"] +
0.35 * comps["test_ratio"] +
0.30 * comps["efficiency"] + # Increased from 0.15 to push optimization
0.10 * comps["llm_correctness"] +
0.05 * comps["llm_security"] +
0.05 * comps["llm_quality"]
)
# Compile bonus: encourage first milestone
if comps["compile_score"] > 0.0:
base_reward += 0.05
# Harsh complexity penalty: if runtime is > 5x optimal, penalize heavily
if exec_result.test_passed == exec_result.test_total and exec_result.test_total > 0:
if exec_result.execution_time_seconds > task_info.optimal_time_seconds * 5:
base_reward -= 0.30
return base_reward, comps
def grade(*args, **kwargs) -> float:
try:
if len(args) == 3:
return calculate_reward(args[0], args[1], args[2])[0]
return 0.5
except Exception:
return 0.5