import os import json from openai import OpenAI from .models import ExecutionResult, TaskInfo def force_valid_reward(value) -> float: """Hard guarantee: reward is strictly in (0, 1) — never 0 or 1, no exceptions.""" try: r = float(value) except Exception: return 0.5 # HARD GUARANTEE: prevent .2f formatting from rounding to 1.00 or 0.00 if r <= 0.01: return 0.01 if r >= 0.99: return 0.99 return r def safe_reward(reward) -> float: """Clamp reward to open interval (0, 1) via force_valid_reward.""" if reward is None: reward = 0.5 return force_valid_reward(reward) def normalize_reward(passed: int, total: int) -> float: if total == 0: return 0.5 raw = passed / total return force_valid_reward(raw) _LLM_CACHE = {} _JUDGE_DISABLED_WARNED = False def get_llm_quality_score(proposed_fix: str) -> dict: global _JUDGE_DISABLED_WARNED if proposed_fix in _LLM_CACHE: return _LLM_CACHE[proposed_fix] api_key = os.environ.get("OPENAI_API_KEY") if not api_key: if not _JUDGE_DISABLED_WARNED: print("LLM judge disabled: OPENAI_API_KEY not set. Using neutral fallback scores.") _JUDGE_DISABLED_WARNED = True fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5} _LLM_CACHE[proposed_fix] = fallback return fallback try: client = OpenAI(api_key=api_key) response = client.chat.completions.create( model=os.environ.get("JUDGE_MODEL", "gpt-4o-mini"), messages=[ {"role": "system", "content": "You are a code judge. Evaluate the provided Python code on a scale of 0.0 to 1.0 for three metrics: code_quality, security, and correctness. Respond with JSON format strictly matching: {\"code_quality\": 0.0, \"security\": 0.0, \"correctness\": 0.0}"}, {"role": "user", "content": proposed_fix} ], response_format={"type": "json_object"} ) result = json.loads(response.choices[0].message.content) _LLM_CACHE[proposed_fix] = result return result except Exception as e: print(f"LLM judge error: {e}") fallback = {"code_quality": 0.5, "security": 0.5, "correctness": 0.5} _LLM_CACHE[proposed_fix] = fallback return fallback def calculate_reward_components(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> dict: compile_score = 1.0 if not exec_result.runtime_errors else 0.0 test_ratio = 0.0 if exec_result.test_total > 0: test_ratio = exec_result.test_passed / exec_result.test_total efficiency = 0.0 if test_ratio == 1.0: if exec_result.execution_time_seconds <= task_info.optimal_time_seconds: efficiency = 1.0 else: ratio = exec_result.execution_time_seconds / max(0.001, task_info.optimal_time_seconds) efficiency = max(0.0, 1.0 - (ratio - 1.0) / 2.0) llm_scores = get_llm_quality_score(proposed_fix) return { "compile_score": compile_score, "test_ratio": test_ratio, "efficiency": efficiency, "llm_correctness": float(llm_scores.get("correctness", 0.5)), "llm_security": float(llm_scores.get("security", 0.5)), "llm_quality": float(llm_scores.get("code_quality", 0.5)) } def calculate_reward(exec_result: ExecutionResult, task_info: TaskInfo, proposed_fix: str) -> tuple[float, dict]: comps = calculate_reward_components(exec_result, task_info, proposed_fix) base_reward = ( 0.15 * comps["compile_score"] + 0.35 * comps["test_ratio"] + 0.30 * comps["efficiency"] + # Increased from 0.15 to push optimization 0.10 * comps["llm_correctness"] + 0.05 * comps["llm_security"] + 0.05 * comps["llm_quality"] ) # Compile bonus: encourage first milestone if comps["compile_score"] > 0.0: base_reward += 0.05 # Harsh complexity penalty: if runtime is > 5x optimal, penalize heavily if exec_result.test_passed == exec_result.test_total and exec_result.test_total > 0: if exec_result.execution_time_seconds > task_info.optimal_time_seconds * 5: base_reward -= 0.30 return base_reward, comps def grade(*args, **kwargs) -> float: try: if len(args) == 3: return calculate_reward(args[0], args[1], args[2])[0] return 0.5 except Exception: return 0.5