codearena-rl / inference.py
havinashpatil
fix: clamp reward to [0.01,0.99] so .2f never rounds to 0.00 or 1.00
59fd9d3
"""
CodeArena RL Inference
Rewritten for strict OpenEnv parsing.
"""
import os
import argparse
import httpx
from datetime import datetime
from openai import OpenAI
def run_task(task_id: str, backend: str):
# Retrieve environment variables as instructed
base_url = os.environ.get("API_BASE_URL")
api_key = os.environ.get("HF_TOKEN") or os.environ.get("API_KEY")
model_name = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
hf_pipeline = None
client = None
if backend == "hf":
from transformers import pipeline
hf_pipeline = pipeline("text-generation", model=model_name)
else:
client = OpenAI(
base_url=base_url,
api_key=api_key or "NO_KEY_PROVIDED"
)
# 1. Print the [START] line
print(f"[START] task={task_id} env=codearena-rl-benchmark model={model_name}")
# 2. Call POST http://localhost:7860/reset
try:
response = httpx.post("http://localhost:7860/reset", json={"task_id": task_id}, timeout=30.0)
response.raise_for_status()
obs_json = response.json()
except Exception as e:
error_msg = str(e).replace("\n", " ").replace("\r", "")
print(f"[STEP] step=1 action=reset_failed reward=0.01 done=true error={error_msg}")
print(f"[END] success=false steps=1 score=0.01 rewards=0.01")
return
rewards = []
success = False
done = False
step = 0
# 3. For up to 5 steps
for i in range(5):
if done:
break
step += 1
obs = obs_json.get("observation", {})
buggy_code = obs.get("buggy_code", "")
error_log = obs.get("error_log", "")
test_results = obs.get("test_results", "")
system_prompt = "You are an expert Python code repair agent. Fix the buggy Python code.\nReturn ONLY the fixed raw Python code. No markdown, no explanation."
user_prompt = f"Fix this buggy Python code:\n\n{buggy_code}\n\nError log:\n{error_log}\n\nTest results so far:\n{test_results}"
error_msg = "null"
proposed_fix = ""
# 3b/c. Call the LLM
try:
if backend == "hf":
prompt = f"{system_prompt}\n\n{user_prompt}"
output = hf_pipeline(prompt, max_new_tokens=512, return_full_text=False)
proposed_fix = output[0]["generated_text"]
else:
completion = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
proposed_fix = completion.choices[0].message.content
except Exception as e:
error_msg = str(e).replace("\n", " ").replace("\r", "")
# If the LLM call fails, use this fallback fix
proposed_fix = obs_json.get("observation", {}).get("buggy_code", "pass")
# Cleanup markdown from proposed_fix if LLM ignores instructions
if proposed_fix:
proposed_fix = proposed_fix.strip()
if proposed_fix.startswith("```python"):
proposed_fix = proposed_fix[9:]
elif proposed_fix.startswith("```"):
proposed_fix = proposed_fix[3:]
if proposed_fix.endswith("```"):
proposed_fix = proposed_fix[:-3]
proposed_fix = proposed_fix.strip()
# 3d. Send proposed_fix to /step
try:
step_resp = httpx.post("http://localhost:7860/step", json={"proposed_fix": proposed_fix}, timeout=60.0)
step_resp.raise_for_status()
step_data = step_resp.json()
raw_reward = step_data.get("reward", 0.0)
done = step_data.get("done", True)
obs_json = step_data
except Exception as e:
raw_reward = 0.01
done = True
if error_msg == "null":
error_msg = str(e).replace("\n", " ").replace("\r", "")
# 3e. Clamp it — bounds chosen so :.2f never rounds to 0.00 or 1.00
reward = max(0.01, min(0.99, float(raw_reward)))
rewards.append(reward)
# 3f. Print [STEP] line immediately
done_str = "true" if done else "false"
action_summary = "llm_fix" if error_msg == "null" else "fallback_fix"
print(f"[STEP] step={step} action={action_summary} reward={reward:.2f} done={done_str} error={error_msg}")
# 4. Print [END]
timestamp = datetime.now().isoformat()
compile_score, test_ratio, efficiency_score = 0.0, 0.0, 0.0
if "info" in obs_json and "reward_components" in obs_json["info"]:
rc = obs_json["info"]["reward_components"]
compile_score = rc.get("compile_score", 0.0)
test_ratio = rc.get("test_ratio", 0.0)
efficiency_score = rc.get("efficiency", 0.0)
final_reward = rewards[-1] if rewards else 0.0
csv_path = "rewards_log.csv"
write_headers = not os.path.exists(csv_path)
with open(csv_path, "a", encoding="utf-8") as f:
if write_headers:
f.write("timestamp,task_id,step,reward,compile_score,test_ratio,efficiency_score\n")
f.write(f"{timestamp},{task_id},{step},{final_reward},{compile_score},{test_ratio},{efficiency_score}\n")
success = any(r > 0.5 for r in rewards)
success_str = "true" if success else "false"
rewards_str = ",".join([f"{r:.2f}" for r in rewards])
score = max(0.01, min(0.99, (sum(rewards) / len(rewards)) if rewards else 0.5))
print(f"[END] success={success_str} steps={step} score={score:.2f} rewards={rewards_str}")
def main():
parser = argparse.ArgumentParser(description="CodeArena RL Inference")
parser.add_argument("--backend", type=str, choices=["openai", "hf"], default="openai", help="Backend to use for LLM generation.")
args = parser.parse_args()
target_task = os.environ.get("CODEARENA_TASK")
if target_task:
run_task(target_task, args.backend)
else:
for t in ["easy", "medium", "hard"]:
run_task(t, args.backend)
if __name__ == "__main__":
main()