codearena-rl / temp_train.py
havinashpatil
Finalizing CodeArena RL Benchmark: frontend improvements, GRPO training scripts, and cleaned environment
03a7eb9
!pip install trl transformers datasets httpx fastapi uvicorn pydantic openai
!git clone https://github.com/havinashpatil/meta.git
!cd meta && pip install -r requirements.txt
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer
import httpx
# Start the backend server in the background (Colab trick)
import subprocess
import time
subprocess.Popen(["uvicorn", "server.app:app", "--port", "7860", "--app-dir", "meta"])
time.sleep(5) # Wait for server to start
def codearena_reward_func(completions, prompts):
"""
Reward function that queries the CodeArena OpenEnv server.
For each proposed fix in `completions`, we step the environment.
"""
rewards = []
for completion in completions:
# Clean the generated code
proposed_fix = completion[0].get('content', '').strip()
if proposed_fix.startswith('```python'):
proposed_fix = proposed_fix[9:].replace('```', '').strip()
try:
# Step the environment
res = httpx.post(
"http://localhost:7860/step",
json={"proposed_fix": proposed_fix},
timeout=10.0
)
res.raise_for_status()
reward = res.json().get('reward', 0.0)
rewards.append(reward)
except Exception as e:
print(f"Env Error: {e}")
rewards.append(0.0)
return rewards
# Load Model
model_name = "Qwen/Qwen2.5-Coder-1.5B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Load dataset for Coding Debugging and Time Complexity Optimization
dataset = load_dataset("m-a-p/Code-Feedback", split="train")
def format_prompt(example):
# m-a-p/Code-Feedback contains 'messages' with user and assistant roles
messages = example.get('messages', [])
user_query = ""
if messages and len(messages) > 0 and messages[0].get('role') == 'user':
user_query = messages[0].get('content', '')
prompt = f"Optimize and debug this code to improve time complexity:\n{user_query}"
return {"prompt": prompt}
dataset = dataset.map(format_prompt)
# Keep only the prompt column for the trainer
dataset = dataset.select_columns(["prompt"])
# Limit for demo purposes
dataset = dataset.select(range(100))
# Initialize GRPO Trainer
training_args = GRPOConfig(
output_dir="./codearena-grpo",
learning_rate=1e-5,
max_steps=50,
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
)
trainer = GRPOTrainer(
model=model,
reward_funcs=codearena_reward_func,
args=training_args,
train_dataset=dataset,
)
trainer.train()