code-review-env / baseline.py
ncncomplete's picture
Upload folder using huggingface_hub
6b42632 verified
#!/usr/bin/env python3
"""
Standalone baseline inference script.
Uses OpenAI gpt-4o-mini to review Python code across 3 difficulty levels.
Saves results to baseline_scores.json.
"""
import os
import json
import requests
from openai import OpenAI
# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
client = OpenAI(api_key=api_key)
# Server endpoint
BASE_URL = "http://localhost:8000"
TASKS = ["easy", "medium", "hard"]
def reset_task(task_id: str) -> dict:
"""Reset environment for a given task_id."""
response = requests.post(
f"{BASE_URL}/reset",
json={"task_id": task_id}
)
response.raise_for_status()
return response.json()
def step_task(action: dict) -> dict:
"""Submit action to environment and get observation."""
response = requests.post(
f"{BASE_URL}/step",
json={"action": action}
)
response.raise_for_status()
return response.json()
def review_code(code_snippet: str) -> dict:
"""Use GPT-4o-mini to review code snippet."""
prompt = f"""Review this Python code. Reply as JSON with keys: review (str), bug_type (syntax/logic/security/none), line_number (int), confidence (float)
Code:
{code_snippet}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": prompt}
],
temperature=0.7
)
content = response.choices[0].message.content
# Try to extract JSON from response
try:
# First try direct JSON parsing
result = json.loads(content)
except json.JSONDecodeError:
# Try to find JSON in the response text
start = content.find('{')
end = content.rfind('}') + 1
if start != -1 and end > start:
result = json.loads(content[start:end])
else:
raise ValueError(f"Could not parse JSON from response: {content}")
return result
def run_baseline():
"""Run baseline inference on all tasks."""
results = {
"scores": {},
"details": {}
}
for task_id in TASKS:
print(f"\n{'='*60}")
print(f"Running task: {task_id}")
print('='*60)
# Reset environment
obs = reset_task(task_id)
code_snippet = obs.get("code_snippet", "")
print(f"Code snippet:\n{code_snippet}\n")
# Get review from GPT-4o-mini
print("Calling GPT-4o-mini for review...")
review_result = review_code(code_snippet)
print(f"Review result: {review_result}")
# Prepare action
action = {
"review": review_result.get("review", ""),
"bug_type": review_result.get("bug_type", "none"),
"line_number": int(review_result.get("line_number", -1)),
"confidence": float(review_result.get("confidence", 0.0))
}
# Submit action to environment
print(f"Submitting action: {action}")
step_obs = step_task(action)
# Extract score from observation
# The step response should have reward/score information
score = step_obs.get("cumulative_reward", 0.0)
feedback = step_obs.get("previous_feedback", "")
print(f"Score: {score}")
print(f"Feedback: {feedback}")
results["scores"][task_id] = score
results["details"][task_id] = {
"action": action,
"feedback": feedback,
"score": score
}
# Calculate average
scores = list(results["scores"].values())
average = sum(scores) / len(scores) if scores else 0.0
results["average"] = round(average, 4)
# Print summary
print(f"\n{'='*60}")
print("BASELINE RESULTS")
print('='*60)
for task_id in TASKS:
print(f"{task_id:10s}: {results['scores'][task_id]:.4f}")
print(f"{'Average':10s}: {results['average']:.4f}")
print('='*60 + "\n")
# Save to file
with open("baseline_scores.json", "w") as f:
json.dump(results, f, indent=2)
print("Results saved to baseline_scores.json")
return results
if __name__ == "__main__":
run_baseline()