CodeReviewEnv / agents /baseline_agent.py
janakb's picture
comit
ced8fd0
"""
Baseline inference script for CodeReviewEnv.
Evaluates a model (via OpenAI-compatible API) across all three tasks and
reports per-task and aggregate scores.
Usage:
HF_TOKEN=<your_token> python agents/baseline_agent.py [--model MODEL] [--server URL]
The script uses the Hugging Face Inference API (OpenAI-compatible endpoint)
with the model specified via --model (default: Qwen/Qwen2.5-72B-Instruct).
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
from typing import Any, Dict, List
import requests
from openai import OpenAI
# ── Config ────────────────────────────────────────────────────────────────────
DEFAULT_MODEL = "Qwen/Qwen2.5-72B-Instruct"
DEFAULT_SERVER = "http://localhost:7860"
HF_BASE_URL = "https://api-inference.huggingface.co/v1"
TASK_IDS = [
"task_1_easy_bug_hunt",
"task_2_medium_security",
"task_3_hard_perf_correctness",
]
# ── Prompts ───────────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """\
You are an expert software engineer performing a thorough code review.
Your task is to:
1. Carefully read the provided code.
2. Identify ALL bugs, security vulnerabilities, performance issues, and correctness problems.
3. For each issue, output a JSON action with action_type="review".
4. After all issues are identified, output a patch with action_type="patch".
5. Finally, output action_type="submit" with your verdict.
Each action must be valid JSON matching this schema:
{
"action_type": "review" | "patch" | "comment" | "submit",
"severity": "critical" | "major" | "minor" | "info", // for review
"issue_type": "bug" | "security" | "performance" | "logic" | "style",
"line_number": <int or null>,
"description": "<concise description of the issue>",
"patched_code": "<full corrected code>", // for patch
"comment": "<optional comment>",
"verdict": "approve" | "request_changes" | "reject", // for submit
"confidence": <0.0-1.0>
}
Output ONE action JSON per message. Be precise and thorough.
"""
def build_user_prompt(obs: Dict[str, Any]) -> str:
ctx = obs["review_context"]
files_text = "\n\n".join(
f"=== {f['filename']} ({f['language']}) ===\n{f['content']}"
for f in ctx["files_changed"]
)
prev = obs.get("previous_actions", [])
issues_so_far = obs.get("issues_found_so_far", [])
prompt = f"""Pull Request: {ctx['pull_request_title']}
Author: {ctx['author']}
Description: {ctx['description']}
Linter: {ctx.get('linter_output', 'N/A')}
Tests: {ctx.get('test_results', 'N/A')}
--- CODE ---
{files_text}
--- END CODE ---
Steps taken so far: {obs['step']} / {obs['max_steps']}
Issues identified so far: {len(issues_so_far)}
"""
if issues_so_far:
prompt += "\nIssues already reported:\n"
for iss in issues_so_far:
prompt += f" - [{iss.get('severity','?')}] line {iss.get('line','?')}: {iss.get('description','')}\n"
if obs["step"] == 0:
prompt += "\nPlease begin your review. Output your first action as JSON."
elif obs["step"] >= obs["max_steps"] - 2:
prompt += "\nYou are running low on steps. Please submit a patch and final verdict now."
else:
prompt += "\nContinue your review or submit if done. Output next action as JSON."
return prompt
# ── Agent loop ────────────────────────────────────────────────────────────────
def extract_json(text: str) -> Dict[str, Any]:
"""Extract first JSON object from model response."""
# Try direct parse first
text = text.strip()
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Find JSON block
start = text.find("{")
if start == -1:
raise ValueError("No JSON found in response")
depth = 0
for i, ch in enumerate(text[start:], start):
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
return json.loads(text[start : i + 1])
raise ValueError("Unbalanced JSON")
def run_episode(
client: OpenAI,
model: str,
server: str,
task_id: str,
) -> Dict[str, Any]:
"""Run a single episode and return the result dict."""
# 1. Reset
resp = requests.post(f"{server}/reset", json={"task_id": task_id}, timeout=30)
resp.raise_for_status()
data = resp.json()
session_id = data["session_id"]
obs = data["observation"]
print(f"\n{'='*60}")
print(f"Task: {task_id}")
print(f"Session: {session_id}")
print(f"{'='*60}")
history: List[Dict[str, str]] = []
final_score = 0.0
done = False
patch_submitted = False
while not done:
user_msg = build_user_prompt(obs)
history.append({"role": "user", "content": user_msg})
# Call model
try:
completion = client.chat.completions.create(
model=model,
messages=[{"role": "system", "content": SYSTEM_PROMPT}] + history,
max_tokens=1024,
temperature=0.2,
)
raw = completion.choices[0].message.content or ""
except Exception as exc:
print(f" [Model error] {exc}")
break
history.append({"role": "assistant", "content": raw})
# Parse action
try:
action_dict = extract_json(raw)
except ValueError as exc:
print(f" [Parse error] {exc} | raw={raw[:200]!r}")
# Force a submit to avoid infinite spin
action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.3}
action_type = action_dict.get("action_type", "review")
print(f" Step {obs['step']+1}: {action_type} | {action_dict.get('description','')[:80]}")
# Auto-submit near step limit
if obs["step"] >= obs["max_steps"] - 1 and action_type != "submit":
action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.5}
if not patch_submitted:
# Submit a patch first
action_dict = {
"action_type": "patch",
"patched_code": obs["review_context"]["files_changed"][0]["content"],
}
if action_type == "patch":
patch_submitted = True
# Step
step_resp = requests.post(
f"{server}/step",
json={"session_id": session_id, "action": action_dict},
timeout=30,
)
step_resp.raise_for_status()
step_data = step_resp.json()
obs = step_data["observation"]
done = step_data["done"]
info = step_data.get("info", {})
if done:
final_score = info.get("final_score", 0.0)
breakdown = info.get("breakdown", {})
print(f"\n Final score: {final_score:.4f}")
print(f" Breakdown: {json.dumps(breakdown, indent=4)}")
time.sleep(0.3) # be polite to the API
# Cleanup
requests.delete(f"{server}/session/{session_id}", timeout=10)
return {
"task_id": task_id,
"final_score": final_score,
"steps_taken": obs["step"],
}
# ── Main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="CodeReviewEnv baseline agent")
parser.add_argument("--model", default=DEFAULT_MODEL)
parser.add_argument("--server", default=DEFAULT_SERVER)
parser.add_argument("--task", default=None, help="Run a single task (default: all)")
args = parser.parse_args()
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
print("ERROR: HF_TOKEN environment variable not set.", file=sys.stderr)
sys.exit(1)
client = OpenAI(
api_key=hf_token,
base_url=HF_BASE_URL,
)
tasks = [args.task] if args.task else TASK_IDS
results = []
for task_id in tasks:
result = run_episode(client, args.model, args.server, task_id)
results.append(result)
# Summary
print("\n" + "=" * 60)
print("BASELINE SUMMARY")
print("=" * 60)
for r in results:
print(f" {r['task_id']:<40} score={r['final_score']:.4f} steps={r['steps_taken']}")
if len(results) == len(TASK_IDS):
avg = sum(r["final_score"] for r in results) / len(results)
print(f"\n Aggregate average score: {avg:.4f}")
# Save results
out_path = "baseline_results.json"
with open(out_path, "w") as f:
json.dump({"model": args.model, "results": results}, f, indent=2)
print(f"\n Results saved to {out_path}")
if __name__ == "__main__":
main()