""" Baseline inference script for CodeReviewEnv. Evaluates a model (via OpenAI-compatible API) across all three tasks and reports per-task and aggregate scores. Usage: HF_TOKEN= python agents/baseline_agent.py [--model MODEL] [--server URL] The script uses the Hugging Face Inference API (OpenAI-compatible endpoint) with the model specified via --model (default: Qwen/Qwen2.5-72B-Instruct). """ from __future__ import annotations import argparse import json import os import sys import time from typing import Any, Dict, List import requests from openai import OpenAI # ── Config ──────────────────────────────────────────────────────────────────── DEFAULT_MODEL = "Qwen/Qwen2.5-72B-Instruct" DEFAULT_SERVER = "http://localhost:7860" HF_BASE_URL = "https://api-inference.huggingface.co/v1" TASK_IDS = [ "task_1_easy_bug_hunt", "task_2_medium_security", "task_3_hard_perf_correctness", ] # ── Prompts ─────────────────────────────────────────────────────────────────── SYSTEM_PROMPT = """\ You are an expert software engineer performing a thorough code review. Your task is to: 1. Carefully read the provided code. 2. Identify ALL bugs, security vulnerabilities, performance issues, and correctness problems. 3. For each issue, output a JSON action with action_type="review". 4. After all issues are identified, output a patch with action_type="patch". 5. Finally, output action_type="submit" with your verdict. Each action must be valid JSON matching this schema: { "action_type": "review" | "patch" | "comment" | "submit", "severity": "critical" | "major" | "minor" | "info", // for review "issue_type": "bug" | "security" | "performance" | "logic" | "style", "line_number": , "description": "", "patched_code": "", // for patch "comment": "", "verdict": "approve" | "request_changes" | "reject", // for submit "confidence": <0.0-1.0> } Output ONE action JSON per message. Be precise and thorough. """ def build_user_prompt(obs: Dict[str, Any]) -> str: ctx = obs["review_context"] files_text = "\n\n".join( f"=== {f['filename']} ({f['language']}) ===\n{f['content']}" for f in ctx["files_changed"] ) prev = obs.get("previous_actions", []) issues_so_far = obs.get("issues_found_so_far", []) prompt = f"""Pull Request: {ctx['pull_request_title']} Author: {ctx['author']} Description: {ctx['description']} Linter: {ctx.get('linter_output', 'N/A')} Tests: {ctx.get('test_results', 'N/A')} --- CODE --- {files_text} --- END CODE --- Steps taken so far: {obs['step']} / {obs['max_steps']} Issues identified so far: {len(issues_so_far)} """ if issues_so_far: prompt += "\nIssues already reported:\n" for iss in issues_so_far: prompt += f" - [{iss.get('severity','?')}] line {iss.get('line','?')}: {iss.get('description','')}\n" if obs["step"] == 0: prompt += "\nPlease begin your review. Output your first action as JSON." elif obs["step"] >= obs["max_steps"] - 2: prompt += "\nYou are running low on steps. Please submit a patch and final verdict now." else: prompt += "\nContinue your review or submit if done. Output next action as JSON." return prompt # ── Agent loop ──────────────────────────────────────────────────────────────── def extract_json(text: str) -> Dict[str, Any]: """Extract first JSON object from model response.""" # Try direct parse first text = text.strip() try: return json.loads(text) except json.JSONDecodeError: pass # Find JSON block start = text.find("{") if start == -1: raise ValueError("No JSON found in response") depth = 0 for i, ch in enumerate(text[start:], start): if ch == "{": depth += 1 elif ch == "}": depth -= 1 if depth == 0: return json.loads(text[start : i + 1]) raise ValueError("Unbalanced JSON") def run_episode( client: OpenAI, model: str, server: str, task_id: str, ) -> Dict[str, Any]: """Run a single episode and return the result dict.""" # 1. Reset resp = requests.post(f"{server}/reset", json={"task_id": task_id}, timeout=30) resp.raise_for_status() data = resp.json() session_id = data["session_id"] obs = data["observation"] print(f"\n{'='*60}") print(f"Task: {task_id}") print(f"Session: {session_id}") print(f"{'='*60}") history: List[Dict[str, str]] = [] final_score = 0.0 done = False patch_submitted = False while not done: user_msg = build_user_prompt(obs) history.append({"role": "user", "content": user_msg}) # Call model try: completion = client.chat.completions.create( model=model, messages=[{"role": "system", "content": SYSTEM_PROMPT}] + history, max_tokens=1024, temperature=0.2, ) raw = completion.choices[0].message.content or "" except Exception as exc: print(f" [Model error] {exc}") break history.append({"role": "assistant", "content": raw}) # Parse action try: action_dict = extract_json(raw) except ValueError as exc: print(f" [Parse error] {exc} | raw={raw[:200]!r}") # Force a submit to avoid infinite spin action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.3} action_type = action_dict.get("action_type", "review") print(f" Step {obs['step']+1}: {action_type} | {action_dict.get('description','')[:80]}") # Auto-submit near step limit if obs["step"] >= obs["max_steps"] - 1 and action_type != "submit": action_dict = {"action_type": "submit", "verdict": "request_changes", "confidence": 0.5} if not patch_submitted: # Submit a patch first action_dict = { "action_type": "patch", "patched_code": obs["review_context"]["files_changed"][0]["content"], } if action_type == "patch": patch_submitted = True # Step step_resp = requests.post( f"{server}/step", json={"session_id": session_id, "action": action_dict}, timeout=30, ) step_resp.raise_for_status() step_data = step_resp.json() obs = step_data["observation"] done = step_data["done"] info = step_data.get("info", {}) if done: final_score = info.get("final_score", 0.0) breakdown = info.get("breakdown", {}) print(f"\n Final score: {final_score:.4f}") print(f" Breakdown: {json.dumps(breakdown, indent=4)}") time.sleep(0.3) # be polite to the API # Cleanup requests.delete(f"{server}/session/{session_id}", timeout=10) return { "task_id": task_id, "final_score": final_score, "steps_taken": obs["step"], } # ── Main ────────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="CodeReviewEnv baseline agent") parser.add_argument("--model", default=DEFAULT_MODEL) parser.add_argument("--server", default=DEFAULT_SERVER) parser.add_argument("--task", default=None, help="Run a single task (default: all)") args = parser.parse_args() hf_token = os.environ.get("HF_TOKEN") if not hf_token: print("ERROR: HF_TOKEN environment variable not set.", file=sys.stderr) sys.exit(1) client = OpenAI( api_key=hf_token, base_url=HF_BASE_URL, ) tasks = [args.task] if args.task else TASK_IDS results = [] for task_id in tasks: result = run_episode(client, args.model, args.server, task_id) results.append(result) # Summary print("\n" + "=" * 60) print("BASELINE SUMMARY") print("=" * 60) for r in results: print(f" {r['task_id']:<40} score={r['final_score']:.4f} steps={r['steps_taken']}") if len(results) == len(TASK_IDS): avg = sum(r["final_score"] for r in results) / len(results) print(f"\n Aggregate average score: {avg:.4f}") # Save results out_path = "baseline_results.json" with open(out_path, "w") as f: json.dump({"model": args.model, "results": results}, f, indent=2) print(f"\n Results saved to {out_path}") if __name__ == "__main__": main()