code-review-env / inference.py
ncncomplete's picture
Upload folder using huggingface_hub
a3f3034 verified
#!/usr/bin/env python3
"""Hackathon baseline inference for coding_env.
MANDATORY environment variables handled here:
- API_BASE_URL (defaulted)
- MODEL_NAME (defaulted)
- HF_TOKEN (no default)
- LOCAL_IMAGE_NAME (optional, for local Docker workflows)
"""
from __future__ import annotations
import json
import os
from typing import Any, Dict, List
import requests
from openai import OpenAI
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN")
LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
BENCHMARK = os.getenv("BENCHMARK", "coding_env")
MAX_STEPS = int(os.getenv("MAX_STEPS", "1"))
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.60"))
MIN_STRICT_SCORE = 0.01
MAX_STRICT_SCORE = 0.99
def _bool_text(value: bool) -> str:
return "true" if value else "false"
def _strict_score(value: float) -> float:
return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, round(float(value), 4)))
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(
step: int, action: str, reward: float, done: bool, error: str | None
) -> None:
error_value = error if error else "null"
print(
f"[STEP] step={step} action={action} reward={reward:.2f} "
f"done={_bool_text(done)} error={error_value}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={_bool_text(success)} steps={steps} "
f"score={score:.2f} rewards={rewards_str}",
flush=True,
)
def _safe_json(method: str, url: str, **kwargs: Any) -> Dict[str, Any]:
try:
response = requests.request(method, url, timeout=30, **kwargs)
response.raise_for_status()
data = response.json()
if isinstance(data, dict):
return data
except Exception:
pass
return {}
def _task_list() -> List[str]:
data = _safe_json("GET", f"{ENV_BASE_URL}/tasks")
tasks = data.get("tasks", [])
if isinstance(tasks, list):
values: List[str] = []
for item in tasks:
if isinstance(item, dict) and item.get("task_id"):
values.append(str(item["task_id"]))
if values:
return values
return ["task_easy_1", "task_medium_1", "task_hard_1"]
def _build_action(client: OpenAI | None, task_description: str, code_snippet: str) -> Dict[str, Any]:
fallback = {
"review": "Likely logic issue in this PR change; please review line-level semantics.",
"file_path": "services/metrics/aggregation.py",
"issue_type": "logic",
"severity": "medium",
"bug_type": "logic",
"line_number": 1,
"confidence": 0.20,
}
if client is None:
return fallback
prompt = f"""You are reviewing a production pull request.
Task: {task_description}
PR context:
{code_snippet}
Return ONLY valid JSON with keys:
review (string),
file_path (string from changed files),
issue_type (one of logic|security|performance|maintainability),
severity (one of low|medium|high|critical),
bug_type (one of syntax|logic|security|none),
line_number (integer),
confidence (0.0-1.0 float)
"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
temperature=0.0,
messages=[{"role": "user", "content": prompt}],
)
raw = (response.choices[0].message.content or "").strip()
raw = raw.replace("```json", "").replace("```", "").strip()
parsed = json.loads(raw)
if not isinstance(parsed, dict):
return fallback
return {
"review": str(parsed.get("review", fallback["review"])),
"file_path": str(parsed.get("file_path", fallback["file_path"])),
"issue_type": str(parsed.get("issue_type", fallback["issue_type"])),
"severity": str(parsed.get("severity", fallback["severity"])),
"bug_type": str(parsed.get("bug_type", fallback["bug_type"])),
"line_number": int(parsed.get("line_number", fallback["line_number"])),
"confidence": float(parsed.get("confidence", fallback["confidence"])),
}
except Exception:
return fallback
def run_task(task_id: str, client: OpenAI | None) -> float:
episode_id = f"baseline-{task_id}"
rewards: List[float] = []
score = MIN_STRICT_SCORE
success = False
last_error: str | None = None
steps_taken = 0
log_start(task_id, BENCHMARK, MODEL_NAME)
try:
reset_data = _safe_json(
"POST",
f"{ENV_BASE_URL}/reset",
json={"task_id": task_id, "episode_id": episode_id},
)
obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {}
task_description = str(obs.get("task_description", "Review code quality and bugs."))
code_snippet = str(obs.get("code_snippet", ""))
for step_num in range(1, MAX_STEPS + 1):
action = _build_action(client, task_description, code_snippet)
action_str = (
f"file={action['file_path']};"
f"issue={action['issue_type']};"
f"sev={action['severity']};"
f"bug_type={action['bug_type']};"
f"line={action['line_number']};"
f"confidence={float(action['confidence']):.2f}"
)
step_data = _safe_json(
"POST",
f"{ENV_BASE_URL}/step",
json={
"action": action,
"task_id": task_id,
"episode_id": episode_id,
},
)
reward = _strict_score(float(step_data.get("reward", MIN_STRICT_SCORE) or MIN_STRICT_SCORE))
done = bool(step_data.get("done", not bool(step_data)))
obs_after = step_data.get("observation", {}) if isinstance(step_data, dict) else {}
raw_error = obs_after.get("last_action_error")
last_error = str(raw_error) if raw_error else None
rewards.append(reward)
steps_taken = step_num
log_step(step_num, action_str, reward, done, last_error)
if done:
break
grader_data = _safe_json(
"GET", f"{ENV_BASE_URL}/grader?task_id={task_id}&episode_id={episode_id}"
)
grader_score = _strict_score(float(grader_data.get("score", MIN_STRICT_SCORE) or MIN_STRICT_SCORE))
step_score = _strict_score(rewards[-1] if rewards else MIN_STRICT_SCORE)
score = _strict_score(max(grader_score, step_score))
success = score >= SUCCESS_SCORE_THRESHOLD
except Exception as exc:
last_error = str(exc)
if steps_taken == 0:
log_step(
1,
"bug_type=none;line=-1;confidence=0.00",
MIN_STRICT_SCORE,
True,
last_error,
)
rewards.append(MIN_STRICT_SCORE)
steps_taken = 1
score = MIN_STRICT_SCORE
success = False
finally:
log_end(success, max(1, steps_taken), score, rewards or [0.0])
return score
def main() -> Dict[str, float]:
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if HF_TOKEN else None
tasks = _task_list()
scores: Dict[str, float] = {}
for task_id in tasks:
scores[task_id] = run_task(task_id, client)
avg = sum(scores.values()) / len(scores) if scores else 0.0
scores["average"] = round(avg, 4)
return scores
if __name__ == "__main__":
main()