Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Hackathon baseline inference for coding_env. | |
| MANDATORY environment variables handled here: | |
| - API_BASE_URL (defaulted) | |
| - MODEL_NAME (defaulted) | |
| - HF_TOKEN (no default) | |
| - LOCAL_IMAGE_NAME (optional, for local Docker workflows) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from typing import Any, Dict, List | |
| import requests | |
| from openai import OpenAI | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") | |
| ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000") | |
| BENCHMARK = os.getenv("BENCHMARK", "coding_env") | |
| MAX_STEPS = int(os.getenv("MAX_STEPS", "1")) | |
| SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.60")) | |
| MIN_STRICT_SCORE = 0.01 | |
| MAX_STRICT_SCORE = 0.99 | |
| def _bool_text(value: bool) -> str: | |
| return "true" if value else "false" | |
| def _strict_score(value: float) -> float: | |
| return max(MIN_STRICT_SCORE, min(MAX_STRICT_SCORE, round(float(value), 4))) | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step( | |
| step: int, action: str, reward: float, done: bool, error: str | None | |
| ) -> None: | |
| error_value = error if error else "null" | |
| print( | |
| f"[STEP] step={step} action={action} reward={reward:.2f} " | |
| f"done={_bool_text(done)} error={error_value}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={_bool_text(success)} steps={steps} " | |
| f"score={score:.2f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| def _safe_json(method: str, url: str, **kwargs: Any) -> Dict[str, Any]: | |
| try: | |
| response = requests.request(method, url, timeout=30, **kwargs) | |
| response.raise_for_status() | |
| data = response.json() | |
| if isinstance(data, dict): | |
| return data | |
| except Exception: | |
| pass | |
| return {} | |
| def _task_list() -> List[str]: | |
| data = _safe_json("GET", f"{ENV_BASE_URL}/tasks") | |
| tasks = data.get("tasks", []) | |
| if isinstance(tasks, list): | |
| values: List[str] = [] | |
| for item in tasks: | |
| if isinstance(item, dict) and item.get("task_id"): | |
| values.append(str(item["task_id"])) | |
| if values: | |
| return values | |
| return ["task_easy_1", "task_medium_1", "task_hard_1"] | |
| def _build_action(client: OpenAI | None, task_description: str, code_snippet: str) -> Dict[str, Any]: | |
| fallback = { | |
| "review": "Likely logic issue in this PR change; please review line-level semantics.", | |
| "file_path": "services/metrics/aggregation.py", | |
| "issue_type": "logic", | |
| "severity": "medium", | |
| "bug_type": "logic", | |
| "line_number": 1, | |
| "confidence": 0.20, | |
| } | |
| if client is None: | |
| return fallback | |
| prompt = f"""You are reviewing a production pull request. | |
| Task: {task_description} | |
| PR context: | |
| {code_snippet} | |
| Return ONLY valid JSON with keys: | |
| review (string), | |
| file_path (string from changed files), | |
| issue_type (one of logic|security|performance|maintainability), | |
| severity (one of low|medium|high|critical), | |
| bug_type (one of syntax|logic|security|none), | |
| line_number (integer), | |
| confidence (0.0-1.0 float) | |
| """ | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| temperature=0.0, | |
| messages=[{"role": "user", "content": prompt}], | |
| ) | |
| raw = (response.choices[0].message.content or "").strip() | |
| raw = raw.replace("```json", "").replace("```", "").strip() | |
| parsed = json.loads(raw) | |
| if not isinstance(parsed, dict): | |
| return fallback | |
| return { | |
| "review": str(parsed.get("review", fallback["review"])), | |
| "file_path": str(parsed.get("file_path", fallback["file_path"])), | |
| "issue_type": str(parsed.get("issue_type", fallback["issue_type"])), | |
| "severity": str(parsed.get("severity", fallback["severity"])), | |
| "bug_type": str(parsed.get("bug_type", fallback["bug_type"])), | |
| "line_number": int(parsed.get("line_number", fallback["line_number"])), | |
| "confidence": float(parsed.get("confidence", fallback["confidence"])), | |
| } | |
| except Exception: | |
| return fallback | |
| def run_task(task_id: str, client: OpenAI | None) -> float: | |
| episode_id = f"baseline-{task_id}" | |
| rewards: List[float] = [] | |
| score = MIN_STRICT_SCORE | |
| success = False | |
| last_error: str | None = None | |
| steps_taken = 0 | |
| log_start(task_id, BENCHMARK, MODEL_NAME) | |
| try: | |
| reset_data = _safe_json( | |
| "POST", | |
| f"{ENV_BASE_URL}/reset", | |
| json={"task_id": task_id, "episode_id": episode_id}, | |
| ) | |
| obs = reset_data.get("observation", {}) if isinstance(reset_data, dict) else {} | |
| task_description = str(obs.get("task_description", "Review code quality and bugs.")) | |
| code_snippet = str(obs.get("code_snippet", "")) | |
| for step_num in range(1, MAX_STEPS + 1): | |
| action = _build_action(client, task_description, code_snippet) | |
| action_str = ( | |
| f"file={action['file_path']};" | |
| f"issue={action['issue_type']};" | |
| f"sev={action['severity']};" | |
| f"bug_type={action['bug_type']};" | |
| f"line={action['line_number']};" | |
| f"confidence={float(action['confidence']):.2f}" | |
| ) | |
| step_data = _safe_json( | |
| "POST", | |
| f"{ENV_BASE_URL}/step", | |
| json={ | |
| "action": action, | |
| "task_id": task_id, | |
| "episode_id": episode_id, | |
| }, | |
| ) | |
| reward = _strict_score(float(step_data.get("reward", MIN_STRICT_SCORE) or MIN_STRICT_SCORE)) | |
| done = bool(step_data.get("done", not bool(step_data))) | |
| obs_after = step_data.get("observation", {}) if isinstance(step_data, dict) else {} | |
| raw_error = obs_after.get("last_action_error") | |
| last_error = str(raw_error) if raw_error else None | |
| rewards.append(reward) | |
| steps_taken = step_num | |
| log_step(step_num, action_str, reward, done, last_error) | |
| if done: | |
| break | |
| grader_data = _safe_json( | |
| "GET", f"{ENV_BASE_URL}/grader?task_id={task_id}&episode_id={episode_id}" | |
| ) | |
| grader_score = _strict_score(float(grader_data.get("score", MIN_STRICT_SCORE) or MIN_STRICT_SCORE)) | |
| step_score = _strict_score(rewards[-1] if rewards else MIN_STRICT_SCORE) | |
| score = _strict_score(max(grader_score, step_score)) | |
| success = score >= SUCCESS_SCORE_THRESHOLD | |
| except Exception as exc: | |
| last_error = str(exc) | |
| if steps_taken == 0: | |
| log_step( | |
| 1, | |
| "bug_type=none;line=-1;confidence=0.00", | |
| MIN_STRICT_SCORE, | |
| True, | |
| last_error, | |
| ) | |
| rewards.append(MIN_STRICT_SCORE) | |
| steps_taken = 1 | |
| score = MIN_STRICT_SCORE | |
| success = False | |
| finally: | |
| log_end(success, max(1, steps_taken), score, rewards or [0.0]) | |
| return score | |
| def main() -> Dict[str, float]: | |
| client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if HF_TOKEN else None | |
| tasks = _task_list() | |
| scores: Dict[str, float] = {} | |
| for task_id in tasks: | |
| scores[task_id] = run_task(task_id, client) | |
| avg = sum(scores.values()) / len(scores) if scores else 0.0 | |
| scores["average"] = round(avg, 4) | |
| return scores | |
| if __name__ == "__main__": | |
| main() | |