Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +1 -1
- client.py +10 -6
- compat.py +92 -0
- graders/optimization.py +70 -66
- graders/pytest_runner.py +100 -59
- inference.py +462 -291
- models.py +88 -36
- openenv_python_env.egg-info/PKG-INFO +13 -0
- openenv_python_env.egg-info/SOURCES.txt +27 -0
- openenv_python_env.egg-info/dependency_links.txt +1 -0
- openenv_python_env.egg-info/entry_points.txt +2 -0
- openenv_python_env.egg-info/requires.txt +9 -0
- openenv_python_env.egg-info/top_level.txt +1 -0
- server/app.py +36 -16
- server/env.py +1 -790
- server/env_safe.py +492 -0
- tasks/task_bank.py +36 -52
Dockerfile
CHANGED
|
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 13 |
COPY . /app
|
| 14 |
|
| 15 |
# Install Python dependencies
|
| 16 |
-
RUN pip install --no-cache-dir -r
|
| 17 |
|
| 18 |
# Set environment variables
|
| 19 |
ENV PYTHONUNBUFFERED=1
|
|
|
|
| 13 |
COPY . /app
|
| 14 |
|
| 15 |
# Install Python dependencies
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
|
| 18 |
# Set environment variables
|
| 19 |
ENV PYTHONUNBUFFERED=1
|
client.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
"""Client for the Python code review environment."""
|
| 2 |
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from typing import Dict
|
| 6 |
-
|
| 7 |
-
from
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
from models import (
|
| 11 |
HistoryEntry,
|
|
|
|
| 1 |
"""Client for the Python code review environment."""
|
| 2 |
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Dict
|
| 6 |
+
|
| 7 |
+
from compat import install_openenv_fastmcp_compat
|
| 8 |
+
|
| 9 |
+
install_openenv_fastmcp_compat()
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
|
| 14 |
from models import (
|
| 15 |
HistoryEntry,
|
compat.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility helpers for OpenEnv and FastMCP runtime drift."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import types
|
| 7 |
+
from typing import Any, Optional
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def install_openenv_fastmcp_compat() -> None:
|
| 11 |
+
"""Patch FastMCP API differences so older OpenEnv builds keep importing."""
|
| 12 |
+
try:
|
| 13 |
+
import fastmcp # type: ignore
|
| 14 |
+
except Exception:
|
| 15 |
+
return
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
if not hasattr(fastmcp, "Client"):
|
| 19 |
+
class CompatClient:
|
| 20 |
+
"""Minimal async MCP client used for legacy OpenEnv imports."""
|
| 21 |
+
|
| 22 |
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
| 23 |
+
self.args = args
|
| 24 |
+
self.kwargs = kwargs
|
| 25 |
+
|
| 26 |
+
async def __aenter__(self) -> "CompatClient":
|
| 27 |
+
return self
|
| 28 |
+
|
| 29 |
+
async def __aexit__(self, exc_type: Any, exc: Any, tb: Any) -> bool:
|
| 30 |
+
return False
|
| 31 |
+
|
| 32 |
+
async def list_tools(self) -> list[Any]:
|
| 33 |
+
return []
|
| 34 |
+
|
| 35 |
+
async def call_tool(self, tool_name: str, arguments: dict[str, Any]) -> Any:
|
| 36 |
+
raise RuntimeError(
|
| 37 |
+
f"MCP client compatibility mode cannot call tool: {tool_name}"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
fastmcp.Client = CompatClient # type: ignore[attr-defined]
|
| 41 |
+
except Exception:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
client_pkg = sys.modules.get("fastmcp.client")
|
| 46 |
+
if client_pkg is None:
|
| 47 |
+
client_pkg = types.ModuleType("fastmcp.client")
|
| 48 |
+
sys.modules["fastmcp.client"] = client_pkg
|
| 49 |
+
|
| 50 |
+
client_mod = sys.modules.get("fastmcp.client.client")
|
| 51 |
+
if client_mod is None:
|
| 52 |
+
client_mod = types.ModuleType("fastmcp.client.client")
|
| 53 |
+
sys.modules["fastmcp.client.client"] = client_mod
|
| 54 |
+
|
| 55 |
+
if not hasattr(client_mod, "CallToolResult"):
|
| 56 |
+
class CallToolResult:
|
| 57 |
+
"""Compatibility container for legacy OpenEnv response handling."""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
content: Any = None,
|
| 62 |
+
structured_content: Any = None,
|
| 63 |
+
meta: Any = None,
|
| 64 |
+
data: Any = None,
|
| 65 |
+
is_error: bool = False,
|
| 66 |
+
) -> None:
|
| 67 |
+
self.content = content
|
| 68 |
+
self.structured_content = structured_content
|
| 69 |
+
self.meta = meta
|
| 70 |
+
self.data = data
|
| 71 |
+
self.is_error = is_error
|
| 72 |
+
|
| 73 |
+
client_mod.CallToolResult = CallToolResult
|
| 74 |
+
|
| 75 |
+
client_pkg.client = client_mod # type: ignore[attr-defined]
|
| 76 |
+
except Exception:
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
install_openenv_fastmcp_compat()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
from openenv.core.env_server.http_server import create_app as openenv_create_app
|
| 85 |
+
from openenv.core.env_server.interfaces import Environment
|
| 86 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 87 |
+
except Exception as exc: # pragma: no cover
|
| 88 |
+
raise RuntimeError(f"OpenEnv runtime import failed after compatibility patch: {exc}") from exc
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
create_app = openenv_create_app
|
| 92 |
+
|
graders/optimization.py
CHANGED
|
@@ -31,61 +31,64 @@ Path("benchmark.json").write_text(json.dumps({{"elapsed": elapsed, "rows": len(r
|
|
| 31 |
"""
|
| 32 |
|
| 33 |
|
| 34 |
-
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
|
| 35 |
-
"""Benchmark runtime deterministically against the starter implementation."""
|
| 36 |
-
|
| 37 |
-
assert task.benchmark_entrypoint is not None
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def ast_quality_score(code: str, task: TaskSpec) -> float:
|
|
@@ -147,17 +150,18 @@ def grade_optimization_task(candidate_code: str, task: TaskSpec) -> TaskGrade:
|
|
| 147 |
+ (0.15 * quality_score)
|
| 148 |
+ (0.05 * pep8_score)
|
| 149 |
)
|
| 150 |
-
return TaskGrade(
|
| 151 |
-
score=score,
|
| 152 |
-
syntax_score=1.0,
|
| 153 |
-
tests_passed=execution.passed,
|
| 154 |
-
tests_total=execution.total,
|
| 155 |
-
quality_score=quality_score,
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
"
|
| 159 |
-
"
|
| 160 |
-
"
|
|
|
|
| 161 |
"style_score": round(pep8_score, 4),
|
| 162 |
},
|
| 163 |
)
|
|
|
|
| 31 |
"""
|
| 32 |
|
| 33 |
|
| 34 |
+
def benchmark_runtime(candidate_code: str, task: TaskSpec) -> tuple[float, bool, str]:
|
| 35 |
+
"""Benchmark runtime deterministically against the starter implementation."""
|
| 36 |
+
|
| 37 |
+
assert task.benchmark_entrypoint is not None
|
| 38 |
+
try:
|
| 39 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-bench-") as temp_dir:
|
| 40 |
+
temp_path = Path(temp_dir)
|
| 41 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 42 |
+
(temp_path / "starter.py").write_text(task.starter_code, encoding="utf-8")
|
| 43 |
+
(temp_path / "candidate_runner.py").write_text(_benchmark_script(task), encoding="utf-8")
|
| 44 |
+
|
| 45 |
+
starter_script = _benchmark_script(task).replace("from candidate import", "from starter import")
|
| 46 |
+
(temp_path / "starter_runner.py").write_text(starter_script, encoding="utf-8")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
starter_run = subprocess.run(
|
| 50 |
+
[sys.executable, "starter_runner.py"],
|
| 51 |
+
cwd=temp_path,
|
| 52 |
+
capture_output=True,
|
| 53 |
+
text=True,
|
| 54 |
+
timeout=task.benchmark_timeout_s,
|
| 55 |
+
check=False,
|
| 56 |
+
)
|
| 57 |
+
starter_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 58 |
+
|
| 59 |
+
candidate_run = subprocess.run(
|
| 60 |
+
[sys.executable, "candidate_runner.py"],
|
| 61 |
+
cwd=temp_path,
|
| 62 |
+
capture_output=True,
|
| 63 |
+
text=True,
|
| 64 |
+
timeout=task.benchmark_timeout_s,
|
| 65 |
+
check=False,
|
| 66 |
+
)
|
| 67 |
+
candidate_payload = json.loads((temp_path / "benchmark.json").read_text(encoding="utf-8"))
|
| 68 |
+
except subprocess.TimeoutExpired as exc:
|
| 69 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 70 |
+
return 0.0, True, (output or "benchmark timed out").strip()
|
| 71 |
+
except Exception as exc: # pragma: no cover
|
| 72 |
+
return 0.0, False, str(exc)
|
| 73 |
+
|
| 74 |
+
starter_elapsed = max(float(starter_payload["elapsed"]), 1e-9)
|
| 75 |
+
candidate_elapsed = max(float(candidate_payload["elapsed"]), 1e-9)
|
| 76 |
+
speedup = starter_elapsed / candidate_elapsed
|
| 77 |
+
runtime_score = clamp_score(min((speedup - 1.0) / 3.0, 1.0))
|
| 78 |
+
output = "\n".join(
|
| 79 |
+
part
|
| 80 |
+
for part in [
|
| 81 |
+
starter_run.stdout.strip(),
|
| 82 |
+
starter_run.stderr.strip(),
|
| 83 |
+
candidate_run.stdout.strip(),
|
| 84 |
+
candidate_run.stderr.strip(),
|
| 85 |
+
f"starter={starter_elapsed:.6f}s candidate={candidate_elapsed:.6f}s speedup={speedup:.2f}x",
|
| 86 |
+
]
|
| 87 |
+
if part
|
| 88 |
+
)
|
| 89 |
+
return runtime_score, False, output
|
| 90 |
+
except Exception as exc: # pragma: no cover
|
| 91 |
+
return 0.0, False, str(exc)
|
| 92 |
|
| 93 |
|
| 94 |
def ast_quality_score(code: str, task: TaskSpec) -> float:
|
|
|
|
| 150 |
+ (0.15 * quality_score)
|
| 151 |
+ (0.05 * pep8_score)
|
| 152 |
)
|
| 153 |
+
return TaskGrade(
|
| 154 |
+
score=score,
|
| 155 |
+
syntax_score=1.0,
|
| 156 |
+
tests_passed=execution.passed,
|
| 157 |
+
tests_total=execution.total,
|
| 158 |
+
quality_score=quality_score,
|
| 159 |
+
runtime_score=runtime_score,
|
| 160 |
+
details={
|
| 161 |
+
"tests": execution.output,
|
| 162 |
+
"benchmark": benchmark_output,
|
| 163 |
+
"test_fraction": round(test_fraction, 4),
|
| 164 |
+
"runtime_score": round(runtime_score, 4),
|
| 165 |
"style_score": round(pep8_score, 4),
|
| 166 |
},
|
| 167 |
)
|
graders/pytest_runner.py
CHANGED
|
@@ -12,17 +12,38 @@ from typing import Iterable
|
|
| 12 |
|
| 13 |
|
| 14 |
@dataclass(frozen=True)
|
| 15 |
-
class PytestExecution:
|
| 16 |
"""Exact pytest execution summary."""
|
| 17 |
|
| 18 |
passed: int
|
| 19 |
failed: int
|
| 20 |
total: int
|
| 21 |
-
timed_out: bool
|
| 22 |
-
output: str
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
return """import json
|
| 27 |
import pathlib
|
| 28 |
import pytest
|
|
@@ -53,56 +74,76 @@ pathlib.Path("pytest_results.json").write_text(json.dumps(payload), encoding="ut
|
|
| 53 |
"""
|
| 54 |
|
| 55 |
|
| 56 |
-
def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
|
| 57 |
-
"""Run a pytest suite against candidate.py and return structured results."""
|
| 58 |
-
|
| 59 |
-
test_cases = list(tests)
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
@dataclass(frozen=True)
|
| 15 |
+
class PytestExecution:
|
| 16 |
"""Exact pytest execution summary."""
|
| 17 |
|
| 18 |
passed: int
|
| 19 |
failed: int
|
| 20 |
total: int
|
| 21 |
+
timed_out: bool
|
| 22 |
+
output: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _test_module_source(tests: Iterable[str]) -> str:
|
| 26 |
+
"""Build a valid pytest module from expression-style or full test snippets."""
|
| 27 |
+
blocks: list[str] = ["from candidate import * # noqa: F401,F403"]
|
| 28 |
+
for index, test in enumerate(tests, start=1):
|
| 29 |
+
snippet = str(test).strip()
|
| 30 |
+
if not snippet:
|
| 31 |
+
continue
|
| 32 |
+
if snippet.startswith("def test_"):
|
| 33 |
+
blocks.append(snippet)
|
| 34 |
+
continue
|
| 35 |
+
blocks.append(
|
| 36 |
+
"\n".join(
|
| 37 |
+
[
|
| 38 |
+
f"def test_case_{index:03d}():",
|
| 39 |
+
f" assert {snippet}",
|
| 40 |
+
]
|
| 41 |
+
)
|
| 42 |
+
)
|
| 43 |
+
return "\n\n".join(blocks) or "def test_placeholder():\n assert True\n"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _runner_script() -> str:
|
| 47 |
return """import json
|
| 48 |
import pathlib
|
| 49 |
import pytest
|
|
|
|
| 74 |
"""
|
| 75 |
|
| 76 |
|
| 77 |
+
def run_pytest_suite(candidate_code: str, tests: Iterable[str], timeout_s: float = 3.0) -> PytestExecution:
|
| 78 |
+
"""Run a pytest suite against candidate.py and return structured results."""
|
| 79 |
+
|
| 80 |
+
test_cases = list(tests)
|
| 81 |
+
try:
|
| 82 |
+
with tempfile.TemporaryDirectory(prefix="python-code-review-") as temp_dir:
|
| 83 |
+
temp_path = Path(temp_dir)
|
| 84 |
+
(temp_path / "candidate.py").write_text(candidate_code, encoding="utf-8")
|
| 85 |
+
(temp_path / "test_candidate.py").write_text(_test_module_source(test_cases), encoding="utf-8")
|
| 86 |
+
(temp_path / "runner.py").write_text(_runner_script(), encoding="utf-8")
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
completed = subprocess.run(
|
| 90 |
+
[sys.executable, "runner.py"],
|
| 91 |
+
cwd=temp_path,
|
| 92 |
+
capture_output=True,
|
| 93 |
+
text=True,
|
| 94 |
+
timeout=timeout_s,
|
| 95 |
+
check=False,
|
| 96 |
+
)
|
| 97 |
+
except subprocess.TimeoutExpired as exc:
|
| 98 |
+
output = (exc.stdout or "") + (exc.stderr or "")
|
| 99 |
+
return PytestExecution(
|
| 100 |
+
passed=0,
|
| 101 |
+
failed=max(len(test_cases), 1),
|
| 102 |
+
total=max(len(test_cases), 1),
|
| 103 |
+
timed_out=True,
|
| 104 |
+
output=(output or "pytest timed out").strip(),
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
result_path = temp_path / "pytest_results.json"
|
| 108 |
+
if not result_path.exists():
|
| 109 |
+
output = (completed.stdout or "") + (completed.stderr or "")
|
| 110 |
+
total = max(len(test_cases), 1)
|
| 111 |
+
return PytestExecution(
|
| 112 |
+
passed=0,
|
| 113 |
+
failed=total,
|
| 114 |
+
total=total,
|
| 115 |
+
timed_out=False,
|
| 116 |
+
output=output.strip(),
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
payload = json.loads(result_path.read_text(encoding="utf-8"))
|
| 121 |
+
except Exception as exc:
|
| 122 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 123 |
+
return PytestExecution(
|
| 124 |
+
passed=0,
|
| 125 |
+
failed=max(len(test_cases), 1),
|
| 126 |
+
total=max(len(test_cases), 1),
|
| 127 |
+
timed_out=False,
|
| 128 |
+
output=(output or str(exc)).strip(),
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
passed = int(payload.get("passed", 0))
|
| 132 |
+
failed = int(payload.get("failed", 0))
|
| 133 |
+
total = max(passed + failed, len(test_cases))
|
| 134 |
+
output = ((completed.stdout or "") + (completed.stderr or "")).strip()
|
| 135 |
+
return PytestExecution(
|
| 136 |
+
passed=passed,
|
| 137 |
+
failed=failed,
|
| 138 |
+
total=total,
|
| 139 |
+
timed_out=False,
|
| 140 |
+
output=output,
|
| 141 |
+
)
|
| 142 |
+
except Exception as exc:
|
| 143 |
+
return PytestExecution(
|
| 144 |
+
passed=0,
|
| 145 |
+
failed=max(len(test_cases), 1),
|
| 146 |
+
total=max(len(test_cases), 1),
|
| 147 |
+
timed_out=False,
|
| 148 |
+
output=str(exc),
|
| 149 |
+
)
|
inference.py
CHANGED
|
@@ -1,291 +1,462 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
import
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
from
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
""
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
)
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Fail-safe inference entrypoint for the Python code review environment."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import io
|
| 7 |
+
import json
|
| 8 |
+
import os
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import time
|
| 12 |
+
from collections.abc import Iterable
|
| 13 |
+
from contextlib import redirect_stderr, redirect_stdout
|
| 14 |
+
from typing import Any, Dict, Optional
|
| 15 |
+
|
| 16 |
+
from compat import install_openenv_fastmcp_compat
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
except Exception:
|
| 21 |
+
OpenAI = None # type: ignore[assignment]
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
install_openenv_fastmcp_compat()
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
from server.env import PythonCodeReviewEnvironment
|
| 28 |
+
except Exception:
|
| 29 |
+
PythonCodeReviewEnvironment = None # type: ignore[assignment]
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
from models import PythonCodeReviewAction
|
| 33 |
+
except Exception:
|
| 34 |
+
PythonCodeReviewAction = None # type: ignore[assignment]
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
from tasks import task_ids
|
| 38 |
+
except Exception:
|
| 39 |
+
task_ids = None # type: ignore[assignment]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
ALLOWED_ACTIONS = {
|
| 43 |
+
"analyze_code",
|
| 44 |
+
"edit_code",
|
| 45 |
+
"run_tests",
|
| 46 |
+
"submit_solution",
|
| 47 |
+
}
|
| 48 |
+
DEFAULT_MODEL_NAME = "mock-model"
|
| 49 |
+
DEFAULT_ACTION = {"action_type": "analyze_code", "code": None, "fallback_reason": "mock_response"}
|
| 50 |
+
API_TIMEOUT_SECONDS = 3.0
|
| 51 |
+
API_RETRIES = 1
|
| 52 |
+
API_RETRY_DELAY_SECONDS = 0.2
|
| 53 |
+
MAX_STEPS = 2
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def safe_env(name: str, default: str = "") -> str:
|
| 57 |
+
"""Read an allowed environment variable and return a safe string default."""
|
| 58 |
+
try:
|
| 59 |
+
value = os.getenv(name)
|
| 60 |
+
if value is None:
|
| 61 |
+
return default
|
| 62 |
+
return str(value)
|
| 63 |
+
except Exception:
|
| 64 |
+
return default
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
| 68 |
+
"""Clamp a numeric value to a bounded range."""
|
| 69 |
+
try:
|
| 70 |
+
return max(low, min(high, float(value)))
|
| 71 |
+
except Exception:
|
| 72 |
+
return low
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def safe_float(value: Any, default: float = 0.0) -> float:
|
| 76 |
+
"""Convert a value to float without raising."""
|
| 77 |
+
try:
|
| 78 |
+
return float(value)
|
| 79 |
+
except Exception:
|
| 80 |
+
return default
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def safe_text(value: Any, default: str = "") -> str:
|
| 84 |
+
"""Convert any value into a bounded, printable string."""
|
| 85 |
+
try:
|
| 86 |
+
text = str(value)
|
| 87 |
+
except Exception:
|
| 88 |
+
return default
|
| 89 |
+
text = " ".join(text.split())
|
| 90 |
+
return text[:160] if text else default
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def safe_getattr(obj: Any, name: str, default: Any = None) -> Any:
|
| 94 |
+
"""Fetch an attribute from an object without raising."""
|
| 95 |
+
try:
|
| 96 |
+
return getattr(obj, name, default)
|
| 97 |
+
except Exception:
|
| 98 |
+
return default
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_json_response(raw_text: str) -> Dict[str, Any]:
|
| 102 |
+
"""Parse model output into a safe action payload with deterministic fallback."""
|
| 103 |
+
try:
|
| 104 |
+
text = raw_text or ""
|
| 105 |
+
start = text.find("{")
|
| 106 |
+
end = text.rfind("}") + 1
|
| 107 |
+
if start >= 0 and end > start:
|
| 108 |
+
payload = json.loads(text[start:end])
|
| 109 |
+
if isinstance(payload, dict):
|
| 110 |
+
action_type = payload.get("action_type", DEFAULT_ACTION["action_type"])
|
| 111 |
+
code = payload.get("code")
|
| 112 |
+
if action_type not in ALLOWED_ACTIONS:
|
| 113 |
+
action_type = DEFAULT_ACTION["action_type"]
|
| 114 |
+
if action_type != "edit_code":
|
| 115 |
+
code = None
|
| 116 |
+
return {
|
| 117 |
+
"action_type": action_type,
|
| 118 |
+
"code": code,
|
| 119 |
+
"fallback_reason": "",
|
| 120 |
+
}
|
| 121 |
+
except Exception:
|
| 122 |
+
pass
|
| 123 |
+
return dict(DEFAULT_ACTION)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def build_prompt(observation: Any) -> str:
|
| 127 |
+
"""Build a short prompt from the current observation with safe defaults."""
|
| 128 |
+
try:
|
| 129 |
+
task_description = safe_text(safe_getattr(observation, "task_description", ""), "No task description.")
|
| 130 |
+
current_code = safe_text(safe_getattr(observation, "current_code", ""), "")
|
| 131 |
+
errors = safe_text(safe_getattr(observation, "errors", ""), "")
|
| 132 |
+
tests = safe_text(safe_getattr(observation, "test_results", ""), "")
|
| 133 |
+
score = clamp(safe_getattr(observation, "score", 0.0))
|
| 134 |
+
visible_tests = safe_getattr(observation, "visible_tests", [])
|
| 135 |
+
if not isinstance(visible_tests, Iterable) or isinstance(visible_tests, (str, bytes)):
|
| 136 |
+
visible_tests = []
|
| 137 |
+
visible_lines = []
|
| 138 |
+
for item in list(visible_tests)[:4]:
|
| 139 |
+
visible_lines.append(f"- {safe_text(item, 'unknown test')}")
|
| 140 |
+
visible_block = "\n".join(visible_lines) if visible_lines else "- none"
|
| 141 |
+
return (
|
| 142 |
+
"Return exactly one JSON object with keys action_type and optional code.\n"
|
| 143 |
+
"Allowed action_type values: analyze_code, edit_code, run_tests, submit_solution.\n"
|
| 144 |
+
f"Task: {task_description}\n"
|
| 145 |
+
f"Score: {score:.3f}\n"
|
| 146 |
+
f"Errors: {errors or 'none'}\n"
|
| 147 |
+
f"Tests: {tests or 'not available'}\n"
|
| 148 |
+
f"Visible tests:\n{visible_block}\n"
|
| 149 |
+
f"Code:\n{current_code}\n"
|
| 150 |
+
)
|
| 151 |
+
except Exception:
|
| 152 |
+
return (
|
| 153 |
+
"Return exactly one JSON object with keys action_type and optional code. "
|
| 154 |
+
"Use action_type analyze_code."
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def create_client() -> Optional[Any]:
|
| 159 |
+
"""Create an OpenAI-compatible client using only the allowed environment variables."""
|
| 160 |
+
if OpenAI is None:
|
| 161 |
+
return None
|
| 162 |
+
base_url = safe_env("API_BASE_URL", "")
|
| 163 |
+
if not base_url:
|
| 164 |
+
return None
|
| 165 |
+
try:
|
| 166 |
+
if safe_env("HF_TOKEN", ""):
|
| 167 |
+
os.environ["OPENAI_API_KEY"] = safe_env("HF_TOKEN", "")
|
| 168 |
+
except Exception:
|
| 169 |
+
pass
|
| 170 |
+
try:
|
| 171 |
+
client = OpenAI(base_url=os.getenv("API_BASE_URL"))
|
| 172 |
+
return client
|
| 173 |
+
except Exception:
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def run_llm(client: Optional[Any], model: str, prompt: str) -> Dict[str, Any]:
|
| 178 |
+
"""Call the LLM with timeout and retry, then fall back to a mock action."""
|
| 179 |
+
if client is None:
|
| 180 |
+
fallback = dict(DEFAULT_ACTION)
|
| 181 |
+
fallback["fallback_reason"] = "client_unavailable"
|
| 182 |
+
return fallback
|
| 183 |
+
|
| 184 |
+
last_reason = "llm_unavailable"
|
| 185 |
+
for attempt in range(API_RETRIES + 1):
|
| 186 |
+
try:
|
| 187 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 188 |
+
response = client.with_options(timeout=API_TIMEOUT_SECONDS).chat.completions.create(
|
| 189 |
+
model=model,
|
| 190 |
+
messages=[{"role": "user", "content": prompt}],
|
| 191 |
+
temperature=0,
|
| 192 |
+
max_tokens=300,
|
| 193 |
+
)
|
| 194 |
+
message = safe_getattr(response.choices[0].message, "content", "")
|
| 195 |
+
parsed = parse_json_response(message)
|
| 196 |
+
if parsed.get("fallback_reason"):
|
| 197 |
+
parsed["fallback_reason"] = "parse_failed"
|
| 198 |
+
return parsed
|
| 199 |
+
except Exception as exc:
|
| 200 |
+
last_reason = safe_text(exc, "llm_error").lower().replace(" ", "_")
|
| 201 |
+
if attempt < API_RETRIES:
|
| 202 |
+
try:
|
| 203 |
+
time.sleep(API_RETRY_DELAY_SECONDS * (attempt + 1))
|
| 204 |
+
except Exception:
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
fallback = dict(DEFAULT_ACTION)
|
| 208 |
+
fallback["fallback_reason"] = last_reason[:48] or "llm_retry_exhausted"
|
| 209 |
+
return fallback
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def probe_docker(image_name: str) -> Dict[str, Any]:
|
| 213 |
+
"""Safely validate Docker connectivity when a local image name is provided."""
|
| 214 |
+
if not image_name:
|
| 215 |
+
return {"checked": False, "available": False, "reason": "docker_skip"}
|
| 216 |
+
try:
|
| 217 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 218 |
+
result = subprocess.run(
|
| 219 |
+
["docker", "image", "inspect", image_name],
|
| 220 |
+
capture_output=True,
|
| 221 |
+
text=True,
|
| 222 |
+
timeout=3,
|
| 223 |
+
check=False,
|
| 224 |
+
)
|
| 225 |
+
if result.returncode == 0:
|
| 226 |
+
return {"checked": True, "available": True, "reason": "docker_ok"}
|
| 227 |
+
return {"checked": True, "available": False, "reason": "docker_unreachable"}
|
| 228 |
+
except Exception as exc:
|
| 229 |
+
return {"checked": True, "available": False, "reason": safe_text(exc, "docker_error").lower().replace(" ", "_")}
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def fallback_step_result(reason: str, docker_status: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 233 |
+
"""Return a deterministic dummy step result when environment execution fails."""
|
| 234 |
+
docker_reason = safe_text((docker_status or {}).get("reason", "docker_skip"), "docker_skip")
|
| 235 |
+
short_reason = safe_text(reason, "env_fallback").lower().replace(" ", "_")
|
| 236 |
+
return {
|
| 237 |
+
"status": "ok",
|
| 238 |
+
"fallback": True,
|
| 239 |
+
"reason": short_reason[:64],
|
| 240 |
+
"reward": 0.0,
|
| 241 |
+
"improvement": 0.0,
|
| 242 |
+
"score": 0.0,
|
| 243 |
+
"done": True,
|
| 244 |
+
"docker": docker_reason[:32],
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def safe_task_list() -> list[str]:
|
| 249 |
+
"""Load task identifiers without raising."""
|
| 250 |
+
try:
|
| 251 |
+
if callable(task_ids):
|
| 252 |
+
loaded = list(task_ids())
|
| 253 |
+
if loaded:
|
| 254 |
+
return [safe_text(item, "fallback-task") for item in loaded]
|
| 255 |
+
except Exception:
|
| 256 |
+
pass
|
| 257 |
+
return ["fallback-task"]
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def make_action(action_payload: Dict[str, Any]) -> Any:
|
| 261 |
+
"""Build a validated environment action or a safe placeholder."""
|
| 262 |
+
action_type = action_payload.get("action_type", DEFAULT_ACTION["action_type"])
|
| 263 |
+
if action_type not in ALLOWED_ACTIONS:
|
| 264 |
+
action_type = DEFAULT_ACTION["action_type"]
|
| 265 |
+
code = action_payload.get("code")
|
| 266 |
+
if action_type != "edit_code":
|
| 267 |
+
code = None
|
| 268 |
+
if PythonCodeReviewAction is None:
|
| 269 |
+
return {"action_type": action_type, "code": code}
|
| 270 |
+
try:
|
| 271 |
+
return PythonCodeReviewAction(action_type=action_type, code=code)
|
| 272 |
+
except Exception:
|
| 273 |
+
try:
|
| 274 |
+
return PythonCodeReviewAction(action_type=DEFAULT_ACTION["action_type"], code=None)
|
| 275 |
+
except Exception:
|
| 276 |
+
return {"action_type": DEFAULT_ACTION["action_type"], "code": None}
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def compute_reward(
|
| 280 |
+
previous_score: float,
|
| 281 |
+
current_score: float,
|
| 282 |
+
step_reward: float,
|
| 283 |
+
used_fallback: bool,
|
| 284 |
+
done: bool,
|
| 285 |
+
) -> Dict[str, float]:
|
| 286 |
+
"""Compute a deterministic dynamic reward and improvement metric."""
|
| 287 |
+
prev_value = clamp(previous_score)
|
| 288 |
+
curr_value = clamp(current_score)
|
| 289 |
+
improvement = round(curr_value - prev_value, 4)
|
| 290 |
+
bounded_step_reward = max(-1.0, min(1.0, safe_float(step_reward, 0.0)))
|
| 291 |
+
reward_value = (
|
| 292 |
+
0.55 * curr_value
|
| 293 |
+
+ 0.30 * max(improvement, 0.0)
|
| 294 |
+
+ 0.10 * max(bounded_step_reward, 0.0)
|
| 295 |
+
+ (0.05 if done and curr_value >= 0.99 else 0.0)
|
| 296 |
+
- (0.05 if used_fallback else 0.0)
|
| 297 |
+
)
|
| 298 |
+
return {
|
| 299 |
+
"reward": round(clamp(reward_value), 4),
|
| 300 |
+
"improvement": improvement,
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def safe_step(env: Any, action: Any) -> Any:
|
| 305 |
+
"""Execute one environment step without allowing stdout leaks or exceptions."""
|
| 306 |
+
try:
|
| 307 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 308 |
+
return env.step(action)
|
| 309 |
+
except Exception:
|
| 310 |
+
return None
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def safe_reset(env: Any, task_id: str) -> Any:
|
| 314 |
+
"""Reset the environment safely for a task."""
|
| 315 |
+
try:
|
| 316 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 317 |
+
return env.reset(task_id=task_id)
|
| 318 |
+
except Exception:
|
| 319 |
+
return None
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def run_env(client: Optional[Any], model: str) -> Dict[str, Any]:
|
| 323 |
+
"""Run the environment loop safely and return a structured result payload."""
|
| 324 |
+
docker_status = probe_docker(safe_env("LOCAL_IMAGE_NAME", ""))
|
| 325 |
+
if PythonCodeReviewEnvironment is None:
|
| 326 |
+
return fallback_step_result("env_import_failed", docker_status)
|
| 327 |
+
|
| 328 |
+
try:
|
| 329 |
+
with redirect_stdout(io.StringIO()), redirect_stderr(io.StringIO()):
|
| 330 |
+
env = PythonCodeReviewEnvironment(verbose=False)
|
| 331 |
+
except Exception as exc:
|
| 332 |
+
return fallback_step_result(f"env_init_failed_{safe_text(exc, 'unknown')}", docker_status)
|
| 333 |
+
|
| 334 |
+
tasks = safe_task_list()
|
| 335 |
+
task_id = tasks[0] if tasks else "fallback-task"
|
| 336 |
+
observation = safe_reset(env, task_id)
|
| 337 |
+
if observation is None:
|
| 338 |
+
return fallback_step_result("env_reset_failed", docker_status)
|
| 339 |
+
|
| 340 |
+
previous_score = clamp(safe_getattr(observation, "score", 0.0))
|
| 341 |
+
total_step_reward = 0.0
|
| 342 |
+
used_fallback = False
|
| 343 |
+
final_status = "ok"
|
| 344 |
+
final_reason = "completed"
|
| 345 |
+
final_observation = observation
|
| 346 |
+
|
| 347 |
+
for step_index in range(MAX_STEPS):
|
| 348 |
+
prompt = build_prompt(final_observation)
|
| 349 |
+
action_payload = run_llm(client, model, prompt)
|
| 350 |
+
used_fallback = used_fallback or bool(action_payload.get("fallback_reason"))
|
| 351 |
+
action = make_action(action_payload)
|
| 352 |
+
next_observation = safe_step(env, action)
|
| 353 |
+
if next_observation is None:
|
| 354 |
+
final_status = "ok"
|
| 355 |
+
final_reason = "env_step_fallback"
|
| 356 |
+
used_fallback = True
|
| 357 |
+
break
|
| 358 |
+
|
| 359 |
+
final_observation = next_observation
|
| 360 |
+
total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
|
| 361 |
+
done = bool(safe_getattr(final_observation, "done", False))
|
| 362 |
+
score = clamp(safe_getattr(final_observation, "score", 0.0))
|
| 363 |
+
if safe_getattr(final_observation, "last_action_status", ""):
|
| 364 |
+
final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "step_completed")
|
| 365 |
+
elif action_payload.get("fallback_reason"):
|
| 366 |
+
final_reason = safe_text(action_payload.get("fallback_reason"), "llm_fallback")
|
| 367 |
+
else:
|
| 368 |
+
final_reason = f"step_{step_index + 1}_completed"
|
| 369 |
+
if done:
|
| 370 |
+
break
|
| 371 |
+
|
| 372 |
+
if step_index == 0:
|
| 373 |
+
submit_action = make_action({"action_type": "submit_solution", "code": None})
|
| 374 |
+
submitted_observation = safe_step(env, submit_action)
|
| 375 |
+
if submitted_observation is None:
|
| 376 |
+
final_reason = "submit_fallback"
|
| 377 |
+
used_fallback = True
|
| 378 |
+
break
|
| 379 |
+
final_observation = submitted_observation
|
| 380 |
+
total_step_reward += safe_float(safe_getattr(final_observation, "reward", 0.0), 0.0)
|
| 381 |
+
if safe_getattr(final_observation, "last_action_status", ""):
|
| 382 |
+
final_reason = safe_text(safe_getattr(final_observation, "last_action_status", ""), "submit_completed")
|
| 383 |
+
break
|
| 384 |
+
|
| 385 |
+
current_score = clamp(safe_getattr(final_observation, "score", previous_score))
|
| 386 |
+
done = bool(safe_getattr(final_observation, "done", True))
|
| 387 |
+
metrics = compute_reward(
|
| 388 |
+
previous_score=previous_score,
|
| 389 |
+
current_score=current_score,
|
| 390 |
+
step_reward=total_step_reward,
|
| 391 |
+
used_fallback=used_fallback,
|
| 392 |
+
done=done,
|
| 393 |
+
)
|
| 394 |
+
return {
|
| 395 |
+
"status": final_status,
|
| 396 |
+
"fallback": used_fallback,
|
| 397 |
+
"reason": safe_text(final_reason, "completed").lower().replace(" ", "_")[:64],
|
| 398 |
+
"reward": metrics["reward"],
|
| 399 |
+
"improvement": metrics["improvement"],
|
| 400 |
+
"score": round(current_score, 4),
|
| 401 |
+
"done": done,
|
| 402 |
+
"docker": safe_text(docker_status.get("reason", "docker_skip"), "docker_skip")[:32],
|
| 403 |
+
}
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def format_step_message(result: Dict[str, Any]) -> str:
|
| 407 |
+
"""Format the only allowed STEP line for stdout."""
|
| 408 |
+
try:
|
| 409 |
+
fallback = bool(result.get("fallback", False))
|
| 410 |
+
reason = safe_text(result.get("reason", "completed"), "completed").lower().replace(" ", "_")
|
| 411 |
+
if fallback:
|
| 412 |
+
reward = safe_float(result.get("reward", 0.0), 0.0)
|
| 413 |
+
improvement = safe_float(result.get("improvement", 0.0), 0.0)
|
| 414 |
+
score = safe_float(result.get("score", 0.0), 0.0)
|
| 415 |
+
status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
|
| 416 |
+
return (
|
| 417 |
+
f"error handled: {reason} reward={reward:.4f} status={status} "
|
| 418 |
+
f"fallback=true improvement={improvement:.4f} score={score:.4f}"
|
| 419 |
+
)
|
| 420 |
+
reward = safe_float(result.get("reward", 0.0), 0.0)
|
| 421 |
+
improvement = safe_float(result.get("improvement", 0.0), 0.0)
|
| 422 |
+
score = safe_float(result.get("score", 0.0), 0.0)
|
| 423 |
+
status = safe_text(result.get("status", "ok"), "ok").lower().replace(" ", "_")
|
| 424 |
+
return (
|
| 425 |
+
f"reward={reward:.4f} status={status} "
|
| 426 |
+
f"fallback=false improvement={improvement:.4f} score={score:.4f}"
|
| 427 |
+
)
|
| 428 |
+
except Exception:
|
| 429 |
+
return "error handled: formatting_failed"
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
def main() -> int:
|
| 433 |
+
"""Run the inference workflow and always terminate successfully."""
|
| 434 |
+
step_message = "error handled: initialization_failed"
|
| 435 |
+
try:
|
| 436 |
+
model_name = safe_env("MODEL_NAME", DEFAULT_MODEL_NAME) or DEFAULT_MODEL_NAME
|
| 437 |
+
client = create_client()
|
| 438 |
+
result = run_env(client, model_name)
|
| 439 |
+
step_message = format_step_message(result)
|
| 440 |
+
except BaseException as exc:
|
| 441 |
+
step_message = f"error handled: {safe_text(exc, 'unexpected_failure').lower().replace(' ', '_')[:64]}"
|
| 442 |
+
finally:
|
| 443 |
+
try:
|
| 444 |
+
print("START")
|
| 445 |
+
print(f"STEP: {step_message}")
|
| 446 |
+
print("END")
|
| 447 |
+
except Exception:
|
| 448 |
+
pass
|
| 449 |
+
return 0
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
if __name__ == "__main__":
|
| 453 |
+
try:
|
| 454 |
+
main()
|
| 455 |
+
except BaseException:
|
| 456 |
+
try:
|
| 457 |
+
print("START")
|
| 458 |
+
print("STEP: error handled: fatal_guard")
|
| 459 |
+
print("END")
|
| 460 |
+
except Exception:
|
| 461 |
+
pass
|
| 462 |
+
sys.exit(0)
|
models.py
CHANGED
|
@@ -1,17 +1,19 @@
|
|
| 1 |
-
"""Typed models for Python code review and repair environment."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
-
|
| 7 |
-
from pydantic import BaseModel, Field
|
| 8 |
-
|
| 9 |
-
from
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
Difficulty = Literal["easy", "medium", "hard"]
|
| 13 |
-
TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
|
| 14 |
-
ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class HistoryEntry(BaseModel):
|
|
@@ -127,29 +129,79 @@ class PythonCodeReviewState(State):
|
|
| 127 |
done: bool = Field(default=False)
|
| 128 |
|
| 129 |
|
| 130 |
-
class TaskDescriptor(BaseModel):
|
| 131 |
-
"""Public task metadata."""
|
| 132 |
-
|
| 133 |
-
task_id: str = Field(..., description="Stable task identifier")
|
| 134 |
-
title: str = Field(..., description="Human-readable title")
|
| 135 |
-
difficulty: Difficulty = Field(..., description="Difficulty level")
|
| 136 |
-
task_kind: TaskKind = Field(
|
| 137 |
-
task_description: str = Field(
|
| 138 |
-
starter_code: str = Field(
|
| 139 |
-
visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
score: float = Field(..., ge=0.0, le=1.0, description="Overall score")
|
| 147 |
-
syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 148 |
-
tests_passed: int = Field(default=0, ge=0)
|
| 149 |
-
tests_total: int = Field(default=0, ge=0)
|
| 150 |
-
quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
class HealthResponse(BaseModel):
|
|
|
|
| 1 |
+
"""Typed models for Python code review and repair environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
from compat import Action, Observation, State
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 13 |
+
TaskKind = Literal["syntax_fix", "bug_fix", "optimization"]
|
| 14 |
+
ActionType = Literal["analyze_code", "edit_code", "run_tests", "submit_solution"]
|
| 15 |
+
Category = Literal["bug", "security", "performance", "maintainability", "style", "testing"]
|
| 16 |
+
Severity = Literal["critical", "warning", "info"]
|
| 17 |
|
| 18 |
|
| 19 |
class HistoryEntry(BaseModel):
|
|
|
|
| 129 |
done: bool = Field(default=False)
|
| 130 |
|
| 131 |
|
| 132 |
+
class TaskDescriptor(BaseModel):
|
| 133 |
+
"""Public task metadata."""
|
| 134 |
+
|
| 135 |
+
task_id: str = Field(..., description="Stable task identifier")
|
| 136 |
+
title: str = Field(..., description="Human-readable title")
|
| 137 |
+
difficulty: Difficulty = Field(..., description="Difficulty level")
|
| 138 |
+
task_kind: Optional[TaskKind] = Field(default=None, description="Type of task")
|
| 139 |
+
task_description: str = Field(default="", description="Full task description")
|
| 140 |
+
starter_code: str = Field(default="", description="Initial broken code")
|
| 141 |
+
visible_tests: List[str] = Field(default_factory=list, description="Public test cases")
|
| 142 |
+
goal: str = Field(default="", description="Optional goal summary for review-style tasks")
|
| 143 |
+
repo_summary: str = Field(default="", description="Optional repository context")
|
| 144 |
+
changed_files: List[str] = Field(default_factory=list, description="Changed files for review-style tasks")
|
| 145 |
+
available_files: List[str] = Field(default_factory=list, description="Browsable files for review-style tasks")
|
| 146 |
+
max_steps: int = Field(..., ge=1, description="Maximum steps allowed")
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class TaskSummary(BaseModel):
|
| 150 |
+
"""Lightweight task metadata for list endpoints."""
|
| 151 |
+
|
| 152 |
+
task_id: str = Field(..., description="Stable task identifier")
|
| 153 |
+
difficulty: Difficulty = Field(..., description="Difficulty level")
|
| 154 |
+
title: str = Field(..., description="Human-readable title")
|
| 155 |
+
goal: str = Field(default="", description="Optional task goal")
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
class ReviewFinding(BaseModel):
|
| 159 |
+
"""Structured code review finding used by auxiliary review utilities."""
|
| 160 |
+
|
| 161 |
+
title: str = Field(..., description="Short human-readable finding title")
|
| 162 |
+
file_path: str = Field(default="", description="Optional file path")
|
| 163 |
+
line: Optional[int] = Field(default=None, ge=1, description="Optional 1-based line number")
|
| 164 |
+
category: Category = Field(default="bug", description="Finding category")
|
| 165 |
+
severity: Severity = Field(default="warning", description="Finding severity")
|
| 166 |
+
rationale: str = Field(default="", description="Why this matters")
|
| 167 |
+
recommendation: str = Field(default="", description="Suggested remediation")
|
| 168 |
+
rule_id: str = Field(default="", description="Stable detector or rubric identifier")
|
| 169 |
+
|
| 170 |
+
@property
|
| 171 |
+
def explanation(self) -> str:
|
| 172 |
+
"""Backward-compatible alias used by older grading helpers."""
|
| 173 |
+
return self.rationale
|
| 174 |
+
|
| 175 |
+
@property
|
| 176 |
+
def suggested_fix(self) -> str:
|
| 177 |
+
"""Backward-compatible alias used by older grading helpers."""
|
| 178 |
+
return self.recommendation
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
class DirectReviewResponse(BaseModel):
|
| 182 |
+
"""Response payload for deterministic direct-review utilities."""
|
| 183 |
+
|
| 184 |
+
issues: List[ReviewFinding] = Field(default_factory=list)
|
| 185 |
+
summary: str = Field(default="")
|
| 186 |
+
score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 187 |
+
improved_code: Optional[str] = Field(default=None)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class TaskGrade(BaseModel):
|
| 191 |
+
"""Grading result for task submission."""
|
| 192 |
+
|
| 193 |
score: float = Field(..., ge=0.0, le=1.0, description="Overall score")
|
| 194 |
+
syntax_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 195 |
+
tests_passed: int = Field(default=0, ge=0)
|
| 196 |
+
tests_total: int = Field(default=0, ge=0)
|
| 197 |
+
quality_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 198 |
+
runtime_score: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 199 |
+
timed_out: bool = Field(default=False)
|
| 200 |
+
matched_issue_ids: List[str] = Field(default_factory=list)
|
| 201 |
+
false_positives: int = Field(default=0, ge=0)
|
| 202 |
+
duplicate_findings: int = Field(default=0, ge=0)
|
| 203 |
+
matched_weight: float = Field(default=0.0, ge=0.0, le=1.0)
|
| 204 |
+
details: Dict[str, Any] = Field(default_factory=dict)
|
| 205 |
|
| 206 |
|
| 207 |
class HealthResponse(BaseModel):
|
openenv_python_env.egg-info/PKG-INFO
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metadata-Version: 2.4
|
| 2 |
+
Name: openenv-python_env
|
| 3 |
+
Version: 0.2.0
|
| 4 |
+
Summary: Deterministic Python code review and repair benchmark environment for OpenEnv
|
| 5 |
+
Requires-Python: >=3.10
|
| 6 |
+
Requires-Dist: openenv-core[core]>=0.2.2
|
| 7 |
+
Requires-Dist: fastapi>=0.115.0
|
| 8 |
+
Requires-Dist: uvicorn>=0.30.0
|
| 9 |
+
Requires-Dist: openai>=1.40.0
|
| 10 |
+
Requires-Dist: pytest>=8.0.0
|
| 11 |
+
Provides-Extra: dev
|
| 12 |
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
| 13 |
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
openenv_python_env.egg-info/SOURCES.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
README.md
|
| 2 |
+
pyproject.toml
|
| 3 |
+
./__init__.py
|
| 4 |
+
./client.py
|
| 5 |
+
./compat.py
|
| 6 |
+
./inference.py
|
| 7 |
+
./models.py
|
| 8 |
+
openenv_python_env.egg-info/PKG-INFO
|
| 9 |
+
openenv_python_env.egg-info/SOURCES.txt
|
| 10 |
+
openenv_python_env.egg-info/dependency_links.txt
|
| 11 |
+
openenv_python_env.egg-info/entry_points.txt
|
| 12 |
+
openenv_python_env.egg-info/requires.txt
|
| 13 |
+
openenv_python_env.egg-info/top_level.txt
|
| 14 |
+
server/__init__.py
|
| 15 |
+
server/app.py
|
| 16 |
+
server/code_review_env_environment.py
|
| 17 |
+
server/code_review_environment.py
|
| 18 |
+
server/env.py
|
| 19 |
+
server/env_safe.py
|
| 20 |
+
server/grading.py
|
| 21 |
+
server/python_env_environment.py
|
| 22 |
+
server/static_review.py
|
| 23 |
+
server/task_bank.py
|
| 24 |
+
tests/test_api.py
|
| 25 |
+
tests/test_environment.py
|
| 26 |
+
tests/test_examples.py
|
| 27 |
+
tests/test_reward_dynamics.py
|
openenv_python_env.egg-info/dependency_links.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
openenv_python_env.egg-info/entry_points.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[console_scripts]
|
| 2 |
+
server = python_env.server.app:main
|
openenv_python_env.egg-info/requires.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.30.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pytest>=8.0.0
|
| 6 |
+
|
| 7 |
+
[dev]
|
| 8 |
+
pytest>=8.0.0
|
| 9 |
+
pytest-cov>=4.0.0
|
openenv_python_env.egg-info/top_level.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
python_env
|
server/app.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
"""FastAPI application for the Python code review environment."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
from fastapi import APIRouter, HTTPException
|
| 8 |
-
from fastapi.responses import RedirectResponse
|
| 9 |
-
|
| 10 |
-
from
|
| 11 |
|
| 12 |
from models import (
|
| 13 |
HealthResponse,
|
|
@@ -20,14 +20,17 @@ from models import (
|
|
| 20 |
from server.env import PythonCodeReviewEnvironment
|
| 21 |
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
router = APIRouter(tags=["python-code-review"])
|
| 33 |
|
|
@@ -79,7 +82,24 @@ def get_state_post() -> RedirectResponse:
|
|
| 79 |
return RedirectResponse(url="/state", status_code=303)
|
| 80 |
|
| 81 |
|
| 82 |
-
app.include_router(router)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
|
|
|
| 1 |
+
"""FastAPI application for the Python code review environment."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException
|
| 8 |
+
from fastapi.responses import RedirectResponse
|
| 9 |
+
|
| 10 |
+
from compat import create_app
|
| 11 |
|
| 12 |
from models import (
|
| 13 |
HealthResponse,
|
|
|
|
| 20 |
from server.env import PythonCodeReviewEnvironment
|
| 21 |
|
| 22 |
|
| 23 |
+
try:
|
| 24 |
+
MAX_CONCURRENT_ENVS = max(int(os.getenv("MAX_CONCURRENT_ENVS", "16")), 1)
|
| 25 |
+
except Exception:
|
| 26 |
+
MAX_CONCURRENT_ENVS = 16
|
| 27 |
+
|
| 28 |
+
python_env = PythonCodeReviewEnvironment(verbose=False)
|
| 29 |
+
app = create_app(
|
| 30 |
+
PythonCodeReviewEnvironment,
|
| 31 |
+
PythonCodeReviewAction,
|
| 32 |
+
PythonCodeReviewObservation,
|
| 33 |
+
max_concurrent_envs=MAX_CONCURRENT_ENVS,
|
| 34 |
)
|
| 35 |
router = APIRouter(tags=["python-code-review"])
|
| 36 |
|
|
|
|
| 82 |
return RedirectResponse(url="/state", status_code=303)
|
| 83 |
|
| 84 |
|
| 85 |
+
app.include_router(router)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _prioritize_route(path: str, methods: set[str]) -> None:
|
| 89 |
+
"""Move a matching custom route ahead of default OpenEnv routes."""
|
| 90 |
+
try:
|
| 91 |
+
for index in range(len(app.router.routes) - 1, -1, -1):
|
| 92 |
+
route = app.router.routes[index]
|
| 93 |
+
route_path = getattr(route, "path", None)
|
| 94 |
+
route_methods = set(getattr(route, "methods", set()) or set())
|
| 95 |
+
if route_path == path and methods.issubset(route_methods):
|
| 96 |
+
app.router.routes.insert(0, app.router.routes.pop(index))
|
| 97 |
+
break
|
| 98 |
+
except Exception:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
_prioritize_route("/health", {"GET"})
|
| 103 |
|
| 104 |
|
| 105 |
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
server/env.py
CHANGED
|
@@ -1,790 +1 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
REWARD SYSTEM ARCHITECTURE
|
| 4 |
-
==========================
|
| 5 |
-
|
| 6 |
-
The environment implements a dynamic, multi-component reward system to provide
|
| 7 |
-
meaningful feedback at every step of agent learning.
|
| 8 |
-
|
| 9 |
-
Six independent reward components are computed and combined:
|
| 10 |
-
|
| 11 |
-
1. PROGRESS REWARD (max +0.25)
|
| 12 |
-
- Awarded for score improvement: min(PROGRESS_SCALE * score_delta, 0.25)
|
| 13 |
-
- Encourages continuous improvement on the task
|
| 14 |
-
|
| 15 |
-
2. SYNTAX REWARD (max +0.35)
|
| 16 |
-
- One-time bonus when code first becomes compilable
|
| 17 |
-
- Acknowledges the critical step of creating valid code
|
| 18 |
-
|
| 19 |
-
3. TEST REWARD (max +0.20)
|
| 20 |
-
- Based on test pass rate improvement
|
| 21 |
-
- Formula: min(TEST_PASS_REWARD_SCALE * test_improvement, 0.20)
|
| 22 |
-
|
| 23 |
-
4. QUALITY REWARD (max +0.15)
|
| 24 |
-
- Based on AST-detected code quality improvements
|
| 25 |
-
- Rewards better structure, readability, best practices
|
| 26 |
-
|
| 27 |
-
5. STAGNATION PENALTY (−0.10)
|
| 28 |
-
- Applied when agent acts but code doesn't change
|
| 29 |
-
- Encourages editing rather than repeated analysis
|
| 30 |
-
|
| 31 |
-
6. REGRESSION PENALTY (scale −0.20)
|
| 32 |
-
- Applied when score declines: REGRESSION_PENALTY_SCALE * abs(score_delta)
|
| 33 |
-
- Discourages actions that make code worse
|
| 34 |
-
|
| 35 |
-
FINAL REWARD
|
| 36 |
-
Final reward = clamp(progress + syntax + test + quality - stagnation - regression, -1.0, +1.0)
|
| 37 |
-
|
| 38 |
-
Always bounded in [-1.0, +1.0] for interpretability and learning stability.
|
| 39 |
-
|
| 40 |
-
See RewardDetails in models.py for all fields returned with each reward.
|
| 41 |
-
"""
|
| 42 |
-
|
| 43 |
-
from __future__ import annotations
|
| 44 |
-
|
| 45 |
-
import random
|
| 46 |
-
import sys
|
| 47 |
-
from typing import List, Optional
|
| 48 |
-
from uuid import uuid4
|
| 49 |
-
|
| 50 |
-
from openenv.core.env_server.interfaces import Environment
|
| 51 |
-
|
| 52 |
-
from graders import grade_task
|
| 53 |
-
from models import (
|
| 54 |
-
HealthResponse,
|
| 55 |
-
HistoryEntry,
|
| 56 |
-
PythonCodeReviewAction,
|
| 57 |
-
PythonCodeReviewObservation,
|
| 58 |
-
PythonCodeReviewState,
|
| 59 |
-
RewardDetails,
|
| 60 |
-
TaskGrade,
|
| 61 |
-
)
|
| 62 |
-
from tasks import TaskSpec, get_task, list_task_descriptors, list_task_summaries, task_ids
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
# ============================================================================
|
| 66 |
-
# REWARD SHAPING CONSTANTS
|
| 67 |
-
# ============================================================================
|
| 68 |
-
# These constants control the reward magnitude for each component.
|
| 69 |
-
# Tuning these values changes agent learning incentives.
|
| 70 |
-
|
| 71 |
-
# Component 1: Score improvement reward
|
| 72 |
-
PROGRESS_SCALE = 0.25
|
| 73 |
-
"""Scale for progress rewards. Higher = more reward for score improvement."""
|
| 74 |
-
|
| 75 |
-
# Component 2: Syntax/compilation fix reward
|
| 76 |
-
SYNTAX_FIX_BONUS = 0.35
|
| 77 |
-
"""One-time bonus for first time code compiles."""
|
| 78 |
-
|
| 79 |
-
# Component 3: Test improvement reward
|
| 80 |
-
TEST_PASS_REWARD_SCALE = 0.30
|
| 81 |
-
"""Scale for test pass rate rewards."""
|
| 82 |
-
|
| 83 |
-
# Component 4: Code quality reward
|
| 84 |
-
QUALITY_BONUS_SCALE = 0.15
|
| 85 |
-
"""Scale for code quality improvements (AST-based)."""
|
| 86 |
-
|
| 87 |
-
# Component 5: Stagnation penalty
|
| 88 |
-
STAGNATION_PENALTY = 0.10
|
| 89 |
-
"""Penalty when action is taken but code unchanged."""
|
| 90 |
-
|
| 91 |
-
# Component 6: Regression penalty
|
| 92 |
-
REGRESSION_PENALTY_SCALE = 0.20
|
| 93 |
-
"""Scale for penalties when score declines."""
|
| 94 |
-
|
| 95 |
-
# One-time completion bonus
|
| 96 |
-
COMPLETION_BONUS = 0.50
|
| 97 |
-
"""Bonus for fully correct solution."""
|
| 98 |
-
|
| 99 |
-
# Invalid/error penalties
|
| 100 |
-
INVALID_ACTION_PENALTY = 0.15
|
| 101 |
-
"""Penalty for unsupported action types."""
|
| 102 |
-
|
| 103 |
-
TIMEOUT_PENALTY = 0.15
|
| 104 |
-
"""Penalty for execution timeout."""
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
class PythonCodeReviewEnvironment(
|
| 108 |
-
Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
|
| 109 |
-
):
|
| 110 |
-
"""Production-style environment for reviewing and fixing Python code.
|
| 111 |
-
|
| 112 |
-
Implements OpenEnv compatibility and dynamic multi-component reward system.
|
| 113 |
-
"""
|
| 114 |
-
|
| 115 |
-
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 116 |
-
|
| 117 |
-
def __init__(self, verbose: bool = True) -> None:
|
| 118 |
-
super().__init__()
|
| 119 |
-
self._task_order = list(task_ids())
|
| 120 |
-
self._task_cursor = -1
|
| 121 |
-
self._task: Optional[TaskSpec] = None
|
| 122 |
-
self._state = PythonCodeReviewState(episode_id=str(uuid4()))
|
| 123 |
-
self._done = False
|
| 124 |
-
self._last_status = "Call reset() to start."
|
| 125 |
-
self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
|
| 126 |
-
self._verbose = verbose
|
| 127 |
-
|
| 128 |
-
# Progress tracking
|
| 129 |
-
self._previous_score = 0.0
|
| 130 |
-
self._previous_code = ""
|
| 131 |
-
self._best_visible_test_fraction = 0.0
|
| 132 |
-
self._best_quality_score = 0.0
|
| 133 |
-
self._full_correctness_awarded = False
|
| 134 |
-
self._syntax_reward_awarded = False
|
| 135 |
-
self.last_code = ""
|
| 136 |
-
self.reward_history: list[float] = []
|
| 137 |
-
|
| 138 |
-
def reset(
|
| 139 |
-
self,
|
| 140 |
-
seed: Optional[int] = None,
|
| 141 |
-
episode_id: Optional[str] = None,
|
| 142 |
-
task_id: Optional[str] = None,
|
| 143 |
-
**_: object,
|
| 144 |
-
) -> PythonCodeReviewObservation:
|
| 145 |
-
"""Reset the environment to the next deterministic task."""
|
| 146 |
-
|
| 147 |
-
del seed
|
| 148 |
-
|
| 149 |
-
# Select task
|
| 150 |
-
if task_id:
|
| 151 |
-
self._task = get_task(task_id)
|
| 152 |
-
self._task_cursor = self._task_order.index(task_id)
|
| 153 |
-
else:
|
| 154 |
-
self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
|
| 155 |
-
self._task = get_task(self._task_order[self._task_cursor])
|
| 156 |
-
|
| 157 |
-
# Reset episode state and tracking
|
| 158 |
-
self._done = False
|
| 159 |
-
self._previous_score = 0.0
|
| 160 |
-
self._previous_code = self._task.starter_code
|
| 161 |
-
self._best_visible_test_fraction = 0.0
|
| 162 |
-
self._best_quality_score = 0.0
|
| 163 |
-
self._full_correctness_awarded = False
|
| 164 |
-
self._syntax_reward_awarded = False
|
| 165 |
-
self.last_code = ""
|
| 166 |
-
self.reward_history = []
|
| 167 |
-
self._last_status = "Inspect the code, edit it, run tests, then submit."
|
| 168 |
-
self._last_reward = RewardDetails(value=0.0, reason="Episode reset.", prev_score=0.0, curr_score=0.0)
|
| 169 |
-
|
| 170 |
-
self._state = PythonCodeReviewState(
|
| 171 |
-
episode_id=episode_id or str(uuid4()),
|
| 172 |
-
step_count=0,
|
| 173 |
-
task_id=self._task.task_id,
|
| 174 |
-
difficulty=self._task.difficulty,
|
| 175 |
-
task_kind=self._task.task_kind,
|
| 176 |
-
attempts_remaining=self._task.max_steps,
|
| 177 |
-
current_code=self._task.starter_code,
|
| 178 |
-
errors="",
|
| 179 |
-
test_results="Not run yet.",
|
| 180 |
-
history=[],
|
| 181 |
-
score=0.0,
|
| 182 |
-
done=False,
|
| 183 |
-
)
|
| 184 |
-
|
| 185 |
-
if self._verbose:
|
| 186 |
-
print(f"\n{'='*70}")
|
| 187 |
-
print(f"RESET: Task {self._task.task_id} ({self._task.difficulty})")
|
| 188 |
-
print(f"{'='*70}")
|
| 189 |
-
|
| 190 |
-
return self._build_observation()
|
| 191 |
-
|
| 192 |
-
def step(
|
| 193 |
-
self,
|
| 194 |
-
action: PythonCodeReviewAction,
|
| 195 |
-
timeout_s: Optional[float] = None,
|
| 196 |
-
**_: object,
|
| 197 |
-
) -> PythonCodeReviewObservation:
|
| 198 |
-
"""Apply one structured action."""
|
| 199 |
-
|
| 200 |
-
del timeout_s
|
| 201 |
-
|
| 202 |
-
if self._task is None:
|
| 203 |
-
return self.reset()
|
| 204 |
-
|
| 205 |
-
if self._done:
|
| 206 |
-
self._last_reward = RewardDetails(
|
| 207 |
-
value=-INVALID_ACTION_PENALTY,
|
| 208 |
-
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 209 |
-
reason="Episode already completed.",
|
| 210 |
-
)
|
| 211 |
-
self._last_status = "Episode already completed. Call reset() to continue."
|
| 212 |
-
return self._build_observation()
|
| 213 |
-
|
| 214 |
-
self._state.step_count += 1
|
| 215 |
-
status = ""
|
| 216 |
-
reward = RewardDetails(value=0.0, reason="Action processed.")
|
| 217 |
-
|
| 218 |
-
# Dispatch to handler based on action type
|
| 219 |
-
if action.action_type == "analyze_code":
|
| 220 |
-
reward, status = self._handle_analyze()
|
| 221 |
-
elif action.action_type == "edit_code":
|
| 222 |
-
reward, status = self._handle_edit(action)
|
| 223 |
-
elif action.action_type == "run_tests":
|
| 224 |
-
reward, status = self._handle_run_tests()
|
| 225 |
-
elif action.action_type == "submit_solution":
|
| 226 |
-
reward, status = self._handle_submit()
|
| 227 |
-
else:
|
| 228 |
-
reward = RewardDetails(
|
| 229 |
-
value=-INVALID_ACTION_PENALTY,
|
| 230 |
-
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 231 |
-
reason=f"Unsupported action_type: {action.action_type}",
|
| 232 |
-
)
|
| 233 |
-
status = f"Invalid action: unsupported action_type '{action.action_type}'."
|
| 234 |
-
|
| 235 |
-
self._last_reward = reward
|
| 236 |
-
self._last_status = status
|
| 237 |
-
self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
|
| 238 |
-
self._state.done = self._done
|
| 239 |
-
|
| 240 |
-
# Auto-submit if steps exhausted
|
| 241 |
-
if self._state.attempts_remaining == 0 and not self._done:
|
| 242 |
-
self._finalize_episode(auto_submit=True)
|
| 243 |
-
self._state.done = True
|
| 244 |
-
|
| 245 |
-
# Debug logging
|
| 246 |
-
if self._verbose:
|
| 247 |
-
self._log_debug_step(reward)
|
| 248 |
-
|
| 249 |
-
return self._build_observation()
|
| 250 |
-
|
| 251 |
-
@property
|
| 252 |
-
def state(self) -> PythonCodeReviewState:
|
| 253 |
-
"""Return the current environment state."""
|
| 254 |
-
return self._state.model_copy(deep=True)
|
| 255 |
-
|
| 256 |
-
def list_task_summaries(self) -> List[object]:
|
| 257 |
-
"""Return public task metadata."""
|
| 258 |
-
return list_task_summaries()
|
| 259 |
-
|
| 260 |
-
def get_task(self, task_id: str) -> object:
|
| 261 |
-
"""Return a single task descriptor."""
|
| 262 |
-
return get_task(task_id).to_descriptor()
|
| 263 |
-
|
| 264 |
-
def health(self) -> HealthResponse:
|
| 265 |
-
"""Return a simple health model."""
|
| 266 |
-
return HealthResponse(task_count=len(self._task_order))
|
| 267 |
-
|
| 268 |
-
def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
|
| 269 |
-
"""Expose deterministic grading outside of an active episode."""
|
| 270 |
-
return grade_task(code, get_task(task_id), include_hidden=True)
|
| 271 |
-
|
| 272 |
-
def _build_observation(self) -> PythonCodeReviewObservation:
|
| 273 |
-
"""Build current observation from state."""
|
| 274 |
-
return PythonCodeReviewObservation(
|
| 275 |
-
task_id=self._state.task_id or "",
|
| 276 |
-
title=self._task.title if self._task else "",
|
| 277 |
-
difficulty=self._state.difficulty or "easy",
|
| 278 |
-
task_kind=self._state.task_kind,
|
| 279 |
-
task_description=self._task.task_description if self._task else "",
|
| 280 |
-
current_code=self._state.current_code,
|
| 281 |
-
errors=self._state.errors,
|
| 282 |
-
test_results=self._state.test_results,
|
| 283 |
-
visible_tests=self._task.visible_tests if self._task else [],
|
| 284 |
-
history=self._state.history,
|
| 285 |
-
attempts_remaining=self._state.attempts_remaining,
|
| 286 |
-
last_action_status=self._last_status,
|
| 287 |
-
score=self._state.score,
|
| 288 |
-
reward_details=self._last_reward,
|
| 289 |
-
reward=self._last_reward.value,
|
| 290 |
-
done=self._state.done,
|
| 291 |
-
metadata={
|
| 292 |
-
"prev_score": self._last_reward.prev_score,
|
| 293 |
-
"curr_score": self._last_reward.curr_score,
|
| 294 |
-
},
|
| 295 |
-
)
|
| 296 |
-
|
| 297 |
-
def apply_action(self, action: PythonCodeReviewAction) -> str:
|
| 298 |
-
"""Return the code candidate produced by an action."""
|
| 299 |
-
if action.action_type == "edit_code":
|
| 300 |
-
return (action.code or "").strip() or self._state.current_code
|
| 301 |
-
return self._state.current_code
|
| 302 |
-
|
| 303 |
-
def run_tests(
|
| 304 |
-
self,
|
| 305 |
-
code: str,
|
| 306 |
-
include_hidden: bool = False,
|
| 307 |
-
) -> tuple[float, dict[str, int], TaskGrade]:
|
| 308 |
-
"""Grade code and return score plus simple test statistics."""
|
| 309 |
-
if self._task is None:
|
| 310 |
-
empty_results = {"passed": 0, "total": 0}
|
| 311 |
-
return 0.0, empty_results, TaskGrade(score=0.0)
|
| 312 |
-
|
| 313 |
-
grade = grade_task(code, self._task, include_hidden=include_hidden)
|
| 314 |
-
test_results = {
|
| 315 |
-
"passed": grade.tests_passed,
|
| 316 |
-
"total": grade.tests_total,
|
| 317 |
-
}
|
| 318 |
-
return grade.score, test_results, grade
|
| 319 |
-
|
| 320 |
-
def compute_reward(self, old_code, new_code, prev_score, curr_score, test_results):
|
| 321 |
-
# progress
|
| 322 |
-
progress = curr_score - prev_score
|
| 323 |
-
|
| 324 |
-
# test score
|
| 325 |
-
passed = test_results["passed"]
|
| 326 |
-
total = test_results["total"]
|
| 327 |
-
test_ratio = passed / total if total > 0 else 0
|
| 328 |
-
|
| 329 |
-
# syntax score
|
| 330 |
-
try:
|
| 331 |
-
compile(new_code, "<string>", "exec")
|
| 332 |
-
syntax_score = 1.0
|
| 333 |
-
except:
|
| 334 |
-
syntax_score = 0.0
|
| 335 |
-
|
| 336 |
-
# stagnation penalty
|
| 337 |
-
stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
|
| 338 |
-
|
| 339 |
-
# regression penalty
|
| 340 |
-
regression_penalty = max(0.0, prev_score - curr_score)
|
| 341 |
-
|
| 342 |
-
# repetition penalty (track last 3 actions)
|
| 343 |
-
repetition_penalty = 0.1 if new_code == self.last_code else 0.0
|
| 344 |
-
|
| 345 |
-
# quality (simple heuristic)
|
| 346 |
-
length_penalty = 0.0
|
| 347 |
-
if len(new_code) > len(old_code) * 1.5:
|
| 348 |
-
length_penalty = 0.1
|
| 349 |
-
|
| 350 |
-
# final reward
|
| 351 |
-
reward = (
|
| 352 |
-
0.4 * progress
|
| 353 |
-
+ 0.3 * test_ratio
|
| 354 |
-
+ 0.2 * syntax_score
|
| 355 |
-
- stagnation_penalty
|
| 356 |
-
- regression_penalty
|
| 357 |
-
- repetition_penalty
|
| 358 |
-
- length_penalty
|
| 359 |
-
)
|
| 360 |
-
|
| 361 |
-
# clamp
|
| 362 |
-
reward = max(-1.0, min(1.0, reward))
|
| 363 |
-
|
| 364 |
-
return reward
|
| 365 |
-
|
| 366 |
-
def _apply_reward_randomization(self, reward: float) -> float:
|
| 367 |
-
"""Break repeated static rewards while keeping the result bounded."""
|
| 368 |
-
reward = max(-1.0, min(1.0, reward))
|
| 369 |
-
self.reward_history.append(reward)
|
| 370 |
-
if len(self.reward_history) >= 3 and len(set(self.reward_history[-3:])) == 1:
|
| 371 |
-
reward += random.uniform(-0.05, 0.05)
|
| 372 |
-
reward = max(-1.0, min(1.0, reward))
|
| 373 |
-
self.reward_history[-1] = reward
|
| 374 |
-
return reward
|
| 375 |
-
|
| 376 |
-
def _build_reward_details(
|
| 377 |
-
self,
|
| 378 |
-
old_code: str,
|
| 379 |
-
new_code: str,
|
| 380 |
-
prev_score: float,
|
| 381 |
-
curr_score: float,
|
| 382 |
-
test_results: dict[str, int],
|
| 383 |
-
reward_value: float,
|
| 384 |
-
reason: str,
|
| 385 |
-
) -> RewardDetails:
|
| 386 |
-
"""Build a reward payload that matches the scalar reward computation."""
|
| 387 |
-
passed = test_results["passed"]
|
| 388 |
-
total = test_results["total"]
|
| 389 |
-
test_ratio = passed / total if total > 0 else 0.0
|
| 390 |
-
try:
|
| 391 |
-
compile(new_code, "<string>", "exec")
|
| 392 |
-
syntax_score = 1.0
|
| 393 |
-
except SyntaxError:
|
| 394 |
-
syntax_score = 0.0
|
| 395 |
-
|
| 396 |
-
stagnation_penalty = 0.2 if new_code.strip() == old_code.strip() else 0.0
|
| 397 |
-
regression_penalty = max(0.0, prev_score - curr_score)
|
| 398 |
-
repetition_penalty = 0.1 if new_code == self.last_code else 0.0
|
| 399 |
-
length_penalty = 0.1 if len(new_code) > len(old_code) * 1.5 else 0.0
|
| 400 |
-
|
| 401 |
-
return RewardDetails(
|
| 402 |
-
value=reward_value,
|
| 403 |
-
progress_delta=0.4 * (curr_score - prev_score),
|
| 404 |
-
syntax_reward=0.2 * syntax_score,
|
| 405 |
-
test_reward=0.3 * test_ratio,
|
| 406 |
-
quality_bonus=-length_penalty,
|
| 407 |
-
stagnation_penalty=stagnation_penalty,
|
| 408 |
-
regression_penalty=regression_penalty + repetition_penalty,
|
| 409 |
-
reason=reason,
|
| 410 |
-
prev_score=round(prev_score, 6),
|
| 411 |
-
curr_score=round(curr_score, 6),
|
| 412 |
-
code_changed=new_code.strip() != old_code.strip(),
|
| 413 |
-
)
|
| 414 |
-
|
| 415 |
-
def _handle_analyze(self) -> tuple[RewardDetails, str]:
|
| 416 |
-
"""Analyze code for errors and test status."""
|
| 417 |
-
if self._task is None:
|
| 418 |
-
return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
|
| 419 |
-
|
| 420 |
-
old_code = self._state.current_code
|
| 421 |
-
prev_score = self._previous_score
|
| 422 |
-
curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
|
| 423 |
-
error = curr_grade.details.get("compile_error", "")
|
| 424 |
-
|
| 425 |
-
# Status message
|
| 426 |
-
if error:
|
| 427 |
-
self._state.errors = error
|
| 428 |
-
self._state.test_results = "Compilation failed. Fix syntax first."
|
| 429 |
-
summary = f"Syntax error detected: {error}"
|
| 430 |
-
else:
|
| 431 |
-
self._state.errors = ""
|
| 432 |
-
if self._task.task_kind == "syntax_fix":
|
| 433 |
-
self._state.test_results = "Code compiles successfully."
|
| 434 |
-
summary = "Code compiles. Ready to submit."
|
| 435 |
-
else:
|
| 436 |
-
visible_total = len(self._task.visible_tests)
|
| 437 |
-
visible_passed = curr_grade.tests_passed
|
| 438 |
-
self._state.test_results = f"Test run: {visible_passed}/{visible_total} passing."
|
| 439 |
-
summary = self._state.test_results
|
| 440 |
-
|
| 441 |
-
reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
|
| 442 |
-
reward_value = self._apply_reward_randomization(reward_value)
|
| 443 |
-
reward = self._build_reward_details(
|
| 444 |
-
old_code=old_code,
|
| 445 |
-
new_code=old_code,
|
| 446 |
-
prev_score=prev_score,
|
| 447 |
-
curr_score=curr_score,
|
| 448 |
-
test_results=test_results,
|
| 449 |
-
reward_value=reward_value,
|
| 450 |
-
reason=summary,
|
| 451 |
-
)
|
| 452 |
-
|
| 453 |
-
# Update state
|
| 454 |
-
self._state.score = curr_score
|
| 455 |
-
self._state.errors = curr_grade.details.get("compile_error", "")
|
| 456 |
-
self._previous_score = curr_score
|
| 457 |
-
self.last_code = old_code
|
| 458 |
-
self._append_history("analyze_code", summary, reward.value)
|
| 459 |
-
return reward, summary
|
| 460 |
-
|
| 461 |
-
def _handle_edit(self, action: PythonCodeReviewAction) -> tuple[RewardDetails, str]:
|
| 462 |
-
"""Edit the code and compute reward for progress."""
|
| 463 |
-
if self._task is None:
|
| 464 |
-
return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
|
| 465 |
-
|
| 466 |
-
code = (action.code or "").strip()
|
| 467 |
-
if not code:
|
| 468 |
-
reward = RewardDetails(
|
| 469 |
-
value=-INVALID_ACTION_PENALTY,
|
| 470 |
-
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 471 |
-
reason="Edit action requires non-empty code.",
|
| 472 |
-
)
|
| 473 |
-
status = "Invalid: edit_code requires code parameter."
|
| 474 |
-
self._append_history("edit_code", status, reward.value)
|
| 475 |
-
return reward, status
|
| 476 |
-
|
| 477 |
-
old_code = self._state.current_code
|
| 478 |
-
prev_score = self._previous_score
|
| 479 |
-
curr_score, test_results, curr_grade = self.run_tests(code, include_hidden=False)
|
| 480 |
-
|
| 481 |
-
# Update state
|
| 482 |
-
self._state.current_code = code
|
| 483 |
-
self._previous_code = code
|
| 484 |
-
self._state.errors = curr_grade.details.get("compile_error", "")
|
| 485 |
-
self._state.test_results = self._format_test_results(curr_grade)
|
| 486 |
-
self._state.score = curr_score
|
| 487 |
-
|
| 488 |
-
status = "Code updated."
|
| 489 |
-
if self._state.errors:
|
| 490 |
-
status = f"Code updated, but syntax issues remain: {self._state.errors}"
|
| 491 |
-
elif curr_grade.tests_total > 0:
|
| 492 |
-
status = self._state.test_results
|
| 493 |
-
|
| 494 |
-
reward_value = self.compute_reward(old_code, code, prev_score, curr_score, test_results)
|
| 495 |
-
reward_value = self._apply_reward_randomization(reward_value)
|
| 496 |
-
reward = self._build_reward_details(
|
| 497 |
-
old_code=old_code,
|
| 498 |
-
new_code=code,
|
| 499 |
-
prev_score=prev_score,
|
| 500 |
-
curr_score=curr_score,
|
| 501 |
-
test_results=test_results,
|
| 502 |
-
reward_value=reward_value,
|
| 503 |
-
reason=status,
|
| 504 |
-
)
|
| 505 |
-
|
| 506 |
-
self._previous_score = curr_score
|
| 507 |
-
self.last_code = code
|
| 508 |
-
self._append_history("edit_code", status, reward.value)
|
| 509 |
-
return reward, status
|
| 510 |
-
|
| 511 |
-
def _handle_run_tests(self) -> tuple[RewardDetails, str]:
|
| 512 |
-
"""Run tests and provide feedback."""
|
| 513 |
-
if self._task is None:
|
| 514 |
-
return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
|
| 515 |
-
|
| 516 |
-
old_code = self._state.current_code
|
| 517 |
-
prev_score = self._previous_score
|
| 518 |
-
curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=False)
|
| 519 |
-
|
| 520 |
-
# Update state
|
| 521 |
-
self._state.errors = curr_grade.details.get("compile_error", "")
|
| 522 |
-
self._state.test_results = self._format_test_results(curr_grade)
|
| 523 |
-
self._state.score = curr_score
|
| 524 |
-
|
| 525 |
-
status = self._state.test_results if not self._state.errors else self._state.errors
|
| 526 |
-
reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
|
| 527 |
-
reward_value = self._apply_reward_randomization(reward_value)
|
| 528 |
-
reward = self._build_reward_details(
|
| 529 |
-
old_code=old_code,
|
| 530 |
-
new_code=old_code,
|
| 531 |
-
prev_score=prev_score,
|
| 532 |
-
curr_score=curr_score,
|
| 533 |
-
test_results=test_results,
|
| 534 |
-
reward_value=reward_value,
|
| 535 |
-
reason=status,
|
| 536 |
-
)
|
| 537 |
-
|
| 538 |
-
self._previous_score = curr_score
|
| 539 |
-
self.last_code = old_code
|
| 540 |
-
self._append_history("run_tests", status, reward.value)
|
| 541 |
-
return reward, status
|
| 542 |
-
|
| 543 |
-
def _handle_submit(self) -> tuple[RewardDetails, str]:
|
| 544 |
-
"""Submit solution and finalize episode."""
|
| 545 |
-
if self._task is None:
|
| 546 |
-
return RewardDetails(value=0.0, reason="Invalid state"), "Error: task not loaded"
|
| 547 |
-
|
| 548 |
-
old_code = self._state.current_code
|
| 549 |
-
prev_score = self._previous_score
|
| 550 |
-
curr_score, test_results, curr_grade = self.run_tests(old_code, include_hidden=True)
|
| 551 |
-
|
| 552 |
-
# Update state
|
| 553 |
-
self._state.errors = curr_grade.details.get("compile_error", "")
|
| 554 |
-
self._state.test_results = self._format_test_results(curr_grade)
|
| 555 |
-
self._state.score = curr_score
|
| 556 |
-
self._previous_score = curr_score
|
| 557 |
-
self.last_code = old_code
|
| 558 |
-
self._finalize_episode(auto_submit=False, grade=curr_grade)
|
| 559 |
-
|
| 560 |
-
reward_value = self.compute_reward(old_code, old_code, prev_score, curr_score, test_results)
|
| 561 |
-
reward_value = self._apply_reward_randomization(reward_value)
|
| 562 |
-
status = f"Solution submitted. Final score: {curr_score:.3f}"
|
| 563 |
-
reward = self._build_reward_details(
|
| 564 |
-
old_code=old_code,
|
| 565 |
-
new_code=old_code,
|
| 566 |
-
prev_score=prev_score,
|
| 567 |
-
curr_score=curr_score,
|
| 568 |
-
test_results=test_results,
|
| 569 |
-
reward_value=reward_value,
|
| 570 |
-
reason=status,
|
| 571 |
-
)
|
| 572 |
-
|
| 573 |
-
self._append_history("submit_solution", status, reward_value)
|
| 574 |
-
return reward, status
|
| 575 |
-
|
| 576 |
-
def _compute_reward_components(
|
| 577 |
-
self,
|
| 578 |
-
curr_score: float,
|
| 579 |
-
prev_score: float,
|
| 580 |
-
curr_grade: TaskGrade,
|
| 581 |
-
code_changed: bool,
|
| 582 |
-
prev_grade_score: float = 0.0,
|
| 583 |
-
) -> dict:
|
| 584 |
-
"""Compute all six reward components and return combined result.
|
| 585 |
-
|
| 586 |
-
This method is the core of the reward system. It evaluates agent progress
|
| 587 |
-
across multiple dimensions and provides transparent, component-wise feedback.
|
| 588 |
-
|
| 589 |
-
REWARD COMPONENTS (6 total):
|
| 590 |
-
============================
|
| 591 |
-
|
| 592 |
-
1. PROGRESS REWARD (positive, max +0.25)
|
| 593 |
-
- Awarded when score improves from previous step
|
| 594 |
-
- Formula: min(PROGRESS_SCALE * score_delta, 0.25)
|
| 595 |
-
- Why: Encourages monotonic improvement
|
| 596 |
-
|
| 597 |
-
2. SYNTAX REWARD (positive, max +0.35)
|
| 598 |
-
- One-time bonus when code first compiles
|
| 599 |
-
- Transition: uncompilable → compilable
|
| 600 |
-
- Why: Acknowledges critical first step of valid code
|
| 601 |
-
|
| 602 |
-
3. TEST REWARD (positive, max +0.20)
|
| 603 |
-
- Based on improvement in test pass rate
|
| 604 |
-
- Formula: min(TEST_PASS_REWARD_SCALE * test_improvement, 0.20)
|
| 605 |
-
- Tracks best test rate seen in episode (monotonic)
|
| 606 |
-
- Why: Rewards incremental progress on passing tests
|
| 607 |
-
|
| 608 |
-
4. QUALITY REWARD (positive, max +0.15)
|
| 609 |
-
- Based on AST-detected code quality metrics
|
| 610 |
-
- Computed by deterministic grader (syntax_score, quality_score)
|
| 611 |
-
- Tracks best quality seen in episode (monotonic)
|
| 612 |
-
- Why: Teaches code structure and maintainability
|
| 613 |
-
|
| 614 |
-
5. STAGNATION PENALTY (negative, −0.10)
|
| 615 |
-
- Applied when action is taken but code doesn't change
|
| 616 |
-
- Exception: No penalty if code has compile errors (still debugging)
|
| 617 |
-
- Why: Encourages editing over repeated analysis
|
| 618 |
-
|
| 619 |
-
6. REGRESSION PENALTY (negative, scale −0.20)
|
| 620 |
-
- Applied when score decreases from previous step
|
| 621 |
-
- Formula: REGRESSION_PENALTY_SCALE * abs(score_delta)
|
| 622 |
-
- Special case: Timeout returns fixed TIMEOUT_PENALTY (−0.15)
|
| 623 |
-
- Why: Discourages actions that make code worse
|
| 624 |
-
|
| 625 |
-
FINAL REWARD:
|
| 626 |
-
=============
|
| 627 |
-
total = progress + syntax + test + quality - stagnation - regression
|
| 628 |
-
final_reward = clamp(total, -1.0, +1.0)
|
| 629 |
-
|
| 630 |
-
The result is always bounded for interpretability and stability.
|
| 631 |
-
|
| 632 |
-
Args:
|
| 633 |
-
curr_score: Current score after action (0.0 to 1.0)
|
| 634 |
-
prev_score: Score from previous step (0.0 to 1.0)
|
| 635 |
-
curr_grade: TaskGrade object with detailed metrics
|
| 636 |
-
code_changed: Boolean, whether the action modified code
|
| 637 |
-
prev_grade_score: Previous syntax_score for detecting first compile
|
| 638 |
-
|
| 639 |
-
Returns:
|
| 640 |
-
dict with keys: "progress", "syntax", "test", "quality",
|
| 641 |
-
"stagnation", "regression", "total"
|
| 642 |
-
All values are floats, with total clamped to [-1.0, +1.0]
|
| 643 |
-
"""
|
| 644 |
-
# Initialize all components to zero
|
| 645 |
-
components = {
|
| 646 |
-
"progress": 0.0,
|
| 647 |
-
"syntax": 0.0,
|
| 648 |
-
"test": 0.0,
|
| 649 |
-
"quality": 0.0,
|
| 650 |
-
"stagnation": 0.0,
|
| 651 |
-
"regression": 0.0,
|
| 652 |
-
"total": 0.0,
|
| 653 |
-
}
|
| 654 |
-
|
| 655 |
-
# ====================================================================
|
| 656 |
-
# COMPONENT 1: PROGRESS REWARD
|
| 657 |
-
# ====================================================================
|
| 658 |
-
# Reward score improvement. Encourages continuous progress towards goal.
|
| 659 |
-
score_delta = curr_score - prev_score
|
| 660 |
-
if score_delta > 0:
|
| 661 |
-
# Scale improvement by constant, cap at 0.25 to prevent dominance
|
| 662 |
-
components["progress"] = min(PROGRESS_SCALE * score_delta, 0.25)
|
| 663 |
-
|
| 664 |
-
# ====================================================================
|
| 665 |
-
# COMPONENT 2: SYNTAX REWARD
|
| 666 |
-
# ====================================================================
|
| 667 |
-
# One-time bonus for fixing syntax errors and making code compilable.
|
| 668 |
-
# This is tracked per episode with _syntax_reward_awarded flag.
|
| 669 |
-
if not self._syntax_reward_awarded and curr_grade.syntax_score >= 0.99:
|
| 670 |
-
# Only award if transitioning from non-compilable to compilable
|
| 671 |
-
if prev_grade_score < 0.99:
|
| 672 |
-
components["syntax"] = SYNTAX_FIX_BONUS
|
| 673 |
-
self._syntax_reward_awarded = True
|
| 674 |
-
|
| 675 |
-
# ====================================================================
|
| 676 |
-
# COMPONENT 3: TEST REWARD
|
| 677 |
-
# ====================================================================
|
| 678 |
-
# Reward improvement in test pass rate. Track best rate seen this episode.
|
| 679 |
-
if curr_grade.tests_total > 0:
|
| 680 |
-
# Fraction of visible tests currently passing
|
| 681 |
-
curr_test_frac = curr_grade.tests_passed / curr_grade.tests_total
|
| 682 |
-
# Improvement since best rate seen in episode
|
| 683 |
-
test_delta = curr_test_frac - self._best_visible_test_fraction
|
| 684 |
-
|
| 685 |
-
if test_delta > 0:
|
| 686 |
-
# Scale improvement, cap at 0.20 to prevent dominance
|
| 687 |
-
components["test"] = min(TEST_PASS_REWARD_SCALE * test_delta, 0.20)
|
| 688 |
-
# Update best rate seen in this episode (monotonic)
|
| 689 |
-
self._best_visible_test_fraction = max(
|
| 690 |
-
self._best_visible_test_fraction, curr_test_frac
|
| 691 |
-
)
|
| 692 |
-
|
| 693 |
-
# ====================================================================
|
| 694 |
-
# COMPONENT 4: QUALITY REWARD
|
| 695 |
-
# ====================================================================
|
| 696 |
-
# Reward improvements in code quality (AST-based metrics from grader).
|
| 697 |
-
# Track best quality metric seen in this episode.
|
| 698 |
-
quality_delta = curr_grade.quality_score - self._best_quality_score
|
| 699 |
-
if quality_delta > 0:
|
| 700 |
-
# Scale improvement, cap at 0.15 to prevent dominance
|
| 701 |
-
components["quality"] = min(QUALITY_BONUS_SCALE * quality_delta, 0.15)
|
| 702 |
-
# Update best quality seen in this episode (monotonic)
|
| 703 |
-
self._best_quality_score = max(
|
| 704 |
-
self._best_quality_score, curr_grade.quality_score
|
| 705 |
-
)
|
| 706 |
-
|
| 707 |
-
# ====================================================================
|
| 708 |
-
# COMPONENT 5: STAGNATION PENALTY
|
| 709 |
-
# ====================================================================
|
| 710 |
-
# Penalize when agent acts but doesn't change code (except during debugging).
|
| 711 |
-
# Exception: No penalty if code still has compile errors (debugging mode).
|
| 712 |
-
if not code_changed and not (curr_grade.details.get("compile_error") == ""):
|
| 713 |
-
components["stagnation"] = -STAGNATION_PENALTY
|
| 714 |
-
|
| 715 |
-
# ====================================================================
|
| 716 |
-
# COMPONENT 6: REGRESSION PENALTY
|
| 717 |
-
# ====================================================================
|
| 718 |
-
# Penalize when score decreases (regression).
|
| 719 |
-
# Special case: Timeout incurs fixed penalty instead of score-based.
|
| 720 |
-
if score_delta < 0:
|
| 721 |
-
# Scale penalty by magnitude of regression
|
| 722 |
-
components["regression"] = REGRESSION_PENALTY_SCALE * abs(score_delta)
|
| 723 |
-
|
| 724 |
-
# Timeout gets special fixed penalty
|
| 725 |
-
if curr_grade.timed_out:
|
| 726 |
-
components["regression"] = -TIMEOUT_PENALTY
|
| 727 |
-
|
| 728 |
-
# ====================================================================
|
| 729 |
-
# FINAL REWARD COMPUTATION
|
| 730 |
-
# ====================================================================
|
| 731 |
-
# Combine all components: sum positives, subtract negatives, clamp to [-1, 1]
|
| 732 |
-
total = (
|
| 733 |
-
components["progress"]
|
| 734 |
-
+ components["syntax"]
|
| 735 |
-
+ components["test"]
|
| 736 |
-
+ components["quality"]
|
| 737 |
-
- components["stagnation"]
|
| 738 |
-
- components["regression"]
|
| 739 |
-
)
|
| 740 |
-
|
| 741 |
-
# Clamp to [-1.0, +1.0] for bounded, interpretable rewards
|
| 742 |
-
components["total"] = max(-1.0, min(1.0, round(total, 6)))
|
| 743 |
-
|
| 744 |
-
return components
|
| 745 |
-
|
| 746 |
-
def _finalize_episode(self, auto_submit: bool, grade: Optional[TaskGrade] = None) -> None:
|
| 747 |
-
"""Mark episode as done and set final score."""
|
| 748 |
-
if grade is None:
|
| 749 |
-
if self._task is None:
|
| 750 |
-
return
|
| 751 |
-
grade = grade_task(self._state.current_code, self._task, include_hidden=True)
|
| 752 |
-
|
| 753 |
-
self._state.score = grade.score
|
| 754 |
-
self._done = True
|
| 755 |
-
self._state.done = True
|
| 756 |
-
|
| 757 |
-
def _format_test_results(self, grade: TaskGrade) -> str:
|
| 758 |
-
"""Format test results for display."""
|
| 759 |
-
if grade.tests_total == 0:
|
| 760 |
-
return "No tests available."
|
| 761 |
-
if grade.timed_out:
|
| 762 |
-
return "Test execution timed out."
|
| 763 |
-
return f"Tests: {grade.tests_passed}/{grade.tests_total} passing"
|
| 764 |
-
|
| 765 |
-
def _append_history(self, action_type: str, status: str, reward: float) -> None:
|
| 766 |
-
"""Append action to history."""
|
| 767 |
-
entry = HistoryEntry(
|
| 768 |
-
step=self._state.step_count,
|
| 769 |
-
action_type=action_type,
|
| 770 |
-
status=status,
|
| 771 |
-
reward=reward,
|
| 772 |
-
)
|
| 773 |
-
self._state.history.append(entry)
|
| 774 |
-
|
| 775 |
-
def _log_debug_step(self, reward: RewardDetails) -> None:
|
| 776 |
-
"""Log the scalar reward signal in a compact RL-friendly format."""
|
| 777 |
-
print(
|
| 778 |
-
f"""
|
| 779 |
-
Step Debug:
|
| 780 |
-
Prev Score: {reward.prev_score}
|
| 781 |
-
Curr Score: {reward.curr_score}
|
| 782 |
-
Reward: {reward.value}
|
| 783 |
-
Progress: {reward.curr_score - reward.prev_score}
|
| 784 |
-
"""
|
| 785 |
-
)
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
# Backwards-compatible aliases used elsewhere in the repo.
|
| 789 |
-
PythonEnvironment = PythonCodeReviewEnvironment
|
| 790 |
-
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
|
|
|
| 1 |
+
from .env_safe import * # noqa: F401,F403
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server/env_safe.py
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Safe OpenEnv environment for deterministic Python code repair tasks."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Optional
|
| 6 |
+
from uuid import uuid4
|
| 7 |
+
|
| 8 |
+
from compat import Environment
|
| 9 |
+
from graders import grade_task
|
| 10 |
+
from models import (
|
| 11 |
+
HealthResponse,
|
| 12 |
+
HistoryEntry,
|
| 13 |
+
PythonCodeReviewAction,
|
| 14 |
+
PythonCodeReviewObservation,
|
| 15 |
+
PythonCodeReviewState,
|
| 16 |
+
RewardDetails,
|
| 17 |
+
TaskGrade,
|
| 18 |
+
)
|
| 19 |
+
from tasks import TaskSpec, get_task as load_task, list_task_summaries, task_ids
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
INVALID_ACTION_PENALTY = 0.10
|
| 23 |
+
NO_PROGRESS_PENALTY = 0.08
|
| 24 |
+
REPEATED_ACTION_PENALTY = 0.05
|
| 25 |
+
BASE_STEP_PENALTY = 0.02
|
| 26 |
+
ANALYZE_STEP_PENALTY = 0.01
|
| 27 |
+
SUBMIT_COMPLETION_BONUS = 0.30
|
| 28 |
+
TIMEOUT_PENALTY = 0.12
|
| 29 |
+
VALID_ACTIONS = {"analyze_code", "edit_code", "run_tests", "submit_solution"}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _clamp(value: float, low: float = 0.0, high: float = 1.0) -> float:
|
| 33 |
+
"""Clamp a scalar to a bounded numeric interval."""
|
| 34 |
+
try:
|
| 35 |
+
return max(low, min(high, float(value)))
|
| 36 |
+
except Exception:
|
| 37 |
+
return low
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _safe_text(value: Any, default: str = "") -> str:
|
| 41 |
+
"""Convert values into short stable strings."""
|
| 42 |
+
try:
|
| 43 |
+
text = str(value)
|
| 44 |
+
except Exception:
|
| 45 |
+
return default
|
| 46 |
+
text = " ".join(text.split())
|
| 47 |
+
return text[:240] if text else default
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class PythonCodeReviewEnvironment(
|
| 51 |
+
Environment[PythonCodeReviewAction, PythonCodeReviewObservation, PythonCodeReviewState]
|
| 52 |
+
):
|
| 53 |
+
"""Deterministic, bounded, evaluator-safe environment for code repair tasks."""
|
| 54 |
+
|
| 55 |
+
SUPPORTS_CONCURRENT_SESSIONS = True
|
| 56 |
+
|
| 57 |
+
def __init__(self, verbose: bool = False) -> None:
|
| 58 |
+
super().__init__()
|
| 59 |
+
self._verbose = bool(verbose)
|
| 60 |
+
self._task_order = self._safe_task_order()
|
| 61 |
+
self._task_cursor = -1
|
| 62 |
+
self._task: Optional[TaskSpec] = None
|
| 63 |
+
self._state = PythonCodeReviewState(episode_id=str(uuid4()))
|
| 64 |
+
self._done = False
|
| 65 |
+
self._last_status = "Call reset() to start."
|
| 66 |
+
self._last_reward = RewardDetails(value=0.0, reason="Environment initialized.")
|
| 67 |
+
self._metrics = self._blank_metrics()
|
| 68 |
+
self._last_action_type = ""
|
| 69 |
+
|
| 70 |
+
def reset(
|
| 71 |
+
self,
|
| 72 |
+
seed: Optional[int] = None,
|
| 73 |
+
episode_id: Optional[str] = None,
|
| 74 |
+
task_id: Optional[str] = None,
|
| 75 |
+
**_: object,
|
| 76 |
+
) -> PythonCodeReviewObservation:
|
| 77 |
+
"""Reset the environment for a deterministic task and return an observation."""
|
| 78 |
+
del seed
|
| 79 |
+
try:
|
| 80 |
+
self._reset_rubric()
|
| 81 |
+
except Exception:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
task = self._select_task(task_id)
|
| 85 |
+
self._task = task
|
| 86 |
+
self._done = False
|
| 87 |
+
self._metrics = self._blank_metrics()
|
| 88 |
+
self._last_action_type = ""
|
| 89 |
+
self._last_status = "Inspect the code, run checks, edit the code, then submit."
|
| 90 |
+
self._last_reward = RewardDetails(
|
| 91 |
+
value=0.0,
|
| 92 |
+
reason="Episode reset.",
|
| 93 |
+
prev_score=0.0,
|
| 94 |
+
curr_score=0.0,
|
| 95 |
+
)
|
| 96 |
+
self._state = PythonCodeReviewState(
|
| 97 |
+
episode_id=episode_id or str(uuid4()),
|
| 98 |
+
step_count=0,
|
| 99 |
+
task_id=task.task_id,
|
| 100 |
+
difficulty=task.difficulty,
|
| 101 |
+
task_kind=task.task_kind,
|
| 102 |
+
attempts_remaining=max(int(task.max_steps), 1),
|
| 103 |
+
current_code=task.starter_code,
|
| 104 |
+
errors="",
|
| 105 |
+
test_results="No checks run yet.",
|
| 106 |
+
history=[],
|
| 107 |
+
score=0.0,
|
| 108 |
+
done=False,
|
| 109 |
+
)
|
| 110 |
+
return self._build_observation()
|
| 111 |
+
|
| 112 |
+
def step(
|
| 113 |
+
self,
|
| 114 |
+
action: PythonCodeReviewAction,
|
| 115 |
+
timeout_s: Optional[float] = None,
|
| 116 |
+
**_: object,
|
| 117 |
+
) -> PythonCodeReviewObservation:
|
| 118 |
+
"""Execute one safe environment step and always return a valid observation."""
|
| 119 |
+
del timeout_s
|
| 120 |
+
try:
|
| 121 |
+
if self._task is None:
|
| 122 |
+
return self.reset()
|
| 123 |
+
|
| 124 |
+
if self._done:
|
| 125 |
+
self._last_status = "Episode already completed. Call reset() to continue."
|
| 126 |
+
self._last_reward = RewardDetails(
|
| 127 |
+
value=-INVALID_ACTION_PENALTY,
|
| 128 |
+
invalid_action_penalty=INVALID_ACTION_PENALTY,
|
| 129 |
+
reason="Episode already completed.",
|
| 130 |
+
prev_score=self._metrics["score"],
|
| 131 |
+
curr_score=self._metrics["score"],
|
| 132 |
+
code_changed=False,
|
| 133 |
+
)
|
| 134 |
+
return self._build_observation()
|
| 135 |
+
|
| 136 |
+
self._state.step_count += 1
|
| 137 |
+
action_type = _safe_text(getattr(action, "action_type", "analyze_code"), "analyze_code")
|
| 138 |
+
code = getattr(action, "code", None)
|
| 139 |
+
|
| 140 |
+
if action_type == "analyze_code":
|
| 141 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 142 |
+
elif action_type == "run_tests":
|
| 143 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=False)
|
| 144 |
+
elif action_type == "edit_code":
|
| 145 |
+
self._handle_edit(code)
|
| 146 |
+
elif action_type == "submit_solution":
|
| 147 |
+
self._handle_scored_action(action_type=action_type, candidate_code=self._state.current_code, include_hidden=True)
|
| 148 |
+
self._done = True
|
| 149 |
+
else:
|
| 150 |
+
self._apply_invalid_action(f"Unsupported action_type '{action_type}'.")
|
| 151 |
+
|
| 152 |
+
self._state.attempts_remaining = max(self._task.max_steps - self._state.step_count, 0)
|
| 153 |
+
if self._state.attempts_remaining == 0 and not self._done:
|
| 154 |
+
self._auto_submit()
|
| 155 |
+
|
| 156 |
+
self._state.done = self._done
|
| 157 |
+
return self._build_observation()
|
| 158 |
+
except Exception as exc:
|
| 159 |
+
self._apply_invalid_action(f"Step failure handled: {_safe_text(exc, 'unknown_error')}")
|
| 160 |
+
self._state.done = self._done
|
| 161 |
+
return self._build_observation()
|
| 162 |
+
|
| 163 |
+
@property
|
| 164 |
+
def state(self) -> PythonCodeReviewState:
|
| 165 |
+
"""Return a deep copy of the current environment state."""
|
| 166 |
+
try:
|
| 167 |
+
return self._state.model_copy(deep=True)
|
| 168 |
+
except Exception:
|
| 169 |
+
return PythonCodeReviewState(episode_id=str(uuid4()))
|
| 170 |
+
|
| 171 |
+
def list_task_summaries(self) -> list[object]:
|
| 172 |
+
"""Return public task summaries."""
|
| 173 |
+
try:
|
| 174 |
+
return list_task_summaries()
|
| 175 |
+
except Exception:
|
| 176 |
+
return []
|
| 177 |
+
|
| 178 |
+
def get_task(self, task_id: str) -> object:
|
| 179 |
+
"""Return a single public task descriptor."""
|
| 180 |
+
return self._select_task(task_id).to_descriptor()
|
| 181 |
+
|
| 182 |
+
def health(self) -> HealthResponse:
|
| 183 |
+
"""Return a simple health response."""
|
| 184 |
+
return HealthResponse(task_count=len(self._task_order))
|
| 185 |
+
|
| 186 |
+
def grade_task_submission(self, task_id: str, code: str) -> TaskGrade:
|
| 187 |
+
"""Grade a task submission outside an episode without raising."""
|
| 188 |
+
try:
|
| 189 |
+
task = self._select_task(task_id)
|
| 190 |
+
return self._safe_grade(task=task, candidate_code=code, include_hidden=True)
|
| 191 |
+
except Exception as exc:
|
| 192 |
+
return TaskGrade(score=0.0, details={"error": _safe_text(exc, "grading_failed")})
|
| 193 |
+
|
| 194 |
+
def run_tests(self, code: str, include_hidden: bool = False) -> tuple[float, dict[str, int], TaskGrade]:
|
| 195 |
+
"""Run deterministic grading and return score plus test summary."""
|
| 196 |
+
task = self._task or self._select_task(None)
|
| 197 |
+
grade = self._safe_grade(task=task, candidate_code=code, include_hidden=include_hidden)
|
| 198 |
+
return (
|
| 199 |
+
_clamp(grade.score),
|
| 200 |
+
{"passed": int(grade.tests_passed), "total": int(grade.tests_total)},
|
| 201 |
+
grade,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
def apply_action(self, action: PythonCodeReviewAction) -> str:
|
| 205 |
+
"""Return the candidate code implied by the action."""
|
| 206 |
+
if getattr(action, "action_type", "") == "edit_code":
|
| 207 |
+
code = getattr(action, "code", None)
|
| 208 |
+
return str(code) if code is not None else self._state.current_code
|
| 209 |
+
return self._state.current_code
|
| 210 |
+
|
| 211 |
+
def compute_reward(
|
| 212 |
+
self,
|
| 213 |
+
action_type: str,
|
| 214 |
+
previous_metrics: dict[str, float],
|
| 215 |
+
current_metrics: dict[str, float],
|
| 216 |
+
grade: TaskGrade,
|
| 217 |
+
code_changed: bool,
|
| 218 |
+
invalid_action: bool = False,
|
| 219 |
+
) -> RewardDetails:
|
| 220 |
+
"""Compute a bounded dynamic reward with progress and efficiency shaping."""
|
| 221 |
+
prev_score = _clamp(previous_metrics.get("score", 0.0))
|
| 222 |
+
curr_score = _clamp(current_metrics.get("score", 0.0))
|
| 223 |
+
score_delta = curr_score - prev_score
|
| 224 |
+
test_delta = current_metrics.get("test_fraction", 0.0) - previous_metrics.get("test_fraction", 0.0)
|
| 225 |
+
syntax_delta = current_metrics.get("syntax_score", 0.0) - previous_metrics.get("syntax_score", 0.0)
|
| 226 |
+
quality_delta = current_metrics.get("quality_score", 0.0) - previous_metrics.get("quality_score", 0.0)
|
| 227 |
+
|
| 228 |
+
step_penalty = BASE_STEP_PENALTY + (ANALYZE_STEP_PENALTY if action_type == "analyze_code" else 0.0)
|
| 229 |
+
repeated_penalty = REPEATED_ACTION_PENALTY if action_type == self._last_action_type else 0.0
|
| 230 |
+
no_progress = (
|
| 231 |
+
score_delta <= 1e-9
|
| 232 |
+
and test_delta <= 1e-9
|
| 233 |
+
and syntax_delta <= 1e-9
|
| 234 |
+
and quality_delta <= 1e-9
|
| 235 |
+
and not code_changed
|
| 236 |
+
)
|
| 237 |
+
stagnation_penalty = NO_PROGRESS_PENALTY if no_progress and not invalid_action else 0.0
|
| 238 |
+
regression_penalty = max(-score_delta, 0.0) * 0.6 + repeated_penalty + step_penalty
|
| 239 |
+
invalid_penalty = INVALID_ACTION_PENALTY if invalid_action else 0.0
|
| 240 |
+
timeout_penalty = TIMEOUT_PENALTY if bool(grade.timed_out) else 0.0
|
| 241 |
+
|
| 242 |
+
progress_reward = max(score_delta, 0.0) * 0.7
|
| 243 |
+
syntax_reward = max(syntax_delta, 0.0) * 0.5
|
| 244 |
+
test_reward = max(test_delta, 0.0) * 1.0
|
| 245 |
+
quality_bonus = max(quality_delta, 0.0) * 0.2
|
| 246 |
+
correctness_bonus = SUBMIT_COMPLETION_BONUS if action_type == "submit_solution" and curr_score >= 0.999 else 0.0
|
| 247 |
+
|
| 248 |
+
reward_value = (
|
| 249 |
+
progress_reward
|
| 250 |
+
+ syntax_reward
|
| 251 |
+
+ test_reward
|
| 252 |
+
+ quality_bonus
|
| 253 |
+
+ correctness_bonus
|
| 254 |
+
- stagnation_penalty
|
| 255 |
+
- regression_penalty
|
| 256 |
+
- invalid_penalty
|
| 257 |
+
- timeout_penalty
|
| 258 |
+
)
|
| 259 |
+
reward_value = max(-1.0, min(1.0, round(reward_value, 6)))
|
| 260 |
+
return RewardDetails(
|
| 261 |
+
value=reward_value,
|
| 262 |
+
syntax_reward=round(syntax_reward, 6),
|
| 263 |
+
test_reward=round(test_reward, 6),
|
| 264 |
+
quality_bonus=round(quality_bonus, 6),
|
| 265 |
+
correctness_bonus=round(correctness_bonus, 6),
|
| 266 |
+
progress_delta=round(progress_reward, 6),
|
| 267 |
+
stagnation_penalty=round(stagnation_penalty, 6),
|
| 268 |
+
regression_penalty=round(regression_penalty, 6),
|
| 269 |
+
invalid_action_penalty=round(invalid_penalty, 6),
|
| 270 |
+
timeout_penalty=round(timeout_penalty, 6),
|
| 271 |
+
reason=f"{action_type} reward computed safely",
|
| 272 |
+
prev_score=round(prev_score, 6),
|
| 273 |
+
curr_score=round(curr_score, 6),
|
| 274 |
+
code_changed=bool(code_changed),
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
def _safe_task_order(self) -> list[str]:
|
| 278 |
+
"""Load deterministic task ids with a hard fallback."""
|
| 279 |
+
try:
|
| 280 |
+
loaded = list(task_ids())
|
| 281 |
+
if loaded:
|
| 282 |
+
return [str(task_id) for task_id in loaded]
|
| 283 |
+
except Exception:
|
| 284 |
+
pass
|
| 285 |
+
return ["syntax-fix-easy", "bug-fix-medium", "optimization-hard"]
|
| 286 |
+
|
| 287 |
+
def _blank_metrics(self) -> dict[str, float]:
|
| 288 |
+
"""Return an empty metric snapshot."""
|
| 289 |
+
return {
|
| 290 |
+
"score": 0.0,
|
| 291 |
+
"test_fraction": 0.0,
|
| 292 |
+
"syntax_score": 0.0,
|
| 293 |
+
"quality_score": 0.0,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
def _select_task(self, task_id: Optional[str]) -> TaskSpec:
|
| 297 |
+
"""Select the requested task or advance deterministically."""
|
| 298 |
+
try:
|
| 299 |
+
if task_id:
|
| 300 |
+
task = load_task(task_id)
|
| 301 |
+
if task.task_id in self._task_order:
|
| 302 |
+
self._task_cursor = self._task_order.index(task.task_id)
|
| 303 |
+
return task
|
| 304 |
+
except Exception:
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
try:
|
| 308 |
+
self._task_cursor = (self._task_cursor + 1) % len(self._task_order)
|
| 309 |
+
return load_task(self._task_order[self._task_cursor])
|
| 310 |
+
except Exception:
|
| 311 |
+
return load_task("syntax-fix-easy")
|
| 312 |
+
|
| 313 |
+
def _safe_grade(self, task: TaskSpec, candidate_code: str, include_hidden: bool) -> TaskGrade:
|
| 314 |
+
"""Run grading without allowing exceptions to escape."""
|
| 315 |
+
try:
|
| 316 |
+
return grade_task(candidate_code, task, include_hidden=include_hidden)
|
| 317 |
+
except Exception as exc:
|
| 318 |
+
return TaskGrade(
|
| 319 |
+
score=0.0,
|
| 320 |
+
syntax_score=0.0,
|
| 321 |
+
tests_passed=0,
|
| 322 |
+
tests_total=max(len(task.visible_tests), 1),
|
| 323 |
+
details={"compile_error": "", "error": _safe_text(exc, "grading_failed")},
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
def _metrics_from_grade(self, grade: TaskGrade) -> dict[str, float]:
|
| 327 |
+
"""Derive normalized reward metrics from a grading result."""
|
| 328 |
+
tests_total = max(int(grade.tests_total), 0)
|
| 329 |
+
tests_passed = max(int(grade.tests_passed), 0)
|
| 330 |
+
test_fraction = (tests_passed / tests_total) if tests_total else _clamp(grade.syntax_score)
|
| 331 |
+
return {
|
| 332 |
+
"score": _clamp(grade.score),
|
| 333 |
+
"test_fraction": _clamp(test_fraction),
|
| 334 |
+
"syntax_score": _clamp(grade.syntax_score),
|
| 335 |
+
"quality_score": _clamp(grade.quality_score),
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
def _format_test_results(self, grade: TaskGrade, include_hidden: bool) -> str:
|
| 339 |
+
"""Format test execution results for the observation."""
|
| 340 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 341 |
+
scope = "all checks" if include_hidden else "visible checks"
|
| 342 |
+
if compile_error:
|
| 343 |
+
return f"{scope}: compile error: {compile_error}"
|
| 344 |
+
if grade.timed_out:
|
| 345 |
+
return f"{scope}: execution timed out"
|
| 346 |
+
if self._task and self._task.task_kind == "syntax_fix":
|
| 347 |
+
return "visible checks: code compiles successfully"
|
| 348 |
+
return f"{scope}: {int(grade.tests_passed)}/{int(grade.tests_total)} passing"
|
| 349 |
+
|
| 350 |
+
def _build_status(self, action_type: str, grade: TaskGrade) -> str:
|
| 351 |
+
"""Build a human-readable status message."""
|
| 352 |
+
if action_type == "submit_solution":
|
| 353 |
+
return f"Solution submitted. Final score: {_clamp(grade.score):.3f}"
|
| 354 |
+
if action_type == "edit_code":
|
| 355 |
+
if grade.details.get("compile_error"):
|
| 356 |
+
return "Code updated, but syntax issues remain."
|
| 357 |
+
return "Code updated and evaluated."
|
| 358 |
+
if action_type == "run_tests":
|
| 359 |
+
return "Test run completed."
|
| 360 |
+
if action_type == "analyze_code":
|
| 361 |
+
return "Analysis completed."
|
| 362 |
+
return "Action handled safely."
|
| 363 |
+
|
| 364 |
+
def _apply_grade_to_state(self, grade: TaskGrade, include_hidden: bool) -> None:
|
| 365 |
+
"""Update environment state from the latest grading result."""
|
| 366 |
+
compile_error = _safe_text(grade.details.get("compile_error", ""), "")
|
| 367 |
+
self._state.score = _clamp(grade.score)
|
| 368 |
+
self._state.errors = compile_error
|
| 369 |
+
self._state.test_results = self._format_test_results(grade, include_hidden=include_hidden)
|
| 370 |
+
|
| 371 |
+
def _handle_scored_action(self, action_type: str, candidate_code: str, include_hidden: bool) -> None:
|
| 372 |
+
"""Grade code, update state, and compute reward for a valid action."""
|
| 373 |
+
task = self._task or self._select_task(None)
|
| 374 |
+
previous_metrics = dict(self._metrics)
|
| 375 |
+
prior_code = self._state.current_code
|
| 376 |
+
code_changed = candidate_code.strip() != prior_code.strip()
|
| 377 |
+
if action_type == "edit_code":
|
| 378 |
+
self._state.current_code = candidate_code
|
| 379 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=include_hidden)
|
| 380 |
+
current_metrics = self._metrics_from_grade(grade)
|
| 381 |
+
self._apply_grade_to_state(grade, include_hidden=include_hidden)
|
| 382 |
+
self._last_reward = self.compute_reward(
|
| 383 |
+
action_type=action_type,
|
| 384 |
+
previous_metrics=previous_metrics,
|
| 385 |
+
current_metrics=current_metrics,
|
| 386 |
+
grade=grade,
|
| 387 |
+
code_changed=code_changed,
|
| 388 |
+
invalid_action=False,
|
| 389 |
+
)
|
| 390 |
+
self._last_status = self._build_status(action_type, grade)
|
| 391 |
+
self._metrics = current_metrics
|
| 392 |
+
self._last_action_type = action_type
|
| 393 |
+
self._append_history(action_type, self._last_status, self._last_reward.value)
|
| 394 |
+
|
| 395 |
+
def _handle_edit(self, code: Optional[str]) -> None:
|
| 396 |
+
"""Validate edit input and evaluate the new candidate code."""
|
| 397 |
+
safe_code = (code or "").strip()
|
| 398 |
+
if not safe_code:
|
| 399 |
+
self._apply_invalid_action("edit_code requires code parameter.")
|
| 400 |
+
return
|
| 401 |
+
self._handle_scored_action(action_type="edit_code", candidate_code=safe_code, include_hidden=False)
|
| 402 |
+
|
| 403 |
+
def _apply_invalid_action(self, reason: str) -> None:
|
| 404 |
+
"""Record an invalid action without crashing the episode."""
|
| 405 |
+
previous_metrics = dict(self._metrics)
|
| 406 |
+
grade = TaskGrade(score=previous_metrics["score"], syntax_score=previous_metrics["syntax_score"])
|
| 407 |
+
self._last_reward = self.compute_reward(
|
| 408 |
+
action_type="invalid",
|
| 409 |
+
previous_metrics=previous_metrics,
|
| 410 |
+
current_metrics=previous_metrics,
|
| 411 |
+
grade=grade,
|
| 412 |
+
code_changed=False,
|
| 413 |
+
invalid_action=True,
|
| 414 |
+
)
|
| 415 |
+
self._last_status = reason
|
| 416 |
+
self._append_history("analyze_code", reason, self._last_reward.value)
|
| 417 |
+
|
| 418 |
+
def _auto_submit(self) -> None:
|
| 419 |
+
"""Finalize the episode when attempts are exhausted."""
|
| 420 |
+
task = self._task or self._select_task(None)
|
| 421 |
+
grade = self._safe_grade(task=task, candidate_code=self._state.current_code, include_hidden=True)
|
| 422 |
+
self._apply_grade_to_state(grade, include_hidden=True)
|
| 423 |
+
self._done = True
|
| 424 |
+
self._state.done = True
|
| 425 |
+
self._last_status = f"Auto-submitted. Final score: {_clamp(grade.score):.3f}"
|
| 426 |
+
|
| 427 |
+
def _append_history(self, action_type: str, status: str, reward: float) -> None:
|
| 428 |
+
"""Append one action record to the episode history."""
|
| 429 |
+
try:
|
| 430 |
+
stable_action = action_type if action_type in VALID_ACTIONS else "analyze_code"
|
| 431 |
+
self._state.history.append(
|
| 432 |
+
HistoryEntry(
|
| 433 |
+
step=max(int(self._state.step_count), 0),
|
| 434 |
+
action_type=stable_action,
|
| 435 |
+
status=_safe_text(status, "handled"),
|
| 436 |
+
reward=float(reward),
|
| 437 |
+
)
|
| 438 |
+
)
|
| 439 |
+
except Exception:
|
| 440 |
+
pass
|
| 441 |
+
|
| 442 |
+
def _build_observation(self) -> PythonCodeReviewObservation:
|
| 443 |
+
"""Build a valid observation from current state."""
|
| 444 |
+
task = self._task
|
| 445 |
+
try:
|
| 446 |
+
return PythonCodeReviewObservation(
|
| 447 |
+
task_id=self._state.task_id or "",
|
| 448 |
+
title=task.title if task else "",
|
| 449 |
+
difficulty=self._state.difficulty or "easy",
|
| 450 |
+
task_kind=self._state.task_kind,
|
| 451 |
+
task_description=task.task_description if task else "",
|
| 452 |
+
current_code=self._state.current_code,
|
| 453 |
+
errors=self._state.errors,
|
| 454 |
+
test_results=self._state.test_results,
|
| 455 |
+
visible_tests=list(task.visible_tests) if task else [],
|
| 456 |
+
history=list(self._state.history),
|
| 457 |
+
attempts_remaining=max(int(self._state.attempts_remaining), 0),
|
| 458 |
+
last_action_status=self._last_status,
|
| 459 |
+
score=_clamp(self._state.score),
|
| 460 |
+
reward_details=self._last_reward,
|
| 461 |
+
reward=self._last_reward.value,
|
| 462 |
+
done=bool(self._state.done),
|
| 463 |
+
metadata={
|
| 464 |
+
"prev_score": self._last_reward.prev_score,
|
| 465 |
+
"curr_score": self._last_reward.curr_score,
|
| 466 |
+
},
|
| 467 |
+
)
|
| 468 |
+
except Exception as exc:
|
| 469 |
+
return PythonCodeReviewObservation(
|
| 470 |
+
task_id=self._state.task_id or "",
|
| 471 |
+
title="",
|
| 472 |
+
difficulty="easy",
|
| 473 |
+
task_kind=None,
|
| 474 |
+
task_description="",
|
| 475 |
+
current_code=getattr(self._state, "current_code", ""),
|
| 476 |
+
errors=_safe_text(exc, "observation_build_failed"),
|
| 477 |
+
test_results="visible checks: unavailable",
|
| 478 |
+
visible_tests=[],
|
| 479 |
+
history=[],
|
| 480 |
+
attempts_remaining=0,
|
| 481 |
+
last_action_status="Observation fallback returned safely.",
|
| 482 |
+
score=0.0,
|
| 483 |
+
reward_details=RewardDetails(value=0.0, reason="Observation fallback."),
|
| 484 |
+
reward=0.0,
|
| 485 |
+
done=bool(getattr(self._state, "done", False)),
|
| 486 |
+
metadata={},
|
| 487 |
+
)
|
| 488 |
+
|
| 489 |
+
|
| 490 |
+
PythonEnvironment = PythonCodeReviewEnvironment
|
| 491 |
+
CodeReviewEnvironment = PythonCodeReviewEnvironment
|
| 492 |
+
|
tasks/task_bank.py
CHANGED
|
@@ -161,82 +161,66 @@ def calculate_invoice_total(line_items: Iterable[int], discount_percent: int) ->
|
|
| 161 |
|
| 162 |
TASK_OPTIMIZATION = TaskSpec(
|
| 163 |
task_id="optimization-hard",
|
| 164 |
-
title="Optimize inefficient
|
| 165 |
difficulty="hard",
|
| 166 |
task_kind="optimization",
|
| 167 |
task_description=(
|
| 168 |
-
"Code review found that `
|
| 169 |
-
"The current implementation
|
| 170 |
-
"
|
| 171 |
"Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
|
| 172 |
"All tests must pass, and the optimized version should be measurably faster."
|
| 173 |
),
|
| 174 |
-
starter_code='''from typing import
|
| 175 |
|
| 176 |
|
| 177 |
-
|
|
|
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
Returns:
|
| 189 |
-
List with duplicates removed, order preserved.
|
| 190 |
-
"""
|
| 191 |
-
result = []
|
| 192 |
-
for item in items:
|
| 193 |
-
if item not in result: # O(n) lookup in list per iteration
|
| 194 |
-
result.append(item)
|
| 195 |
-
return result
|
| 196 |
''',
|
| 197 |
-
reference_code='''from
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
-
|
|
|
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
"""Remove duplicates from list while preserving order.
|
| 205 |
-
|
| 206 |
-
Efficient set-based implementation with O(n) time complexity.
|
| 207 |
-
|
| 208 |
-
Args:
|
| 209 |
-
items: List that may contain duplicate elements.
|
| 210 |
-
|
| 211 |
-
Returns:
|
| 212 |
-
List with duplicates removed, order preserved.
|
| 213 |
-
"""
|
| 214 |
-
seen: set = set()
|
| 215 |
-
result = []
|
| 216 |
-
for item in items:
|
| 217 |
-
if item not in seen:
|
| 218 |
-
seen.add(item)
|
| 219 |
-
result.append(item)
|
| 220 |
-
return result
|
| 221 |
''',
|
| 222 |
visible_tests=[
|
| 223 |
-
"
|
| 224 |
-
"
|
| 225 |
-
"
|
| 226 |
-
"
|
| 227 |
],
|
| 228 |
hidden_tests=[
|
| 229 |
-
"
|
| 230 |
],
|
| 231 |
max_steps=10,
|
| 232 |
-
benchmark_entrypoint="
|
| 233 |
-
benchmark_builder=
|
|
|
|
| 234 |
benchmark_repeats=3,
|
| 235 |
benchmark_timeout_s=1.0,
|
| 236 |
style_max_line_length=88,
|
| 237 |
expected_quality_markers=[
|
| 238 |
-
"
|
| 239 |
-
"
|
| 240 |
],
|
| 241 |
)
|
| 242 |
|
|
|
|
| 161 |
|
| 162 |
TASK_OPTIMIZATION = TaskSpec(
|
| 163 |
task_id="optimization-hard",
|
| 164 |
+
title="Optimize inefficient user activity summarization",
|
| 165 |
difficulty="hard",
|
| 166 |
task_kind="optimization",
|
| 167 |
task_description=(
|
| 168 |
+
"Code review found that `summarize_user_activity` is inefficient for large event streams. "
|
| 169 |
+
"The current implementation repeatedly scans the full event list for every user, making it O(n**2). "
|
| 170 |
+
"Refactor it to aggregate counts in one pass while preserving the sorted output contract. "
|
| 171 |
"Style and code quality also matter: use idiomatic Python, proper types, and clear logic. "
|
| 172 |
"All tests must pass, and the optimized version should be measurably faster."
|
| 173 |
),
|
| 174 |
+
starter_code='''from typing import Iterable
|
| 175 |
|
| 176 |
|
| 177 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 178 |
+
"""Aggregate user activity counts."""
|
| 179 |
|
| 180 |
+
ordered_users = []
|
| 181 |
+
for event in events:
|
| 182 |
+
user_id = event["user_id"]
|
| 183 |
+
if user_id not in ordered_users:
|
| 184 |
+
ordered_users.append(user_id)
|
| 185 |
|
| 186 |
+
summary = []
|
| 187 |
+
for user_id in ordered_users:
|
| 188 |
+
count = 0
|
| 189 |
+
for event in events:
|
| 190 |
+
if event["user_id"] == user_id:
|
| 191 |
+
count += 1
|
| 192 |
+
summary.append((user_id, count))
|
| 193 |
+
return sorted(summary, key=lambda item: (-item[1], item[0]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
''',
|
| 195 |
+
reference_code='''from collections import Counter
|
| 196 |
+
from typing import Iterable
|
| 197 |
|
| 198 |
|
| 199 |
+
def summarize_user_activity(events: Iterable[dict]) -> list[tuple[str, int]]:
|
| 200 |
+
"""Aggregate user activity counts in one pass."""
|
| 201 |
|
| 202 |
+
counts = Counter(event["user_id"] for event in events)
|
| 203 |
+
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
''',
|
| 205 |
visible_tests=[
|
| 206 |
+
"summarize_user_activity([{'user_id': 'alice'}, {'user_id': 'bob'}, {'user_id': 'alice'}]) == [('alice', 2), ('bob', 1)]",
|
| 207 |
+
"summarize_user_activity([{'user_id': 'z'}, {'user_id': 'a'}]) == [('a', 1), ('z', 1)]",
|
| 208 |
+
"summarize_user_activity([]) == []",
|
| 209 |
+
"summarize_user_activity([{'user_id': 'solo'}]) == [('solo', 1)]",
|
| 210 |
],
|
| 211 |
hidden_tests=[
|
| 212 |
+
"summarize_user_activity([{'user_id': 'u2'}, {'user_id': 'u1'}, {'user_id': 'u2'}, {'user_id': 'u2'}, {'user_id': 'u1'}]) == [('u2', 3), ('u1', 2)]",
|
| 213 |
],
|
| 214 |
max_steps=10,
|
| 215 |
+
benchmark_entrypoint="summarize_user_activity",
|
| 216 |
+
benchmark_builder='''def build_benchmark_events():
|
| 217 |
+
return [{"user_id": f"user_{index % 400}"} for index in range(6000)]''',
|
| 218 |
benchmark_repeats=3,
|
| 219 |
benchmark_timeout_s=1.0,
|
| 220 |
style_max_line_length=88,
|
| 221 |
expected_quality_markers=[
|
| 222 |
+
"Counter",
|
| 223 |
+
"sorted",
|
| 224 |
],
|
| 225 |
)
|
| 226 |
|