Spaces:
Sleeping
Sleeping
commit
Browse files- environment/env.py +135 -0
environment/env.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
CodeReviewEnv β main environment logic.
|
| 3 |
+
Manages state, episode flow, reward accumulation, and per-step grading.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Optional, List, Dict, Any
|
| 6 |
+
|
| 7 |
+
from environment.models import (
|
| 8 |
+
CodeReviewAction,
|
| 9 |
+
CodeReviewObservation,
|
| 10 |
+
StepResult,
|
| 11 |
+
ResetResult,
|
| 12 |
+
StateResult,
|
| 13 |
+
)
|
| 14 |
+
from environment.tasks import get_task, list_tasks as _list_tasks
|
| 15 |
+
from environment.graders import grade
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class CodeReviewEnv:
|
| 19 |
+
"""
|
| 20 |
+
OpenEnv-compliant code-review environment.
|
| 21 |
+
|
| 22 |
+
Episode flow
|
| 23 |
+
ββββββββββββ
|
| 24 |
+
reset(task_id) β observation
|
| 25 |
+
step(action) β (observation, reward, done, info) [repeated β€ max_steps]
|
| 26 |
+
state() β current state snapshot
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def __init__(self) -> None:
|
| 30 |
+
self._task: Optional[dict] = None
|
| 31 |
+
self._step_number: int = 0
|
| 32 |
+
self._total_reward: float = 0.0
|
| 33 |
+
self._actions_history: List[Dict[str, Any]] = []
|
| 34 |
+
self._done: bool = False
|
| 35 |
+
self._initialized: bool = False
|
| 36 |
+
self._last_feedback: Optional[str] = None
|
| 37 |
+
|
| 38 |
+
# ββ public properties ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def is_initialized(self) -> bool:
|
| 42 |
+
return self._initialized
|
| 43 |
+
|
| 44 |
+
# ββ core API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 45 |
+
|
| 46 |
+
def reset(self, task_id: Optional[str] = None) -> ResetResult:
|
| 47 |
+
"""Start a new episode. Defaults to the easy task."""
|
| 48 |
+
if task_id is None:
|
| 49 |
+
task_id = "easy_syntax"
|
| 50 |
+
|
| 51 |
+
self._task = get_task(task_id)
|
| 52 |
+
self._step_number = 0
|
| 53 |
+
self._total_reward = 0.0
|
| 54 |
+
self._actions_history = []
|
| 55 |
+
self._done = False
|
| 56 |
+
self._initialized = True
|
| 57 |
+
self._last_feedback = None
|
| 58 |
+
|
| 59 |
+
obs = self._make_observation()
|
| 60 |
+
return ResetResult(observation=obs)
|
| 61 |
+
|
| 62 |
+
def step(self, action: CodeReviewAction) -> StepResult:
|
| 63 |
+
"""Process one agent action and return (observation, reward, done, info)."""
|
| 64 |
+
if not self._initialized or self._done:
|
| 65 |
+
raise RuntimeError("Call reset() before stepping, or episode is over.")
|
| 66 |
+
|
| 67 |
+
self._step_number += 1
|
| 68 |
+
task_id = self._task["task_id"]
|
| 69 |
+
ground_truth = self._task["ground_truth"]
|
| 70 |
+
|
| 71 |
+
# Grade the action
|
| 72 |
+
reward, feedback = grade(task_id, action, ground_truth)
|
| 73 |
+
self._last_feedback = feedback
|
| 74 |
+
self._total_reward += reward
|
| 75 |
+
|
| 76 |
+
# Record history
|
| 77 |
+
self._actions_history.append(
|
| 78 |
+
{
|
| 79 |
+
"step": self._step_number,
|
| 80 |
+
"num_issues_reported": len(action.identified_issues),
|
| 81 |
+
"has_fix": action.suggested_fix is not None,
|
| 82 |
+
"reward": reward,
|
| 83 |
+
}
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Episode ends when: agent says done, reward is perfect, or max steps reached
|
| 87 |
+
max_steps = self._task["max_steps"]
|
| 88 |
+
done = action.done or reward >= 0.95 or self._step_number >= max_steps
|
| 89 |
+
self._done = done
|
| 90 |
+
|
| 91 |
+
obs = self._make_observation()
|
| 92 |
+
|
| 93 |
+
return StepResult(
|
| 94 |
+
observation=obs,
|
| 95 |
+
reward=reward,
|
| 96 |
+
done=done,
|
| 97 |
+
info={
|
| 98 |
+
"feedback": feedback,
|
| 99 |
+
"step": self._step_number,
|
| 100 |
+
"total_reward": round(self._total_reward, 4),
|
| 101 |
+
"cumulative_score": round(
|
| 102 |
+
self._total_reward / max(self._step_number, 1), 4
|
| 103 |
+
),
|
| 104 |
+
},
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
def get_state(self) -> StateResult:
|
| 108 |
+
"""Return a snapshot of the current episode state."""
|
| 109 |
+
return StateResult(
|
| 110 |
+
task_id=self._task["task_id"] if self._task else "",
|
| 111 |
+
step_number=self._step_number,
|
| 112 |
+
total_reward=round(self._total_reward, 4),
|
| 113 |
+
actions_history=self._actions_history,
|
| 114 |
+
done=self._done,
|
| 115 |
+
initialized=self._initialized,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
def list_tasks(self) -> list:
|
| 119 |
+
return _list_tasks()
|
| 120 |
+
|
| 121 |
+
# ββ internal helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
|
| 123 |
+
def _make_observation(self) -> CodeReviewObservation:
|
| 124 |
+
t = self._task
|
| 125 |
+
return CodeReviewObservation(
|
| 126 |
+
task_id=t["task_id"],
|
| 127 |
+
task_name=t["task_name"],
|
| 128 |
+
difficulty=t["difficulty"],
|
| 129 |
+
language=t["language"],
|
| 130 |
+
code_snippet=t["code_snippet"],
|
| 131 |
+
context=t["context"],
|
| 132 |
+
step_number=self._step_number,
|
| 133 |
+
max_steps=t["max_steps"],
|
| 134 |
+
previous_feedback=self._last_feedback,
|
| 135 |
+
)
|