File size: 7,219 Bytes
0ee66d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | """
Tests for the core environment β reset, step, state.
"""
import pytest
from env.environment import DebuggerEnvironment
from env.models import Action
@pytest.fixture
def env():
return DebuggerEnvironment()
# ββ Reset Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_reset_easy_returns_observation(env):
obs = env.reset("easy")
assert obs["task_id"] == "easy"
assert obs["done"] is False
assert obs["tests_total"] == 8
assert obs["attempts_remaining"] == 5
assert obs["max_attempts"] == 5
assert obs["step_number"] == 0
assert obs["buggy_code"] != ""
assert obs["test_suite"] != ""
assert obs["initial_error_output"] != ""
assert obs["previous_attempts"] == []
def test_reset_medium_returns_observation(env):
obs = env.reset("medium")
assert obs["task_id"] == "medium"
assert obs["tests_total"] == 10
assert obs["max_attempts"] == 7
def test_reset_hard_returns_observation(env):
obs = env.reset("hard")
assert obs["task_id"] == "hard"
assert obs["tests_total"] == 8
assert obs["max_attempts"] == 10
def test_reset_invalid_task_raises(env):
with pytest.raises(ValueError, match="Unknown task_id"):
env.reset("nonexistent")
def test_reset_clears_previous_state(env):
env.reset("easy")
# Do a step
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="test hypothesis",
)
env.step(action)
# Reset should clear everything
obs = env.reset("easy")
assert obs["step_number"] == 0
assert obs["previous_attempts"] == []
assert obs["attempts_remaining"] == 5
# ββ Step Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_step_submit_fix_without_hypothesis(env):
env.reset("easy")
action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
result = env.step(action)
assert result["reward"]["step_reward"] == -0.10
assert result["info"]["error"] is not None
assert "hypothesis" in result["info"]["error"].lower()
def test_step_submit_fix_with_valid_code(env):
env.reset("easy")
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Testing a fix",
)
result = env.step(action)
assert "observation" in result
assert "reward" in result
assert "done" in result
assert "info" in result
assert result["observation"]["step_number"] == 1
def test_step_submit_fix_solves_easy(env):
env.reset("easy")
fixed_code = '''def binary_search(arr: list, target: int) -> int:
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1
'''
action = Action(
action_type="submit_fix",
fixed_code=fixed_code,
hypothesis="Off by one: should be left <= right",
)
result = env.step(action)
assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
assert result["done"] is True
assert result["reward"]["grader_score"] > 0.0
def test_step_query_context_first_free(env):
env.reset("easy")
action = Action(
action_type="query_context",
query_type="error_explanation",
query_target="binary_search",
)
result = env.step(action)
assert result["reward"]["step_reward"] == 0.0
assert result["info"]["query_result"] is not None
def test_step_query_context_second_costs(env):
env.reset("easy")
action = Action(
action_type="query_context",
query_type="error_explanation",
)
env.step(action) # First β free
result = env.step(action) # Second β costs -0.05
assert result["reward"]["step_reward"] == -0.05
def test_step_give_up(env):
env.reset("easy")
action = Action(
action_type="give_up",
final_diagnosis="I cannot find the bug",
)
result = env.step(action)
assert result["done"] is True
assert result["reward"]["grader_score"] >= 0.0
def test_step_after_done(env):
env.reset("easy")
action = Action(action_type="give_up", final_diagnosis="done")
env.step(action)
result = env.step(Action(action_type="give_up"))
assert result["info"]["error"] is not None
assert "already done" in result["info"]["error"].lower()
def test_step_invalid_action_type(env):
env.reset("easy")
action = Action(action_type="invalid_action")
result = env.step(action)
assert result["info"]["error"] is not None
def test_step_invalid_query_type(env):
env.reset("easy")
action = Action(action_type="query_context", query_type="invalid_query")
result = env.step(action)
assert result["reward"]["step_reward"] == -0.05
assert result["info"]["error"] is not None
# ββ State Tests ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def test_state_before_reset(env):
state = env.state()
assert state["done"] is True
assert state["task_id"] is None
def test_state_after_reset(env):
env.reset("easy")
state = env.state()
assert state["task_id"] == "easy"
assert state["done"] is False
assert state["attempts_used"] == 0
def test_state_after_step(env):
env.reset("easy")
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Testing",
)
env.step(action)
state = env.state()
assert state["attempts_used"] == 1
assert state["step_number"] == 1
assert len(state["all_hypotheses"]) == 1
# ββ Attempts Exhaustion Tests ββββββββββββββββββββββββββββββββββββββββββββββββ
def test_attempts_exhausted(env):
env.reset("easy")
for i in range(5):
action = Action(
action_type="submit_fix",
fixed_code=f"def binary_search(arr, target): return {i}",
hypothesis=f"Attempt {i + 1}",
)
result = env.step(action)
# After 5 attempts, episode should be done (max_attempts=5)
assert result["done"] is True or result["observation"]["attempts_remaining"] == 0
# Trying another fix should either fail or episode is done
if not result["done"]:
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Extra attempt",
)
result = env.step(action)
assert result["info"]["error"] is not None
|