File size: 7,219 Bytes
0ee66d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
Tests for the core environment β€” reset, step, state.
"""

import pytest
from env.environment import DebuggerEnvironment
from env.models import Action


@pytest.fixture
def env():
    return DebuggerEnvironment()


# ── Reset Tests ──────────────────────────────────────────────────────────────

def test_reset_easy_returns_observation(env):
    obs = env.reset("easy")
    assert obs["task_id"] == "easy"
    assert obs["done"] is False
    assert obs["tests_total"] == 8
    assert obs["attempts_remaining"] == 5
    assert obs["max_attempts"] == 5
    assert obs["step_number"] == 0
    assert obs["buggy_code"] != ""
    assert obs["test_suite"] != ""
    assert obs["initial_error_output"] != ""
    assert obs["previous_attempts"] == []


def test_reset_medium_returns_observation(env):
    obs = env.reset("medium")
    assert obs["task_id"] == "medium"
    assert obs["tests_total"] == 10
    assert obs["max_attempts"] == 7


def test_reset_hard_returns_observation(env):
    obs = env.reset("hard")
    assert obs["task_id"] == "hard"
    assert obs["tests_total"] == 8
    assert obs["max_attempts"] == 10


def test_reset_invalid_task_raises(env):
    with pytest.raises(ValueError, match="Unknown task_id"):
        env.reset("nonexistent")


def test_reset_clears_previous_state(env):
    env.reset("easy")
    # Do a step
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="test hypothesis",
    )
    env.step(action)

    # Reset should clear everything
    obs = env.reset("easy")
    assert obs["step_number"] == 0
    assert obs["previous_attempts"] == []
    assert obs["attempts_remaining"] == 5


# ── Step Tests ───────────────────────────────────────────────────────────────

def test_step_submit_fix_without_hypothesis(env):
    env.reset("easy")
    action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
    result = env.step(action)
    assert result["reward"]["step_reward"] == -0.10
    assert result["info"]["error"] is not None
    assert "hypothesis" in result["info"]["error"].lower()


def test_step_submit_fix_with_valid_code(env):
    env.reset("easy")
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="Testing a fix",
    )
    result = env.step(action)
    assert "observation" in result
    assert "reward" in result
    assert "done" in result
    assert "info" in result
    assert result["observation"]["step_number"] == 1


def test_step_submit_fix_solves_easy(env):
    env.reset("easy")
    fixed_code = '''def binary_search(arr: list, target: int) -> int:
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
'''
    action = Action(
        action_type="submit_fix",
        fixed_code=fixed_code,
        hypothesis="Off by one: should be left <= right",
    )
    result = env.step(action)
    assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
    assert result["done"] is True
    assert result["reward"]["grader_score"] > 0.0


def test_step_query_context_first_free(env):
    env.reset("easy")
    action = Action(
        action_type="query_context",
        query_type="error_explanation",
        query_target="binary_search",
    )
    result = env.step(action)
    assert result["reward"]["step_reward"] == 0.0
    assert result["info"]["query_result"] is not None


def test_step_query_context_second_costs(env):
    env.reset("easy")
    action = Action(
        action_type="query_context",
        query_type="error_explanation",
    )
    env.step(action)  # First β€” free
    result = env.step(action)  # Second β€” costs -0.05
    assert result["reward"]["step_reward"] == -0.05


def test_step_give_up(env):
    env.reset("easy")
    action = Action(
        action_type="give_up",
        final_diagnosis="I cannot find the bug",
    )
    result = env.step(action)
    assert result["done"] is True
    assert result["reward"]["grader_score"] >= 0.0


def test_step_after_done(env):
    env.reset("easy")
    action = Action(action_type="give_up", final_diagnosis="done")
    env.step(action)
    result = env.step(Action(action_type="give_up"))
    assert result["info"]["error"] is not None
    assert "already done" in result["info"]["error"].lower()


def test_step_invalid_action_type(env):
    env.reset("easy")
    action = Action(action_type="invalid_action")
    result = env.step(action)
    assert result["info"]["error"] is not None


def test_step_invalid_query_type(env):
    env.reset("easy")
    action = Action(action_type="query_context", query_type="invalid_query")
    result = env.step(action)
    assert result["reward"]["step_reward"] == -0.05
    assert result["info"]["error"] is not None


# ── State Tests ──────────────────────────────────────────────────────────────

def test_state_before_reset(env):
    state = env.state()
    assert state["done"] is True
    assert state["task_id"] is None


def test_state_after_reset(env):
    env.reset("easy")
    state = env.state()
    assert state["task_id"] == "easy"
    assert state["done"] is False
    assert state["attempts_used"] == 0


def test_state_after_step(env):
    env.reset("easy")
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="Testing",
    )
    env.step(action)
    state = env.state()
    assert state["attempts_used"] == 1
    assert state["step_number"] == 1
    assert len(state["all_hypotheses"]) == 1


# ── Attempts Exhaustion Tests ────────────────────────────────────────────────

def test_attempts_exhausted(env):
    env.reset("easy")
    for i in range(5):
        action = Action(
            action_type="submit_fix",
            fixed_code=f"def binary_search(arr, target): return {i}",
            hypothesis=f"Attempt {i + 1}",
        )
        result = env.step(action)

    # After 5 attempts, episode should be done (max_attempts=5)
    assert result["done"] is True or result["observation"]["attempts_remaining"] == 0

    # Trying another fix should either fail or episode is done
    if not result["done"]:
        action = Action(
            action_type="submit_fix",
            fixed_code="def binary_search(arr, target): return -1",
            hypothesis="Extra attempt",
        )
        result = env.step(action)
        assert result["info"]["error"] is not None