AgentDebugger-training-v3 / tests /test_environment.py
shank
complete project
0ee66d2
"""
Tests for the core environment β€” reset, step, state.
"""
import pytest
from env.environment import DebuggerEnvironment
from env.models import Action
@pytest.fixture
def env():
return DebuggerEnvironment()
# ── Reset Tests ──────────────────────────────────────────────────────────────
def test_reset_easy_returns_observation(env):
obs = env.reset("easy")
assert obs["task_id"] == "easy"
assert obs["done"] is False
assert obs["tests_total"] == 8
assert obs["attempts_remaining"] == 5
assert obs["max_attempts"] == 5
assert obs["step_number"] == 0
assert obs["buggy_code"] != ""
assert obs["test_suite"] != ""
assert obs["initial_error_output"] != ""
assert obs["previous_attempts"] == []
def test_reset_medium_returns_observation(env):
obs = env.reset("medium")
assert obs["task_id"] == "medium"
assert obs["tests_total"] == 10
assert obs["max_attempts"] == 7
def test_reset_hard_returns_observation(env):
obs = env.reset("hard")
assert obs["task_id"] == "hard"
assert obs["tests_total"] == 8
assert obs["max_attempts"] == 10
def test_reset_invalid_task_raises(env):
with pytest.raises(ValueError, match="Unknown task_id"):
env.reset("nonexistent")
def test_reset_clears_previous_state(env):
env.reset("easy")
# Do a step
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="test hypothesis",
)
env.step(action)
# Reset should clear everything
obs = env.reset("easy")
assert obs["step_number"] == 0
assert obs["previous_attempts"] == []
assert obs["attempts_remaining"] == 5
# ── Step Tests ───────────────────────────────────────────────────────────────
def test_step_submit_fix_without_hypothesis(env):
env.reset("easy")
action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
result = env.step(action)
assert result["reward"]["step_reward"] == -0.10
assert result["info"]["error"] is not None
assert "hypothesis" in result["info"]["error"].lower()
def test_step_submit_fix_with_valid_code(env):
env.reset("easy")
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Testing a fix",
)
result = env.step(action)
assert "observation" in result
assert "reward" in result
assert "done" in result
assert "info" in result
assert result["observation"]["step_number"] == 1
def test_step_submit_fix_solves_easy(env):
env.reset("easy")
fixed_code = '''def binary_search(arr: list, target: int) -> int:
left, right = 0, len(arr) - 1
while left <= right:
mid = (left + right) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1
'''
action = Action(
action_type="submit_fix",
fixed_code=fixed_code,
hypothesis="Off by one: should be left <= right",
)
result = env.step(action)
assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
assert result["done"] is True
assert result["reward"]["grader_score"] > 0.0
def test_step_query_context_first_free(env):
env.reset("easy")
action = Action(
action_type="query_context",
query_type="error_explanation",
query_target="binary_search",
)
result = env.step(action)
assert result["reward"]["step_reward"] == 0.0
assert result["info"]["query_result"] is not None
def test_step_query_context_second_costs(env):
env.reset("easy")
action = Action(
action_type="query_context",
query_type="error_explanation",
)
env.step(action) # First β€” free
result = env.step(action) # Second β€” costs -0.05
assert result["reward"]["step_reward"] == -0.05
def test_step_give_up(env):
env.reset("easy")
action = Action(
action_type="give_up",
final_diagnosis="I cannot find the bug",
)
result = env.step(action)
assert result["done"] is True
assert result["reward"]["grader_score"] >= 0.0
def test_step_after_done(env):
env.reset("easy")
action = Action(action_type="give_up", final_diagnosis="done")
env.step(action)
result = env.step(Action(action_type="give_up"))
assert result["info"]["error"] is not None
assert "already done" in result["info"]["error"].lower()
def test_step_invalid_action_type(env):
env.reset("easy")
action = Action(action_type="invalid_action")
result = env.step(action)
assert result["info"]["error"] is not None
def test_step_invalid_query_type(env):
env.reset("easy")
action = Action(action_type="query_context", query_type="invalid_query")
result = env.step(action)
assert result["reward"]["step_reward"] == -0.05
assert result["info"]["error"] is not None
# ── State Tests ──────────────────────────────────────────────────────────────
def test_state_before_reset(env):
state = env.state()
assert state["done"] is True
assert state["task_id"] is None
def test_state_after_reset(env):
env.reset("easy")
state = env.state()
assert state["task_id"] == "easy"
assert state["done"] is False
assert state["attempts_used"] == 0
def test_state_after_step(env):
env.reset("easy")
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Testing",
)
env.step(action)
state = env.state()
assert state["attempts_used"] == 1
assert state["step_number"] == 1
assert len(state["all_hypotheses"]) == 1
# ── Attempts Exhaustion Tests ────────────────────────────────────────────────
def test_attempts_exhausted(env):
env.reset("easy")
for i in range(5):
action = Action(
action_type="submit_fix",
fixed_code=f"def binary_search(arr, target): return {i}",
hypothesis=f"Attempt {i + 1}",
)
result = env.step(action)
# After 5 attempts, episode should be done (max_attempts=5)
assert result["done"] is True or result["observation"]["attempts_remaining"] == 0
# Trying another fix should either fail or episode is done
if not result["done"]:
action = Action(
action_type="submit_fix",
fixed_code="def binary_search(arr, target): return -1",
hypothesis="Extra attempt",
)
result = env.step(action)
assert result["info"]["error"] is not None