Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

File size: 7,219 Bytes

0ee66d2

"""
Tests for the core environment — reset, step, state.
"""

import pytest
from env.environment import DebuggerEnvironment
from env.models import Action


@pytest.fixture
def env():
    return DebuggerEnvironment()


# ── Reset Tests ──────────────────────────────────────────────────────────────

def test_reset_easy_returns_observation(env):
    obs = env.reset("easy")
    assert obs["task_id"] == "easy"
    assert obs["done"] is False
    assert obs["tests_total"] == 8
    assert obs["attempts_remaining"] == 5
    assert obs["max_attempts"] == 5
    assert obs["step_number"] == 0
    assert obs["buggy_code"] != ""
    assert obs["test_suite"] != ""
    assert obs["initial_error_output"] != ""
    assert obs["previous_attempts"] == []


def test_reset_medium_returns_observation(env):
    obs = env.reset("medium")
    assert obs["task_id"] == "medium"
    assert obs["tests_total"] == 10
    assert obs["max_attempts"] == 7


def test_reset_hard_returns_observation(env):
    obs = env.reset("hard")
    assert obs["task_id"] == "hard"
    assert obs["tests_total"] == 8
    assert obs["max_attempts"] == 10


def test_reset_invalid_task_raises(env):
    with pytest.raises(ValueError, match="Unknown task_id"):
        env.reset("nonexistent")


def test_reset_clears_previous_state(env):
    env.reset("easy")
    # Do a step
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="test hypothesis",
    )
    env.step(action)

    # Reset should clear everything
    obs = env.reset("easy")
    assert obs["step_number"] == 0
    assert obs["previous_attempts"] == []
    assert obs["attempts_remaining"] == 5


# ── Step Tests ───────────────────────────────────────────────────────────────

def test_step_submit_fix_without_hypothesis(env):
    env.reset("easy")
    action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
    result = env.step(action)
    assert result["reward"]["step_reward"] == -0.10
    assert result["info"]["error"] is not None
    assert "hypothesis" in result["info"]["error"].lower()


def test_step_submit_fix_with_valid_code(env):
    env.reset("easy")
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="Testing a fix",
    )
    result = env.step(action)
    assert "observation" in result
    assert "reward" in result
    assert "done" in result
    assert "info" in result
    assert result["observation"]["step_number"] == 1


def test_step_submit_fix_solves_easy(env):
    env.reset("easy")
    fixed_code = '''def binary_search(arr: list, target: int) -> int:
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
'''
    action = Action(
        action_type="submit_fix",
        fixed_code=fixed_code,
        hypothesis="Off by one: should be left <= right",
    )
    result = env.step(action)
    assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
    assert result["done"] is True
    assert result["reward"]["grader_score"] > 0.0


def test_step_query_context_first_free(env):
    env.reset("easy")
    action = Action(
        action_type="query_context",
        query_type="error_explanation",
        query_target="binary_search",
    )
    result = env.step(action)
    assert result["reward"]["step_reward"] == 0.0
    assert result["info"]["query_result"] is not None


def test_step_query_context_second_costs(env):
    env.reset("easy")
    action = Action(
        action_type="query_context",
        query_type="error_explanation",
    )
    env.step(action)  # First — free
    result = env.step(action)  # Second — costs -0.05
    assert result["reward"]["step_reward"] == -0.05


def test_step_give_up(env):
    env.reset("easy")
    action = Action(
        action_type="give_up",
        final_diagnosis="I cannot find the bug",
    )
    result = env.step(action)
    assert result["done"] is True
    assert result["reward"]["grader_score"] >= 0.0


def test_step_after_done(env):
    env.reset("easy")
    action = Action(action_type="give_up", final_diagnosis="done")
    env.step(action)
    result = env.step(Action(action_type="give_up"))
    assert result["info"]["error"] is not None
    assert "already done" in result["info"]["error"].lower()


def test_step_invalid_action_type(env):
    env.reset("easy")
    action = Action(action_type="invalid_action")
    result = env.step(action)
    assert result["info"]["error"] is not None


def test_step_invalid_query_type(env):
    env.reset("easy")
    action = Action(action_type="query_context", query_type="invalid_query")
    result = env.step(action)
    assert result["reward"]["step_reward"] == -0.05
    assert result["info"]["error"] is not None


# ── State Tests ──────────────────────────────────────────────────────────────

def test_state_before_reset(env):
    state = env.state()
    assert state["done"] is True
    assert state["task_id"] is None


def test_state_after_reset(env):
    env.reset("easy")
    state = env.state()
    assert state["task_id"] == "easy"
    assert state["done"] is False
    assert state["attempts_used"] == 0


def test_state_after_step(env):
    env.reset("easy")
    action = Action(
        action_type="submit_fix",
        fixed_code="def binary_search(arr, target): return -1",
        hypothesis="Testing",
    )
    env.step(action)
    state = env.state()
    assert state["attempts_used"] == 1
    assert state["step_number"] == 1
    assert len(state["all_hypotheses"]) == 1


# ── Attempts Exhaustion Tests ────────────────────────────────────────────────

def test_attempts_exhausted(env):
    env.reset("easy")
    for i in range(5):
        action = Action(
            action_type="submit_fix",
            fixed_code=f"def binary_search(arr, target): return {i}",
            hypothesis=f"Attempt {i + 1}",
        )
        result = env.step(action)

    # After 5 attempts, episode should be done (max_attempts=5)
    assert result["done"] is True or result["observation"]["attempts_remaining"] == 0

    # Trying another fix should either fail or episode is done
    if not result["done"]:
        action = Action(
            action_type="submit_fix",
            fixed_code="def binary_search(arr, target): return -1",
            hypothesis="Extra attempt",
        )
        result = env.step(action)
        assert result["info"]["error"] is not None