Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

File size: 3,401 Bytes

159a5fa

"""
AgentDebuggerEnv — Integration Tests
====================================
Verifies the full episode lifecycle: reset -> step -> end.
Assumes the server is available via the DebuggerEnvironment class directly
(testing the logic, not the HTTP layer which is just a thin wrapper).
"""

import pytest
from env.environment import DebuggerEnvironment
from env.models import Action

def test_full_episode_easy():
    """Test a full successful episode on the 'easy' task."""
    env = DebuggerEnvironment()
    
    # 1. Reset
    obs = env.reset("easy")
    assert obs["task_id"] == "easy"
    assert obs["done"] is False
    assert obs["tests_passed"] < obs["tests_total"]
    
    # 2. Submit a fix (using known ground truth)
    # The easy task is binary search with 'left < right' instead of 'left <= right'
    ground_truth_code = """
def binary_search(arr, target):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    return -1
"""
    action = Action(
        action_type="submit_fix",
        fixed_code=ground_truth_code,
        hypothesis="Binary search termination condition should be left <= right to include all elements."
    )
    
    result = env.step(action)
    
    # 3. Verify results
    assert result["done"] is True
    assert result["observation"]["tests_passed"] == result["observation"]["tests_total"]
    assert result["reward"]["grader_score"] > 0.80

def test_query_hint_system():
    """Test the newly added hint system."""
    env = DebuggerEnvironment()
    env.reset("hard")
    
    action = Action(
        action_type="query_context",
        query_type="test_suggestion"
    )
    
    result = env.step(action)
    assert "concurrent threads" in result["info"]["query_result"]
    assert result["reward"]["step_reward"] == 0.0  # First query is free

def test_hard_grader_consensus():
    """
    Test that the hard grader runs multiple times.
    (We mock execute_code to simulate flakiness).
    """
    from unittest.mock import patch
    from env.graders.grader_hard import HardGrader
    
    grader = HardGrader()
    
    # Mock execute_code to return success 3/5 times
    # Sequence: PASS, FAIL, PASS, FAIL, PASS
    with patch("env.graders.grader_hard.execute_code") as mock_exec:
        mock_exec.side_effect = [
            ("CONCURRENT PASS", False, 100),
            ("CONCURRENT FAIL", False, 100),
            ("CONCURRENT PASS", False, 100),
            ("CONCURRENT FAIL", False, 100),
            ("CONCURRENT PASS", False, 100),
        ]
        
        score = grader.score(
            task_config={"task_id": "hard", "ground_truth": {"hypothesis_keywords": ["race"]}},
            attempts=[{"tests_passed": 8, "attempt_number": 1, "code_submitted": "..."}],
            best_tests_passed=8,
            tests_total=8,
            attempts_used=1,
            max_attempts=10,
            hypotheses=["race condition"]
        )
        
        # 3/5 passes → should get partial credit (0.15) for concurrency
        # Sequential: 1.0 * 0.40 = 0.40
        # Concurrency: 0.15
        # Hypothesis: 1.0 * 0.20 = 0.20
        # Efficiency: (concurrent_score == 0.30) is False -> 0.0
        # Total: 0.75
        assert score == 0.75