Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

AgentDebugger-training-v3 / tests /test_environment.py

shank

complete project

0ee66d2 27 days ago

7.22 kB

	"""
	Tests for the core environment — reset, step, state.
	"""

	import pytest
	from env.environment import DebuggerEnvironment
	from env.models import Action


	@pytest.fixture
	def env():
	return DebuggerEnvironment()


	# ── Reset Tests ──────────────────────────────────────────────────────────────

	def test_reset_easy_returns_observation(env):
	obs = env.reset("easy")
	assert obs["task_id"] == "easy"
	assert obs["done"] is False
	assert obs["tests_total"] == 8
	assert obs["attempts_remaining"] == 5
	assert obs["max_attempts"] == 5
	assert obs["step_number"] == 0
	assert obs["buggy_code"] != ""
	assert obs["test_suite"] != ""
	assert obs["initial_error_output"] != ""
	assert obs["previous_attempts"] == []


	def test_reset_medium_returns_observation(env):
	obs = env.reset("medium")
	assert obs["task_id"] == "medium"
	assert obs["tests_total"] == 10
	assert obs["max_attempts"] == 7


	def test_reset_hard_returns_observation(env):
	obs = env.reset("hard")
	assert obs["task_id"] == "hard"
	assert obs["tests_total"] == 8
	assert obs["max_attempts"] == 10


	def test_reset_invalid_task_raises(env):
	with pytest.raises(ValueError, match="Unknown task_id"):
	env.reset("nonexistent")


	def test_reset_clears_previous_state(env):
	env.reset("easy")
	# Do a step
	action = Action(
	action_type="submit_fix",
	fixed_code="def binary_search(arr, target): return -1",
	hypothesis="test hypothesis",
	)
	env.step(action)

	# Reset should clear everything
	obs = env.reset("easy")
	assert obs["step_number"] == 0
	assert obs["previous_attempts"] == []
	assert obs["attempts_remaining"] == 5


	# ── Step Tests ───────────────────────────────────────────────────────────────

	def test_step_submit_fix_without_hypothesis(env):
	env.reset("easy")
	action = Action(action_type="submit_fix", fixed_code="def binary_search(arr, target): return -1")
	result = env.step(action)
	assert result["reward"]["step_reward"] == -0.10
	assert result["info"]["error"] is not None
	assert "hypothesis" in result["info"]["error"].lower()


	def test_step_submit_fix_with_valid_code(env):
	env.reset("easy")
	action = Action(
	action_type="submit_fix",
	fixed_code="def binary_search(arr, target): return -1",
	hypothesis="Testing a fix",
	)
	result = env.step(action)
	assert "observation" in result
	assert "reward" in result
	assert "done" in result
	assert "info" in result
	assert result["observation"]["step_number"] == 1


	def test_step_submit_fix_solves_easy(env):
	env.reset("easy")
	fixed_code = '''def binary_search(arr: list, target: int) -> int:
	left, right = 0, len(arr) - 1
	while left <= right:
	mid = (left + right) // 2
	if arr[mid] == target:
	return mid
	elif arr[mid] < target:
	left = mid + 1
	else:
	right = mid - 1
	return -1
	'''
	action = Action(
	action_type="submit_fix",
	fixed_code=fixed_code,
	hypothesis="Off by one: should be left <= right",
	)
	result = env.step(action)
	assert result["observation"]["tests_passed"] == 8, result["observation"]["current_error_output"]
	assert result["done"] is True
	assert result["reward"]["grader_score"] > 0.0


	def test_step_query_context_first_free(env):
	env.reset("easy")
	action = Action(
	action_type="query_context",
	query_type="error_explanation",
	query_target="binary_search",
	)
	result = env.step(action)
	assert result["reward"]["step_reward"] == 0.0
	assert result["info"]["query_result"] is not None


	def test_step_query_context_second_costs(env):
	env.reset("easy")
	action = Action(
	action_type="query_context",
	query_type="error_explanation",
	)
	env.step(action) # First — free
	result = env.step(action) # Second — costs -0.05
	assert result["reward"]["step_reward"] == -0.05


	def test_step_give_up(env):
	env.reset("easy")
	action = Action(
	action_type="give_up",
	final_diagnosis="I cannot find the bug",
	)
	result = env.step(action)
	assert result["done"] is True
	assert result["reward"]["grader_score"] >= 0.0


	def test_step_after_done(env):
	env.reset("easy")
	action = Action(action_type="give_up", final_diagnosis="done")
	env.step(action)
	result = env.step(Action(action_type="give_up"))
	assert result["info"]["error"] is not None
	assert "already done" in result["info"]["error"].lower()


	def test_step_invalid_action_type(env):
	env.reset("easy")
	action = Action(action_type="invalid_action")
	result = env.step(action)
	assert result["info"]["error"] is not None


	def test_step_invalid_query_type(env):
	env.reset("easy")
	action = Action(action_type="query_context", query_type="invalid_query")
	result = env.step(action)
	assert result["reward"]["step_reward"] == -0.05
	assert result["info"]["error"] is not None


	# ── State Tests ──────────────────────────────────────────────────────────────

	def test_state_before_reset(env):
	state = env.state()
	assert state["done"] is True
	assert state["task_id"] is None


	def test_state_after_reset(env):
	env.reset("easy")
	state = env.state()
	assert state["task_id"] == "easy"
	assert state["done"] is False
	assert state["attempts_used"] == 0


	def test_state_after_step(env):
	env.reset("easy")
	action = Action(
	action_type="submit_fix",
	fixed_code="def binary_search(arr, target): return -1",
	hypothesis="Testing",
	)
	env.step(action)
	state = env.state()
	assert state["attempts_used"] == 1
	assert state["step_number"] == 1
	assert len(state["all_hypotheses"]) == 1


	# ── Attempts Exhaustion Tests ────────────────────────────────────────────────

	def test_attempts_exhausted(env):
	env.reset("easy")
	for i in range(5):
	action = Action(
	action_type="submit_fix",
	fixed_code=f"def binary_search(arr, target): return {i}",
	hypothesis=f"Attempt {i + 1}",
	)
	result = env.step(action)

	# After 5 attempts, episode should be done (max_attempts=5)
	assert result["done"] is True or result["observation"]["attempts_remaining"] == 0

	# Trying another fix should either fail or episode is done
	if not result["done"]:
	action = Action(
	action_type="submit_fix",
	fixed_code="def binary_search(arr, target): return -1",
	hypothesis="Extra attempt",
	)
	result = env.step(action)
	assert result["info"]["error"] is not None