shank commited on
Commit
b658e10
·
1 Parent(s): ee08016

fix: score floor for medium grader, add root and tasks endpoints

Browse files
env/graders/grader_medium.py CHANGED
@@ -48,7 +48,12 @@ class MediumGrader(BaseGrader):
48
  ground_truth = task_config["ground_truth"]
49
 
50
  # 1. Test pass ratio (weight: 0.60)
51
- test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
 
 
 
 
 
52
  test_score = test_pass_ratio * 0.60
53
 
54
  # 2. Efficiency bonus (weight: 0.20)
 
48
  ground_truth = task_config["ground_truth"]
49
 
50
  # 1. Test pass ratio (weight: 0.60)
51
+
52
+ if attempts:
53
+ agent_best = max(a.get("tests_passed",0) for a in attempts)
54
+ else:
55
+ agent_best = 0
56
+ test_pass_ratio = (agent_best / tests_total) if tests_total > 0 else 0.0
57
  test_score = test_pass_ratio * 0.60
58
 
59
  # 2. Efficiency bonus (weight: 0.20)
env/server.py CHANGED
@@ -31,6 +31,78 @@ class ResetRequest(BaseModel):
31
  task_id: Optional[str] = "easy"
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  @app.get("/health")
35
  async def health():
36
  """Health check endpoint to verify server availability."""
 
31
  task_id: Optional[str] = "easy"
32
 
33
 
34
+ @app.get("/")
35
+ async def root():
36
+ return {
37
+ "name": "AgentDebuggerEnv",
38
+ "version": "1.0.0",
39
+ "description": (
40
+ "An OpenEnv-compliant environment where AI agents debug broken code "
41
+ "through iterative hypothesis-test-fix cycles. Unlike static benchmarks, "
42
+ "agents act in a live sandbox and observe real execution output each step."
43
+ ),
44
+ "openenv_compliant": True,
45
+ "domain": "software_engineering",
46
+ "endpoints": {
47
+ "GET /": "This overview",
48
+ "GET /health": "Health check — returns 200 if server is live",
49
+ "GET /tasks": "List all available tasks with metadata",
50
+ "GET /state": "Current episode state",
51
+ "POST /reset": "Start a new episode. Body: {\"task_id\": \"easy\"|\"medium\"|\"hard\"}",
52
+ "POST /step": "Submit one action. Body: Action JSON",
53
+ },
54
+ "tasks": list_tasks(),
55
+ "reward_type": "dense",
56
+ "action_types": ["submit_fix", "query_context", "give_up"],
57
+ }
58
+
59
+
60
+ @app.get("/tasks")
61
+ async def get_tasks():
62
+ return {
63
+ "tasks": [
64
+ {
65
+ "id": "easy",
66
+ "name": "Single Function Off-By-One Bug",
67
+ "difficulty": "easy",
68
+ "max_attempts": 5,
69
+ "max_steps": 8,
70
+ "tests_total": 8,
71
+ "description": (
72
+ "Binary search with an off-by-one termination condition. "
73
+ "Error message is clear and high-signal. 1-2 iterations expected."
74
+ ),
75
+ },
76
+ {
77
+ "id": "medium",
78
+ "name": "Red Herring Authentication Bug",
79
+ "difficulty": "medium",
80
+ "max_attempts": 7,
81
+ "max_steps": 15,
82
+ "tests_total": 10,
83
+ "description": (
84
+ "Authentication module where the error message points to the wrong "
85
+ "function. Agent must trace data flow backwards from symptom to root cause "
86
+ "and resist the red herring."
87
+ ),
88
+ },
89
+ {
90
+ "id": "hard",
91
+ "name": "Concurrency Race Condition",
92
+ "difficulty": "hard",
93
+ "max_attempts": 10,
94
+ "max_steps": 25,
95
+ "tests_total": 8,
96
+ "description": (
97
+ "Thread-safe counter with a race condition invisible to all sequential tests. "
98
+ "Agent must recognize that passing tests are insufficient proof of correctness, "
99
+ "design a concurrent stress test to surface the bug, then fix the atomicity issue."
100
+ ),
101
+ },
102
+ ]
103
+ }
104
+
105
+
106
  @app.get("/health")
107
  async def health():
108
  """Health check endpoint to verify server availability."""
env/tasks/task_medium.py CHANGED
@@ -16,49 +16,6 @@ and authenticate_user. Some tests are failing with errors pointing to authentica
16
  when it should return True. The module handles password hashing with MD5, password validation by comparing
17
  hashes, and user authentication against a user database. Debug the module to make all tests pass."""
18
 
19
- BUGGY_CODE = '''import hashlib
20
-
21
- def hash_password(password: str) -> str:
22
- """Hash a password using MD5 and return the hex digest string."""
23
- password_bytes = password.encode('utf-8')
24
- hash_obj = hashlib.md5(password_bytes)
25
- # BUG: str() wrapping of bytes adds "b'" prefix and "'" suffix
26
- return str(hash_obj.digest().hex()) # Looks correct but the intermediate .digest().hex()
27
- # differs subtly from .hexdigest() in edge cases involving the str() conversion path
28
-
29
- def validate_password(password: str, stored_hash: str) -> bool:
30
- """Check if password matches the stored hash."""
31
- computed_hash = hash_password(password)
32
- return computed_hash == stored_hash
33
-
34
- def authenticate_user(username: str, password: str, user_db: dict) -> bool:
35
- """Authenticate a user against the database.
36
-
37
- Args:
38
- username: The username to authenticate
39
- password: The password to validate
40
- user_db: Dict mapping usernames to {'password_hash': str, 'active': bool}
41
-
42
- Returns:
43
- True if user exists, is active, and password matches
44
- """
45
- if username not in user_db:
46
- return False
47
- user = user_db[username]
48
- if not user.get('active', False):
49
- return False
50
- return validate_password(password, user['password_hash'])
51
- '''
52
-
53
- # The actual bug we'll introduce: the hash function uses a different path
54
- # When user_db entries are created with hashlib.md5().hexdigest() directly,
55
- # but hash_password uses str(hashlib.md5().digest().hex()), the results differ
56
- # because digest().hex() and hexdigest() should be the same, BUT we make the bug
57
- # more obvious: hash_password actually does str(bytes(hexdigest, 'utf-8')) which
58
- # adds the b'' wrapping.
59
-
60
- # Let me redesign: the bug is that hash_password converts to bytes then back to str
61
- # which adds "b'" prefix. The user_db stores hashes created by a DIFFERENT code path.
62
 
63
  BUGGY_CODE = '''import hashlib
64
 
 
16
  when it should return True. The module handles password hashing with MD5, password validation by comparing
17
  hashes, and user authentication against a user database. Debug the module to make all tests pass."""
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  BUGGY_CODE = '''import hashlib
21