Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

App Files Files Community

shank commited on 27 days ago

Commit

b658e10

1 Parent(s): ee08016

fix: score floor for medium grader, add root and tasks endpoints

Browse files

Files changed (3) hide show

env/graders/grader_medium.py +6 -1
env/server.py +72 -0
env/tasks/task_medium.py +0 -43

env/graders/grader_medium.py CHANGED Viewed

@@ -48,7 +48,12 @@ class MediumGrader(BaseGrader):
         ground_truth = task_config["ground_truth"]
         # 1. Test pass ratio (weight: 0.60)
-        test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
         test_score = test_pass_ratio * 0.60
         # 2. Efficiency bonus (weight: 0.20)

         ground_truth = task_config["ground_truth"]
         # 1. Test pass ratio (weight: 0.60)
+        if attempts:
+            agent_best = max(a.get("tests_passed",0) for a in attempts)
+        else:
+            agent_best = 0
+        test_pass_ratio = (agent_best / tests_total) if tests_total > 0 else 0.0
         test_score = test_pass_ratio * 0.60
         # 2. Efficiency bonus (weight: 0.20)

env/server.py CHANGED Viewed

@@ -31,6 +31,78 @@ class ResetRequest(BaseModel):
     task_id: Optional[str] = "easy"
 @app.get("/health")
 async def health():
     """Health check endpoint to verify server availability."""

     task_id: Optional[str] = "easy"
+@app.get("/")
+async def root():
+    return {
+        "name": "AgentDebuggerEnv",
+        "version": "1.0.0",
+        "description": (
+            "An OpenEnv-compliant environment where AI agents debug broken code "
+            "through iterative hypothesis-test-fix cycles. Unlike static benchmarks, "
+            "agents act in a live sandbox and observe real execution output each step."
+        ),
+        "openenv_compliant": True,
+        "domain": "software_engineering",
+        "endpoints": {
+            "GET  /":        "This overview",
+            "GET  /health":  "Health check — returns 200 if server is live",
+            "GET  /tasks":   "List all available tasks with metadata",
+            "GET  /state":   "Current episode state",
+            "POST /reset":   "Start a new episode. Body: {\"task_id\": \"easy\"|\"medium\"|\"hard\"}",
+            "POST /step":    "Submit one action. Body: Action JSON",
+        },
+        "tasks": list_tasks(),
+        "reward_type": "dense",
+        "action_types": ["submit_fix", "query_context", "give_up"],
+    }
+@app.get("/tasks")
+async def get_tasks():
+    return {
+        "tasks": [
+            {
+                "id": "easy",
+                "name": "Single Function Off-By-One Bug",
+                "difficulty": "easy",
+                "max_attempts": 5,
+                "max_steps": 8,
+                "tests_total": 8,
+                "description": (
+                    "Binary search with an off-by-one termination condition. "
+                    "Error message is clear and high-signal. 1-2 iterations expected."
+                ),
+            },
+            {
+                "id": "medium",
+                "name": "Red Herring Authentication Bug",
+                "difficulty": "medium",
+                "max_attempts": 7,
+                "max_steps": 15,
+                "tests_total": 10,
+                "description": (
+                    "Authentication module where the error message points to the wrong "
+                    "function. Agent must trace data flow backwards from symptom to root cause "
+                    "and resist the red herring."
+                ),
+            },
+            {
+                "id": "hard",
+                "name": "Concurrency Race Condition",
+                "difficulty": "hard",
+                "max_attempts": 10,
+                "max_steps": 25,
+                "tests_total": 8,
+                "description": (
+                    "Thread-safe counter with a race condition invisible to all sequential tests. "
+                    "Agent must recognize that passing tests are insufficient proof of correctness, "
+                    "design a concurrent stress test to surface the bug, then fix the atomicity issue."
+                ),
+            },
+        ]
+    }
 @app.get("/health")
 async def health():
     """Health check endpoint to verify server availability."""

env/tasks/task_medium.py CHANGED Viewed

@@ -16,49 +16,6 @@ and authenticate_user. Some tests are failing with errors pointing to authentica
 when it should return True. The module handles password hashing with MD5, password validation by comparing
 hashes, and user authentication against a user database. Debug the module to make all tests pass."""
-BUGGY_CODE = '''import hashlib
-def hash_password(password: str) -> str:
-    """Hash a password using MD5 and return the hex digest string."""
-    password_bytes = password.encode('utf-8')
-    hash_obj = hashlib.md5(password_bytes)
-    # BUG: str() wrapping of bytes adds "b'" prefix and "'" suffix
-    return str(hash_obj.digest().hex())  # Looks correct but the intermediate .digest().hex()
-    # differs subtly from .hexdigest() in edge cases involving the str() conversion path
-def validate_password(password: str, stored_hash: str) -> bool:
-    """Check if password matches the stored hash."""
-    computed_hash = hash_password(password)
-    return computed_hash == stored_hash
-def authenticate_user(username: str, password: str, user_db: dict) -> bool:
-    """Authenticate a user against the database.
-    Args:
-        username: The username to authenticate
-        password: The password to validate
-        user_db: Dict mapping usernames to {'password_hash': str, 'active': bool}
-    Returns:
-        True if user exists, is active, and password matches
-    """
-    if username not in user_db:
-        return False
-    user = user_db[username]
-    if not user.get('active', False):
-        return False
-    return validate_password(password, user['password_hash'])
-'''
-# The actual bug we'll introduce: the hash function uses a different path
-# When user_db entries are created with hashlib.md5().hexdigest() directly,
-# but hash_password uses str(hashlib.md5().digest().hex()), the results differ
-# because digest().hex() and hexdigest() should be the same, BUT we make the bug
-# more obvious: hash_password actually does str(bytes(hexdigest, 'utf-8')) which
-# adds the b'' wrapping.
-# Let me redesign: the bug is that hash_password converts to bytes then back to str
-# which adds "b'" prefix. The user_db stores hashes created by a DIFFERENT code path.
 BUGGY_CODE = '''import hashlib

 when it should return True. The module handles password hashing with MD5, password validation by comparing
 hashes, and user authentication against a user database. Debug the module to make all tests pass."""
 BUGGY_CODE = '''import hashlib