omkarrr88 commited on
Commit ·
d8eeec6
1
Parent(s): 7336adb
minor changes
Browse files- README.md +1 -1
- inference.py +17 -26
- pyproject.toml +1 -1
- server/app.py +1 -1
- test_results.txt +278 -0
- tests/test_endpoints.py +2 -2
README.md
CHANGED
|
@@ -183,7 +183,7 @@ API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4o OPENAI_API_KEY=sk-... p
|
|
| 183 |
|
| 184 |
| Endpoint | Method | Description |
|
| 185 |
|----------|--------|-------------|
|
| 186 |
-
| `/health` | GET | `{"status": "
|
| 187 |
| `/tasks` | GET | Task list with IDs, difficulties, action schema |
|
| 188 |
| `/grader` | POST | Score for last completed episode |
|
| 189 |
| `/baseline` | POST | Run heuristic on all tasks, return scores |
|
|
|
|
| 183 |
|
| 184 |
| Endpoint | Method | Description |
|
| 185 |
|----------|--------|-------------|
|
| 186 |
+
| `/health` | GET | `{"status": "healthy", "tasks": 7}` |
|
| 187 |
| `/tasks` | GET | Task list with IDs, difficulties, action schema |
|
| 188 |
| `/grader` | POST | Score for last completed episode |
|
| 189 |
| `/baseline` | POST | Run heuristic on all tasks, return scores |
|
inference.py
CHANGED
|
@@ -24,12 +24,12 @@ import asyncio
|
|
| 24 |
import json
|
| 25 |
import os
|
| 26 |
import sys
|
| 27 |
-
from typing import Optional
|
| 28 |
|
| 29 |
try:
|
| 30 |
from openai import OpenAI
|
| 31 |
except ImportError:
|
| 32 |
-
print("Error: openai package not installed. Run: pip install openai",
|
| 33 |
sys.exit(1)
|
| 34 |
|
| 35 |
from openenv.core import GenericAction, GenericEnvClient
|
|
@@ -74,7 +74,7 @@ def log_step(
|
|
| 74 |
)
|
| 75 |
|
| 76 |
|
| 77 |
-
def log_end(success: bool, steps: int, score: float, rewards:
|
| 78 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 79 |
print(
|
| 80 |
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
|
@@ -155,7 +155,7 @@ def get_model_message(
|
|
| 155 |
step: int,
|
| 156 |
last_obs_summary: dict,
|
| 157 |
last_reward: float,
|
| 158 |
-
history:
|
| 159 |
) -> str:
|
| 160 |
"""Get next action from the LLM."""
|
| 161 |
history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
|
|
@@ -179,7 +179,7 @@ def get_model_message(
|
|
| 179 |
text = (completion.choices[0].message.content or "").strip()
|
| 180 |
return text if text else FALLBACK_ACTION
|
| 181 |
except Exception as exc:
|
| 182 |
-
print(f"[DEBUG] Model request failed: {exc}",
|
| 183 |
return FALLBACK_ACTION
|
| 184 |
|
| 185 |
|
|
@@ -198,11 +198,11 @@ def parse_action(raw: str) -> str:
|
|
| 198 |
async def main() -> None:
|
| 199 |
if not API_KEY:
|
| 200 |
print(
|
| 201 |
-
"Error: OPENAI_API_KEY or HF_TOKEN required.",
|
| 202 |
)
|
| 203 |
sys.exit(1)
|
| 204 |
|
| 205 |
-
|
| 206 |
|
| 207 |
# Connect to environment via standard OpenEnv client
|
| 208 |
if IMAGE_NAME:
|
|
@@ -211,8 +211,8 @@ async def main() -> None:
|
|
| 211 |
env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
|
| 212 |
await env.connect()
|
| 213 |
|
| 214 |
-
history:
|
| 215 |
-
rewards:
|
| 216 |
steps_taken = 0
|
| 217 |
score = 0.0
|
| 218 |
success = False
|
|
@@ -220,9 +220,8 @@ async def main() -> None:
|
|
| 220 |
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 221 |
|
| 222 |
try:
|
| 223 |
-
# Reset environment for the selected task
|
| 224 |
result = await env.reset(task_id=TASK_NAME, seed=42)
|
| 225 |
-
obs = result.observation
|
| 226 |
last_reward = 0.0
|
| 227 |
|
| 228 |
for step in range(1, MAX_STEPS + 1):
|
|
@@ -230,10 +229,9 @@ async def main() -> None:
|
|
| 230 |
break
|
| 231 |
|
| 232 |
obs_summary = _build_obs_summary(obs)
|
| 233 |
-
raw = get_model_message(
|
| 234 |
action_str = parse_action(raw)
|
| 235 |
|
| 236 |
-
# Step via standard OpenEnv client API
|
| 237 |
action = GenericAction(json.loads(action_str))
|
| 238 |
result = await env.step(action)
|
| 239 |
obs = result.observation
|
|
@@ -250,30 +248,23 @@ async def main() -> None:
|
|
| 250 |
steps_taken = step
|
| 251 |
last_reward = reward
|
| 252 |
|
| 253 |
-
log_step(
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
reward=reward,
|
| 257 |
-
done=done,
|
| 258 |
-
error=error,
|
| 259 |
-
)
|
| 260 |
-
history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
|
| 261 |
|
| 262 |
if done:
|
| 263 |
break
|
| 264 |
|
| 265 |
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 266 |
-
score = min(max(score, 0.0), 1.0)
|
| 267 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 268 |
|
| 269 |
finally:
|
| 270 |
try:
|
| 271 |
await env.close()
|
| 272 |
except Exception as e:
|
| 273 |
-
print(f"[DEBUG] env.close() error: {e}",
|
| 274 |
-
log_end(
|
| 275 |
-
success=success, steps=steps_taken, score=score, rewards=rewards
|
| 276 |
-
)
|
| 277 |
|
| 278 |
|
| 279 |
if __name__ == "__main__":
|
|
|
|
| 24 |
import json
|
| 25 |
import os
|
| 26 |
import sys
|
| 27 |
+
from typing import List, Optional
|
| 28 |
|
| 29 |
try:
|
| 30 |
from openai import OpenAI
|
| 31 |
except ImportError:
|
| 32 |
+
print("Error: openai package not installed. Run: pip install openai", flush=True)
|
| 33 |
sys.exit(1)
|
| 34 |
|
| 35 |
from openenv.core import GenericAction, GenericEnvClient
|
|
|
|
| 74 |
)
|
| 75 |
|
| 76 |
|
| 77 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 78 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 79 |
print(
|
| 80 |
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
|
|
|
| 155 |
step: int,
|
| 156 |
last_obs_summary: dict,
|
| 157 |
last_reward: float,
|
| 158 |
+
history: List[str],
|
| 159 |
) -> str:
|
| 160 |
"""Get next action from the LLM."""
|
| 161 |
history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
|
|
|
|
| 179 |
text = (completion.choices[0].message.content or "").strip()
|
| 180 |
return text if text else FALLBACK_ACTION
|
| 181 |
except Exception as exc:
|
| 182 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 183 |
return FALLBACK_ACTION
|
| 184 |
|
| 185 |
|
|
|
|
| 198 |
async def main() -> None:
|
| 199 |
if not API_KEY:
|
| 200 |
print(
|
| 201 |
+
"Error: OPENAI_API_KEY or HF_TOKEN required.", flush=True
|
| 202 |
)
|
| 203 |
sys.exit(1)
|
| 204 |
|
| 205 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 206 |
|
| 207 |
# Connect to environment via standard OpenEnv client
|
| 208 |
if IMAGE_NAME:
|
|
|
|
| 211 |
env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
|
| 212 |
await env.connect()
|
| 213 |
|
| 214 |
+
history: List[str] = []
|
| 215 |
+
rewards: List[float] = []
|
| 216 |
steps_taken = 0
|
| 217 |
score = 0.0
|
| 218 |
success = False
|
|
|
|
| 220 |
log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
|
| 221 |
|
| 222 |
try:
|
|
|
|
| 223 |
result = await env.reset(task_id=TASK_NAME, seed=42)
|
| 224 |
+
obs = result.observation
|
| 225 |
last_reward = 0.0
|
| 226 |
|
| 227 |
for step in range(1, MAX_STEPS + 1):
|
|
|
|
| 229 |
break
|
| 230 |
|
| 231 |
obs_summary = _build_obs_summary(obs)
|
| 232 |
+
raw = get_model_message(client, step, obs_summary, last_reward, history)
|
| 233 |
action_str = parse_action(raw)
|
| 234 |
|
|
|
|
| 235 |
action = GenericAction(json.loads(action_str))
|
| 236 |
result = await env.step(action)
|
| 237 |
obs = result.observation
|
|
|
|
| 248 |
steps_taken = step
|
| 249 |
last_reward = reward
|
| 250 |
|
| 251 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 252 |
+
|
| 253 |
+
history.append(f"Step {step}: {action_str!r} -> reward {reward:+.2f}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
if done:
|
| 256 |
break
|
| 257 |
|
| 258 |
score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
|
| 259 |
+
score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
|
| 260 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 261 |
|
| 262 |
finally:
|
| 263 |
try:
|
| 264 |
await env.close()
|
| 265 |
except Exception as e:
|
| 266 |
+
print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
|
| 267 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
|
|
|
|
|
| 268 |
|
| 269 |
|
| 270 |
if __name__ == "__main__":
|
pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ version = "1.1.0"
|
|
| 4 |
description = "OpenEnv RL environment for PyTorch training failure debugging"
|
| 5 |
requires-python = ">=3.12"
|
| 6 |
dependencies = [
|
| 7 |
-
"torch",
|
| 8 |
"openenv-core",
|
| 9 |
"pydantic>=2.0",
|
| 10 |
"fastapi",
|
|
|
|
| 4 |
description = "OpenEnv RL environment for PyTorch training failure debugging"
|
| 5 |
requires-python = ">=3.12"
|
| 6 |
dependencies = [
|
| 7 |
+
"torch>=2.5.1",
|
| 8 |
"openenv-core",
|
| 9 |
"pydantic>=2.0",
|
| 10 |
"fastapi",
|
server/app.py
CHANGED
|
@@ -85,7 +85,7 @@ def root() -> RedirectResponse:
|
|
| 85 |
@app.get("/health")
|
| 86 |
def health_check() -> dict:
|
| 87 |
"""Health check — required by hackathon auto-validator."""
|
| 88 |
-
return {"status": "
|
| 89 |
|
| 90 |
|
| 91 |
@app.get("/dashboard", response_class=HTMLResponse)
|
|
|
|
| 85 |
@app.get("/health")
|
| 86 |
def health_check() -> dict:
|
| 87 |
"""Health check — required by hackathon auto-validator."""
|
| 88 |
+
return {"status": "healthy", "tasks": len(ALL_TASKS)}
|
| 89 |
|
| 90 |
|
| 91 |
@app.get("/dashboard", response_class=HTMLResponse)
|
test_results.txt
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
============================= test session starts ==============================
|
| 2 |
+
platform linux -- Python 3.12.3, pytest-9.0.2, pluggy-1.6.0 -- /home/omkar-kadam/Desktop/Rubacus/ML Debugger/.venv/bin/python3
|
| 3 |
+
cachedir: .pytest_cache
|
| 4 |
+
rootdir: /home/omkar-kadam/Desktop/Rubacus/ML Debugger
|
| 5 |
+
configfile: pyproject.toml
|
| 6 |
+
plugins: cov-7.1.0, anyio-4.13.0, asyncio-1.3.0
|
| 7 |
+
asyncio: mode=Mode.AUTO, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function
|
| 8 |
+
collecting ... collected 245 items
|
| 9 |
+
|
| 10 |
+
tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_two_runs_identical PASSED [ 0%]
|
| 11 |
+
tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_all_scores_in_range PASSED [ 0%]
|
| 12 |
+
tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_scores_have_meaningful_variance PASSED [ 1%]
|
| 13 |
+
tests/test_client.py::TestMLTrainingEnvClient::test_can_instantiate PASSED [ 1%]
|
| 14 |
+
tests/test_client.py::TestMLTrainingEnvClient::test_is_generic_env_client PASSED [ 2%]
|
| 15 |
+
tests/test_code_templates.py::TestGenerateCodeSnippet::test_eval_mode PASSED [ 2%]
|
| 16 |
+
tests/test_code_templates.py::TestGenerateCodeSnippet::test_detach_loss PASSED [ 2%]
|
| 17 |
+
tests/test_code_templates.py::TestGenerateCodeSnippet::test_zero_grad_missing PASSED [ 3%]
|
| 18 |
+
tests/test_code_templates.py::TestGenerateCodeSnippet::test_inplace_relu PASSED [ 3%]
|
| 19 |
+
tests/test_code_templates.py::TestGenerateCodeSnippet::test_unknown_bug_raises PASSED [ 4%]
|
| 20 |
+
tests/test_code_templates.py::TestValidateFix::test_eval_mode_correct_fix PASSED [ 4%]
|
| 21 |
+
tests/test_code_templates.py::TestValidateFix::test_eval_mode_with_whitespace PASSED [ 4%]
|
| 22 |
+
tests/test_code_templates.py::TestValidateFix::test_eval_mode_wrong_fix PASSED [ 5%]
|
| 23 |
+
tests/test_code_templates.py::TestValidateFix::test_detach_loss_correct_fix PASSED [ 5%]
|
| 24 |
+
tests/test_code_templates.py::TestValidateFix::test_detach_loss_with_trailing_spaces PASSED [ 6%]
|
| 25 |
+
tests/test_code_templates.py::TestValidateFix::test_zero_grad_correct_fix PASSED [ 6%]
|
| 26 |
+
tests/test_code_templates.py::TestValidateFix::test_inplace_relu_correct_fix PASSED [ 6%]
|
| 27 |
+
tests/test_code_templates.py::TestValidateFix::test_wrong_line_number PASSED [ 7%]
|
| 28 |
+
tests/test_code_templates.py::TestValidateFix::test_unknown_bug_type PASSED [ 7%]
|
| 29 |
+
tests/test_code_templates_edge.py::TestNormalizeCode::test_strips_whitespace PASSED [ 8%]
|
| 30 |
+
tests/test_code_templates_edge.py::TestNormalizeCode::test_multiline PASSED [ 8%]
|
| 31 |
+
tests/test_code_templates_edge.py::TestTokenizeCompare::test_identical_tokens PASSED [ 8%]
|
| 32 |
+
tests/test_code_templates_edge.py::TestTokenizeCompare::test_whitespace_ignored PASSED [ 9%]
|
| 33 |
+
tests/test_code_templates_edge.py::TestTokenizeCompare::test_different_tokens PASSED [ 9%]
|
| 34 |
+
tests/test_code_templates_edge.py::TestTokenizeCompare::test_invalid_syntax PASSED [ 10%]
|
| 35 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_ast_fallback_with_train_keyword PASSED [ 10%]
|
| 36 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_ast_without_detach PASSED [ 11%]
|
| 37 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_inplace_relu_ast_without_inplace PASSED [ 11%]
|
| 38 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_line_zero_invalid PASSED [ 11%]
|
| 39 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_syntax_error_rejected PASSED [ 12%]
|
| 40 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_with_comment PASSED [ 12%]
|
| 41 |
+
tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_without_keyword PASSED [ 13%]
|
| 42 |
+
tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_semantic_train_present PASSED [ 13%]
|
| 43 |
+
tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_with_eval_keyword_fails PASSED [ 13%]
|
| 44 |
+
tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_detach_loss_criterion_without_detach PASSED [ 14%]
|
| 45 |
+
tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_inplace_relu_without_inplace_flag PASSED [ 14%]
|
| 46 |
+
tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_eval_mode_has_hint PASSED [ 15%]
|
| 47 |
+
tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_detach_loss_has_hint PASSED [ 15%]
|
| 48 |
+
tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_zero_grad_no_hint PASSED [ 15%]
|
| 49 |
+
tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_inplace_relu_no_hint PASSED [ 16%]
|
| 50 |
+
tests/test_endpoints.py::TestHealthEndpoint::test_returns_healthy PASSED [ 16%]
|
| 51 |
+
tests/test_endpoints.py::TestHealthEndpoint::test_task_count_matches_all_tasks PASSED [ 17%]
|
| 52 |
+
tests/test_endpoints.py::TestTasksEndpoint::test_returns_six_tasks PASSED [ 17%]
|
| 53 |
+
tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_action_schema PASSED [ 17%]
|
| 54 |
+
tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_difficulty_and_max_steps PASSED [ 18%]
|
| 55 |
+
tests/test_endpoints.py::TestGraderEndpoint::test_no_completed_episode PASSED [ 18%]
|
| 56 |
+
tests/test_endpoints.py::TestGraderEndpoint::test_grader_after_completed_episode PASSED [ 19%]
|
| 57 |
+
tests/test_endpoints.py::TestGraderEndpoint::test_grader_with_session_id PASSED [ 19%]
|
| 58 |
+
tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_returns_scores PASSED [ 20%]
|
| 59 |
+
tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_scores_in_valid_range PASSED [ 20%]
|
| 60 |
+
tests/test_endpoints.py::TestDashboardEndpoint::test_returns_html PASSED [ 20%]
|
| 61 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_001_exploding PASSED [ 21%]
|
| 62 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_002_vanishing PASSED [ 21%]
|
| 63 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_003_leakage PASSED [ 22%]
|
| 64 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_004_overfitting PASSED [ 22%]
|
| 65 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_005_batchnorm PASSED [ 22%]
|
| 66 |
+
tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_006_code_bug PASSED [ 23%]
|
| 67 |
+
tests/test_endpoints.py::TestGetScore::test_no_session PASSED [ 23%]
|
| 68 |
+
tests/test_endpoints.py::TestGetScore::test_with_session PASSED [ 24%]
|
| 69 |
+
tests/test_endpoints.py::TestRunBaselineSync::test_returns_all_tasks PASSED [ 24%]
|
| 70 |
+
tests/test_endpoints.py::TestRunBaselineSync::test_reproducible PASSED [ 24%]
|
| 71 |
+
tests/test_episode_lifecycle.py::TestReset::test_reset_returns_valid_observation PASSED [ 25%]
|
| 72 |
+
tests/test_episode_lifecycle.py::TestReset::test_reset_initial_state PASSED [ 25%]
|
| 73 |
+
tests/test_episode_lifecycle.py::TestReset::test_reset_progressive_reveal PASSED [ 26%]
|
| 74 |
+
tests/test_episode_lifecycle.py::TestReset::test_reset_available_actions PASSED [ 26%]
|
| 75 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_populates_stats PASSED [ 26%]
|
| 76 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_gives_investigation_bonus PASSED [ 27%]
|
| 77 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch_gives_investigation_bonus PASSED [ 27%]
|
| 78 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes_gives_investigation_bonus PASSED [ 28%]
|
| 79 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_repeat_inspection_no_bonus PASSED [ 28%]
|
| 80 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch PASSED [ 28%]
|
| 81 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes PASSED [ 29%]
|
| 82 |
+
tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_weights PASSED [ 29%]
|
| 83 |
+
tests/test_episode_lifecycle.py::TestStepFixActions::test_modify_config PASSED [ 30%]
|
| 84 |
+
tests/test_episode_lifecycle.py::TestStepFixActions::test_restart_run_after_fix PASSED [ 30%]
|
| 85 |
+
tests/test_episode_lifecycle.py::TestStepDiagnosis::test_mark_diagnosed_ends_episode PASSED [ 31%]
|
| 86 |
+
tests/test_episode_lifecycle.py::TestStepDiagnosis::test_step_after_done PASSED [ 31%]
|
| 87 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_invalid_action_type PASSED [ 31%]
|
| 88 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_action_not_in_available PASSED [ 32%]
|
| 89 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_modify_config_missing_target PASSED [ 32%]
|
| 90 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_missing_diagnosis PASSED [ 33%]
|
| 91 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_invalid_diagnosis PASSED [ 33%]
|
| 92 |
+
tests/test_episode_lifecycle.py::TestErrorHandling::test_step_before_reset PASSED [ 33%]
|
| 93 |
+
tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_001_full_flow PASSED [ 34%]
|
| 94 |
+
tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_005_context_gated_penalty PASSED [ 34%]
|
| 95 |
+
tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_003_data_leakage PASSED [ 35%]
|
| 96 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_001] PASSED [ 35%]
|
| 97 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_002] PASSED [ 35%]
|
| 98 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_003] PASSED [ 36%]
|
| 99 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_004] PASSED [ 36%]
|
| 100 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_005] PASSED [ 37%]
|
| 101 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_006] PASSED [ 37%]
|
| 102 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_007] PASSED [ 37%]
|
| 103 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_hard_task_has_variance PASSED [ 38%]
|
| 104 |
+
tests/test_exploit_resistance.py::TestExploitResistance::test_deterministic_per_seed PASSED [ 38%]
|
| 105 |
+
tests/test_graders.py::TestGradeTask001::test_perfect_score PASSED [ 39%]
|
| 106 |
+
tests/test_graders.py::TestGradeTask001::test_wrong_diagnosis PASSED [ 39%]
|
| 107 |
+
tests/test_graders.py::TestGradeTask001::test_no_investigation PASSED [ 40%]
|
| 108 |
+
tests/test_graders.py::TestGradeTask001::test_score_in_range PASSED [ 40%]
|
| 109 |
+
tests/test_graders.py::TestGradeTask003::test_perfect_score PASSED [ 40%]
|
| 110 |
+
tests/test_graders.py::TestGradeTask003::test_wrong_diagnosis PASSED [ 41%]
|
| 111 |
+
tests/test_graders.py::TestGradeTask005::test_perfect_score_thorough PASSED [ 41%]
|
| 112 |
+
tests/test_graders.py::TestGradeTask005::test_quick_fix_partial_credit PASSED [ 42%]
|
| 113 |
+
tests/test_graders.py::TestGradeTask005::test_red_herring_chaser PASSED [ 42%]
|
| 114 |
+
tests/test_graders.py::TestGradeTask005::test_wrong_fix_penalty PASSED [ 42%]
|
| 115 |
+
tests/test_graders.py::TestGradeTask005::test_double_trap_devastates_score PASSED [ 43%]
|
| 116 |
+
tests/test_graders.py::TestGradeEpisode::test_dispatch_to_correct_grader PASSED [ 43%]
|
| 117 |
+
tests/test_graders.py::TestGradeEpisode::test_unknown_task_returns_zero PASSED [ 44%]
|
| 118 |
+
tests/test_graders.py::TestGradeTask006::test_perfect_score_thorough PASSED [ 44%]
|
| 119 |
+
tests/test_graders.py::TestGradeTask006::test_no_weights_inspection_partial PASSED [ 44%]
|
| 120 |
+
tests/test_graders.py::TestGradeTask006::test_minimal_investigation PASSED [ 45%]
|
| 121 |
+
tests/test_graders.py::TestGradeTask006::test_wrong_diagnosis PASSED [ 45%]
|
| 122 |
+
tests/test_graders.py::TestGradeTask006::test_score_in_range PASSED [ 46%]
|
| 123 |
+
tests/test_graders.py::TestGradeTask007::test_perfect_score_thorough PASSED [ 46%]
|
| 124 |
+
tests/test_graders.py::TestGradeTask007::test_no_weights_partial PASSED [ 46%]
|
| 125 |
+
tests/test_graders.py::TestGradeTask007::test_wrong_fix_penalty PASSED [ 47%]
|
| 126 |
+
tests/test_graders.py::TestGradeTask007::test_wrong_diagnosis PASSED [ 47%]
|
| 127 |
+
tests/test_graders.py::TestGradeTask007::test_score_in_range PASSED [ 48%]
|
| 128 |
+
tests/test_graders.py::TestSubmittedDiagnosis::test_finds_diagnosis PASSED [ 48%]
|
| 129 |
+
tests/test_graders.py::TestSubmittedDiagnosis::test_no_diagnosis PASSED [ 48%]
|
| 130 |
+
tests/test_graders.py::TestSubmittedDiagnosis::test_latest_diagnosis PASSED [ 49%]
|
| 131 |
+
tests/test_models.py::TestRootCauseDiagnosis::test_all_values_exist PASSED [ 49%]
|
| 132 |
+
tests/test_models.py::TestRootCauseDiagnosis::test_values_are_strings PASSED [ 50%]
|
| 133 |
+
tests/test_models.py::TestRootCauseDiagnosis::test_specific_values PASSED [ 50%]
|
| 134 |
+
tests/test_models.py::TestTrainingConfig::test_default_instantiation PASSED [ 51%]
|
| 135 |
+
tests/test_models.py::TestTrainingConfig::test_json_roundtrip PASSED [ 51%]
|
| 136 |
+
tests/test_models.py::TestGradientStats::test_exploding PASSED [ 51%]
|
| 137 |
+
tests/test_models.py::TestGradientStats::test_vanishing PASSED [ 52%]
|
| 138 |
+
tests/test_models.py::TestGradientStats::test_normal PASSED [ 52%]
|
| 139 |
+
tests/test_models.py::TestEpisodeState::test_fresh_state PASSED [ 53%]
|
| 140 |
+
tests/test_models.py::TestEpisodeState::test_available_actions_initial PASSED [ 53%]
|
| 141 |
+
tests/test_models.py::TestEpisodeState::test_fix_code_available_after_code_inspected PASSED [ 53%]
|
| 142 |
+
tests/test_models.py::TestEpisodeState::test_restart_run_available_after_fix PASSED [ 54%]
|
| 143 |
+
tests/test_models.py::TestEpisodeState::test_mark_diagnosed_disappears_after_submission PASSED [ 54%]
|
| 144 |
+
tests/test_models.py::TestMLTrainingObservation::test_extends_observation PASSED [ 55%]
|
| 145 |
+
tests/test_models.py::TestMLTrainingObservation::test_has_done_and_reward PASSED [ 55%]
|
| 146 |
+
tests/test_models.py::TestMLTrainingObservation::test_json_serialization PASSED [ 55%]
|
| 147 |
+
tests/test_models.py::TestMLTrainingAction::test_extends_action PASSED [ 56%]
|
| 148 |
+
tests/test_models.py::TestMLTrainingAction::test_basic_action PASSED [ 56%]
|
| 149 |
+
tests/test_models.py::TestMLTrainingAction::test_modify_config_action PASSED [ 57%]
|
| 150 |
+
tests/test_models.py::TestMLTrainingAction::test_mark_diagnosed_action PASSED [ 57%]
|
| 151 |
+
tests/test_models.py::TestMLTrainingAction::test_fix_code_action PASSED [ 57%]
|
| 152 |
+
tests/test_new_endpoints.py::TestCurriculumEndpoint::test_returns_curriculum PASSED [ 58%]
|
| 153 |
+
tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_has_difficulty_levels PASSED [ 58%]
|
| 154 |
+
tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_covers_all_tasks PASSED [ 59%]
|
| 155 |
+
tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_returns_leaderboard PASSED [ 59%]
|
| 156 |
+
tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_leaderboard_after_baseline PASSED [ 60%]
|
| 157 |
+
tests/test_new_endpoints.py::TestReplayEndpoint::test_missing_episode PASSED [ 60%]
|
| 158 |
+
tests/test_new_endpoints.py::TestReplayEndpoint::test_replay_after_baseline PASSED [ 60%]
|
| 159 |
+
tests/test_new_endpoints.py::TestValidationReportEndpoint::test_returns_real_report PASSED [ 61%]
|
| 160 |
+
tests/test_pytorch_engine.py::TestSimpleCNN::test_is_nn_module PASSED [ 61%]
|
| 161 |
+
tests/test_pytorch_engine.py::TestSimpleCNN::test_param_count PASSED [ 62%]
|
| 162 |
+
tests/test_pytorch_engine.py::TestSimpleCNN::test_forward_pass PASSED [ 62%]
|
| 163 |
+
tests/test_pytorch_engine.py::TestFaultInjection::test_task_001_exploding_gradients PASSED [ 62%]
|
| 164 |
+
tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_eval_mode PASSED [ 63%]
|
| 165 |
+
tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_gradients_not_exploding PASSED [ 63%]
|
| 166 |
+
tests/test_pytorch_engine.py::TestExtractGradientStats::test_returns_gradient_stats PASSED [ 64%]
|
| 167 |
+
tests/test_pytorch_engine.py::TestExtractWeightStats::test_returns_weight_stats PASSED [ 64%]
|
| 168 |
+
tests/test_pytorch_engine.py::TestExtractModelModes::test_train_mode PASSED [ 64%]
|
| 169 |
+
tests/test_pytorch_engine.py::TestExtractModelModes::test_eval_mode PASSED [ 65%]
|
| 170 |
+
tests/test_pytorch_engine.py::TestTask005RedHerrings::test_conv1_near_vanishing_red_herring PASSED [ 65%]
|
| 171 |
+
tests/test_pytorch_engine.py::TestTask005RedHerrings::test_fc_spike_not_exploding PASSED [ 66%]
|
| 172 |
+
tests/test_pytorch_engine.py::TestTask005RedHerrings::test_all_layers_not_exploding PASSED [ 66%]
|
| 173 |
+
tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_vanishing PASSED [ 66%]
|
| 174 |
+
tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_model_in_train_mode PASSED [ 67%]
|
| 175 |
+
tests/test_pytorch_engine.py::TestCodeBugFaultInjection::test_task_006_model_trains_normally PASSED [ 67%]
|
| 176 |
+
tests/test_pytorch_engine.py::TestDataLeakageFaultInjection::test_task_003_normal_model PASSED [ 68%]
|
| 177 |
+
tests/test_real_training.py::TestRunRealTraining::test_returns_20_epoch_curves PASSED [ 68%]
|
| 178 |
+
tests/test_real_training.py::TestRunRealTraining::test_all_values_are_floats PASSED [ 68%]
|
| 179 |
+
tests/test_real_training.py::TestRunRealTraining::test_caching_works PASSED [ 69%]
|
| 180 |
+
tests/test_real_training.py::TestRunRealTraining::test_reproducible_across_calls PASSED [ 69%]
|
| 181 |
+
tests/test_real_training.py::TestRunRealTraining::test_different_seeds_different_curves PASSED [ 70%]
|
| 182 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_001_high_lr_instability PASSED [ 70%]
|
| 183 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_002_vanishing_slow_learning PASSED [ 71%]
|
| 184 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_003_data_leakage PASSED [ 71%]
|
| 185 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_004_overfitting PASSED [ 71%]
|
| 186 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_005_batchnorm_eval PASSED [ 72%]
|
| 187 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_006_code_bug PASSED [ 72%]
|
| 188 |
+
tests/test_real_training.py::TestRunRealTraining::test_task_007_scheduler PASSED [ 73%]
|
| 189 |
+
tests/test_real_training.py::TestRunRealTraining::test_mlp_architecture PASSED [ 73%]
|
| 190 |
+
tests/test_real_training.py::TestSimpleMLP::test_is_nn_module PASSED [ 73%]
|
| 191 |
+
tests/test_real_training.py::TestSimpleMLP::test_param_count PASSED [ 74%]
|
| 192 |
+
tests/test_real_training.py::TestSimpleMLP::test_forward_pass PASSED [ 74%]
|
| 193 |
+
tests/test_real_training.py::TestSimpleMLP::test_has_batchnorm PASSED [ 75%]
|
| 194 |
+
tests/test_reward_engine.py::TestStepPenalty::test_flat_step_penalty PASSED [ 75%]
|
| 195 |
+
tests/test_reward_engine.py::TestStepPenalty::test_step_penalty_not_multiplied_by_step_count PASSED [ 75%]
|
| 196 |
+
tests/test_reward_engine.py::TestInvestigationBonus::test_first_time_bonus PASSED [ 76%]
|
| 197 |
+
tests/test_reward_engine.py::TestInvestigationBonus::test_no_bonus_on_repeat PASSED [ 76%]
|
| 198 |
+
tests/test_reward_engine.py::TestInvestigationBonus::test_each_inspection_type_gives_bonus PASSED [ 77%]
|
| 199 |
+
tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_before_inspection PASSED [ 77%]
|
| 200 |
+
tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_after_normal_gradients PASSED [ 77%]
|
| 201 |
+
tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_after_abnormal_gradients PASSED [ 78%]
|
| 202 |
+
tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_only_for_add_callback PASSED [ 78%]
|
| 203 |
+
tests/test_reward_engine.py::TestDiagnosisReward::test_correct_diagnosis PASSED [ 79%]
|
| 204 |
+
tests/test_reward_engine.py::TestDiagnosisReward::test_wrong_diagnosis PASSED [ 79%]
|
| 205 |
+
tests/test_reward_engine.py::TestTerminalConvergence::test_convergence_after_fix_and_restart PASSED [ 80%]
|
| 206 |
+
tests/test_reward_engine.py::TestTerminalConvergence::test_no_convergence_without_fix PASSED [ 80%]
|
| 207 |
+
tests/test_reward_engine.py::TestInvalidAction::test_invalid_action_penalty PASSED [ 80%]
|
| 208 |
+
tests/test_reward_engine.py::TestWrongCodeFix::test_wrong_code_fix_penalty PASSED [ 81%]
|
| 209 |
+
tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_one PASSED [ 81%]
|
| 210 |
+
tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_negative_one PASSED [ 82%]
|
| 211 |
+
tests/test_scenarios.py::TestSampleScenario::test_task_001_root_cause PASSED [ 82%]
|
| 212 |
+
tests/test_scenarios.py::TestSampleScenario::test_task_003_root_cause PASSED [ 82%]
|
| 213 |
+
tests/test_scenarios.py::TestSampleScenario::test_task_005_root_cause PASSED [ 83%]
|
| 214 |
+
tests/test_scenarios.py::TestSampleScenario::test_different_seeds_produce_different_params PASSED [ 83%]
|
| 215 |
+
tests/test_scenarios.py::TestSampleScenario::test_same_seed_same_params PASSED [ 84%]
|
| 216 |
+
tests/test_scenarios.py::TestSampleScenario::test_unknown_task_raises PASSED [ 84%]
|
| 217 |
+
tests/test_scenarios.py::TestSampleScenario::test_task_005_has_error_log PASSED [ 84%]
|
| 218 |
+
tests/test_scenarios.py::TestSampleScenario::test_task_003_has_notes PASSED [ 85%]
|
| 219 |
+
tests/test_simulation.py::TestGenLossHistory::test_returns_20_floats PASSED [ 85%]
|
| 220 |
+
tests/test_simulation.py::TestGenLossHistory::test_task_001_has_instability PASSED [ 86%]
|
| 221 |
+
tests/test_simulation.py::TestGenLossHistory::test_task_003_reasonable PASSED [ 86%]
|
| 222 |
+
tests/test_simulation.py::TestGenLossHistory::test_task_005_no_crash PASSED [ 86%]
|
| 223 |
+
tests/test_simulation.py::TestGenValAccuracy::test_returns_20_floats PASSED [ 87%]
|
| 224 |
+
tests/test_simulation.py::TestGenValAccuracy::test_task_003_leakage_shows_higher_acc PASSED [ 87%]
|
| 225 |
+
tests/test_simulation.py::TestGenValAccuracy::test_task_005_low_accuracy PASSED [ 88%]
|
| 226 |
+
tests/test_simulation.py::TestGenValLoss::test_returns_20_floats PASSED [ 88%]
|
| 227 |
+
tests/test_simulation.py::TestGenDataBatchStats::test_leakage_high_overlap PASSED [ 88%]
|
| 228 |
+
tests/test_simulation.py::TestGenDataBatchStats::test_normal_low_overlap PASSED [ 89%]
|
| 229 |
+
tests/test_simulation.py::TestGenDataBatchStats::test_confusion_matrix_present PASSED [ 89%]
|
| 230 |
+
tests/test_simulation_extended.py::TestVanishingGradients::test_loss_barely_decreases PASSED [ 90%]
|
| 231 |
+
tests/test_simulation_extended.py::TestVanishingGradients::test_val_acc_low PASSED [ 90%]
|
| 232 |
+
tests/test_simulation_extended.py::TestVanishingGradients::test_val_loss_present PASSED [ 91%]
|
| 233 |
+
tests/test_simulation_extended.py::TestOverfitting::test_loss_history_present PASSED [ 91%]
|
| 234 |
+
tests/test_simulation_extended.py::TestOverfitting::test_val_acc_present PASSED [ 91%]
|
| 235 |
+
tests/test_simulation_extended.py::TestOverfitting::test_val_loss_present PASSED [ 92%]
|
| 236 |
+
tests/test_simulation_extended.py::TestOverfitting::test_data_batch_stats_clean PASSED [ 92%]
|
| 237 |
+
tests/test_simulation_extended.py::TestCodeBug::test_loss_history PASSED [ 93%]
|
| 238 |
+
tests/test_simulation_extended.py::TestCodeBug::test_val_acc PASSED [ 93%]
|
| 239 |
+
tests/test_simulation_extended.py::TestCodeBug::test_val_loss PASSED [ 93%]
|
| 240 |
+
tests/test_simulation_extended.py::TestBatchNormEval::test_val_loss_present PASSED [ 94%]
|
| 241 |
+
tests/test_simulation_extended.py::TestBatchNormEval::test_val_acc_near_zero PASSED [ 94%]
|
| 242 |
+
tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_loss_history PASSED [ 95%]
|
| 243 |
+
tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_acc PASSED [ 95%]
|
| 244 |
+
tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_loss PASSED [ 95%]
|
| 245 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_endpoint_exists PASSED [ 96%]
|
| 246 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_returns_observation PASSED [ 96%]
|
| 247 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_with_task_selection PASSED [ 97%]
|
| 248 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_selection_all_tasks PASSED [ 97%]
|
| 249 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_inspect_gradients PASSED [ 97%]
|
| 250 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_full_episode_flow PASSED [ 98%]
|
| 251 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_005_red_herrings PASSED [ 98%]
|
| 252 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_006_code_inspection PASSED [ 99%]
|
| 253 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_invalid_message_returns_error PASSED [ 99%]
|
| 254 |
+
tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_data_batch PASSED [100%]
|
| 255 |
+
|
| 256 |
+
================================ tests coverage ================================
|
| 257 |
+
_______________ coverage: platform linux, python 3.12.3-final-0 ________________
|
| 258 |
+
|
| 259 |
+
Name Stmts Miss Cover Missing
|
| 260 |
+
----------------------------------------------------------------------
|
| 261 |
+
ml_training_debugger/__init__.py 1 0 100%
|
| 262 |
+
ml_training_debugger/client.py 4 0 100%
|
| 263 |
+
ml_training_debugger/code_templates.py 76 6 92% 224-225, 239, 241, 243-244
|
| 264 |
+
ml_training_debugger/graders.py 141 1 99% 39
|
| 265 |
+
ml_training_debugger/models.py 106 0 100%
|
| 266 |
+
ml_training_debugger/nn_models.py 42 0 100%
|
| 267 |
+
ml_training_debugger/pytorch_engine.py 203 4 98% 104, 113-114, 289
|
| 268 |
+
ml_training_debugger/reward_engine.py 38 0 100%
|
| 269 |
+
ml_training_debugger/scenarios.py 64 0 100%
|
| 270 |
+
ml_training_debugger/simulation.py 38 0 100%
|
| 271 |
+
server/__init__.py 0 0 100%
|
| 272 |
+
server/_baseline_results.py 15 0 100%
|
| 273 |
+
server/_heuristic.py 93 9 90% 123-126, 132, 140-146, 156, 176
|
| 274 |
+
server/app.py 96 6 94% 82, 113, 180, 196-198, 202
|
| 275 |
+
server/environment.py 242 27 89% 158-166, 305-318, 338, 380, 393, 443, 458, 508, 513-517, 535-540
|
| 276 |
+
----------------------------------------------------------------------
|
| 277 |
+
TOTAL 1159 53 95%
|
| 278 |
+
======================== 245 passed in 80.39s (0:01:20) ========================
|
tests/test_endpoints.py
CHANGED
|
@@ -27,11 +27,11 @@ def client():
|
|
| 27 |
|
| 28 |
|
| 29 |
class TestHealthEndpoint:
|
| 30 |
-
def
|
| 31 |
resp = client.get("/health")
|
| 32 |
assert resp.status_code == 200
|
| 33 |
data = resp.json()
|
| 34 |
-
assert data["status"] == "
|
| 35 |
assert data["tasks"] == 7
|
| 36 |
|
| 37 |
def test_task_count_matches_all_tasks(self, client):
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class TestHealthEndpoint:
|
| 30 |
+
def test_returns_healthy(self, client):
|
| 31 |
resp = client.get("/health")
|
| 32 |
assert resp.status_code == 200
|
| 33 |
data = resp.json()
|
| 34 |
+
assert data["status"] == "healthy"
|
| 35 |
assert data["tasks"] == 7
|
| 36 |
|
| 37 |
def test_task_count_matches_all_tasks(self, client):
|