omkarrr88 commited on
Commit
d8eeec6
·
1 Parent(s): 7336adb

minor changes

Browse files
Files changed (6) hide show
  1. README.md +1 -1
  2. inference.py +17 -26
  3. pyproject.toml +1 -1
  4. server/app.py +1 -1
  5. test_results.txt +278 -0
  6. tests/test_endpoints.py +2 -2
README.md CHANGED
@@ -183,7 +183,7 @@ API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4o OPENAI_API_KEY=sk-... p
183
 
184
  | Endpoint | Method | Description |
185
  |----------|--------|-------------|
186
- | `/health` | GET | `{"status": "ready", "tasks": 7}` |
187
  | `/tasks` | GET | Task list with IDs, difficulties, action schema |
188
  | `/grader` | POST | Score for last completed episode |
189
  | `/baseline` | POST | Run heuristic on all tasks, return scores |
 
183
 
184
  | Endpoint | Method | Description |
185
  |----------|--------|-------------|
186
+ | `/health` | GET | `{"status": "healthy", "tasks": 7}` |
187
  | `/tasks` | GET | Task list with IDs, difficulties, action schema |
188
  | `/grader` | POST | Score for last completed episode |
189
  | `/baseline` | POST | Run heuristic on all tasks, return scores |
inference.py CHANGED
@@ -24,12 +24,12 @@ import asyncio
24
  import json
25
  import os
26
  import sys
27
- from typing import Optional
28
 
29
  try:
30
  from openai import OpenAI
31
  except ImportError:
32
- print("Error: openai package not installed. Run: pip install openai", file=sys.stderr)
33
  sys.exit(1)
34
 
35
  from openenv.core import GenericAction, GenericEnvClient
@@ -74,7 +74,7 @@ def log_step(
74
  )
75
 
76
 
77
- def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
78
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
79
  print(
80
  f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
@@ -155,7 +155,7 @@ def get_model_message(
155
  step: int,
156
  last_obs_summary: dict,
157
  last_reward: float,
158
- history: list[str],
159
  ) -> str:
160
  """Get next action from the LLM."""
161
  history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
@@ -179,7 +179,7 @@ def get_model_message(
179
  text = (completion.choices[0].message.content or "").strip()
180
  return text if text else FALLBACK_ACTION
181
  except Exception as exc:
182
- print(f"[DEBUG] Model request failed: {exc}", file=sys.stderr)
183
  return FALLBACK_ACTION
184
 
185
 
@@ -198,11 +198,11 @@ def parse_action(raw: str) -> str:
198
  async def main() -> None:
199
  if not API_KEY:
200
  print(
201
- "Error: OPENAI_API_KEY or HF_TOKEN required.", file=sys.stderr
202
  )
203
  sys.exit(1)
204
 
205
- llm = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
206
 
207
  # Connect to environment via standard OpenEnv client
208
  if IMAGE_NAME:
@@ -211,8 +211,8 @@ async def main() -> None:
211
  env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
212
  await env.connect()
213
 
214
- history: list[str] = []
215
- rewards: list[float] = []
216
  steps_taken = 0
217
  score = 0.0
218
  success = False
@@ -220,9 +220,8 @@ async def main() -> None:
220
  log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
221
 
222
  try:
223
- # Reset environment for the selected task
224
  result = await env.reset(task_id=TASK_NAME, seed=42)
225
- obs = result.observation # dict
226
  last_reward = 0.0
227
 
228
  for step in range(1, MAX_STEPS + 1):
@@ -230,10 +229,9 @@ async def main() -> None:
230
  break
231
 
232
  obs_summary = _build_obs_summary(obs)
233
- raw = get_model_message(llm, step, obs_summary, last_reward, history)
234
  action_str = parse_action(raw)
235
 
236
- # Step via standard OpenEnv client API
237
  action = GenericAction(json.loads(action_str))
238
  result = await env.step(action)
239
  obs = result.observation
@@ -250,30 +248,23 @@ async def main() -> None:
250
  steps_taken = step
251
  last_reward = reward
252
 
253
- log_step(
254
- step=step,
255
- action=action_str,
256
- reward=reward,
257
- done=done,
258
- error=error,
259
- )
260
- history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
261
 
262
  if done:
263
  break
264
 
265
  score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
266
- score = min(max(score, 0.0), 1.0)
267
  success = score >= SUCCESS_SCORE_THRESHOLD
268
 
269
  finally:
270
  try:
271
  await env.close()
272
  except Exception as e:
273
- print(f"[DEBUG] env.close() error: {e}", file=sys.stderr)
274
- log_end(
275
- success=success, steps=steps_taken, score=score, rewards=rewards
276
- )
277
 
278
 
279
  if __name__ == "__main__":
 
24
  import json
25
  import os
26
  import sys
27
+ from typing import List, Optional
28
 
29
  try:
30
  from openai import OpenAI
31
  except ImportError:
32
+ print("Error: openai package not installed. Run: pip install openai", flush=True)
33
  sys.exit(1)
34
 
35
  from openenv.core import GenericAction, GenericEnvClient
 
74
  )
75
 
76
 
77
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
78
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
79
  print(
80
  f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
 
155
  step: int,
156
  last_obs_summary: dict,
157
  last_reward: float,
158
+ history: List[str],
159
  ) -> str:
160
  """Get next action from the LLM."""
161
  history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
 
179
  text = (completion.choices[0].message.content or "").strip()
180
  return text if text else FALLBACK_ACTION
181
  except Exception as exc:
182
+ print(f"[DEBUG] Model request failed: {exc}", flush=True)
183
  return FALLBACK_ACTION
184
 
185
 
 
198
  async def main() -> None:
199
  if not API_KEY:
200
  print(
201
+ "Error: OPENAI_API_KEY or HF_TOKEN required.", flush=True
202
  )
203
  sys.exit(1)
204
 
205
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
206
 
207
  # Connect to environment via standard OpenEnv client
208
  if IMAGE_NAME:
 
211
  env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
212
  await env.connect()
213
 
214
+ history: List[str] = []
215
+ rewards: List[float] = []
216
  steps_taken = 0
217
  score = 0.0
218
  success = False
 
220
  log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
221
 
222
  try:
 
223
  result = await env.reset(task_id=TASK_NAME, seed=42)
224
+ obs = result.observation
225
  last_reward = 0.0
226
 
227
  for step in range(1, MAX_STEPS + 1):
 
229
  break
230
 
231
  obs_summary = _build_obs_summary(obs)
232
+ raw = get_model_message(client, step, obs_summary, last_reward, history)
233
  action_str = parse_action(raw)
234
 
 
235
  action = GenericAction(json.loads(action_str))
236
  result = await env.step(action)
237
  obs = result.observation
 
248
  steps_taken = step
249
  last_reward = reward
250
 
251
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
252
+
253
+ history.append(f"Step {step}: {action_str!r} -> reward {reward:+.2f}")
 
 
 
 
 
254
 
255
  if done:
256
  break
257
 
258
  score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
259
+ score = min(max(score, 0.0), 1.0) # clamp to [0, 1]
260
  success = score >= SUCCESS_SCORE_THRESHOLD
261
 
262
  finally:
263
  try:
264
  await env.close()
265
  except Exception as e:
266
+ print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
267
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 
 
268
 
269
 
270
  if __name__ == "__main__":
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ version = "1.1.0"
4
  description = "OpenEnv RL environment for PyTorch training failure debugging"
5
  requires-python = ">=3.12"
6
  dependencies = [
7
- "torch",
8
  "openenv-core",
9
  "pydantic>=2.0",
10
  "fastapi",
 
4
  description = "OpenEnv RL environment for PyTorch training failure debugging"
5
  requires-python = ">=3.12"
6
  dependencies = [
7
+ "torch>=2.5.1",
8
  "openenv-core",
9
  "pydantic>=2.0",
10
  "fastapi",
server/app.py CHANGED
@@ -85,7 +85,7 @@ def root() -> RedirectResponse:
85
  @app.get("/health")
86
  def health_check() -> dict:
87
  """Health check — required by hackathon auto-validator."""
88
- return {"status": "ready", "tasks": len(ALL_TASKS)}
89
 
90
 
91
  @app.get("/dashboard", response_class=HTMLResponse)
 
85
  @app.get("/health")
86
  def health_check() -> dict:
87
  """Health check — required by hackathon auto-validator."""
88
+ return {"status": "healthy", "tasks": len(ALL_TASKS)}
89
 
90
 
91
  @app.get("/dashboard", response_class=HTMLResponse)
test_results.txt ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ============================= test session starts ==============================
2
+ platform linux -- Python 3.12.3, pytest-9.0.2, pluggy-1.6.0 -- /home/omkar-kadam/Desktop/Rubacus/ML Debugger/.venv/bin/python3
3
+ cachedir: .pytest_cache
4
+ rootdir: /home/omkar-kadam/Desktop/Rubacus/ML Debugger
5
+ configfile: pyproject.toml
6
+ plugins: cov-7.1.0, anyio-4.13.0, asyncio-1.3.0
7
+ asyncio: mode=Mode.AUTO, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function
8
+ collecting ... collected 245 items
9
+
10
+ tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_two_runs_identical PASSED [ 0%]
11
+ tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_all_scores_in_range PASSED [ 0%]
12
+ tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_scores_have_meaningful_variance PASSED [ 1%]
13
+ tests/test_client.py::TestMLTrainingEnvClient::test_can_instantiate PASSED [ 1%]
14
+ tests/test_client.py::TestMLTrainingEnvClient::test_is_generic_env_client PASSED [ 2%]
15
+ tests/test_code_templates.py::TestGenerateCodeSnippet::test_eval_mode PASSED [ 2%]
16
+ tests/test_code_templates.py::TestGenerateCodeSnippet::test_detach_loss PASSED [ 2%]
17
+ tests/test_code_templates.py::TestGenerateCodeSnippet::test_zero_grad_missing PASSED [ 3%]
18
+ tests/test_code_templates.py::TestGenerateCodeSnippet::test_inplace_relu PASSED [ 3%]
19
+ tests/test_code_templates.py::TestGenerateCodeSnippet::test_unknown_bug_raises PASSED [ 4%]
20
+ tests/test_code_templates.py::TestValidateFix::test_eval_mode_correct_fix PASSED [ 4%]
21
+ tests/test_code_templates.py::TestValidateFix::test_eval_mode_with_whitespace PASSED [ 4%]
22
+ tests/test_code_templates.py::TestValidateFix::test_eval_mode_wrong_fix PASSED [ 5%]
23
+ tests/test_code_templates.py::TestValidateFix::test_detach_loss_correct_fix PASSED [ 5%]
24
+ tests/test_code_templates.py::TestValidateFix::test_detach_loss_with_trailing_spaces PASSED [ 6%]
25
+ tests/test_code_templates.py::TestValidateFix::test_zero_grad_correct_fix PASSED [ 6%]
26
+ tests/test_code_templates.py::TestValidateFix::test_inplace_relu_correct_fix PASSED [ 6%]
27
+ tests/test_code_templates.py::TestValidateFix::test_wrong_line_number PASSED [ 7%]
28
+ tests/test_code_templates.py::TestValidateFix::test_unknown_bug_type PASSED [ 7%]
29
+ tests/test_code_templates_edge.py::TestNormalizeCode::test_strips_whitespace PASSED [ 8%]
30
+ tests/test_code_templates_edge.py::TestNormalizeCode::test_multiline PASSED [ 8%]
31
+ tests/test_code_templates_edge.py::TestTokenizeCompare::test_identical_tokens PASSED [ 8%]
32
+ tests/test_code_templates_edge.py::TestTokenizeCompare::test_whitespace_ignored PASSED [ 9%]
33
+ tests/test_code_templates_edge.py::TestTokenizeCompare::test_different_tokens PASSED [ 9%]
34
+ tests/test_code_templates_edge.py::TestTokenizeCompare::test_invalid_syntax PASSED [ 10%]
35
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_ast_fallback_with_train_keyword PASSED [ 10%]
36
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_ast_without_detach PASSED [ 11%]
37
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_inplace_relu_ast_without_inplace PASSED [ 11%]
38
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_line_zero_invalid PASSED [ 11%]
39
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_syntax_error_rejected PASSED [ 12%]
40
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_with_comment PASSED [ 12%]
41
+ tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_without_keyword PASSED [ 13%]
42
+ tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_semantic_train_present PASSED [ 13%]
43
+ tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_with_eval_keyword_fails PASSED [ 13%]
44
+ tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_detach_loss_criterion_without_detach PASSED [ 14%]
45
+ tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_inplace_relu_without_inplace_flag PASSED [ 14%]
46
+ tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_eval_mode_has_hint PASSED [ 15%]
47
+ tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_detach_loss_has_hint PASSED [ 15%]
48
+ tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_zero_grad_no_hint PASSED [ 15%]
49
+ tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_inplace_relu_no_hint PASSED [ 16%]
50
+ tests/test_endpoints.py::TestHealthEndpoint::test_returns_healthy PASSED [ 16%]
51
+ tests/test_endpoints.py::TestHealthEndpoint::test_task_count_matches_all_tasks PASSED [ 17%]
52
+ tests/test_endpoints.py::TestTasksEndpoint::test_returns_six_tasks PASSED [ 17%]
53
+ tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_action_schema PASSED [ 17%]
54
+ tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_difficulty_and_max_steps PASSED [ 18%]
55
+ tests/test_endpoints.py::TestGraderEndpoint::test_no_completed_episode PASSED [ 18%]
56
+ tests/test_endpoints.py::TestGraderEndpoint::test_grader_after_completed_episode PASSED [ 19%]
57
+ tests/test_endpoints.py::TestGraderEndpoint::test_grader_with_session_id PASSED [ 19%]
58
+ tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_returns_scores PASSED [ 20%]
59
+ tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_scores_in_valid_range PASSED [ 20%]
60
+ tests/test_endpoints.py::TestDashboardEndpoint::test_returns_html PASSED [ 20%]
61
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_001_exploding PASSED [ 21%]
62
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_002_vanishing PASSED [ 21%]
63
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_003_leakage PASSED [ 22%]
64
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_004_overfitting PASSED [ 22%]
65
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_005_batchnorm PASSED [ 22%]
66
+ tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_006_code_bug PASSED [ 23%]
67
+ tests/test_endpoints.py::TestGetScore::test_no_session PASSED [ 23%]
68
+ tests/test_endpoints.py::TestGetScore::test_with_session PASSED [ 24%]
69
+ tests/test_endpoints.py::TestRunBaselineSync::test_returns_all_tasks PASSED [ 24%]
70
+ tests/test_endpoints.py::TestRunBaselineSync::test_reproducible PASSED [ 24%]
71
+ tests/test_episode_lifecycle.py::TestReset::test_reset_returns_valid_observation PASSED [ 25%]
72
+ tests/test_episode_lifecycle.py::TestReset::test_reset_initial_state PASSED [ 25%]
73
+ tests/test_episode_lifecycle.py::TestReset::test_reset_progressive_reveal PASSED [ 26%]
74
+ tests/test_episode_lifecycle.py::TestReset::test_reset_available_actions PASSED [ 26%]
75
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_populates_stats PASSED [ 26%]
76
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_gives_investigation_bonus PASSED [ 27%]
77
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch_gives_investigation_bonus PASSED [ 27%]
78
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes_gives_investigation_bonus PASSED [ 28%]
79
+ tests/test_episode_lifecycle.py::TestStepInspections::test_repeat_inspection_no_bonus PASSED [ 28%]
80
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch PASSED [ 28%]
81
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes PASSED [ 29%]
82
+ tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_weights PASSED [ 29%]
83
+ tests/test_episode_lifecycle.py::TestStepFixActions::test_modify_config PASSED [ 30%]
84
+ tests/test_episode_lifecycle.py::TestStepFixActions::test_restart_run_after_fix PASSED [ 30%]
85
+ tests/test_episode_lifecycle.py::TestStepDiagnosis::test_mark_diagnosed_ends_episode PASSED [ 31%]
86
+ tests/test_episode_lifecycle.py::TestStepDiagnosis::test_step_after_done PASSED [ 31%]
87
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_invalid_action_type PASSED [ 31%]
88
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_action_not_in_available PASSED [ 32%]
89
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_modify_config_missing_target PASSED [ 32%]
90
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_missing_diagnosis PASSED [ 33%]
91
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_invalid_diagnosis PASSED [ 33%]
92
+ tests/test_episode_lifecycle.py::TestErrorHandling::test_step_before_reset PASSED [ 33%]
93
+ tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_001_full_flow PASSED [ 34%]
94
+ tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_005_context_gated_penalty PASSED [ 34%]
95
+ tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_003_data_leakage PASSED [ 35%]
96
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_001] PASSED [ 35%]
97
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_002] PASSED [ 35%]
98
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_003] PASSED [ 36%]
99
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_004] PASSED [ 36%]
100
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_005] PASSED [ 37%]
101
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_006] PASSED [ 37%]
102
+ tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_007] PASSED [ 37%]
103
+ tests/test_exploit_resistance.py::TestExploitResistance::test_hard_task_has_variance PASSED [ 38%]
104
+ tests/test_exploit_resistance.py::TestExploitResistance::test_deterministic_per_seed PASSED [ 38%]
105
+ tests/test_graders.py::TestGradeTask001::test_perfect_score PASSED [ 39%]
106
+ tests/test_graders.py::TestGradeTask001::test_wrong_diagnosis PASSED [ 39%]
107
+ tests/test_graders.py::TestGradeTask001::test_no_investigation PASSED [ 40%]
108
+ tests/test_graders.py::TestGradeTask001::test_score_in_range PASSED [ 40%]
109
+ tests/test_graders.py::TestGradeTask003::test_perfect_score PASSED [ 40%]
110
+ tests/test_graders.py::TestGradeTask003::test_wrong_diagnosis PASSED [ 41%]
111
+ tests/test_graders.py::TestGradeTask005::test_perfect_score_thorough PASSED [ 41%]
112
+ tests/test_graders.py::TestGradeTask005::test_quick_fix_partial_credit PASSED [ 42%]
113
+ tests/test_graders.py::TestGradeTask005::test_red_herring_chaser PASSED [ 42%]
114
+ tests/test_graders.py::TestGradeTask005::test_wrong_fix_penalty PASSED [ 42%]
115
+ tests/test_graders.py::TestGradeTask005::test_double_trap_devastates_score PASSED [ 43%]
116
+ tests/test_graders.py::TestGradeEpisode::test_dispatch_to_correct_grader PASSED [ 43%]
117
+ tests/test_graders.py::TestGradeEpisode::test_unknown_task_returns_zero PASSED [ 44%]
118
+ tests/test_graders.py::TestGradeTask006::test_perfect_score_thorough PASSED [ 44%]
119
+ tests/test_graders.py::TestGradeTask006::test_no_weights_inspection_partial PASSED [ 44%]
120
+ tests/test_graders.py::TestGradeTask006::test_minimal_investigation PASSED [ 45%]
121
+ tests/test_graders.py::TestGradeTask006::test_wrong_diagnosis PASSED [ 45%]
122
+ tests/test_graders.py::TestGradeTask006::test_score_in_range PASSED [ 46%]
123
+ tests/test_graders.py::TestGradeTask007::test_perfect_score_thorough PASSED [ 46%]
124
+ tests/test_graders.py::TestGradeTask007::test_no_weights_partial PASSED [ 46%]
125
+ tests/test_graders.py::TestGradeTask007::test_wrong_fix_penalty PASSED [ 47%]
126
+ tests/test_graders.py::TestGradeTask007::test_wrong_diagnosis PASSED [ 47%]
127
+ tests/test_graders.py::TestGradeTask007::test_score_in_range PASSED [ 48%]
128
+ tests/test_graders.py::TestSubmittedDiagnosis::test_finds_diagnosis PASSED [ 48%]
129
+ tests/test_graders.py::TestSubmittedDiagnosis::test_no_diagnosis PASSED [ 48%]
130
+ tests/test_graders.py::TestSubmittedDiagnosis::test_latest_diagnosis PASSED [ 49%]
131
+ tests/test_models.py::TestRootCauseDiagnosis::test_all_values_exist PASSED [ 49%]
132
+ tests/test_models.py::TestRootCauseDiagnosis::test_values_are_strings PASSED [ 50%]
133
+ tests/test_models.py::TestRootCauseDiagnosis::test_specific_values PASSED [ 50%]
134
+ tests/test_models.py::TestTrainingConfig::test_default_instantiation PASSED [ 51%]
135
+ tests/test_models.py::TestTrainingConfig::test_json_roundtrip PASSED [ 51%]
136
+ tests/test_models.py::TestGradientStats::test_exploding PASSED [ 51%]
137
+ tests/test_models.py::TestGradientStats::test_vanishing PASSED [ 52%]
138
+ tests/test_models.py::TestGradientStats::test_normal PASSED [ 52%]
139
+ tests/test_models.py::TestEpisodeState::test_fresh_state PASSED [ 53%]
140
+ tests/test_models.py::TestEpisodeState::test_available_actions_initial PASSED [ 53%]
141
+ tests/test_models.py::TestEpisodeState::test_fix_code_available_after_code_inspected PASSED [ 53%]
142
+ tests/test_models.py::TestEpisodeState::test_restart_run_available_after_fix PASSED [ 54%]
143
+ tests/test_models.py::TestEpisodeState::test_mark_diagnosed_disappears_after_submission PASSED [ 54%]
144
+ tests/test_models.py::TestMLTrainingObservation::test_extends_observation PASSED [ 55%]
145
+ tests/test_models.py::TestMLTrainingObservation::test_has_done_and_reward PASSED [ 55%]
146
+ tests/test_models.py::TestMLTrainingObservation::test_json_serialization PASSED [ 55%]
147
+ tests/test_models.py::TestMLTrainingAction::test_extends_action PASSED [ 56%]
148
+ tests/test_models.py::TestMLTrainingAction::test_basic_action PASSED [ 56%]
149
+ tests/test_models.py::TestMLTrainingAction::test_modify_config_action PASSED [ 57%]
150
+ tests/test_models.py::TestMLTrainingAction::test_mark_diagnosed_action PASSED [ 57%]
151
+ tests/test_models.py::TestMLTrainingAction::test_fix_code_action PASSED [ 57%]
152
+ tests/test_new_endpoints.py::TestCurriculumEndpoint::test_returns_curriculum PASSED [ 58%]
153
+ tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_has_difficulty_levels PASSED [ 58%]
154
+ tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_covers_all_tasks PASSED [ 59%]
155
+ tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_returns_leaderboard PASSED [ 59%]
156
+ tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_leaderboard_after_baseline PASSED [ 60%]
157
+ tests/test_new_endpoints.py::TestReplayEndpoint::test_missing_episode PASSED [ 60%]
158
+ tests/test_new_endpoints.py::TestReplayEndpoint::test_replay_after_baseline PASSED [ 60%]
159
+ tests/test_new_endpoints.py::TestValidationReportEndpoint::test_returns_real_report PASSED [ 61%]
160
+ tests/test_pytorch_engine.py::TestSimpleCNN::test_is_nn_module PASSED [ 61%]
161
+ tests/test_pytorch_engine.py::TestSimpleCNN::test_param_count PASSED [ 62%]
162
+ tests/test_pytorch_engine.py::TestSimpleCNN::test_forward_pass PASSED [ 62%]
163
+ tests/test_pytorch_engine.py::TestFaultInjection::test_task_001_exploding_gradients PASSED [ 62%]
164
+ tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_eval_mode PASSED [ 63%]
165
+ tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_gradients_not_exploding PASSED [ 63%]
166
+ tests/test_pytorch_engine.py::TestExtractGradientStats::test_returns_gradient_stats PASSED [ 64%]
167
+ tests/test_pytorch_engine.py::TestExtractWeightStats::test_returns_weight_stats PASSED [ 64%]
168
+ tests/test_pytorch_engine.py::TestExtractModelModes::test_train_mode PASSED [ 64%]
169
+ tests/test_pytorch_engine.py::TestExtractModelModes::test_eval_mode PASSED [ 65%]
170
+ tests/test_pytorch_engine.py::TestTask005RedHerrings::test_conv1_near_vanishing_red_herring PASSED [ 65%]
171
+ tests/test_pytorch_engine.py::TestTask005RedHerrings::test_fc_spike_not_exploding PASSED [ 66%]
172
+ tests/test_pytorch_engine.py::TestTask005RedHerrings::test_all_layers_not_exploding PASSED [ 66%]
173
+ tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_vanishing PASSED [ 66%]
174
+ tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_model_in_train_mode PASSED [ 67%]
175
+ tests/test_pytorch_engine.py::TestCodeBugFaultInjection::test_task_006_model_trains_normally PASSED [ 67%]
176
+ tests/test_pytorch_engine.py::TestDataLeakageFaultInjection::test_task_003_normal_model PASSED [ 68%]
177
+ tests/test_real_training.py::TestRunRealTraining::test_returns_20_epoch_curves PASSED [ 68%]
178
+ tests/test_real_training.py::TestRunRealTraining::test_all_values_are_floats PASSED [ 68%]
179
+ tests/test_real_training.py::TestRunRealTraining::test_caching_works PASSED [ 69%]
180
+ tests/test_real_training.py::TestRunRealTraining::test_reproducible_across_calls PASSED [ 69%]
181
+ tests/test_real_training.py::TestRunRealTraining::test_different_seeds_different_curves PASSED [ 70%]
182
+ tests/test_real_training.py::TestRunRealTraining::test_task_001_high_lr_instability PASSED [ 70%]
183
+ tests/test_real_training.py::TestRunRealTraining::test_task_002_vanishing_slow_learning PASSED [ 71%]
184
+ tests/test_real_training.py::TestRunRealTraining::test_task_003_data_leakage PASSED [ 71%]
185
+ tests/test_real_training.py::TestRunRealTraining::test_task_004_overfitting PASSED [ 71%]
186
+ tests/test_real_training.py::TestRunRealTraining::test_task_005_batchnorm_eval PASSED [ 72%]
187
+ tests/test_real_training.py::TestRunRealTraining::test_task_006_code_bug PASSED [ 72%]
188
+ tests/test_real_training.py::TestRunRealTraining::test_task_007_scheduler PASSED [ 73%]
189
+ tests/test_real_training.py::TestRunRealTraining::test_mlp_architecture PASSED [ 73%]
190
+ tests/test_real_training.py::TestSimpleMLP::test_is_nn_module PASSED [ 73%]
191
+ tests/test_real_training.py::TestSimpleMLP::test_param_count PASSED [ 74%]
192
+ tests/test_real_training.py::TestSimpleMLP::test_forward_pass PASSED [ 74%]
193
+ tests/test_real_training.py::TestSimpleMLP::test_has_batchnorm PASSED [ 75%]
194
+ tests/test_reward_engine.py::TestStepPenalty::test_flat_step_penalty PASSED [ 75%]
195
+ tests/test_reward_engine.py::TestStepPenalty::test_step_penalty_not_multiplied_by_step_count PASSED [ 75%]
196
+ tests/test_reward_engine.py::TestInvestigationBonus::test_first_time_bonus PASSED [ 76%]
197
+ tests/test_reward_engine.py::TestInvestigationBonus::test_no_bonus_on_repeat PASSED [ 76%]
198
+ tests/test_reward_engine.py::TestInvestigationBonus::test_each_inspection_type_gives_bonus PASSED [ 77%]
199
+ tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_before_inspection PASSED [ 77%]
200
+ tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_after_normal_gradients PASSED [ 77%]
201
+ tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_after_abnormal_gradients PASSED [ 78%]
202
+ tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_only_for_add_callback PASSED [ 78%]
203
+ tests/test_reward_engine.py::TestDiagnosisReward::test_correct_diagnosis PASSED [ 79%]
204
+ tests/test_reward_engine.py::TestDiagnosisReward::test_wrong_diagnosis PASSED [ 79%]
205
+ tests/test_reward_engine.py::TestTerminalConvergence::test_convergence_after_fix_and_restart PASSED [ 80%]
206
+ tests/test_reward_engine.py::TestTerminalConvergence::test_no_convergence_without_fix PASSED [ 80%]
207
+ tests/test_reward_engine.py::TestInvalidAction::test_invalid_action_penalty PASSED [ 80%]
208
+ tests/test_reward_engine.py::TestWrongCodeFix::test_wrong_code_fix_penalty PASSED [ 81%]
209
+ tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_one PASSED [ 81%]
210
+ tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_negative_one PASSED [ 82%]
211
+ tests/test_scenarios.py::TestSampleScenario::test_task_001_root_cause PASSED [ 82%]
212
+ tests/test_scenarios.py::TestSampleScenario::test_task_003_root_cause PASSED [ 82%]
213
+ tests/test_scenarios.py::TestSampleScenario::test_task_005_root_cause PASSED [ 83%]
214
+ tests/test_scenarios.py::TestSampleScenario::test_different_seeds_produce_different_params PASSED [ 83%]
215
+ tests/test_scenarios.py::TestSampleScenario::test_same_seed_same_params PASSED [ 84%]
216
+ tests/test_scenarios.py::TestSampleScenario::test_unknown_task_raises PASSED [ 84%]
217
+ tests/test_scenarios.py::TestSampleScenario::test_task_005_has_error_log PASSED [ 84%]
218
+ tests/test_scenarios.py::TestSampleScenario::test_task_003_has_notes PASSED [ 85%]
219
+ tests/test_simulation.py::TestGenLossHistory::test_returns_20_floats PASSED [ 85%]
220
+ tests/test_simulation.py::TestGenLossHistory::test_task_001_has_instability PASSED [ 86%]
221
+ tests/test_simulation.py::TestGenLossHistory::test_task_003_reasonable PASSED [ 86%]
222
+ tests/test_simulation.py::TestGenLossHistory::test_task_005_no_crash PASSED [ 86%]
223
+ tests/test_simulation.py::TestGenValAccuracy::test_returns_20_floats PASSED [ 87%]
224
+ tests/test_simulation.py::TestGenValAccuracy::test_task_003_leakage_shows_higher_acc PASSED [ 87%]
225
+ tests/test_simulation.py::TestGenValAccuracy::test_task_005_low_accuracy PASSED [ 88%]
226
+ tests/test_simulation.py::TestGenValLoss::test_returns_20_floats PASSED [ 88%]
227
+ tests/test_simulation.py::TestGenDataBatchStats::test_leakage_high_overlap PASSED [ 88%]
228
+ tests/test_simulation.py::TestGenDataBatchStats::test_normal_low_overlap PASSED [ 89%]
229
+ tests/test_simulation.py::TestGenDataBatchStats::test_confusion_matrix_present PASSED [ 89%]
230
+ tests/test_simulation_extended.py::TestVanishingGradients::test_loss_barely_decreases PASSED [ 90%]
231
+ tests/test_simulation_extended.py::TestVanishingGradients::test_val_acc_low PASSED [ 90%]
232
+ tests/test_simulation_extended.py::TestVanishingGradients::test_val_loss_present PASSED [ 91%]
233
+ tests/test_simulation_extended.py::TestOverfitting::test_loss_history_present PASSED [ 91%]
234
+ tests/test_simulation_extended.py::TestOverfitting::test_val_acc_present PASSED [ 91%]
235
+ tests/test_simulation_extended.py::TestOverfitting::test_val_loss_present PASSED [ 92%]
236
+ tests/test_simulation_extended.py::TestOverfitting::test_data_batch_stats_clean PASSED [ 92%]
237
+ tests/test_simulation_extended.py::TestCodeBug::test_loss_history PASSED [ 93%]
238
+ tests/test_simulation_extended.py::TestCodeBug::test_val_acc PASSED [ 93%]
239
+ tests/test_simulation_extended.py::TestCodeBug::test_val_loss PASSED [ 93%]
240
+ tests/test_simulation_extended.py::TestBatchNormEval::test_val_loss_present PASSED [ 94%]
241
+ tests/test_simulation_extended.py::TestBatchNormEval::test_val_acc_near_zero PASSED [ 94%]
242
+ tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_loss_history PASSED [ 95%]
243
+ tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_acc PASSED [ 95%]
244
+ tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_loss PASSED [ 95%]
245
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_endpoint_exists PASSED [ 96%]
246
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_returns_observation PASSED [ 96%]
247
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_with_task_selection PASSED [ 97%]
248
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_selection_all_tasks PASSED [ 97%]
249
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_inspect_gradients PASSED [ 97%]
250
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_full_episode_flow PASSED [ 98%]
251
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_005_red_herrings PASSED [ 98%]
252
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_006_code_inspection PASSED [ 99%]
253
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_invalid_message_returns_error PASSED [ 99%]
254
+ tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_data_batch PASSED [100%]
255
+
256
+ ================================ tests coverage ================================
257
+ _______________ coverage: platform linux, python 3.12.3-final-0 ________________
258
+
259
+ Name Stmts Miss Cover Missing
260
+ ----------------------------------------------------------------------
261
+ ml_training_debugger/__init__.py 1 0 100%
262
+ ml_training_debugger/client.py 4 0 100%
263
+ ml_training_debugger/code_templates.py 76 6 92% 224-225, 239, 241, 243-244
264
+ ml_training_debugger/graders.py 141 1 99% 39
265
+ ml_training_debugger/models.py 106 0 100%
266
+ ml_training_debugger/nn_models.py 42 0 100%
267
+ ml_training_debugger/pytorch_engine.py 203 4 98% 104, 113-114, 289
268
+ ml_training_debugger/reward_engine.py 38 0 100%
269
+ ml_training_debugger/scenarios.py 64 0 100%
270
+ ml_training_debugger/simulation.py 38 0 100%
271
+ server/__init__.py 0 0 100%
272
+ server/_baseline_results.py 15 0 100%
273
+ server/_heuristic.py 93 9 90% 123-126, 132, 140-146, 156, 176
274
+ server/app.py 96 6 94% 82, 113, 180, 196-198, 202
275
+ server/environment.py 242 27 89% 158-166, 305-318, 338, 380, 393, 443, 458, 508, 513-517, 535-540
276
+ ----------------------------------------------------------------------
277
+ TOTAL 1159 53 95%
278
+ ======================== 245 passed in 80.39s (0:01:20) ========================
tests/test_endpoints.py CHANGED
@@ -27,11 +27,11 @@ def client():
27
 
28
 
29
  class TestHealthEndpoint:
30
- def test_returns_ready(self, client):
31
  resp = client.get("/health")
32
  assert resp.status_code == 200
33
  data = resp.json()
34
- assert data["status"] == "ready"
35
  assert data["tasks"] == 7
36
 
37
  def test_task_count_matches_all_tasks(self, client):
 
27
 
28
 
29
  class TestHealthEndpoint:
30
+ def test_returns_healthy(self, client):
31
  resp = client.get("/health")
32
  assert resp.status_code == 200
33
  data = resp.json()
34
+ assert data["status"] == "healthy"
35
  assert data["tasks"] == 7
36
 
37
  def test_task_count_matches_all_tasks(self, client):