Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

App Files Files Community

omkarrr88 commited on 30 days ago

Commit

d8eeec6

1 Parent(s): 7336adb

minor changes

Browse files

Files changed (6) hide show

README.md +1 -1
inference.py +17 -26
pyproject.toml +1 -1
server/app.py +1 -1
test_results.txt +278 -0
tests/test_endpoints.py +2 -2

README.md CHANGED Viewed

@@ -183,7 +183,7 @@ API_BASE_URL=https://api.openai.com/v1 MODEL_NAME=gpt-4o OPENAI_API_KEY=sk-... p
 | Endpoint | Method | Description |
 |----------|--------|-------------|
-| `/health` | GET | `{"status": "ready", "tasks": 7}` |
 | `/tasks` | GET | Task list with IDs, difficulties, action schema |
 | `/grader` | POST | Score for last completed episode |
 | `/baseline` | POST | Run heuristic on all tasks, return scores |

 | Endpoint | Method | Description |
 |----------|--------|-------------|
+| `/health` | GET | `{"status": "healthy", "tasks": 7}` |
 | `/tasks` | GET | Task list with IDs, difficulties, action schema |
 | `/grader` | POST | Score for last completed episode |
 | `/baseline` | POST | Run heuristic on all tasks, return scores |

inference.py CHANGED Viewed

@@ -24,12 +24,12 @@ import asyncio
 import json
 import os
 import sys
-from typing import Optional
 try:
     from openai import OpenAI
 except ImportError:
-    print("Error: openai package not installed. Run: pip install openai", file=sys.stderr)
     sys.exit(1)
 from openenv.core import GenericAction, GenericEnvClient
@@ -74,7 +74,7 @@ def log_step(
     )
-def log_end(success: bool, steps: int, score: float, rewards: list[float]) -> None:
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
     print(
         f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
@@ -155,7 +155,7 @@ def get_model_message(
     step: int,
     last_obs_summary: dict,
     last_reward: float,
-    history: list[str],
 ) -> str:
     """Get next action from the LLM."""
     history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
@@ -179,7 +179,7 @@ def get_model_message(
         text = (completion.choices[0].message.content or "").strip()
         return text if text else FALLBACK_ACTION
     except Exception as exc:
-        print(f"[DEBUG] Model request failed: {exc}", file=sys.stderr)
         return FALLBACK_ACTION
@@ -198,11 +198,11 @@ def parse_action(raw: str) -> str:
 async def main() -> None:
     if not API_KEY:
         print(
-            "Error: OPENAI_API_KEY or HF_TOKEN required.", file=sys.stderr
         )
         sys.exit(1)
-    llm = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     # Connect to environment via standard OpenEnv client
     if IMAGE_NAME:
@@ -211,8 +211,8 @@ async def main() -> None:
         env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
         await env.connect()
-    history: list[str] = []
-    rewards: list[float] = []
     steps_taken = 0
     score = 0.0
     success = False
@@ -220,9 +220,8 @@ async def main() -> None:
     log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
     try:
-        # Reset environment for the selected task
         result = await env.reset(task_id=TASK_NAME, seed=42)
-        obs = result.observation  # dict
         last_reward = 0.0
         for step in range(1, MAX_STEPS + 1):
@@ -230,10 +229,9 @@ async def main() -> None:
                 break
             obs_summary = _build_obs_summary(obs)
-            raw = get_model_message(llm, step, obs_summary, last_reward, history)
             action_str = parse_action(raw)
-            # Step via standard OpenEnv client API
             action = GenericAction(json.loads(action_str))
             result = await env.step(action)
             obs = result.observation
@@ -250,30 +248,23 @@ async def main() -> None:
             steps_taken = step
             last_reward = reward
-            log_step(
-                step=step,
-                action=action_str,
-                reward=reward,
-                done=done,
-                error=error,
-            )
-            history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
             if done:
                 break
         score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
-        score = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
         try:
             await env.close()
         except Exception as e:
-            print(f"[DEBUG] env.close() error: {e}", file=sys.stderr)
-        log_end(
-            success=success, steps=steps_taken, score=score, rewards=rewards
-        )
 if __name__ == "__main__":

 import json
 import os
 import sys
+from typing import List, Optional
 try:
     from openai import OpenAI
 except ImportError:
+    print("Error: openai package not installed. Run: pip install openai", flush=True)
     sys.exit(1)
 from openenv.core import GenericAction, GenericEnvClient
     )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
     print(
         f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
     step: int,
     last_obs_summary: dict,
     last_reward: float,
+    history: List[str],
 ) -> str:
     """Get next action from the LLM."""
     history_ctx = "\n".join(history[-5:]) if history else "No previous steps."
         text = (completion.choices[0].message.content or "").strip()
         return text if text else FALLBACK_ACTION
     except Exception as exc:
+        print(f"[DEBUG] Model request failed: {exc}", flush=True)
         return FALLBACK_ACTION
 async def main() -> None:
     if not API_KEY:
         print(
+            "Error: OPENAI_API_KEY or HF_TOKEN required.", flush=True
         )
         sys.exit(1)
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
     # Connect to environment via standard OpenEnv client
     if IMAGE_NAME:
         env = GenericEnvClient(base_url=ENV_URL, message_timeout_s=120.0)
         await env.connect()
+    history: List[str] = []
+    rewards: List[float] = []
     steps_taken = 0
     score = 0.0
     success = False
     log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
     try:
         result = await env.reset(task_id=TASK_NAME, seed=42)
+        obs = result.observation
         last_reward = 0.0
         for step in range(1, MAX_STEPS + 1):
                 break
             obs_summary = _build_obs_summary(obs)
+            raw = get_model_message(client, step, obs_summary, last_reward, history)
             action_str = parse_action(raw)
             action = GenericAction(json.loads(action_str))
             result = await env.step(action)
             obs = result.observation
             steps_taken = step
             last_reward = reward
+            log_step(step=step, action=action_str, reward=reward, done=done, error=error)
+            history.append(f"Step {step}: {action_str!r} -> reward {reward:+.2f}")
             if done:
                 break
         score = sum(rewards) / MAX_TOTAL_REWARD if MAX_TOTAL_REWARD > 0 else 0.0
+        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
         try:
             await env.close()
         except Exception as e:
+            print(f"[DEBUG] env.close() error (container cleanup): {e}", flush=True)
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
 if __name__ == "__main__":

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ version = "1.1.0"
 description = "OpenEnv RL environment for PyTorch training failure debugging"
 requires-python = ">=3.12"
 dependencies = [
-    "torch",
     "openenv-core",
     "pydantic>=2.0",
     "fastapi",

 description = "OpenEnv RL environment for PyTorch training failure debugging"
 requires-python = ">=3.12"
 dependencies = [
+    "torch>=2.5.1",
     "openenv-core",
     "pydantic>=2.0",
     "fastapi",

server/app.py CHANGED Viewed

@@ -85,7 +85,7 @@ def root() -> RedirectResponse:
 @app.get("/health")
 def health_check() -> dict:
     """Health check — required by hackathon auto-validator."""
-    return {"status": "ready", "tasks": len(ALL_TASKS)}
 @app.get("/dashboard", response_class=HTMLResponse)

 @app.get("/health")
 def health_check() -> dict:
     """Health check — required by hackathon auto-validator."""
+    return {"status": "healthy", "tasks": len(ALL_TASKS)}
 @app.get("/dashboard", response_class=HTMLResponse)

test_results.txt ADDED Viewed

	@@ -0,0 +1,278 @@

+============================= test session starts ==============================
+platform linux -- Python 3.12.3, pytest-9.0.2, pluggy-1.6.0 -- /home/omkar-kadam/Desktop/Rubacus/ML Debugger/.venv/bin/python3
+cachedir: .pytest_cache
+rootdir: /home/omkar-kadam/Desktop/Rubacus/ML Debugger
+configfile: pyproject.toml
+plugins: cov-7.1.0, anyio-4.13.0, asyncio-1.3.0
+asyncio: mode=Mode.AUTO, debug=False, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function
+collecting ... collected 245 items
+tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_two_runs_identical PASSED [  0%]
+tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_all_scores_in_range PASSED [  0%]
+tests/test_baseline_reproducibility.py::TestBaselineReproducibility::test_scores_have_meaningful_variance PASSED [  1%]
+tests/test_client.py::TestMLTrainingEnvClient::test_can_instantiate PASSED [  1%]
+tests/test_client.py::TestMLTrainingEnvClient::test_is_generic_env_client PASSED [  2%]
+tests/test_code_templates.py::TestGenerateCodeSnippet::test_eval_mode PASSED [  2%]
+tests/test_code_templates.py::TestGenerateCodeSnippet::test_detach_loss PASSED [  2%]
+tests/test_code_templates.py::TestGenerateCodeSnippet::test_zero_grad_missing PASSED [  3%]
+tests/test_code_templates.py::TestGenerateCodeSnippet::test_inplace_relu PASSED [  3%]
+tests/test_code_templates.py::TestGenerateCodeSnippet::test_unknown_bug_raises PASSED [  4%]
+tests/test_code_templates.py::TestValidateFix::test_eval_mode_correct_fix PASSED [  4%]
+tests/test_code_templates.py::TestValidateFix::test_eval_mode_with_whitespace PASSED [  4%]
+tests/test_code_templates.py::TestValidateFix::test_eval_mode_wrong_fix PASSED [  5%]
+tests/test_code_templates.py::TestValidateFix::test_detach_loss_correct_fix PASSED [  5%]
+tests/test_code_templates.py::TestValidateFix::test_detach_loss_with_trailing_spaces PASSED [  6%]
+tests/test_code_templates.py::TestValidateFix::test_zero_grad_correct_fix PASSED [  6%]
+tests/test_code_templates.py::TestValidateFix::test_inplace_relu_correct_fix PASSED [  6%]
+tests/test_code_templates.py::TestValidateFix::test_wrong_line_number PASSED [  7%]
+tests/test_code_templates.py::TestValidateFix::test_unknown_bug_type PASSED [  7%]
+tests/test_code_templates_edge.py::TestNormalizeCode::test_strips_whitespace PASSED [  8%]
+tests/test_code_templates_edge.py::TestNormalizeCode::test_multiline PASSED [  8%]
+tests/test_code_templates_edge.py::TestTokenizeCompare::test_identical_tokens PASSED [  8%]
+tests/test_code_templates_edge.py::TestTokenizeCompare::test_whitespace_ignored PASSED [  9%]
+tests/test_code_templates_edge.py::TestTokenizeCompare::test_different_tokens PASSED [  9%]
+tests/test_code_templates_edge.py::TestTokenizeCompare::test_invalid_syntax PASSED [ 10%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_ast_fallback_with_train_keyword PASSED [ 10%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_ast_without_detach PASSED [ 11%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_inplace_relu_ast_without_inplace PASSED [ 11%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_eval_mode_line_zero_invalid PASSED [ 11%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_detach_loss_syntax_error_rejected PASSED [ 12%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_with_comment PASSED [ 12%]
+tests/test_code_templates_edge.py::TestValidateFixASTFallback::test_zero_grad_without_keyword PASSED [ 13%]
+tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_semantic_train_present PASSED [ 13%]
+tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_eval_mode_with_eval_keyword_fails PASSED [ 13%]
+tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_detach_loss_criterion_without_detach PASSED [ 14%]
+tests/test_code_templates_edge.py::TestValidateFixSemanticPatterns::test_inplace_relu_without_inplace_flag PASSED [ 14%]
+tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_eval_mode_has_hint PASSED [ 15%]
+tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_detach_loss_has_hint PASSED [ 15%]
+tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_zero_grad_no_hint PASSED [ 15%]
+tests/test_code_templates_edge.py::TestGenerateCodeSnippetHints::test_inplace_relu_no_hint PASSED [ 16%]
+tests/test_endpoints.py::TestHealthEndpoint::test_returns_healthy PASSED [ 16%]
+tests/test_endpoints.py::TestHealthEndpoint::test_task_count_matches_all_tasks PASSED [ 17%]
+tests/test_endpoints.py::TestTasksEndpoint::test_returns_six_tasks PASSED [ 17%]
+tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_action_schema PASSED [ 17%]
+tests/test_endpoints.py::TestTasksEndpoint::test_tasks_have_difficulty_and_max_steps PASSED [ 18%]
+tests/test_endpoints.py::TestGraderEndpoint::test_no_completed_episode PASSED [ 18%]
+tests/test_endpoints.py::TestGraderEndpoint::test_grader_after_completed_episode PASSED [ 19%]
+tests/test_endpoints.py::TestGraderEndpoint::test_grader_with_session_id PASSED [ 19%]
+tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_returns_scores PASSED [ 20%]
+tests/test_endpoints.py::TestBaselineEndpoint::test_baseline_scores_in_valid_range PASSED [ 20%]
+tests/test_endpoints.py::TestDashboardEndpoint::test_returns_html PASSED [ 20%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_001_exploding PASSED [ 21%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_002_vanishing PASSED [ 21%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_003_leakage PASSED [ 22%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_004_overfitting PASSED [ 22%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_005_batchnorm PASSED [ 22%]
+tests/test_endpoints.py::TestRunHeuristicEpisode::test_task_006_code_bug PASSED [ 23%]
+tests/test_endpoints.py::TestGetScore::test_no_session PASSED            [ 23%]
+tests/test_endpoints.py::TestGetScore::test_with_session PASSED          [ 24%]
+tests/test_endpoints.py::TestRunBaselineSync::test_returns_all_tasks PASSED [ 24%]
+tests/test_endpoints.py::TestRunBaselineSync::test_reproducible PASSED   [ 24%]
+tests/test_episode_lifecycle.py::TestReset::test_reset_returns_valid_observation PASSED [ 25%]
+tests/test_episode_lifecycle.py::TestReset::test_reset_initial_state PASSED [ 25%]
+tests/test_episode_lifecycle.py::TestReset::test_reset_progressive_reveal PASSED [ 26%]
+tests/test_episode_lifecycle.py::TestReset::test_reset_available_actions PASSED [ 26%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_populates_stats PASSED [ 26%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_gradients_gives_investigation_bonus PASSED [ 27%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch_gives_investigation_bonus PASSED [ 27%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes_gives_investigation_bonus PASSED [ 28%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_repeat_inspection_no_bonus PASSED [ 28%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_data_batch PASSED [ 28%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_modes PASSED [ 29%]
+tests/test_episode_lifecycle.py::TestStepInspections::test_inspect_model_weights PASSED [ 29%]
+tests/test_episode_lifecycle.py::TestStepFixActions::test_modify_config PASSED [ 30%]
+tests/test_episode_lifecycle.py::TestStepFixActions::test_restart_run_after_fix PASSED [ 30%]
+tests/test_episode_lifecycle.py::TestStepDiagnosis::test_mark_diagnosed_ends_episode PASSED [ 31%]
+tests/test_episode_lifecycle.py::TestStepDiagnosis::test_step_after_done PASSED [ 31%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_invalid_action_type PASSED [ 31%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_action_not_in_available PASSED [ 32%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_modify_config_missing_target PASSED [ 32%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_missing_diagnosis PASSED [ 33%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_mark_diagnosed_invalid_diagnosis PASSED [ 33%]
+tests/test_episode_lifecycle.py::TestErrorHandling::test_step_before_reset PASSED [ 33%]
+tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_001_full_flow PASSED [ 34%]
+tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_005_context_gated_penalty PASSED [ 34%]
+tests/test_episode_lifecycle.py::TestFullEpisodeFlow::test_task_003_data_leakage PASSED [ 35%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_001] PASSED [ 35%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_002] PASSED [ 35%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_003] PASSED [ 36%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_004] PASSED [ 36%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_005] PASSED [ 37%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_006] PASSED [ 37%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_multiple_seeds_produce_valid_scores[task_007] PASSED [ 37%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_hard_task_has_variance PASSED [ 38%]
+tests/test_exploit_resistance.py::TestExploitResistance::test_deterministic_per_seed PASSED [ 38%]
+tests/test_graders.py::TestGradeTask001::test_perfect_score PASSED       [ 39%]
+tests/test_graders.py::TestGradeTask001::test_wrong_diagnosis PASSED     [ 39%]
+tests/test_graders.py::TestGradeTask001::test_no_investigation PASSED    [ 40%]
+tests/test_graders.py::TestGradeTask001::test_score_in_range PASSED      [ 40%]
+tests/test_graders.py::TestGradeTask003::test_perfect_score PASSED       [ 40%]
+tests/test_graders.py::TestGradeTask003::test_wrong_diagnosis PASSED     [ 41%]
+tests/test_graders.py::TestGradeTask005::test_perfect_score_thorough PASSED [ 41%]
+tests/test_graders.py::TestGradeTask005::test_quick_fix_partial_credit PASSED [ 42%]
+tests/test_graders.py::TestGradeTask005::test_red_herring_chaser PASSED  [ 42%]
+tests/test_graders.py::TestGradeTask005::test_wrong_fix_penalty PASSED   [ 42%]
+tests/test_graders.py::TestGradeTask005::test_double_trap_devastates_score PASSED [ 43%]
+tests/test_graders.py::TestGradeEpisode::test_dispatch_to_correct_grader PASSED [ 43%]
+tests/test_graders.py::TestGradeEpisode::test_unknown_task_returns_zero PASSED [ 44%]
+tests/test_graders.py::TestGradeTask006::test_perfect_score_thorough PASSED [ 44%]
+tests/test_graders.py::TestGradeTask006::test_no_weights_inspection_partial PASSED [ 44%]
+tests/test_graders.py::TestGradeTask006::test_minimal_investigation PASSED [ 45%]
+tests/test_graders.py::TestGradeTask006::test_wrong_diagnosis PASSED     [ 45%]
+tests/test_graders.py::TestGradeTask006::test_score_in_range PASSED      [ 46%]
+tests/test_graders.py::TestGradeTask007::test_perfect_score_thorough PASSED [ 46%]
+tests/test_graders.py::TestGradeTask007::test_no_weights_partial PASSED  [ 46%]
+tests/test_graders.py::TestGradeTask007::test_wrong_fix_penalty PASSED   [ 47%]
+tests/test_graders.py::TestGradeTask007::test_wrong_diagnosis PASSED     [ 47%]
+tests/test_graders.py::TestGradeTask007::test_score_in_range PASSED      [ 48%]
+tests/test_graders.py::TestSubmittedDiagnosis::test_finds_diagnosis PASSED [ 48%]
+tests/test_graders.py::TestSubmittedDiagnosis::test_no_diagnosis PASSED  [ 48%]
+tests/test_graders.py::TestSubmittedDiagnosis::test_latest_diagnosis PASSED [ 49%]
+tests/test_models.py::TestRootCauseDiagnosis::test_all_values_exist PASSED [ 49%]
+tests/test_models.py::TestRootCauseDiagnosis::test_values_are_strings PASSED [ 50%]
+tests/test_models.py::TestRootCauseDiagnosis::test_specific_values PASSED [ 50%]
+tests/test_models.py::TestTrainingConfig::test_default_instantiation PASSED [ 51%]
+tests/test_models.py::TestTrainingConfig::test_json_roundtrip PASSED     [ 51%]
+tests/test_models.py::TestGradientStats::test_exploding PASSED           [ 51%]
+tests/test_models.py::TestGradientStats::test_vanishing PASSED           [ 52%]
+tests/test_models.py::TestGradientStats::test_normal PASSED              [ 52%]
+tests/test_models.py::TestEpisodeState::test_fresh_state PASSED          [ 53%]
+tests/test_models.py::TestEpisodeState::test_available_actions_initial PASSED [ 53%]
+tests/test_models.py::TestEpisodeState::test_fix_code_available_after_code_inspected PASSED [ 53%]
+tests/test_models.py::TestEpisodeState::test_restart_run_available_after_fix PASSED [ 54%]
+tests/test_models.py::TestEpisodeState::test_mark_diagnosed_disappears_after_submission PASSED [ 54%]
+tests/test_models.py::TestMLTrainingObservation::test_extends_observation PASSED [ 55%]
+tests/test_models.py::TestMLTrainingObservation::test_has_done_and_reward PASSED [ 55%]
+tests/test_models.py::TestMLTrainingObservation::test_json_serialization PASSED [ 55%]
+tests/test_models.py::TestMLTrainingAction::test_extends_action PASSED   [ 56%]
+tests/test_models.py::TestMLTrainingAction::test_basic_action PASSED     [ 56%]
+tests/test_models.py::TestMLTrainingAction::test_modify_config_action PASSED [ 57%]
+tests/test_models.py::TestMLTrainingAction::test_mark_diagnosed_action PASSED [ 57%]
+tests/test_models.py::TestMLTrainingAction::test_fix_code_action PASSED  [ 57%]
+tests/test_new_endpoints.py::TestCurriculumEndpoint::test_returns_curriculum PASSED [ 58%]
+tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_has_difficulty_levels PASSED [ 58%]
+tests/test_new_endpoints.py::TestCurriculumEndpoint::test_curriculum_covers_all_tasks PASSED [ 59%]
+tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_returns_leaderboard PASSED [ 59%]
+tests/test_new_endpoints.py::TestLeaderboardEndpoint::test_leaderboard_after_baseline PASSED [ 60%]
+tests/test_new_endpoints.py::TestReplayEndpoint::test_missing_episode PASSED [ 60%]
+tests/test_new_endpoints.py::TestReplayEndpoint::test_replay_after_baseline PASSED [ 60%]
+tests/test_new_endpoints.py::TestValidationReportEndpoint::test_returns_real_report PASSED [ 61%]
+tests/test_pytorch_engine.py::TestSimpleCNN::test_is_nn_module PASSED    [ 61%]
+tests/test_pytorch_engine.py::TestSimpleCNN::test_param_count PASSED     [ 62%]
+tests/test_pytorch_engine.py::TestSimpleCNN::test_forward_pass PASSED    [ 62%]
+tests/test_pytorch_engine.py::TestFaultInjection::test_task_001_exploding_gradients PASSED [ 62%]
+tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_eval_mode PASSED [ 63%]
+tests/test_pytorch_engine.py::TestFaultInjection::test_task_005_gradients_not_exploding PASSED [ 63%]
+tests/test_pytorch_engine.py::TestExtractGradientStats::test_returns_gradient_stats PASSED [ 64%]
+tests/test_pytorch_engine.py::TestExtractWeightStats::test_returns_weight_stats PASSED [ 64%]
+tests/test_pytorch_engine.py::TestExtractModelModes::test_train_mode PASSED [ 64%]
+tests/test_pytorch_engine.py::TestExtractModelModes::test_eval_mode PASSED [ 65%]
+tests/test_pytorch_engine.py::TestTask005RedHerrings::test_conv1_near_vanishing_red_herring PASSED [ 65%]
+tests/test_pytorch_engine.py::TestTask005RedHerrings::test_fc_spike_not_exploding PASSED [ 66%]
+tests/test_pytorch_engine.py::TestTask005RedHerrings::test_all_layers_not_exploding PASSED [ 66%]
+tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_vanishing PASSED [ 66%]
+tests/test_pytorch_engine.py::TestVanishingGradientInjection::test_task_002_model_in_train_mode PASSED [ 67%]
+tests/test_pytorch_engine.py::TestCodeBugFaultInjection::test_task_006_model_trains_normally PASSED [ 67%]
+tests/test_pytorch_engine.py::TestDataLeakageFaultInjection::test_task_003_normal_model PASSED [ 68%]
+tests/test_real_training.py::TestRunRealTraining::test_returns_20_epoch_curves PASSED [ 68%]
+tests/test_real_training.py::TestRunRealTraining::test_all_values_are_floats PASSED [ 68%]
+tests/test_real_training.py::TestRunRealTraining::test_caching_works PASSED [ 69%]
+tests/test_real_training.py::TestRunRealTraining::test_reproducible_across_calls PASSED [ 69%]
+tests/test_real_training.py::TestRunRealTraining::test_different_seeds_different_curves PASSED [ 70%]
+tests/test_real_training.py::TestRunRealTraining::test_task_001_high_lr_instability PASSED [ 70%]
+tests/test_real_training.py::TestRunRealTraining::test_task_002_vanishing_slow_learning PASSED [ 71%]
+tests/test_real_training.py::TestRunRealTraining::test_task_003_data_leakage PASSED [ 71%]
+tests/test_real_training.py::TestRunRealTraining::test_task_004_overfitting PASSED [ 71%]
+tests/test_real_training.py::TestRunRealTraining::test_task_005_batchnorm_eval PASSED [ 72%]
+tests/test_real_training.py::TestRunRealTraining::test_task_006_code_bug PASSED [ 72%]
+tests/test_real_training.py::TestRunRealTraining::test_task_007_scheduler PASSED [ 73%]
+tests/test_real_training.py::TestRunRealTraining::test_mlp_architecture PASSED [ 73%]
+tests/test_real_training.py::TestSimpleMLP::test_is_nn_module PASSED     [ 73%]
+tests/test_real_training.py::TestSimpleMLP::test_param_count PASSED      [ 74%]
+tests/test_real_training.py::TestSimpleMLP::test_forward_pass PASSED     [ 74%]
+tests/test_real_training.py::TestSimpleMLP::test_has_batchnorm PASSED    [ 75%]
+tests/test_reward_engine.py::TestStepPenalty::test_flat_step_penalty PASSED [ 75%]
+tests/test_reward_engine.py::TestStepPenalty::test_step_penalty_not_multiplied_by_step_count PASSED [ 75%]
+tests/test_reward_engine.py::TestInvestigationBonus::test_first_time_bonus PASSED [ 76%]
+tests/test_reward_engine.py::TestInvestigationBonus::test_no_bonus_on_repeat PASSED [ 76%]
+tests/test_reward_engine.py::TestInvestigationBonus::test_each_inspection_type_gives_bonus PASSED [ 77%]
+tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_before_inspection PASSED [ 77%]
+tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_after_normal_gradients PASSED [ 77%]
+tests/test_reward_engine.py::TestContextGatedPenalty::test_no_penalty_after_abnormal_gradients PASSED [ 78%]
+tests/test_reward_engine.py::TestContextGatedPenalty::test_penalty_only_for_add_callback PASSED [ 78%]
+tests/test_reward_engine.py::TestDiagnosisReward::test_correct_diagnosis PASSED [ 79%]
+tests/test_reward_engine.py::TestDiagnosisReward::test_wrong_diagnosis PASSED [ 79%]
+tests/test_reward_engine.py::TestTerminalConvergence::test_convergence_after_fix_and_restart PASSED [ 80%]
+tests/test_reward_engine.py::TestTerminalConvergence::test_no_convergence_without_fix PASSED [ 80%]
+tests/test_reward_engine.py::TestInvalidAction::test_invalid_action_penalty PASSED [ 80%]
+tests/test_reward_engine.py::TestWrongCodeFix::test_wrong_code_fix_penalty PASSED [ 81%]
+tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_one PASSED [ 81%]
+tests/test_reward_engine.py::TestRewardCap::test_reward_capped_at_negative_one PASSED [ 82%]
+tests/test_scenarios.py::TestSampleScenario::test_task_001_root_cause PASSED [ 82%]
+tests/test_scenarios.py::TestSampleScenario::test_task_003_root_cause PASSED [ 82%]
+tests/test_scenarios.py::TestSampleScenario::test_task_005_root_cause PASSED [ 83%]
+tests/test_scenarios.py::TestSampleScenario::test_different_seeds_produce_different_params PASSED [ 83%]
+tests/test_scenarios.py::TestSampleScenario::test_same_seed_same_params PASSED [ 84%]
+tests/test_scenarios.py::TestSampleScenario::test_unknown_task_raises PASSED [ 84%]
+tests/test_scenarios.py::TestSampleScenario::test_task_005_has_error_log PASSED [ 84%]
+tests/test_scenarios.py::TestSampleScenario::test_task_003_has_notes PASSED [ 85%]
+tests/test_simulation.py::TestGenLossHistory::test_returns_20_floats PASSED [ 85%]
+tests/test_simulation.py::TestGenLossHistory::test_task_001_has_instability PASSED [ 86%]
+tests/test_simulation.py::TestGenLossHistory::test_task_003_reasonable PASSED [ 86%]
+tests/test_simulation.py::TestGenLossHistory::test_task_005_no_crash PASSED [ 86%]
+tests/test_simulation.py::TestGenValAccuracy::test_returns_20_floats PASSED [ 87%]
+tests/test_simulation.py::TestGenValAccuracy::test_task_003_leakage_shows_higher_acc PASSED [ 87%]
+tests/test_simulation.py::TestGenValAccuracy::test_task_005_low_accuracy PASSED [ 88%]
+tests/test_simulation.py::TestGenValLoss::test_returns_20_floats PASSED  [ 88%]
+tests/test_simulation.py::TestGenDataBatchStats::test_leakage_high_overlap PASSED [ 88%]
+tests/test_simulation.py::TestGenDataBatchStats::test_normal_low_overlap PASSED [ 89%]
+tests/test_simulation.py::TestGenDataBatchStats::test_confusion_matrix_present PASSED [ 89%]
+tests/test_simulation_extended.py::TestVanishingGradients::test_loss_barely_decreases PASSED [ 90%]
+tests/test_simulation_extended.py::TestVanishingGradients::test_val_acc_low PASSED [ 90%]
+tests/test_simulation_extended.py::TestVanishingGradients::test_val_loss_present PASSED [ 91%]
+tests/test_simulation_extended.py::TestOverfitting::test_loss_history_present PASSED [ 91%]
+tests/test_simulation_extended.py::TestOverfitting::test_val_acc_present PASSED [ 91%]
+tests/test_simulation_extended.py::TestOverfitting::test_val_loss_present PASSED [ 92%]
+tests/test_simulation_extended.py::TestOverfitting::test_data_batch_stats_clean PASSED [ 92%]
+tests/test_simulation_extended.py::TestCodeBug::test_loss_history PASSED [ 93%]
+tests/test_simulation_extended.py::TestCodeBug::test_val_acc PASSED      [ 93%]
+tests/test_simulation_extended.py::TestCodeBug::test_val_loss PASSED     [ 93%]
+tests/test_simulation_extended.py::TestBatchNormEval::test_val_loss_present PASSED [ 94%]
+tests/test_simulation_extended.py::TestBatchNormEval::test_val_acc_near_zero PASSED [ 94%]
+tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_loss_history PASSED [ 95%]
+tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_acc PASSED [ 95%]
+tests/test_simulation_extended.py::TestSchedulerMisconfigured::test_val_loss PASSED [ 95%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_endpoint_exists PASSED [ 96%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_returns_observation PASSED [ 96%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_reset_with_task_selection PASSED [ 97%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_selection_all_tasks PASSED [ 97%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_inspect_gradients PASSED [ 97%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_full_episode_flow PASSED [ 98%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_005_red_herrings PASSED [ 98%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_task_006_code_inspection PASSED [ 99%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_invalid_message_returns_error PASSED [ 99%]
+tests/test_websocket.py::TestWebSocketEndpoint::test_ws_step_data_batch PASSED [100%]
+================================ tests coverage ================================
+_______________ coverage: platform linux, python 3.12.3-final-0 ________________
+Name                                     Stmts   Miss  Cover   Missing
+----------------------------------------------------------------------
+ml_training_debugger/__init__.py             1      0   100%
+ml_training_debugger/client.py               4      0   100%
+ml_training_debugger/code_templates.py      76      6    92%   224-225, 239, 241, 243-244
+ml_training_debugger/graders.py            141      1    99%   39
+ml_training_debugger/models.py             106      0   100%
+ml_training_debugger/nn_models.py           42      0   100%
+ml_training_debugger/pytorch_engine.py     203      4    98%   104, 113-114, 289
+ml_training_debugger/reward_engine.py       38      0   100%
+ml_training_debugger/scenarios.py           64      0   100%
+ml_training_debugger/simulation.py          38      0   100%
+server/__init__.py                           0      0   100%
+server/_baseline_results.py                 15      0   100%
+server/_heuristic.py                        93      9    90%   123-126, 132, 140-146, 156, 176
+server/app.py                               96      6    94%   82, 113, 180, 196-198, 202
+server/environment.py                      242     27    89%   158-166, 305-318, 338, 380, 393, 443, 458, 508, 513-517, 535-540
+----------------------------------------------------------------------
+TOTAL                                     1159     53    95%
+======================== 245 passed in 80.39s (0:01:20) ========================

tests/test_endpoints.py CHANGED Viewed

@@ -27,11 +27,11 @@ def client():
 class TestHealthEndpoint:
-    def test_returns_ready(self, client):
         resp = client.get("/health")
         assert resp.status_code == 200
         data = resp.json()
-        assert data["status"] == "ready"
         assert data["tasks"] == 7
     def test_task_count_matches_all_tasks(self, client):

 class TestHealthEndpoint:
+    def test_returns_healthy(self, client):
         resp = client.get("/health")
         assert resp.status_code == 200
         data = resp.json()
+        assert data["status"] == "healthy"
         assert data["tasks"] == 7
     def test_task_count_matches_all_tasks(self, client):