File size: 14,699 Bytes
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92bfe31
b5cb5bb
128a79a
 
 
 
 
 
b5cb5bb
92bfe31
b5cb5bb
 
92bfe31
b5cb5bb
 
92bfe31
b5cb5bb
 
 
 
 
 
92bfe31
b5cb5bb
128a79a
 
92bfe31
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92bfe31
128a79a
 
 
b5cb5bb
128a79a
b5cb5bb
128a79a
b5cb5bb
 
128a79a
92bfe31
b5cb5bb
128a79a
b5cb5bb
92bfe31
b5cb5bb
e2968a4
b5cb5bb
 
 
 
128a79a
 
 
 
 
 
 
 
 
 
b5cb5bb
 
 
 
 
 
92bfe31
128a79a
92bfe31
b5cb5bb
 
 
128a79a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128a79a
 
 
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92bfe31
b5cb5bb
 
 
 
 
 
92bfe31
b5cb5bb
 
 
 
92bfe31
b5cb5bb
 
 
92bfe31
b5cb5bb
 
92bfe31
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92bfe31
b5cb5bb
92bfe31
b5cb5bb
 
 
 
 
 
 
 
 
 
128a79a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5cb5bb
 
92bfe31
b5cb5bb
 
 
 
 
92bfe31
b5cb5bb
 
 
 
 
 
 
 
92bfe31
b5cb5bb
 
 
92bfe31
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92bfe31
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
"""
Startup validation for MathPulse AI backend.

This module validates all critical dependencies and configurations BEFORE
the FastAPI app starts, preventing indefinite restart loops.

If any critical check fails, the process exits with a clear error message
that's visible in HF Space logs.
"""

import os
import sys
import logging
from pathlib import Path

logger = logging.getLogger("mathpulse.startup")


class StartupError(Exception):
    """Critical error during startup validation."""
    pass


def validate_imports() -> None:
    """Verify all critical imports work. Use absolute imports."""
    logger.info("๐Ÿ” Validating Python imports...")
    try:
        # Core FastAPI stack
        import fastapi  # noqa
        import uvicorn  # noqa
        import pydantic  # noqa
        logger.info("   โœ“ FastAPI, Uvicorn, Pydantic OK")
        
        # Backend services (use ABSOLUTE imports like deployed code)
        from services.inference_client import (
            InferenceClient, create_default_client, is_sequential_model,
            get_current_runtime_config, get_model_for_task, model_supports_thinking,
            set_runtime_model_profile, set_runtime_model_override, reset_runtime_overrides,
            _MODEL_PROFILES,
        )  # noqa
        logger.info("   โœ“ InferenceClient imports OK")
        
        from automation_engine import automation_engine  # noqa
        logger.info("   โœ“ automation_engine imports OK")
        
        from analytics import compute_competency_analysis  # noqa
        logger.info("   โœ“ analytics imports OK")
        
        # Firebase
        try:
            import firebase_admin  # noqa
            logger.info("   โœ“ firebase_admin imports OK")
        except ImportError:
            logger.warning("   โš  firebase_admin not available (OK if Firebase not needed)")
        
        # ML & inference
        from services.ai_client import get_deepseek_client, CHAT_MODEL, REASONER_MODEL  # noqa
        logger.info("   โœ“ DeepSeek AI client imports OK")
        
        logger.info("โœ… All critical imports validated")
    except ImportError as e:
        raise StartupError(
            f"โŒ IMPORT ERROR - Cannot start backend:\n"
            f"   {e}\n"
            f"\n"
            f"This usually means:\n"
            f"  - A Python package is missing (check requirements.txt)\n"
            f"  - A relative import was used (must be absolute in container)\n"
            f"  - A circular import exists\n"
            f"\n"
            f"Deploy will FAIL and backend will restart indefinitely.\n"
        ) from e
    except Exception as e:
        raise StartupError(f"โŒ Unexpected import error: {e}") from e


def validate_environment() -> None:
    """Verify required environment variables are set."""
    logger.info("๐Ÿ” Validating environment variables...")
    
    # CRITICAL: DEEPSEEK_API_KEY for inference
    ds_api_key = os.environ.get("DEEPSEEK_API_KEY")
    if not ds_api_key:
        logger.warning(
            "โš   WARNING: DEEPSEEK_API_KEY is not set as an environment variable.\n"
            "   AI inference will fail without this token.\n"
            "   Use: Set DEEPSEEK_API_KEY in your .env or space secrets."
        )
    else:
        logger.info("   โœ“ DEEPSEEK_API_KEY is set")
    
    # Check inference provider config
    inference_provider = os.getenv("INFERENCE_PROVIDER", "deepseek")
    logger.info(f"   โœ“ INFERENCE_PROVIDER: {inference_provider}")
    
    # Check model IDs
    chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "deepseek-chat"
    logger.info(f"   โœ“ Chat model configured: {chat_model}")

    chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
    chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
    enforce_lock_model = os.getenv("INFERENCE_ENFORCE_LOCK_MODEL", "true").strip().lower() in {"1", "true", "yes", "on"}
    lock_model_id = os.getenv("INFERENCE_LOCK_MODEL_ID", "deepseek-chat").strip() or "deepseek-chat"
    logger.info(f"   โœ“ INFERENCE_ENFORCE_LOCK_MODEL: {enforce_lock_model}")
    logger.info(f"   โœ“ INFERENCE_LOCK_MODEL_ID: {lock_model_id}")
    model_profile = os.getenv("MODEL_PROFILE", "").strip().lower()
    quiz_model = os.getenv("HF_QUIZ_MODEL_ID", "").strip()
    rag_model = os.getenv("HF_RAG_MODEL_ID", "").strip()
    logger.info(f"   โœ“ MODEL_PROFILE: {model_profile or 'not set (using individual env vars)'}")
    logger.info(f"   โœ“ HF_QUIZ_MODEL_ID: {quiz_model or 'not set (using defaults)'}")
    logger.info(f"   โœ“ HF_RAG_MODEL_ID: {rag_model or 'not set (using defaults)'}")
    if not chat_strict:
        logger.warning("   โš  Chat strict model lock is disabled; chat may fallback to alternate models")
    if chat_strict and chat_hard_trigger:
        logger.warning(
            "   โš  Chat hard trigger is enabled while strict chat lock is on; hard escalation will be bypassed"
        )
    
    _validate_embedding_model()
    
    logger.info("โœ… Environment variables OK")


EXPECTED_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"

def _validate_embedding_model() -> None:
    embedding_model = os.getenv("EMBEDDING_MODEL", "").strip()
    if not embedding_model:
        logger.warning(
            "WARNING: EMBEDDING_MODEL env var is not set. "
            f"Expected: {EXPECTED_EMBEDDING_MODEL}. "
            "RAG retrieval will fail without an embedding model."
        )
    elif embedding_model != EXPECTED_EMBEDDING_MODEL:
        logger.warning(
            f"WARNING: EMBEDDING_MODEL is set to '{embedding_model}' โ€” "
            f"expected '{EXPECTED_EMBEDDING_MODEL}'. "
            "Confirm this is intentional before deploying."
        )
    from services.ai_client import CHAT_MODEL, REASONER_MODEL  # noqa
    generation_model_ids = [
        CHAT_MODEL, REASONER_MODEL,
    ]
    if embedding_model in generation_model_ids:
        logger.warning(
            f"CRITICAL: EMBEDDING_MODEL is set to a generation model ('{embedding_model}'). "
            "This will break RAG retrieval. Set it to 'BAAI/bge-small-en-v1.5'."
        )
    else:
        logger.info(f"   EMBEDDING_MODEL: {embedding_model or 'not set'}")


def validate_config_files() -> None:
    """Verify config files exist and are readable."""
    logger.info("๐Ÿ” Validating configuration files...")

    # Accept either deployment/runtime path without warning when one valid path exists.
    model_config_candidates = [
        "config/models.yaml",
        "backend/config/models.yaml",
    ]

    readable_model_config = None
    for config_path in model_config_candidates:
        full_path = Path(config_path)
        if not full_path.exists():
            continue
        try:
            with open(full_path, 'r', encoding='utf-8') as f:
                content = f.read()
            if not content.strip():
                raise StartupError(
                    f"โŒ CONFIG ERROR: {config_path} is empty!\n"
                    f"   This will cause model routing to fail.\n"
                )
            readable_model_config = config_path
            break
        except StartupError:
            raise
        except Exception as e:
            raise StartupError(
                f"โŒ CONFIG ERROR: Cannot read {config_path}:\n"
                f"   {e}\n"
            ) from e

    if not readable_model_config:
        joined_paths = ", ".join(model_config_candidates)
        raise StartupError(
            f"โŒ CONFIG ERROR: No readable model config found.\n"
            f"   Checked: {joined_paths}\n"
        )

    logger.info(f"   โœ“ Using model config: {readable_model_config}")

    _validate_model_config_fields(readable_model_config)

    logger.info("โœ… Configuration files OK")


def validate_file_structure() -> None:
    """Verify critical backend files exist."""
    logger.info("๐Ÿ” Validating file structure...")
    required_path_sets = [
        ["main.py", "backend/main.py"],
        ["services/inference_client.py", "backend/services/inference_client.py"],
        ["analytics.py", "backend/analytics.py"],
        ["automation_engine.py", "backend/automation_engine.py"],
    ]
    optional_path_sets = [
        ["Dockerfile", "backend/Dockerfile"],
    ]

    for candidates in required_path_sets:
        found = None
        for candidate in candidates:
            if Path(candidate).exists():
                found = candidate
                break

        if not found:
            joined = " or ".join(candidates)
            raise StartupError(
                f"โŒ FILE MISSING: {joined}\n"
                f"   Backend structure is broken for this deployment layout.\n"
            )

        logger.info(f"   โœ“ Found {found}")

    for candidates in optional_path_sets:
        found = None
        for candidate in candidates:
            if Path(candidate).exists():
                found = candidate
                break

        if found:
            logger.info(f"   โœ“ Found optional build file {found}")
            continue

        joined = " or ".join(candidates)
        logger.info(
            f"   โ„น Optional build file not present at runtime: {joined}"
        )
    
    logger.info("โœ… File structure OK")


def validate_inference_client_config() -> None:
    """Validate InferenceClient can load its config."""
    logger.info("๐Ÿ” Validating InferenceClient configuration...")
    
    try:
        # Try to create the client (this will load config from YAML)
        from services.inference_client import create_default_client
        client = create_default_client()
        
        # Verify critical attributes
        if not hasattr(client, 'task_model_map'):
            raise StartupError("โŒ InferenceClient missing task_model_map attribute")
        
        if not hasattr(client, 'task_provider_map'):
            raise StartupError("โŒ InferenceClient missing task_provider_map attribute")
        
        # Check that required tasks are mapped
        required_tasks = ['chat', 'verify_solution', 'lesson_generation', 'quiz_generation']
        for task in required_tasks:
            if task not in client.task_model_map:
                raise StartupError(
                    f"โŒ Task '{task}' not in task_model_map.\n"
                    f"   Check config/models.yaml\n"
                )
            model = client.task_model_map[task]
            provider = client.task_provider_map.get(task, 'unknown')
            logger.info(f"   โœ“ {task}: {model} ({provider})")

        chat_model = client.task_model_map.get("chat", client.default_model)
        chat_chain = client._model_chain_for_task("chat", chat_model)
        logger.info(
            f"   โœ“ chat strict lock: {client.chat_strict_model_only}; "
            f"effective chat chain length={len(chat_chain)}"
        )
        if client.chat_strict_model_only and len(chat_chain) != 1:
            raise StartupError(
                "โŒ Chat strict model lock is enabled but effective chat model chain is not singular.\n"
                "   Check INFERENCE_CHAT_STRICT_MODEL_ONLY and routing.task_fallback_model_map.chat\n"
            )
        
        logger.info("โœ… InferenceClient configuration OK")
        
    except StartupError:
        raise
    except Exception as e:
        raise StartupError(
            f"โŒ InferenceClient validation failed:\n"
            f"   {e}\n"
            f"   Check config/models.yaml and backend/config/models.yaml\n"
        ) from e


def _validate_model_config_fields(config_path: str) -> None:
    try:
        import yaml
        with open(config_path, "r", encoding="utf-8") as f:
            config = yaml.safe_load(f) or {}
    except Exception as e:
        raise StartupError(f"โŒ Cannot parse {config_path} as YAML: {e}") from e

    models = config.get("models", {})
    if not isinstance(models, dict):
        raise StartupError(f"โŒ {config_path}: 'models' section missing or invalid")

    if "rag_primary" not in models:
        raise StartupError(f"โŒ {config_path}: missing 'models.rag_primary' field")
    rag_primary = models["rag_primary"]
    if isinstance(rag_primary, dict):
        logger.info(f"   โœ“ rag_primary model: {rag_primary.get('id', 'UNSET')}")
    else:
        logger.warning(f"   โš  rag_primary is not a dict, may cause issues")

    capabilities = models.get("model_capabilities")
    if not isinstance(capabilities, dict):
        raise StartupError(f"โŒ {config_path}: missing 'models.model_capabilities' section")
    logger.info(f"   โœ“ model_capabilities: sequential_only={capabilities.get('sequential_only')}, supports_thinking={capabilities.get('supports_thinking')}")

    tasks = config.get("routing", {}).get("task_model_map", {})
    rag_tasks = {"rag_lesson", "rag_problem", "rag_analysis_context"}
    missing_rag = rag_tasks - set(str(t).strip().lower() for t in tasks.keys())
    if missing_rag:
        raise StartupError(f"โŒ {config_path}: missing RAG task mappings: {missing_rag}")

    logger.info(f"   โœ“ All RAG task mappings present")


def run_all_validations() -> None:
    """Run comprehensive startup validation.
    
    If any check fails, exits with clear error message visible in logs.
    """
    logger.info("=" * 70)
    logger.info("๐Ÿš€ STARTUP VALIDATION - Checking all critical dependencies")
    logger.info("=" * 70)
    
    strict_mode = os.getenv("STARTUP_VALIDATION_STRICT", "false").strip().lower() in {"1", "true", "yes", "on"}

    try:
        validate_file_structure()
        validate_imports()
        validate_environment()
        validate_config_files()
        validate_inference_client_config()
        
        logger.info("=" * 70)
        logger.info("โœ… ALL STARTUP VALIDATIONS PASSED")
        logger.info("=" * 70)
        
    except StartupError as e:
        logger.error("=" * 70)
        logger.error(str(e))
        logger.error("=" * 70)
        if strict_mode:
            logger.error("\n๐Ÿ›‘ DEPLOYMENT WILL FAIL - Fix errors above and redeploy")
            sys.exit(1)
        logger.warning(
            "\nโš ๏ธ  Continuing startup because STARTUP_VALIDATION_STRICT is disabled. "
            "Set STARTUP_VALIDATION_STRICT=true to fail fast."
        )
    except Exception as e:
        logger.exception(f"Unexpected validation error: {e}")
        if strict_mode:
            sys.exit(1)
        logger.warning(
            "โš ๏ธ  Continuing startup after unexpected validation error because "
            "STARTUP_VALIDATION_STRICT is disabled."
        )