Spaces:
Running
Running
File size: 14,699 Bytes
b5cb5bb 92bfe31 b5cb5bb 128a79a b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 128a79a 92bfe31 b5cb5bb 92bfe31 128a79a b5cb5bb 128a79a b5cb5bb 128a79a b5cb5bb 128a79a 92bfe31 b5cb5bb 128a79a b5cb5bb 92bfe31 b5cb5bb e2968a4 b5cb5bb 128a79a b5cb5bb 92bfe31 128a79a 92bfe31 b5cb5bb 128a79a b5cb5bb 128a79a b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 128a79a b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 b5cb5bb 92bfe31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | """
Startup validation for MathPulse AI backend.
This module validates all critical dependencies and configurations BEFORE
the FastAPI app starts, preventing indefinite restart loops.
If any critical check fails, the process exits with a clear error message
that's visible in HF Space logs.
"""
import os
import sys
import logging
from pathlib import Path
logger = logging.getLogger("mathpulse.startup")
class StartupError(Exception):
"""Critical error during startup validation."""
pass
def validate_imports() -> None:
"""Verify all critical imports work. Use absolute imports."""
logger.info("๐ Validating Python imports...")
try:
# Core FastAPI stack
import fastapi # noqa
import uvicorn # noqa
import pydantic # noqa
logger.info(" โ FastAPI, Uvicorn, Pydantic OK")
# Backend services (use ABSOLUTE imports like deployed code)
from services.inference_client import (
InferenceClient, create_default_client, is_sequential_model,
get_current_runtime_config, get_model_for_task, model_supports_thinking,
set_runtime_model_profile, set_runtime_model_override, reset_runtime_overrides,
_MODEL_PROFILES,
) # noqa
logger.info(" โ InferenceClient imports OK")
from automation_engine import automation_engine # noqa
logger.info(" โ automation_engine imports OK")
from analytics import compute_competency_analysis # noqa
logger.info(" โ analytics imports OK")
# Firebase
try:
import firebase_admin # noqa
logger.info(" โ firebase_admin imports OK")
except ImportError:
logger.warning(" โ firebase_admin not available (OK if Firebase not needed)")
# ML & inference
from services.ai_client import get_deepseek_client, CHAT_MODEL, REASONER_MODEL # noqa
logger.info(" โ DeepSeek AI client imports OK")
logger.info("โ
All critical imports validated")
except ImportError as e:
raise StartupError(
f"โ IMPORT ERROR - Cannot start backend:\n"
f" {e}\n"
f"\n"
f"This usually means:\n"
f" - A Python package is missing (check requirements.txt)\n"
f" - A relative import was used (must be absolute in container)\n"
f" - A circular import exists\n"
f"\n"
f"Deploy will FAIL and backend will restart indefinitely.\n"
) from e
except Exception as e:
raise StartupError(f"โ Unexpected import error: {e}") from e
def validate_environment() -> None:
"""Verify required environment variables are set."""
logger.info("๐ Validating environment variables...")
# CRITICAL: DEEPSEEK_API_KEY for inference
ds_api_key = os.environ.get("DEEPSEEK_API_KEY")
if not ds_api_key:
logger.warning(
"โ WARNING: DEEPSEEK_API_KEY is not set as an environment variable.\n"
" AI inference will fail without this token.\n"
" Use: Set DEEPSEEK_API_KEY in your .env or space secrets."
)
else:
logger.info(" โ DEEPSEEK_API_KEY is set")
# Check inference provider config
inference_provider = os.getenv("INFERENCE_PROVIDER", "deepseek")
logger.info(f" โ INFERENCE_PROVIDER: {inference_provider}")
# Check model IDs
chat_model = os.getenv("INFERENCE_CHAT_MODEL_ID") or os.getenv("INFERENCE_MODEL_ID") or "deepseek-chat"
logger.info(f" โ Chat model configured: {chat_model}")
chat_strict = os.getenv("INFERENCE_CHAT_STRICT_MODEL_ONLY", "true").strip().lower() in {"1", "true", "yes", "on"}
chat_hard_trigger = os.getenv("INFERENCE_CHAT_HARD_TRIGGER_ENABLED", "false").strip().lower() in {"1", "true", "yes", "on"}
enforce_lock_model = os.getenv("INFERENCE_ENFORCE_LOCK_MODEL", "true").strip().lower() in {"1", "true", "yes", "on"}
lock_model_id = os.getenv("INFERENCE_LOCK_MODEL_ID", "deepseek-chat").strip() or "deepseek-chat"
logger.info(f" โ INFERENCE_ENFORCE_LOCK_MODEL: {enforce_lock_model}")
logger.info(f" โ INFERENCE_LOCK_MODEL_ID: {lock_model_id}")
model_profile = os.getenv("MODEL_PROFILE", "").strip().lower()
quiz_model = os.getenv("HF_QUIZ_MODEL_ID", "").strip()
rag_model = os.getenv("HF_RAG_MODEL_ID", "").strip()
logger.info(f" โ MODEL_PROFILE: {model_profile or 'not set (using individual env vars)'}")
logger.info(f" โ HF_QUIZ_MODEL_ID: {quiz_model or 'not set (using defaults)'}")
logger.info(f" โ HF_RAG_MODEL_ID: {rag_model or 'not set (using defaults)'}")
if not chat_strict:
logger.warning(" โ Chat strict model lock is disabled; chat may fallback to alternate models")
if chat_strict and chat_hard_trigger:
logger.warning(
" โ Chat hard trigger is enabled while strict chat lock is on; hard escalation will be bypassed"
)
_validate_embedding_model()
logger.info("โ
Environment variables OK")
EXPECTED_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
def _validate_embedding_model() -> None:
embedding_model = os.getenv("EMBEDDING_MODEL", "").strip()
if not embedding_model:
logger.warning(
"WARNING: EMBEDDING_MODEL env var is not set. "
f"Expected: {EXPECTED_EMBEDDING_MODEL}. "
"RAG retrieval will fail without an embedding model."
)
elif embedding_model != EXPECTED_EMBEDDING_MODEL:
logger.warning(
f"WARNING: EMBEDDING_MODEL is set to '{embedding_model}' โ "
f"expected '{EXPECTED_EMBEDDING_MODEL}'. "
"Confirm this is intentional before deploying."
)
from services.ai_client import CHAT_MODEL, REASONER_MODEL # noqa
generation_model_ids = [
CHAT_MODEL, REASONER_MODEL,
]
if embedding_model in generation_model_ids:
logger.warning(
f"CRITICAL: EMBEDDING_MODEL is set to a generation model ('{embedding_model}'). "
"This will break RAG retrieval. Set it to 'BAAI/bge-small-en-v1.5'."
)
else:
logger.info(f" EMBEDDING_MODEL: {embedding_model or 'not set'}")
def validate_config_files() -> None:
"""Verify config files exist and are readable."""
logger.info("๐ Validating configuration files...")
# Accept either deployment/runtime path without warning when one valid path exists.
model_config_candidates = [
"config/models.yaml",
"backend/config/models.yaml",
]
readable_model_config = None
for config_path in model_config_candidates:
full_path = Path(config_path)
if not full_path.exists():
continue
try:
with open(full_path, 'r', encoding='utf-8') as f:
content = f.read()
if not content.strip():
raise StartupError(
f"โ CONFIG ERROR: {config_path} is empty!\n"
f" This will cause model routing to fail.\n"
)
readable_model_config = config_path
break
except StartupError:
raise
except Exception as e:
raise StartupError(
f"โ CONFIG ERROR: Cannot read {config_path}:\n"
f" {e}\n"
) from e
if not readable_model_config:
joined_paths = ", ".join(model_config_candidates)
raise StartupError(
f"โ CONFIG ERROR: No readable model config found.\n"
f" Checked: {joined_paths}\n"
)
logger.info(f" โ Using model config: {readable_model_config}")
_validate_model_config_fields(readable_model_config)
logger.info("โ
Configuration files OK")
def validate_file_structure() -> None:
"""Verify critical backend files exist."""
logger.info("๐ Validating file structure...")
required_path_sets = [
["main.py", "backend/main.py"],
["services/inference_client.py", "backend/services/inference_client.py"],
["analytics.py", "backend/analytics.py"],
["automation_engine.py", "backend/automation_engine.py"],
]
optional_path_sets = [
["Dockerfile", "backend/Dockerfile"],
]
for candidates in required_path_sets:
found = None
for candidate in candidates:
if Path(candidate).exists():
found = candidate
break
if not found:
joined = " or ".join(candidates)
raise StartupError(
f"โ FILE MISSING: {joined}\n"
f" Backend structure is broken for this deployment layout.\n"
)
logger.info(f" โ Found {found}")
for candidates in optional_path_sets:
found = None
for candidate in candidates:
if Path(candidate).exists():
found = candidate
break
if found:
logger.info(f" โ Found optional build file {found}")
continue
joined = " or ".join(candidates)
logger.info(
f" โน Optional build file not present at runtime: {joined}"
)
logger.info("โ
File structure OK")
def validate_inference_client_config() -> None:
"""Validate InferenceClient can load its config."""
logger.info("๐ Validating InferenceClient configuration...")
try:
# Try to create the client (this will load config from YAML)
from services.inference_client import create_default_client
client = create_default_client()
# Verify critical attributes
if not hasattr(client, 'task_model_map'):
raise StartupError("โ InferenceClient missing task_model_map attribute")
if not hasattr(client, 'task_provider_map'):
raise StartupError("โ InferenceClient missing task_provider_map attribute")
# Check that required tasks are mapped
required_tasks = ['chat', 'verify_solution', 'lesson_generation', 'quiz_generation']
for task in required_tasks:
if task not in client.task_model_map:
raise StartupError(
f"โ Task '{task}' not in task_model_map.\n"
f" Check config/models.yaml\n"
)
model = client.task_model_map[task]
provider = client.task_provider_map.get(task, 'unknown')
logger.info(f" โ {task}: {model} ({provider})")
chat_model = client.task_model_map.get("chat", client.default_model)
chat_chain = client._model_chain_for_task("chat", chat_model)
logger.info(
f" โ chat strict lock: {client.chat_strict_model_only}; "
f"effective chat chain length={len(chat_chain)}"
)
if client.chat_strict_model_only and len(chat_chain) != 1:
raise StartupError(
"โ Chat strict model lock is enabled but effective chat model chain is not singular.\n"
" Check INFERENCE_CHAT_STRICT_MODEL_ONLY and routing.task_fallback_model_map.chat\n"
)
logger.info("โ
InferenceClient configuration OK")
except StartupError:
raise
except Exception as e:
raise StartupError(
f"โ InferenceClient validation failed:\n"
f" {e}\n"
f" Check config/models.yaml and backend/config/models.yaml\n"
) from e
def _validate_model_config_fields(config_path: str) -> None:
try:
import yaml
with open(config_path, "r", encoding="utf-8") as f:
config = yaml.safe_load(f) or {}
except Exception as e:
raise StartupError(f"โ Cannot parse {config_path} as YAML: {e}") from e
models = config.get("models", {})
if not isinstance(models, dict):
raise StartupError(f"โ {config_path}: 'models' section missing or invalid")
if "rag_primary" not in models:
raise StartupError(f"โ {config_path}: missing 'models.rag_primary' field")
rag_primary = models["rag_primary"]
if isinstance(rag_primary, dict):
logger.info(f" โ rag_primary model: {rag_primary.get('id', 'UNSET')}")
else:
logger.warning(f" โ rag_primary is not a dict, may cause issues")
capabilities = models.get("model_capabilities")
if not isinstance(capabilities, dict):
raise StartupError(f"โ {config_path}: missing 'models.model_capabilities' section")
logger.info(f" โ model_capabilities: sequential_only={capabilities.get('sequential_only')}, supports_thinking={capabilities.get('supports_thinking')}")
tasks = config.get("routing", {}).get("task_model_map", {})
rag_tasks = {"rag_lesson", "rag_problem", "rag_analysis_context"}
missing_rag = rag_tasks - set(str(t).strip().lower() for t in tasks.keys())
if missing_rag:
raise StartupError(f"โ {config_path}: missing RAG task mappings: {missing_rag}")
logger.info(f" โ All RAG task mappings present")
def run_all_validations() -> None:
"""Run comprehensive startup validation.
If any check fails, exits with clear error message visible in logs.
"""
logger.info("=" * 70)
logger.info("๐ STARTUP VALIDATION - Checking all critical dependencies")
logger.info("=" * 70)
strict_mode = os.getenv("STARTUP_VALIDATION_STRICT", "false").strip().lower() in {"1", "true", "yes", "on"}
try:
validate_file_structure()
validate_imports()
validate_environment()
validate_config_files()
validate_inference_client_config()
logger.info("=" * 70)
logger.info("โ
ALL STARTUP VALIDATIONS PASSED")
logger.info("=" * 70)
except StartupError as e:
logger.error("=" * 70)
logger.error(str(e))
logger.error("=" * 70)
if strict_mode:
logger.error("\n๐ DEPLOYMENT WILL FAIL - Fix errors above and redeploy")
sys.exit(1)
logger.warning(
"\nโ ๏ธ Continuing startup because STARTUP_VALIDATION_STRICT is disabled. "
"Set STARTUP_VALIDATION_STRICT=true to fail fast."
)
except Exception as e:
logger.exception(f"Unexpected validation error: {e}")
if strict_mode:
sys.exit(1)
logger.warning(
"โ ๏ธ Continuing startup after unexpected validation error because "
"STARTUP_VALIDATION_STRICT is disabled."
)
|