""" Firebase Storage PDF loader for curriculum ingestion. Downloads PDFs from Firebase Storage and extracts text for ChromaDB indexing. """ from __future__ import annotations import logging import os from pathlib import Path from typing import Dict, List, Optional, Tuple logger = logging.getLogger("mathpulse.fb_storage_loader") _FIREBASE_INITIALIZED = False def _init_firebase_storage() -> Tuple[any, any]: global _FIREBASE_INITIALIZED if _FIREBASE_INITIALIZED: try: from firebase_admin import storage as fb_storage bucket = fb_storage.bucket() return fb_storage, bucket except Exception as e: logger.warning("Firebase storage unavailable: %s", e) _FIREBASE_INITIALIZED = False return None, None try: import firebase_admin from firebase_admin import credentials, storage except ImportError: logger.warning("firebase_admin not installed") return None, None if firebase_admin._apps: _FIREBASE_INITIALIZED = True try: bucket = storage.bucket() return storage, bucket except Exception as e: logger.warning("Firebase storage bucket unavailable: %s", e) return None, None sa_json = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON") sa_file = os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE") bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "mathpulse-ai-2026.firebasestorage.app") try: if sa_json: import json as _json creds = credentials.Certificate(_json.loads(sa_json)) elif sa_file and Path(sa_file).exists(): creds = credentials.Certificate(sa_file) else: creds = credentials.ApplicationDefault() firebase_admin.initialize_app(creds, {"storageBucket": bucket_name}) _FIREBASE_INITIALIZED = True bucket = storage.bucket() return storage, bucket except Exception as e: logger.warning("Firebase init failed: %s", e) return None, None def download_pdf_from_storage(storage_path: str, dest_path: Optional[str] = None) -> Optional[bytes]: """Download a PDF from Firebase Storage and return its bytes.""" _, bucket = _init_firebase_storage() if bucket is None: logger.warning("Firebase Storage not available, skipping download") return None try: blob = bucket.blob(storage_path) if not blob.exists(): logger.warning("Blob does not exist: %s", storage_path) return None bytes_data = blob.download_as_bytes() logger.info("Downloaded %s (%d bytes)", storage_path, len(bytes_data)) if dest_path: Path(dest_path).parent.mkdir(parents=True, exist_ok=True) with open(dest_path, "wb") as f: f.write(bytes_data) logger.info("Saved to %s", dest_path) return bytes_data except Exception as e: logger.error("Failed to download %s: %s", storage_path, e) return None def list_curriculum_blobs(prefix: str = "curriculum/") -> List[Dict[str, str]]: """List all blobs under a prefix in Firebase Storage.""" _, bucket = _init_firebase_storage() if bucket is None: return [] blobs = bucket.list_blobs(prefix=prefix) result = [] for blob in blobs: if blob.name.endswith(".pdf"): result.append({ "name": blob.name, "size": blob.size, "updated": str(blob.updated) if blob.updated else None, "download_url": f"https://storage.googleapis.com/{bucket.name}/{blob.name}", }) return result # NOTE: Curriculum guide PDFs (shaping papers) are stored in Firebase Storage # for system reference but are NOT included in RAG ingestion because they # contain only learning objectives and course descriptions — insufficient # content for lesson generation (typically <10 chunks each). # # Only SDO teaching modules (full lesson content with examples and problems) # are included in the RAG pipeline. PDF_METADATA: Dict[str, dict] = { # General Mathematics Q1 — SDO Navotas teaching module (100 pages, ~117k chars) "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": { "subject": "General Mathematics", "subjectId": "gen-math", "type": "sdo_module", "content_domain": "general", "quarter": 1, "storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf", }, # General Mathematics Q2 — Interest & Annuities modules (~27-35 pages each) "curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf": { "subject": "General Mathematics", "subjectId": "gen-math", "type": "sdo_module", "content_domain": "general", "quarter": 2, "storage_path": "curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf", }, "curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf": { "subject": "General Mathematics", "subjectId": "gen-math", "type": "sdo_module", "content_domain": "general", "quarter": 2, "storage_path": "curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf", }, "curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf": { "subject": "General Mathematics", "subjectId": "gen-math", "type": "sdo_module", "content_domain": "general", "quarter": 2, "storage_path": "curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf", }, "curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf": { "subject": "General Mathematics", "subjectId": "gen-math", "type": "sdo_module", "content_domain": "general", "quarter": 2, "storage_path": "curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf", }, # Statistics and Probability — Full textbook (331 pages, ~607k chars) "curriculum/stat_prob/Full.pdf": { "subject": "Statistics and Probability", "subjectId": "stats-prob", "type": "sdo_module", "content_domain": "statistics", "quarter": 1, "storage_path": "curriculum/stat_prob/Full.pdf", }, }