mathpulse-api-v3test / rag /firebase_storage_loader.py
github-actions[bot]
๐Ÿš€ Auto-deploy backend from GitHub (a17792b)
8e2e5f4
"""
Firebase Storage PDF loader for curriculum ingestion.
Downloads PDFs from Firebase Storage and extracts text for ChromaDB indexing.
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger("mathpulse.fb_storage_loader")
_FIREBASE_INITIALIZED = False
def _init_firebase_storage() -> Tuple[any, any]:
global _FIREBASE_INITIALIZED
if _FIREBASE_INITIALIZED:
try:
from firebase_admin import storage as fb_storage
bucket = fb_storage.bucket()
return fb_storage, bucket
except Exception as e:
logger.warning("Firebase storage unavailable: %s", e)
_FIREBASE_INITIALIZED = False
return None, None
try:
import firebase_admin
from firebase_admin import credentials, storage
except ImportError:
logger.warning("firebase_admin not installed")
return None, None
if firebase_admin._apps:
_FIREBASE_INITIALIZED = True
try:
bucket = storage.bucket()
return storage, bucket
except Exception as e:
logger.warning("Firebase storage bucket unavailable: %s", e)
return None, None
sa_json = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
sa_file = os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE")
bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "mathpulse-ai-2026.firebasestorage.app")
try:
if sa_json:
import json as _json
creds = credentials.Certificate(_json.loads(sa_json))
elif sa_file and Path(sa_file).exists():
creds = credentials.Certificate(sa_file)
else:
creds = credentials.ApplicationDefault()
firebase_admin.initialize_app(creds, {"storageBucket": bucket_name})
_FIREBASE_INITIALIZED = True
bucket = storage.bucket()
return storage, bucket
except Exception as e:
logger.warning("Firebase init failed: %s", e)
return None, None
def download_pdf_from_storage(storage_path: str, dest_path: Optional[str] = None) -> Optional[bytes]:
"""Download a PDF from Firebase Storage and return its bytes."""
_, bucket = _init_firebase_storage()
if bucket is None:
logger.warning("Firebase Storage not available, skipping download")
return None
try:
blob = bucket.blob(storage_path)
if not blob.exists():
logger.warning("Blob does not exist: %s", storage_path)
return None
bytes_data = blob.download_as_bytes()
logger.info("Downloaded %s (%d bytes)", storage_path, len(bytes_data))
if dest_path:
Path(dest_path).parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
f.write(bytes_data)
logger.info("Saved to %s", dest_path)
return bytes_data
except Exception as e:
logger.error("Failed to download %s: %s", storage_path, e)
return None
def list_curriculum_blobs(prefix: str = "curriculum/") -> List[Dict[str, str]]:
"""List all blobs under a prefix in Firebase Storage."""
_, bucket = _init_firebase_storage()
if bucket is None:
return []
blobs = bucket.list_blobs(prefix=prefix)
result = []
for blob in blobs:
if blob.name.endswith(".pdf"):
result.append({
"name": blob.name,
"size": blob.size,
"updated": str(blob.updated) if blob.updated else None,
"download_url": f"https://storage.googleapis.com/{bucket.name}/{blob.name}",
})
return result
# NOTE: Curriculum guide PDFs (shaping papers) are stored in Firebase Storage
# for system reference but are NOT included in RAG ingestion because they
# contain only learning objectives and course descriptions โ€” insufficient
# content for lesson generation (typically <10 chunks each).
#
# Only SDO teaching modules (full lesson content with examples and problems)
# are included in the RAG pipeline.
PDF_METADATA: Dict[str, dict] = {
# General Mathematics Q1 โ€” SDO Navotas teaching module (100 pages, ~117k chars)
"curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 1,
"storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf",
},
# General Mathematics Q2 โ€” Interest & Annuities modules (~27-35 pages each)
"curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf",
},
# Statistics and Probability โ€” Full textbook (331 pages, ~607k chars)
"curriculum/stat_prob/Full.pdf": {
"subject": "Statistics and Probability",
"subjectId": "stats-prob",
"type": "sdo_module",
"content_domain": "statistics",
"quarter": 1,
"storage_path": "curriculum/stat_prob/Full.pdf",
},
}