Spaces:
Sleeping
Sleeping
File size: 6,568 Bytes
de4b0cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """
Firebase Storage PDF loader for curriculum ingestion.
Downloads PDFs from Firebase Storage and extracts text for ChromaDB indexing.
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger("mathpulse.fb_storage_loader")
_FIREBASE_INITIALIZED = False
def _init_firebase_storage() -> Tuple[any, any]:
global _FIREBASE_INITIALIZED
if _FIREBASE_INITIALIZED:
try:
from firebase_admin import storage as fb_storage
bucket = fb_storage.bucket()
return fb_storage, bucket
except Exception as e:
logger.warning("Firebase storage unavailable: %s", e)
_FIREBASE_INITIALIZED = False
return None, None
try:
import firebase_admin
from firebase_admin import credentials, storage
except ImportError:
logger.warning("firebase_admin not installed")
return None, None
if firebase_admin._apps:
_FIREBASE_INITIALIZED = True
try:
bucket = storage.bucket()
return storage, bucket
except Exception as e:
logger.warning("Firebase storage bucket unavailable: %s", e)
return None, None
sa_json = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
sa_file = os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE")
bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "mathpulse-ai-2026.firebasestorage.app")
try:
if sa_json:
import json as _json
creds = credentials.Certificate(_json.loads(sa_json))
elif sa_file and Path(sa_file).exists():
creds = credentials.Certificate(sa_file)
else:
creds = credentials.ApplicationDefault()
firebase_admin.initialize_app(creds, {"storageBucket": bucket_name})
_FIREBASE_INITIALIZED = True
bucket = storage.bucket()
return storage, bucket
except Exception as e:
logger.warning("Firebase init failed: %s", e)
return None, None
def download_pdf_from_storage(storage_path: str, dest_path: Optional[str] = None) -> Optional[bytes]:
"""Download a PDF from Firebase Storage and return its bytes."""
_, bucket = _init_firebase_storage()
if bucket is None:
logger.warning("Firebase Storage not available, skipping download")
return None
try:
blob = bucket.blob(storage_path)
if not blob.exists():
logger.warning("Blob does not exist: %s", storage_path)
return None
bytes_data = blob.download_as_bytes()
logger.info("Downloaded %s (%d bytes)", storage_path, len(bytes_data))
if dest_path:
Path(dest_path).parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
f.write(bytes_data)
logger.info("Saved to %s", dest_path)
return bytes_data
except Exception as e:
logger.error("Failed to download %s: %s", storage_path, e)
return None
def list_curriculum_blobs(prefix: str = "curriculum/") -> List[Dict[str, str]]:
"""List all blobs under a prefix in Firebase Storage."""
_, bucket = _init_firebase_storage()
if bucket is None:
return []
blobs = bucket.list_blobs(prefix=prefix)
result = []
for blob in blobs:
if blob.name.endswith(".pdf"):
result.append({
"name": blob.name,
"size": blob.size,
"updated": str(blob.updated) if blob.updated else None,
"download_url": f"https://storage.googleapis.com/{bucket.name}/{blob.name}",
})
return result
# NOTE: Curriculum guide PDFs (shaping papers) are stored in Firebase Storage
# for system reference but are NOT included in RAG ingestion because they
# contain only learning objectives and course descriptions — insufficient
# content for lesson generation (typically <10 chunks each).
#
# Only SDO teaching modules (full lesson content with examples and problems)
# are included in the RAG pipeline.
PDF_METADATA: Dict[str, dict] = {
# General Mathematics Q1 — SDO Navotas teaching module (100 pages, ~117k chars)
"curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 1,
"storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf",
},
# General Mathematics Q2 — Interest & Annuities modules (~27-35 pages each)
"curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod1_simpleandcompoundinterests_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod2_interestmaturityfutureandpresentvaluesinsimpleandcompoundinterests_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod3_SolvingProblemsInvolvingSimpleandCompoundInterest_v2.pdf",
},
"curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf": {
"subject": "General Mathematics",
"subjectId": "gen-math",
"type": "sdo_module",
"content_domain": "general",
"quarter": 2,
"storage_path": "curriculum/general_math/genmath_q2_mod4_simpleandgeneralannuities_v2.pdf",
},
# Statistics and Probability — Full textbook (331 pages, ~607k chars)
"curriculum/stat_prob/Full.pdf": {
"subject": "Statistics and Probability",
"subjectId": "stats-prob",
"type": "sdo_module",
"content_domain": "statistics",
"quarter": 1,
"storage_path": "curriculum/stat_prob/Full.pdf",
},
} |