mathpulse-api-v3test / scripts /download_vectorstore_from_firebase.py
github-actions[bot]
🚀 Auto-deploy backend from GitHub (c322c5f)
36e0175
"""
Download vectorstore directory from Firebase Storage at container startup.
Run: python /app/scripts/download_vectorstore_from_firebase.py
"""
from __future__ import annotations
import json
import logging
import os
import sys
from pathlib import Path
logger = logging.getLogger("mathpulse.download_vectorstore")
REMOTE_PREFIX = "vectorstore/"
_FIREBASE_INITIALIZED = False
def _init_firebase() -> any | None:
global _FIREBASE_INITIALIZED
if _FIREBASE_INITIALIZED:
try:
from firebase_admin import storage as fb_storage
return fb_storage.bucket()
except Exception as e:
logger.warning("Firebase storage unavailable: %s", e)
_FIREBASE_INITIALIZED = False
return None
try:
import firebase_admin
from firebase_admin import credentials, storage
except ImportError:
logger.warning("firebase_admin not installed")
return None
if firebase_admin._apps:
_FIREBASE_INITIALIZED = True
try:
return storage.bucket()
except Exception as e:
logger.warning("Firebase storage bucket unavailable: %s", e)
return None
sa_json = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
sa_file = os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE")
bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "mathpulse-ai-2026.firebasestorage.app")
try:
if sa_json:
creds = credentials.Certificate(json.loads(sa_json))
elif sa_file and Path(sa_file).exists():
creds = credentials.Certificate(sa_file)
else:
creds = credentials.ApplicationDefault()
firebase_admin.initialize_app(creds, {"storageBucket": bucket_name})
_FIREBASE_INITIALIZED = True
return storage.bucket()
except Exception as e:
logger.error("Firebase init failed: %s", e)
return None
def _resolve_dest_dir() -> Path:
raw = os.getenv("CURRICULUM_VECTORSTORE_DIR") or os.getenv("VECTORSTORE_DIR")
if raw:
return Path(raw)
return Path("/app/datasets/vectorstore")
def download_vectorstore(dest_dir: Path, prefix: str = REMOTE_PREFIX):
bucket = _init_firebase()
if bucket is None:
logger.warning("Firebase Storage not available, vectorstore download skipped")
return False
dest_dir.mkdir(parents=True, exist_ok=True)
blobs = list(bucket.list_blobs(prefix=prefix))
if not blobs:
logger.warning("No blobs found under prefix: %s", prefix)
return False
downloaded = 0
skipped = 0
errors = 0
for blob in blobs:
relative_path = blob.name[len(prefix):].lstrip("/")
if not relative_path:
continue
local_path = dest_dir / relative_path
local_path.parent.mkdir(parents=True, exist_ok=True)
try:
if local_path.exists() and blob.size is not None and local_path.stat().st_size == blob.size:
logger.info("Skipped (already up-to-date): %s", blob.name)
skipped += 1
continue
blob.download_to_filename(str(local_path))
logger.info("Downloaded: %s (%d bytes)", blob.name, blob.size or 0)
downloaded += 1
except Exception as e:
logger.error("Failed to download %s: %s", blob.name, e)
errors += 1
logger.info("Download complete: %d downloaded, %d skipped, %d errors", downloaded, skipped, errors)
return errors == 0
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
dest_dir = _resolve_dest_dir()
print(f"INFO: Using vectorstore destination: {dest_dir}")
print(f"INFO: CURRICULUM_VECTORSTORE_DIR env: {os.environ.get('CURRICULUM_VECTORSTORE_DIR', 'not set')}")
print(f"INFO: VECTORSTORE_DIR env: {os.environ.get('VECTORSTORE_DIR', 'not set')}")
print(f"INFO: FIREBASE_STORAGE_BUCKET env: {os.environ.get('FIREBASE_STORAGE_BUCKET', 'not set')}")
print(f"INFO: FIREBASE_SERVICE_ACCOUNT_JSON length: {len(os.environ.get('FIREBASE_SERVICE_ACCOUNT_JSON', ''))}")
result = download_vectorstore(dest_dir, REMOTE_PREFIX)
if result:
print("SUCCESS: Vectorstore download completed")
else:
print("FAILURE: Vectorstore download failed")