File size: 4,334 Bytes
9594951
 
92bfe31
9594951
 
 
 
92bfe31
9594951
 
 
 
 
 
 
92bfe31
 
36e0175
 
92bfe31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9594951
 
 
92bfe31
9594951
 
 
 
 
 
 
 
 
 
 
 
92bfe31
9594951
 
 
 
 
 
 
 
 
 
 
92bfe31
 
 
 
9594951
 
 
 
 
 
 
92bfe31
9594951
 
 
 
 
92bfe31
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Download vectorstore directory from Firebase Storage at container startup.
Run: python /app/scripts/download_vectorstore_from_firebase.py
"""

from __future__ import annotations

import json
import logging
import os
import sys
from pathlib import Path

logger = logging.getLogger("mathpulse.download_vectorstore")

REMOTE_PREFIX = "vectorstore/"
_FIREBASE_INITIALIZED = False


def _init_firebase() -> any | None:
    global _FIREBASE_INITIALIZED

    if _FIREBASE_INITIALIZED:
        try:
            from firebase_admin import storage as fb_storage
            return fb_storage.bucket()
        except Exception as e:
            logger.warning("Firebase storage unavailable: %s", e)
            _FIREBASE_INITIALIZED = False
            return None

    try:
        import firebase_admin
        from firebase_admin import credentials, storage
    except ImportError:
        logger.warning("firebase_admin not installed")
        return None

    if firebase_admin._apps:
        _FIREBASE_INITIALIZED = True
        try:
            return storage.bucket()
        except Exception as e:
            logger.warning("Firebase storage bucket unavailable: %s", e)
            return None

    sa_json = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
    sa_file = os.getenv("FIREBASE_SERVICE_ACCOUNT_FILE")
    bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "mathpulse-ai-2026.firebasestorage.app")

    try:
        if sa_json:
            creds = credentials.Certificate(json.loads(sa_json))
        elif sa_file and Path(sa_file).exists():
            creds = credentials.Certificate(sa_file)
        else:
            creds = credentials.ApplicationDefault()

        firebase_admin.initialize_app(creds, {"storageBucket": bucket_name})
        _FIREBASE_INITIALIZED = True
        return storage.bucket()
    except Exception as e:
        logger.error("Firebase init failed: %s", e)
        return None


def _resolve_dest_dir() -> Path:
    raw = os.getenv("CURRICULUM_VECTORSTORE_DIR") or os.getenv("VECTORSTORE_DIR")
    if raw:
        return Path(raw)
    return Path("/app/datasets/vectorstore")


def download_vectorstore(dest_dir: Path, prefix: str = REMOTE_PREFIX):
    bucket = _init_firebase()
    if bucket is None:
        logger.warning("Firebase Storage not available, vectorstore download skipped")
        return False

    dest_dir.mkdir(parents=True, exist_ok=True)

    blobs = list(bucket.list_blobs(prefix=prefix))
    if not blobs:
        logger.warning("No blobs found under prefix: %s", prefix)
        return False

    downloaded = 0
    skipped = 0
    errors = 0

    for blob in blobs:
        relative_path = blob.name[len(prefix):].lstrip("/")
        if not relative_path:
            continue

        local_path = dest_dir / relative_path
        local_path.parent.mkdir(parents=True, exist_ok=True)

        try:
            if local_path.exists() and blob.size is not None and local_path.stat().st_size == blob.size:
                logger.info("Skipped (already up-to-date): %s", blob.name)
                skipped += 1
                continue
            blob.download_to_filename(str(local_path))
            logger.info("Downloaded: %s (%d bytes)", blob.name, blob.size or 0)
            downloaded += 1
        except Exception as e:
            logger.error("Failed to download %s: %s", blob.name, e)
            errors += 1

    logger.info("Download complete: %d downloaded, %d skipped, %d errors", downloaded, skipped, errors)
    return errors == 0


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
    dest_dir = _resolve_dest_dir()
    print(f"INFO: Using vectorstore destination: {dest_dir}")
    print(f"INFO: CURRICULUM_VECTORSTORE_DIR env: {os.environ.get('CURRICULUM_VECTORSTORE_DIR', 'not set')}")
    print(f"INFO: VECTORSTORE_DIR env: {os.environ.get('VECTORSTORE_DIR', 'not set')}")
    print(f"INFO: FIREBASE_STORAGE_BUCKET env: {os.environ.get('FIREBASE_STORAGE_BUCKET', 'not set')}")
    print(f"INFO: FIREBASE_SERVICE_ACCOUNT_JSON length: {len(os.environ.get('FIREBASE_SERVICE_ACCOUNT_JSON', ''))}")
    result = download_vectorstore(dest_dir, REMOTE_PREFIX)
    if result:
        print("SUCCESS: Vectorstore download completed")
    else:
        print("FAILURE: Vectorstore download failed")