""" Upload DepEd curriculum PDFs to Firebase Storage. Run once during initial setup: python scripts/upload_curriculum_pdfs.py """ from __future__ import annotations import os import sys from pathlib import Path from typing import Dict, List sys.path.insert(0, str(Path(__file__).resolve().parents[1])) LOCAL_PDF_DIR = r"C:\Users\Deign\Downloads\Documents" PDF_METADATA: Dict[str, Dict[str, object]] = { "GENERAL-MATHEMATICS-1.pdf": { "subject": "General Mathematics", "type": "curriculum_guide", "strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"], "quarters": ["Q1", "Q2", "Q3", "Q4"], "storage_path": "curriculum/general_math/GENERAL-MATHEMATICS-1.pdf", }, "Finite-Mathematics-1-1.pdf": { "subject": "Finite Mathematics 1", "type": "curriculum_guide", "strand": ["STEM", "ABM"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/finite_math/Finite-Mathematics-1-1.pdf", }, "Finite-Mathematics-2-1.pdf": { "subject": "Finite Mathematics 2", "type": "curriculum_guide", "strand": ["STEM", "ABM"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/finite_math/Finite-Mathematics-2-1.pdf", }, "SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": { "subject": "General Mathematics", "type": "sdo_module", "strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf", }, "SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf": { "subject": "Business Mathematics", "type": "sdo_module", "strand": ["ABM"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/business_math/SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf", }, "SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf": { "subject": "Organization and Management", "type": "sdo_module", "strand": ["ABM"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/org_mgmt/SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf", }, "SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf": { "subject": "Statistics and Probability", "type": "sdo_module", "strand": ["STEM", "ABM"], "quarters": ["Q1", "Q2"], "storage_path": "curriculum/stat_prob/SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf", }, } def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> List[str]: """Split text into overlapping chunks.""" words = text.split() chunks: List[str] = [] i = 0 while i < len(words): chunk = " ".join(words[i : i + chunk_size]) chunks.append(chunk) i += chunk_size - overlap return chunks def upload_pdfs(): """Upload PDFs from local directory to Firebase Storage.""" try: import firebase_admin from firebase_admin import credentials, storage, firestore except ImportError: print("ERROR: firebase-admin not installed. Run: pip install firebase-admin") return service_account_path = Path(__file__).resolve().parents[1] / "serviceAccountKey.json" if not service_account_path.exists(): print(f"ERROR: Service account key not found at {service_account_path}") return bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "").strip() if not bucket_name: print("ERROR: FIREBASE_STORAGE_BUCKET not set in environment") return cred = credentials.Certificate(str(service_account_path)) firebase_admin.initialize_app(cred, {"storageBucket": bucket_name}) bucket = storage.bucket() db = firestore.client() print(f"Scanning: {LOCAL_PDF_DIR}") print("-" * 50) uploaded = 0 skipped = 0 for filename, meta in PDF_METADATA.items(): local_path = Path(LOCAL_PDF_DIR) / filename if not local_path.exists(): print(f"[SKIP] {filename} not found in {LOCAL_PDF_DIR}") skipped += 1 continue doc_ref = db.collection("curriculumDocs").document(filename) if doc_ref.get().exists: print(f"[SKIP] {filename} already uploaded") skipped += 1 continue try: blob = bucket.blob(meta["storage_path"]) blob.upload_from_filename(str(local_path), content_type="application/pdf") doc_ref.set( { "filename": filename, "subject": meta["subject"], "type": meta["type"], "strand": meta["strand"], "quarters": meta["quarters"], "storage_path": meta["storage_path"], "uploaded_at": firestore.SERVER_TIMESTAMP, "indexed": False, } ) print(f"[OK] Uploaded {filename}") uploaded += 1 except Exception as e: print(f"[ERROR] {filename}: {e}") print("-" * 50) print(f"Upload complete: {uploaded} uploaded, {skipped} skipped") def index_pdfs(): """Extract text from PDFs, chunk, embed, and store in ChromaDB.""" try: from pypdf import PdfReader import chromadb from sentence_transformers import SentenceTransformer from firebase_admin import firestore except ImportError: print("ERROR: Missing dependencies. Run: pip install pypdf chromadb sentence-transformers firebase-admin") return chroma_path = os.getenv("CHROMA_PERSIST_PATH", "./datasets/vectorstore") chroma_client = chromadb.PersistentClient(path=chroma_path) collection = chroma_client.get_or_create_collection( name="curriculum_chunks", metadata={"hnsw:space": "cosine"}, ) embedder = SentenceTransformer("BAAI/bge-base-en-v1.5") try: import firebase_admin from firebase_admin import firestore as FS db = FS.client() except Exception: db = None print(f"Indexing PDFs from: {LOCAL_PDF_DIR}") print("-" * 50) indexed = 0 skipped = 0 for filename, meta in PDF_METADATA.items(): if db: doc_ref = db.collection("curriculumDocs").document(filename) doc = doc_ref.get() if doc and doc.to_dict().get("indexed", False): print(f"[SKIP] {filename} already indexed") skipped += 1 continue local_path = Path(LOCAL_PDF_DIR) / filename if not local_path.exists(): print(f"[SKIP] {filename} not found") skipped += 1 continue try: reader = PdfReader(str(local_path)) full_text = "\n".join(page.extract_text() or "" for page in reader.pages) if not full_text.strip(): print(f"[WARN] {filename} has no extractable text") continue chunks = chunk_text(full_text) print(f"[INFO] {filename} -> {len(chunks)} chunks") for i, chunk in enumerate(chunks): chunk_id = f"{filename}_chunk_{i}" existing = collection.get(ids=[chunk_id]) if existing and existing.get("ids"): continue chunk_embedding = embedder.encode( chunk, normalize_embeddings=True, ).tolist() collection.add( embeddings=[chunk_embedding], documents=[chunk], metadatas=[ { "source_file": filename, "subject": meta["subject"], "strand": ",".join(meta["strand"]), "quarter": ",".join(meta["quarters"]), "chunk_index": i, "type": meta["type"], } ], ids=[chunk_id], ) if db: doc_ref.update({"indexed": True}) print(f"[OK] Indexed {filename}") indexed += 1 except Exception as e: print(f"[ERROR] {filename}: {e}") print("-" * 50) print(f"Indexing complete: {indexed} indexed, {skipped} skipped") print(f"Total chunks in ChromaDB: {collection.count()}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Upload and index DepEd curriculum PDFs") parser.add_argument("action", choices=["upload", "index", "both"], help="Action to perform") args = parser.parse_args() if args.action in ("upload", "both"): upload_pdfs() if args.action in ("index", "both"): index_pdfs()