Spaces:
Running
Running
| """ | |
| Upload DepEd curriculum PDFs to Firebase Storage. | |
| Run once during initial setup: python scripts/upload_curriculum_pdfs.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, List | |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) | |
| LOCAL_PDF_DIR = r"C:\Users\Deign\Downloads\Documents" | |
| PDF_METADATA: Dict[str, Dict[str, object]] = { | |
| "GENERAL-MATHEMATICS-1.pdf": { | |
| "subject": "General Mathematics", | |
| "type": "curriculum_guide", | |
| "strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"], | |
| "quarters": ["Q1", "Q2", "Q3", "Q4"], | |
| "storage_path": "curriculum/general_math/GENERAL-MATHEMATICS-1.pdf", | |
| }, | |
| "Finite-Mathematics-1-1.pdf": { | |
| "subject": "Finite Mathematics 1", | |
| "type": "curriculum_guide", | |
| "strand": ["STEM", "ABM"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/finite_math/Finite-Mathematics-1-1.pdf", | |
| }, | |
| "Finite-Mathematics-2-1.pdf": { | |
| "subject": "Finite Mathematics 2", | |
| "type": "curriculum_guide", | |
| "strand": ["STEM", "ABM"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/finite_math/Finite-Mathematics-2-1.pdf", | |
| }, | |
| "SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": { | |
| "subject": "General Mathematics", | |
| "type": "sdo_module", | |
| "strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf", | |
| }, | |
| "SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf": { | |
| "subject": "Business Mathematics", | |
| "type": "sdo_module", | |
| "strand": ["ABM"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/business_math/SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf", | |
| }, | |
| "SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf": { | |
| "subject": "Organization and Management", | |
| "type": "sdo_module", | |
| "strand": ["ABM"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/org_mgmt/SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf", | |
| }, | |
| "SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf": { | |
| "subject": "Statistics and Probability", | |
| "type": "sdo_module", | |
| "strand": ["STEM", "ABM"], | |
| "quarters": ["Q1", "Q2"], | |
| "storage_path": "curriculum/stat_prob/SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf", | |
| }, | |
| } | |
| def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> List[str]: | |
| """Split text into overlapping chunks.""" | |
| words = text.split() | |
| chunks: List[str] = [] | |
| i = 0 | |
| while i < len(words): | |
| chunk = " ".join(words[i : i + chunk_size]) | |
| chunks.append(chunk) | |
| i += chunk_size - overlap | |
| return chunks | |
| def upload_pdfs(): | |
| """Upload PDFs from local directory to Firebase Storage.""" | |
| try: | |
| import firebase_admin | |
| from firebase_admin import credentials, storage, firestore | |
| except ImportError: | |
| print("ERROR: firebase-admin not installed. Run: pip install firebase-admin") | |
| return | |
| service_account_path = Path(__file__).resolve().parents[1] / "serviceAccountKey.json" | |
| if not service_account_path.exists(): | |
| print(f"ERROR: Service account key not found at {service_account_path}") | |
| return | |
| bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "").strip() | |
| if not bucket_name: | |
| print("ERROR: FIREBASE_STORAGE_BUCKET not set in environment") | |
| return | |
| cred = credentials.Certificate(str(service_account_path)) | |
| firebase_admin.initialize_app(cred, {"storageBucket": bucket_name}) | |
| bucket = storage.bucket() | |
| db = firestore.client() | |
| print(f"Scanning: {LOCAL_PDF_DIR}") | |
| print("-" * 50) | |
| uploaded = 0 | |
| skipped = 0 | |
| for filename, meta in PDF_METADATA.items(): | |
| local_path = Path(LOCAL_PDF_DIR) / filename | |
| if not local_path.exists(): | |
| print(f"[SKIP] {filename} not found in {LOCAL_PDF_DIR}") | |
| skipped += 1 | |
| continue | |
| doc_ref = db.collection("curriculumDocs").document(filename) | |
| if doc_ref.get().exists: | |
| print(f"[SKIP] {filename} already uploaded") | |
| skipped += 1 | |
| continue | |
| try: | |
| blob = bucket.blob(meta["storage_path"]) | |
| blob.upload_from_filename(str(local_path), content_type="application/pdf") | |
| doc_ref.set( | |
| { | |
| "filename": filename, | |
| "subject": meta["subject"], | |
| "type": meta["type"], | |
| "strand": meta["strand"], | |
| "quarters": meta["quarters"], | |
| "storage_path": meta["storage_path"], | |
| "uploaded_at": firestore.SERVER_TIMESTAMP, | |
| "indexed": False, | |
| } | |
| ) | |
| print(f"[OK] Uploaded {filename}") | |
| uploaded += 1 | |
| except Exception as e: | |
| print(f"[ERROR] {filename}: {e}") | |
| print("-" * 50) | |
| print(f"Upload complete: {uploaded} uploaded, {skipped} skipped") | |
| def index_pdfs(): | |
| """Extract text from PDFs, chunk, embed, and store in ChromaDB.""" | |
| try: | |
| from pypdf import PdfReader | |
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| from firebase_admin import firestore | |
| except ImportError: | |
| print("ERROR: Missing dependencies. Run: pip install pypdf chromadb sentence-transformers firebase-admin") | |
| return | |
| chroma_path = os.getenv("CHROMA_PERSIST_PATH", "./datasets/vectorstore") | |
| chroma_client = chromadb.PersistentClient(path=chroma_path) | |
| collection = chroma_client.get_or_create_collection( | |
| name="curriculum_chunks", | |
| metadata={"hnsw:space": "cosine"}, | |
| ) | |
| embedder = SentenceTransformer("BAAI/bge-base-en-v1.5") | |
| try: | |
| import firebase_admin | |
| from firebase_admin import firestore as FS | |
| db = FS.client() | |
| except Exception: | |
| db = None | |
| print(f"Indexing PDFs from: {LOCAL_PDF_DIR}") | |
| print("-" * 50) | |
| indexed = 0 | |
| skipped = 0 | |
| for filename, meta in PDF_METADATA.items(): | |
| if db: | |
| doc_ref = db.collection("curriculumDocs").document(filename) | |
| doc = doc_ref.get() | |
| if doc and doc.to_dict().get("indexed", False): | |
| print(f"[SKIP] {filename} already indexed") | |
| skipped += 1 | |
| continue | |
| local_path = Path(LOCAL_PDF_DIR) / filename | |
| if not local_path.exists(): | |
| print(f"[SKIP] {filename} not found") | |
| skipped += 1 | |
| continue | |
| try: | |
| reader = PdfReader(str(local_path)) | |
| full_text = "\n".join(page.extract_text() or "" for page in reader.pages) | |
| if not full_text.strip(): | |
| print(f"[WARN] {filename} has no extractable text") | |
| continue | |
| chunks = chunk_text(full_text) | |
| print(f"[INFO] {filename} -> {len(chunks)} chunks") | |
| for i, chunk in enumerate(chunks): | |
| chunk_id = f"{filename}_chunk_{i}" | |
| existing = collection.get(ids=[chunk_id]) | |
| if existing and existing.get("ids"): | |
| continue | |
| chunk_embedding = embedder.encode( | |
| chunk, | |
| normalize_embeddings=True, | |
| ).tolist() | |
| collection.add( | |
| embeddings=[chunk_embedding], | |
| documents=[chunk], | |
| metadatas=[ | |
| { | |
| "source_file": filename, | |
| "subject": meta["subject"], | |
| "strand": ",".join(meta["strand"]), | |
| "quarter": ",".join(meta["quarters"]), | |
| "chunk_index": i, | |
| "type": meta["type"], | |
| } | |
| ], | |
| ids=[chunk_id], | |
| ) | |
| if db: | |
| doc_ref.update({"indexed": True}) | |
| print(f"[OK] Indexed {filename}") | |
| indexed += 1 | |
| except Exception as e: | |
| print(f"[ERROR] {filename}: {e}") | |
| print("-" * 50) | |
| print(f"Indexing complete: {indexed} indexed, {skipped} skipped") | |
| print(f"Total chunks in ChromaDB: {collection.count()}") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Upload and index DepEd curriculum PDFs") | |
| parser.add_argument("action", choices=["upload", "index", "both"], help="Action to perform") | |
| args = parser.parse_args() | |
| if args.action in ("upload", "both"): | |
| upload_pdfs() | |
| if args.action in ("index", "both"): | |
| index_pdfs() |