Spaces:

Deign86
/

mathpulse-api-v3test

Running

mathpulse-api-v3test / scripts /upload_curriculum_pdfs.py

github-actions[bot]

🚀 Auto-deploy backend from GitHub (93e7c2a)

92bfe31 6 days ago

8.84 kB

	"""
	Upload DepEd curriculum PDFs to Firebase Storage.
	Run once during initial setup: python scripts/upload_curriculum_pdfs.py
	"""

	from __future__ import annotations

	import os
	import sys
	from pathlib import Path
	from typing import Dict, List

	sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

	LOCAL_PDF_DIR = r"C:\Users\Deign\Downloads\Documents"

	PDF_METADATA: Dict[str, Dict[str, object]] = {
	"GENERAL-MATHEMATICS-1.pdf": {
	"subject": "General Mathematics",
	"type": "curriculum_guide",
	"strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"],
	"quarters": ["Q1", "Q2", "Q3", "Q4"],
	"storage_path": "curriculum/general_math/GENERAL-MATHEMATICS-1.pdf",
	},
	"Finite-Mathematics-1-1.pdf": {
	"subject": "Finite Mathematics 1",
	"type": "curriculum_guide",
	"strand": ["STEM", "ABM"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/finite_math/Finite-Mathematics-1-1.pdf",
	},
	"Finite-Mathematics-2-1.pdf": {
	"subject": "Finite Mathematics 2",
	"type": "curriculum_guide",
	"strand": ["STEM", "ABM"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/finite_math/Finite-Mathematics-2-1.pdf",
	},
	"SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf": {
	"subject": "General Mathematics",
	"type": "sdo_module",
	"strand": ["STEM", "ABM", "HUMSS", "GAS", "TVL"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/gen_math_sdo/SDO_Navotas_Gen.Math_SHS_1stSem.FV.pdf",
	},
	"SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf": {
	"subject": "Business Mathematics",
	"type": "sdo_module",
	"strand": ["ABM"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/business_math/SDO_Navotas_Bus.Math_SHS_1stSem.FV.pdf",
	},
	"SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf": {
	"subject": "Organization and Management",
	"type": "sdo_module",
	"strand": ["ABM"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/org_mgmt/SDO_Navotas_SHS_ABM_OrgAndMngt_FirstSem_FV.pdf",
	},
	"SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf": {
	"subject": "Statistics and Probability",
	"type": "sdo_module",
	"strand": ["STEM", "ABM"],
	"quarters": ["Q1", "Q2"],
	"storage_path": "curriculum/stat_prob/SDO_Navotas_STAT_PROB_SHS_1stSem_FV.pdf",
	},
	}


	def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> List[str]:
	"""Split text into overlapping chunks."""
	words = text.split()
	chunks: List[str] = []
	i = 0
	while i < len(words):
	chunk = " ".join(words[i : i + chunk_size])
	chunks.append(chunk)
	i += chunk_size - overlap
	return chunks


	def upload_pdfs():
	"""Upload PDFs from local directory to Firebase Storage."""
	try:
	import firebase_admin
	from firebase_admin import credentials, storage, firestore
	except ImportError:
	print("ERROR: firebase-admin not installed. Run: pip install firebase-admin")
	return

	service_account_path = Path(__file__).resolve().parents[1] / "serviceAccountKey.json"
	if not service_account_path.exists():
	print(f"ERROR: Service account key not found at {service_account_path}")
	return

	bucket_name = os.getenv("FIREBASE_STORAGE_BUCKET", "").strip()
	if not bucket_name:
	print("ERROR: FIREBASE_STORAGE_BUCKET not set in environment")
	return

	cred = credentials.Certificate(str(service_account_path))
	firebase_admin.initialize_app(cred, {"storageBucket": bucket_name})

	bucket = storage.bucket()
	db = firestore.client()

	print(f"Scanning: {LOCAL_PDF_DIR}")
	print("-" * 50)

	uploaded = 0
	skipped = 0

	for filename, meta in PDF_METADATA.items():
	local_path = Path(LOCAL_PDF_DIR) / filename

	if not local_path.exists():
	print(f"[SKIP] {filename} not found in {LOCAL_PDF_DIR}")
	skipped += 1
	continue

	doc_ref = db.collection("curriculumDocs").document(filename)
	if doc_ref.get().exists:
	print(f"[SKIP] {filename} already uploaded")
	skipped += 1
	continue

	try:
	blob = bucket.blob(meta["storage_path"])
	blob.upload_from_filename(str(local_path), content_type="application/pdf")

	doc_ref.set(
	{
	"filename": filename,
	"subject": meta["subject"],
	"type": meta["type"],
	"strand": meta["strand"],
	"quarters": meta["quarters"],
	"storage_path": meta["storage_path"],
	"uploaded_at": firestore.SERVER_TIMESTAMP,
	"indexed": False,
	}
	)

	print(f"[OK] Uploaded {filename}")
	uploaded += 1
	except Exception as e:
	print(f"[ERROR] {filename}: {e}")

	print("-" * 50)
	print(f"Upload complete: {uploaded} uploaded, {skipped} skipped")


	def index_pdfs():
	"""Extract text from PDFs, chunk, embed, and store in ChromaDB."""
	try:
	from pypdf import PdfReader
	import chromadb
	from sentence_transformers import SentenceTransformer
	from firebase_admin import firestore
	except ImportError:
	print("ERROR: Missing dependencies. Run: pip install pypdf chromadb sentence-transformers firebase-admin")
	return

	chroma_path = os.getenv("CHROMA_PERSIST_PATH", "./datasets/vectorstore")

	chroma_client = chromadb.PersistentClient(path=chroma_path)
	collection = chroma_client.get_or_create_collection(
	name="curriculum_chunks",
	metadata={"hnsw:space": "cosine"},
	)
	embedder = SentenceTransformer("BAAI/bge-base-en-v1.5")

	try:
	import firebase_admin
	from firebase_admin import firestore as FS
	db = FS.client()
	except Exception:
	db = None

	print(f"Indexing PDFs from: {LOCAL_PDF_DIR}")
	print("-" * 50)

	indexed = 0
	skipped = 0

	for filename, meta in PDF_METADATA.items():
	if db:
	doc_ref = db.collection("curriculumDocs").document(filename)
	doc = doc_ref.get()
	if doc and doc.to_dict().get("indexed", False):
	print(f"[SKIP] {filename} already indexed")
	skipped += 1
	continue

	local_path = Path(LOCAL_PDF_DIR) / filename
	if not local_path.exists():
	print(f"[SKIP] {filename} not found")
	skipped += 1
	continue

	try:
	reader = PdfReader(str(local_path))
	full_text = "\n".join(page.extract_text() or "" for page in reader.pages)

	if not full_text.strip():
	print(f"[WARN] {filename} has no extractable text")
	continue

	chunks = chunk_text(full_text)
	print(f"[INFO] {filename} -> {len(chunks)} chunks")

	for i, chunk in enumerate(chunks):
	chunk_id = f"{filename}_chunk_{i}"

	existing = collection.get(ids=[chunk_id])
	if existing and existing.get("ids"):
	continue

	chunk_embedding = embedder.encode(
	chunk,
	normalize_embeddings=True,
	).tolist()

	collection.add(
	embeddings=[chunk_embedding],
	documents=[chunk],
	metadatas=[
	{
	"source_file": filename,
	"subject": meta["subject"],
	"strand": ",".join(meta["strand"]),
	"quarter": ",".join(meta["quarters"]),
	"chunk_index": i,
	"type": meta["type"],
	}
	],
	ids=[chunk_id],
	)

	if db:
	doc_ref.update({"indexed": True})

	print(f"[OK] Indexed {filename}")
	indexed += 1
	except Exception as e:
	print(f"[ERROR] {filename}: {e}")

	print("-" * 50)
	print(f"Indexing complete: {indexed} indexed, {skipped} skipped")
	print(f"Total chunks in ChromaDB: {collection.count()}")


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Upload and index DepEd curriculum PDFs")
	parser.add_argument("action", choices=["upload", "index", "both"], help="Action to perform")
	args = parser.parse_args()

	if args.action in ("upload", "both"):
	upload_pdfs()

	if args.action in ("index", "both"):
	index_pdfs()