from __future__ import annotations import os from pathlib import Path from threading import Lock from typing import Any, Dict, Tuple import chromadb from sentence_transformers import SentenceTransformer _VECTORSTORE_LOCK = Lock() _VECTORSTORE_SINGLETON: Tuple[Any, Any, SentenceTransformer] | None = None def reset_vectorstore_singleton() -> None: global _VECTORSTORE_SINGLETON with _VECTORSTORE_LOCK: _VECTORSTORE_SINGLETON = None def _resolve_vectorstore_dir() -> Path: raw = os.getenv("CURRICULUM_VECTORSTORE_DIR", "datasets/vectorstore") path = Path(raw) if path.is_absolute(): return path cwd_candidate = Path.cwd() / path if cwd_candidate.exists() or str(Path.cwd()).endswith("MATHPULSE-AI"): return cwd_candidate backend_candidate = Path(__file__).resolve().parents[2] / path return backend_candidate def get_vectorstore_components( collection_name: str = "curriculum_chunks", model_name: str = "BAAI/bge-base-en-v1.5", ): global _VECTORSTORE_SINGLETON if _VECTORSTORE_SINGLETON is None: with _VECTORSTORE_LOCK: if _VECTORSTORE_SINGLETON is None: vectorstore_dir = _resolve_vectorstore_dir() vectorstore_dir.mkdir(parents=True, exist_ok=True) client = chromadb.PersistentClient(path=str(vectorstore_dir)) collection = client.get_or_create_collection( name=collection_name, metadata={"hnsw:space": "cosine"}, ) embedder = SentenceTransformer(model_name) _VECTORSTORE_SINGLETON = (client, collection, embedder) return _VECTORSTORE_SINGLETON def get_vectorstore_health() -> Dict[str, Any]: _, collection, _ = get_vectorstore_components() payload = collection.get(include=["metadatas"]) metadatas = payload.get("metadatas") or [] subjects: Dict[str, int] = {} for md in metadatas: if not isinstance(md, dict): continue subject = str(md.get("subject") or "unknown") subjects[subject] = subjects.get(subject, 0) + 1 return { "chunkCount": len(payload.get("ids") or []), "subjects": subjects, "vectorstoreDir": str(_resolve_vectorstore_dir()), }