File size: 2,299 Bytes
b5cb5bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from __future__ import annotations

import os
from pathlib import Path
from threading import Lock
from typing import Any, Dict, Tuple

import chromadb
from sentence_transformers import SentenceTransformer

_VECTORSTORE_LOCK = Lock()
_VECTORSTORE_SINGLETON: Tuple[Any, Any, SentenceTransformer] | None = None


def reset_vectorstore_singleton() -> None:
    global _VECTORSTORE_SINGLETON
    with _VECTORSTORE_LOCK:
        _VECTORSTORE_SINGLETON = None


def _resolve_vectorstore_dir() -> Path:
    raw = os.getenv("CURRICULUM_VECTORSTORE_DIR", "datasets/vectorstore")
    path = Path(raw)
    if path.is_absolute():
        return path

    cwd_candidate = Path.cwd() / path
    if cwd_candidate.exists() or str(Path.cwd()).endswith("MATHPULSE-AI"):
        return cwd_candidate

    backend_candidate = Path(__file__).resolve().parents[2] / path
    return backend_candidate


def get_vectorstore_components(
    collection_name: str = "curriculum_chunks",
    model_name: str = "BAAI/bge-base-en-v1.5",
):
    global _VECTORSTORE_SINGLETON
    if _VECTORSTORE_SINGLETON is None:
        with _VECTORSTORE_LOCK:
            if _VECTORSTORE_SINGLETON is None:
                vectorstore_dir = _resolve_vectorstore_dir()
                vectorstore_dir.mkdir(parents=True, exist_ok=True)
                client = chromadb.PersistentClient(path=str(vectorstore_dir))
                collection = client.get_or_create_collection(
                    name=collection_name,
                    metadata={"hnsw:space": "cosine"},
                )
                embedder = SentenceTransformer(model_name)
                _VECTORSTORE_SINGLETON = (client, collection, embedder)
    return _VECTORSTORE_SINGLETON


def get_vectorstore_health() -> Dict[str, Any]:
    _, collection, _ = get_vectorstore_components()
    payload = collection.get(include=["metadatas"])
    metadatas = payload.get("metadatas") or []
    subjects: Dict[str, int] = {}
    for md in metadatas:
        if not isinstance(md, dict):
            continue
        subject = str(md.get("subject") or "unknown")
        subjects[subject] = subjects.get(subject, 0) + 1
    return {
        "chunkCount": len(payload.get("ids") or []),
        "subjects": subjects,
        "vectorstoreDir": str(_resolve_vectorstore_dir()),
    }