""" backend/app/services/sparse_encoder.py BM25 sparse encoder backed by FastEmbed's Qdrant/bm25 model. Used at ingestion time (ingest.py) and at query time (retrieve node). The model downloads a ~5 MB vocabulary file on first use. Subsequent calls are fully local. The module-level singleton is loaded lazily on first call to avoid startup delay in the API Space. Fallback: if fastembed is not installed, encode() returns empty sparse vectors so dense-only retrieval continues working unchanged. """ from __future__ import annotations import logging from typing import Any, Optional logger = logging.getLogger(__name__) _model: Optional[Any] = None _fastembed_available: Optional[bool] = None def _get_model() -> Optional[Any]: global _model, _fastembed_available # noqa: PLW0603 if _fastembed_available is False: return None if _model is not None: return _model try: from fastembed import SparseTextEmbedding # type: ignore[import] _model = SparseTextEmbedding(model_name="Qdrant/bm25") _fastembed_available = True logger.info("FastEmbed BM25 sparse encoder loaded (Qdrant/bm25).") return _model except Exception as exc: _fastembed_available = False logger.warning( "FastEmbed not available — sparse retrieval disabled, falling back to dense-only. (%s)", exc, ) return None class SparseEncoder: """ Wraps FastEmbed SparseTextEmbedding to produce BM25 sparse vectors. Returns list of (indices, values) tuples — one per input text. If FastEmbed is unavailable, returns empty ([], []) tuples so callers can gracefully skip sparse indexing without breaking the ingestion pipeline. """ def encode(self, texts: list[str]) -> list[tuple[list[int], list[float]]]: """Encode a batch of texts. Returns [(indices, values), ...] per text.""" if not texts: return [] model = _get_model() if model is None: return [([], []) for _ in texts] try: results = [] for emb in model.embed(texts): # fastembed SparseEmbedding exposes .indices and .values as numpy arrays. results.append((emb.indices.tolist(), emb.values.tolist())) return results except Exception as exc: logger.warning("BM25 encoding failed (%s); returning empty sparse vectors.", exc) return [([], []) for _ in texts] def encode_one(self, text: str) -> tuple[list[int], list[float]]: """Convenience wrapper for a single string.""" return self.encode([text])[0] @property def available(self) -> bool: """True if FastEmbed loaded successfully and sparse encoding is active.""" return _get_model() is not None