Spaces:
Running
Running
| """ | |
| backend/app/services/sparse_encoder.py | |
| BM25 sparse encoder backed by FastEmbed's Qdrant/bm25 model. | |
| Used at ingestion time (ingest.py) and at query time (retrieve node). | |
| The model downloads a ~5 MB vocabulary file on first use. Subsequent calls | |
| are fully local. The module-level singleton is loaded lazily on first call | |
| to avoid startup delay in the API Space. | |
| Fallback: if fastembed is not installed, encode() returns empty sparse vectors | |
| so dense-only retrieval continues working unchanged. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| from typing import Any, Optional | |
| logger = logging.getLogger(__name__) | |
| _model: Optional[Any] = None | |
| _fastembed_available: Optional[bool] = None | |
| def _get_model() -> Optional[Any]: | |
| global _model, _fastembed_available # noqa: PLW0603 | |
| if _fastembed_available is False: | |
| return None | |
| if _model is not None: | |
| return _model | |
| try: | |
| from fastembed import SparseTextEmbedding # type: ignore[import] | |
| _model = SparseTextEmbedding(model_name="Qdrant/bm25") | |
| _fastembed_available = True | |
| logger.info("FastEmbed BM25 sparse encoder loaded (Qdrant/bm25).") | |
| return _model | |
| except Exception as exc: | |
| _fastembed_available = False | |
| logger.warning( | |
| "FastEmbed not available — sparse retrieval disabled, falling back to dense-only. (%s)", | |
| exc, | |
| ) | |
| return None | |
| class SparseEncoder: | |
| """ | |
| Wraps FastEmbed SparseTextEmbedding to produce BM25 sparse vectors. | |
| Returns list of (indices, values) tuples — one per input text. If FastEmbed | |
| is unavailable, returns empty ([], []) tuples so callers can gracefully skip | |
| sparse indexing without breaking the ingestion pipeline. | |
| """ | |
| def encode(self, texts: list[str]) -> list[tuple[list[int], list[float]]]: | |
| """Encode a batch of texts. Returns [(indices, values), ...] per text.""" | |
| if not texts: | |
| return [] | |
| model = _get_model() | |
| if model is None: | |
| return [([], []) for _ in texts] | |
| try: | |
| results = [] | |
| for emb in model.embed(texts): | |
| # fastembed SparseEmbedding exposes .indices and .values as numpy arrays. | |
| results.append((emb.indices.tolist(), emb.values.tolist())) | |
| return results | |
| except Exception as exc: | |
| logger.warning("BM25 encoding failed (%s); returning empty sparse vectors.", exc) | |
| return [([], []) for _ in texts] | |
| def encode_one(self, text: str) -> tuple[list[int], list[float]]: | |
| """Convenience wrapper for a single string.""" | |
| return self.encode([text])[0] | |
| def available(self) -> bool: | |
| """True if FastEmbed loaded successfully and sparse encoding is active.""" | |
| return _get_model() is not None | |