Spaces:
Running
Running
File size: 2,844 Bytes
e7c9ee6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """
backend/app/services/sparse_encoder.py
BM25 sparse encoder backed by FastEmbed's Qdrant/bm25 model.
Used at ingestion time (ingest.py) and at query time (retrieve node).
The model downloads a ~5 MB vocabulary file on first use. Subsequent calls
are fully local. The module-level singleton is loaded lazily on first call
to avoid startup delay in the API Space.
Fallback: if fastembed is not installed, encode() returns empty sparse vectors
so dense-only retrieval continues working unchanged.
"""
from __future__ import annotations
import logging
from typing import Any, Optional
logger = logging.getLogger(__name__)
_model: Optional[Any] = None
_fastembed_available: Optional[bool] = None
def _get_model() -> Optional[Any]:
global _model, _fastembed_available # noqa: PLW0603
if _fastembed_available is False:
return None
if _model is not None:
return _model
try:
from fastembed import SparseTextEmbedding # type: ignore[import]
_model = SparseTextEmbedding(model_name="Qdrant/bm25")
_fastembed_available = True
logger.info("FastEmbed BM25 sparse encoder loaded (Qdrant/bm25).")
return _model
except Exception as exc:
_fastembed_available = False
logger.warning(
"FastEmbed not available — sparse retrieval disabled, falling back to dense-only. (%s)",
exc,
)
return None
class SparseEncoder:
"""
Wraps FastEmbed SparseTextEmbedding to produce BM25 sparse vectors.
Returns list of (indices, values) tuples — one per input text. If FastEmbed
is unavailable, returns empty ([], []) tuples so callers can gracefully skip
sparse indexing without breaking the ingestion pipeline.
"""
def encode(self, texts: list[str]) -> list[tuple[list[int], list[float]]]:
"""Encode a batch of texts. Returns [(indices, values), ...] per text."""
if not texts:
return []
model = _get_model()
if model is None:
return [([], []) for _ in texts]
try:
results = []
for emb in model.embed(texts):
# fastembed SparseEmbedding exposes .indices and .values as numpy arrays.
results.append((emb.indices.tolist(), emb.values.tolist()))
return results
except Exception as exc:
logger.warning("BM25 encoding failed (%s); returning empty sparse vectors.", exc)
return [([], []) for _ in texts]
def encode_one(self, text: str) -> tuple[list[int], list[float]]:
"""Convenience wrapper for a single string."""
return self.encode([text])[0]
@property
def available(self) -> bool:
"""True if FastEmbed loaded successfully and sparse encoding is active."""
return _get_model() is not None
|