Spaces:

1337XCode
/

personabot-api

Running

personabot-api / app /services /sparse_encoder.py

GitHub Actions

Deploy d8ad462

e7c9ee6 16 days ago

2.84 kB

	"""
	backend/app/services/sparse_encoder.py

	BM25 sparse encoder backed by FastEmbed's Qdrant/bm25 model.
	Used at ingestion time (ingest.py) and at query time (retrieve node).

	The model downloads a ~5 MB vocabulary file on first use. Subsequent calls
	are fully local. The module-level singleton is loaded lazily on first call
	to avoid startup delay in the API Space.

	Fallback: if fastembed is not installed, encode() returns empty sparse vectors
	so dense-only retrieval continues working unchanged.
	"""
	from __future__ import annotations

	import logging
	from typing import Any, Optional

	logger = logging.getLogger(__name__)

	_model: Optional[Any] = None
	_fastembed_available: Optional[bool] = None


	def _get_model() -> Optional[Any]:
	global _model, _fastembed_available # noqa: PLW0603
	if _fastembed_available is False:
	return None
	if _model is not None:
	return _model
	try:
	from fastembed import SparseTextEmbedding # type: ignore[import]

	_model = SparseTextEmbedding(model_name="Qdrant/bm25")
	_fastembed_available = True
	logger.info("FastEmbed BM25 sparse encoder loaded (Qdrant/bm25).")
	return _model
	except Exception as exc:
	_fastembed_available = False
	logger.warning(
	"FastEmbed not available — sparse retrieval disabled, falling back to dense-only. (%s)",
	exc,
	)
	return None


	class SparseEncoder:
	"""
	Wraps FastEmbed SparseTextEmbedding to produce BM25 sparse vectors.

	Returns list of (indices, values) tuples — one per input text. If FastEmbed
	is unavailable, returns empty ([], []) tuples so callers can gracefully skip
	sparse indexing without breaking the ingestion pipeline.
	"""

	def encode(self, texts: list[str]) -> list[tuple[list[int], list[float]]]:
	"""Encode a batch of texts. Returns [(indices, values), ...] per text."""
	if not texts:
	return []
	model = _get_model()
	if model is None:
	return [([], []) for _ in texts]
	try:
	results = []
	for emb in model.embed(texts):
	# fastembed SparseEmbedding exposes .indices and .values as numpy arrays.
	results.append((emb.indices.tolist(), emb.values.tolist()))
	return results
	except Exception as exc:
	logger.warning("BM25 encoding failed (%s); returning empty sparse vectors.", exc)
	return [([], []) for _ in texts]

	def encode_one(self, text: str) -> tuple[list[int], list[float]]:
	"""Convenience wrapper for a single string."""
	return self.encode([text])[0]

	@property
	def available(self) -> bool:
	"""True if FastEmbed loaded successfully and sparse encoding is active."""
	return _get_model() is not None