Spaces:
Sleeping
Sleeping
| """ | |
| FastEmbed-based Code Embedding Server | |
| Optimized for CPU Basic (2 vCPU, 16GB RAM) | |
| Models: | |
| - Dense: jinaai/jina-embeddings-v2-small-en (512 dim) | |
| - Sparse: Qdrant/bm25 (BM25, 0.01GB) | |
| - Reranker: jinaai/jina-reranker-v1-turbo-en (0.13GB) | |
| """ | |
| import time | |
| import uuid | |
| from typing import Any, Literal | |
| import numpy as np | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel, ConfigDict, Field | |
| from fastembed import TextEmbedding, SparseTextEmbedding | |
| from fastembed.rerank.cross_encoder import TextCrossEncoder | |
| # Model names | |
| DENSE_MODEL = "jinaai/jina-embeddings-v2-small-en" | |
| SPARSE_MODEL = "Qdrant/bm25" | |
| RERANKER_MODEL = "jinaai/jina-reranker-v1-turbo-en" | |
| # Global model cache (loaded once, reused) | |
| _dense_model: TextEmbedding | None = None | |
| _sparse_model: SparseTextEmbedding | None = None | |
| _reranker_model: TextCrossEncoder | None = None | |
| app = FastAPI( | |
| title="FastEmbed Code Embeddings", | |
| summary="CPU-optimized code embeddings with BM25 sparse and reranking", | |
| version="2.0.0", | |
| ) | |
| def _get_dense_model() -> TextEmbedding: | |
| """Lazy-load dense model (cached globally).""" | |
| global _dense_model | |
| if _dense_model is None: | |
| _dense_model = TextEmbedding(model_name=DENSE_MODEL) | |
| return _dense_model | |
| def _get_sparse_model() -> SparseTextEmbedding: | |
| """Lazy-load sparse BM25 model (cached globally).""" | |
| global _sparse_model | |
| if _sparse_model is None: | |
| _sparse_model = SparseTextEmbedding(model_name=SPARSE_MODEL) | |
| return _sparse_model | |
| def _get_reranker() -> TextCrossEncoder: | |
| """Lazy-load reranker model (cached globally).""" | |
| global _reranker_model | |
| if _reranker_model is None: | |
| _reranker_model = TextCrossEncoder(model_name=RERANKER_MODEL) | |
| return _reranker_model | |
| # ==================== Request Models ==================== | |
| class EmbeddingRequest(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| input: str | list[str] | |
| model: str = "code-embed" | |
| encoding_format: Literal["float", "base64"] = "float" | |
| dimensions: int = 0 # 0 = full dimensions | |
| class SparseEmbeddingRequest(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| input: str | list[str] | |
| model: str = "bm25" | |
| class RerankRequest(BaseModel): | |
| model_config = ConfigDict(extra="allow") | |
| query: str = Field(..., max_length=8192) | |
| documents: list[str] = Field(..., min_length=1, max_length=256) | |
| return_documents: bool = False | |
| raw_scores: bool = False | |
| model: str = "code-rerank" | |
| top_n: int | None = None | |
| class HybridRequest(BaseModel): | |
| """Request for hybrid search embeddings (dense + sparse).""" | |
| model_config = ConfigDict(extra="allow") | |
| input: str | list[str] | |
| dense_model: str = "code-embed" | |
| sparse_model: str = "bm25" | |
| # ==================== Helper Functions ==================== | |
| def _now_ts() -> int: | |
| return int(time.time()) | |
| def _make_id(prefix: str) -> str: | |
| return f"{prefix}-{uuid.uuid4().hex}" | |
| def _normalize_input(input: str | list[str]) -> list[str]: | |
| if isinstance(input, str): | |
| return [input] | |
| return input | |
| def _truncate_embedding(vector: np.ndarray, dimensions: int) -> np.ndarray: | |
| if dimensions > 0 and dimensions < len(vector): | |
| return vector[:dimensions] | |
| return vector | |
| def _vector_to_payload(vector: np.ndarray, encoding_format: str) -> list[float] | str: | |
| if encoding_format == "base64": | |
| import base64 | |
| return base64.b64encode(vector.astype(np.float32).tobytes()).decode() | |
| return vector.tolist() | |
| # ==================== API Endpoints ==================== | |
| def health() -> dict[str, str]: | |
| return {"status": "ok", "models": f"{DENSE_MODEL} + {SPARSE_MODEL} + {RERANKER_MODEL}"} | |
| def embeddings(request: EmbeddingRequest) -> dict[str, Any]: | |
| """Generate dense embeddings using jina-embeddings-v2-base-code.""" | |
| texts = _normalize_input(request.input) | |
| model = _get_dense_model() | |
| # Generate embeddings (ONNX-optimized, cached) | |
| embeddings_list = list(model.embed(texts)) | |
| data = [] | |
| for idx, embedding in enumerate(embeddings_list): | |
| embedding = _truncate_embedding(embedding, request.dimensions) | |
| data.append({ | |
| "object": "embedding", | |
| "embedding": _vector_to_payload(embedding, request.encoding_format), | |
| "index": idx, | |
| }) | |
| return { | |
| "object": "list", | |
| "data": data, | |
| "model": request.model, | |
| "usage": {"prompt_tokens": sum(len(t.split()) for t in texts), "total_tokens": 0}, | |
| "id": _make_id("emb"), | |
| "created": _now_ts(), | |
| } | |
| def sparse_embeddings(request: SparseEmbeddingRequest) -> dict[str, Any]: | |
| """Generate sparse BM25 embeddings.""" | |
| texts = _normalize_input(request.input) | |
| model = _get_sparse_model() | |
| # Generate sparse embeddings | |
| sparse_embeddings = list(model.embed(texts)) | |
| data = [] | |
| for idx, emb in enumerate(sparse_embeddings): | |
| data.append({ | |
| "object": "sparse_embedding", | |
| "indices": emb.indices.tolist(), | |
| "values": emb.values.tolist(), | |
| "index": idx, | |
| }) | |
| return { | |
| "object": "list", | |
| "data": data, | |
| "model": request.model, | |
| "id": _make_id("sparse"), | |
| "created": _now_ts(), | |
| } | |
| def rerank(request: RerankRequest) -> dict[str, Any]: | |
| """Rerank documents using cross-encoder.""" | |
| reranker = _get_reranker() | |
| # Compute rerank scores | |
| scores = reranker.rerank(request.query, request.documents) | |
| results = [] | |
| for idx, score in enumerate(scores): | |
| item = {"index": idx, "relevance_score": float(score)} | |
| if request.return_documents: | |
| item["document"] = request.documents[idx] | |
| results.append(item) | |
| # Sort by relevance | |
| results.sort(key=lambda x: x["relevance_score"], reverse=True) | |
| if request.top_n is not None: | |
| results = results[:request.top_n] | |
| return { | |
| "object": "rerank", | |
| "results": results, | |
| "model": request.model, | |
| "usage": { | |
| "prompt_tokens": len(request.query.split()), | |
| "total_tokens": sum(len(d.split()) for d in request.documents), | |
| }, | |
| "id": _make_id("rerank"), | |
| "created": _now_ts(), | |
| } | |
| def hybrid_embeddings(request: HybridRequest) -> dict[str, Any]: | |
| """Generate both dense and sparse embeddings for hybrid search.""" | |
| texts = _normalize_input(request.input) | |
| dense_model = _get_dense_model() | |
| sparse_model = _get_sparse_model() | |
| # Generate both | |
| dense_embeddings = list(dense_model.embed(texts)) | |
| sparse_embeddings = list(sparse_model.embed(texts)) | |
| data = [] | |
| for idx, (dense, sparse) in enumerate(zip(dense_embeddings, sparse_embeddings)): | |
| data.append({ | |
| "object": "hybrid_embedding", | |
| "dense": { | |
| "vector": dense.tolist(), | |
| "dim": len(dense), | |
| }, | |
| "sparse": { | |
| "indices": sparse.indices.tolist(), | |
| "values": sparse.values.tolist(), | |
| }, | |
| "index": idx, | |
| }) | |
| return { | |
| "object": "list", | |
| "data": data, | |
| "model": f"{request.dense_model} + {request.sparse_model}", | |
| "id": _make_id("hybrid"), | |
| "created": _now_ts(), | |
| } | |
| # ==================== Model Info ==================== | |
| def list_models() -> dict[str, Any]: | |
| """List supported models and their specs.""" | |
| return { | |
| "dense": { | |
| "model": DENSE_MODEL, | |
| "dim": 768, | |
| "size_gb": 0.64, | |
| "type": "code-optimized", | |
| }, | |
| "sparse": { | |
| "model": SPARSE_MODEL, | |
| "type": "bm25", | |
| "size_gb": 0.01, | |
| "requires_idf": True, | |
| }, | |
| "reranker": { | |
| "model": RERANKER_MODEL, | |
| "size_gb": 0.13, | |
| "type": "cross-encoder", | |
| }, | |
| } | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |