Spaces:
Sleeping
Sleeping
fix(embedder): ensure unique indices in sparse vectors for Qdrant
Browse files- models/embedder.py +11 -5
models/embedder.py
CHANGED
|
@@ -64,18 +64,24 @@ class TextEmbedder:
|
|
| 64 |
dense_vec = output['dense_vecs'].tolist()
|
| 65 |
lexical_weights: Dict[str, float] = output['lexical_weights']
|
| 66 |
|
| 67 |
-
# 2. Sparse Vector Transformation
|
| 68 |
-
|
| 69 |
-
|
| 70 |
|
| 71 |
# Convert text tokens into unique IDs (integers) using the BGE-M3 tokenizer
|
| 72 |
for token_str, weight in lexical_weights.items():
|
| 73 |
# Get the ID of the string token through the tokenizer (vocab index)
|
| 74 |
token_id = self.model.tokenizer.convert_tokens_to_ids(token_str)
|
|
|
|
| 75 |
if token_id is not None:
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
|
|
|
|
|
|
|
|
|
|
| 79 |
return EmbedderResult(
|
| 80 |
dense_vector=dense_vec,
|
| 81 |
sparse_indices=sparse_indices,
|
|
|
|
| 64 |
dense_vec = output['dense_vecs'].tolist()
|
| 65 |
lexical_weights: Dict[str, float] = output['lexical_weights']
|
| 66 |
|
| 67 |
+
# 2. Sparse Vector Transformation and Duplicate Index Removal
|
| 68 |
+
# Prevent duplicates by storing in the format {token_id: max_weight}
|
| 69 |
+
unique_sparse_data = {}
|
| 70 |
|
| 71 |
# Convert text tokens into unique IDs (integers) using the BGE-M3 tokenizer
|
| 72 |
for token_str, weight in lexical_weights.items():
|
| 73 |
# Get the ID of the string token through the tokenizer (vocab index)
|
| 74 |
token_id = self.model.tokenizer.convert_tokens_to_ids(token_str)
|
| 75 |
+
# If the same token_id appears, maintain the higher weight
|
| 76 |
if token_id is not None:
|
| 77 |
+
unique_sparse_data[token_id] = max(
|
| 78 |
+
unique_sparse_data.get(token_id, 0.0),
|
| 79 |
+
float(weight)
|
| 80 |
+
)
|
| 81 |
|
| 82 |
+
sparse_indices = list(unique_sparse_data.keys())
|
| 83 |
+
sparse_values = list(unique_sparse_data.values())
|
| 84 |
+
|
| 85 |
return EmbedderResult(
|
| 86 |
dense_vector=dense_vec,
|
| 87 |
sparse_indices=sparse_indices,
|