m97j commited on
Commit
ee15d5b
·
1 Parent(s): 1487b7f

fix(embedder): ensure unique indices in sparse vectors for Qdrant

Browse files
Files changed (1) hide show
  1. models/embedder.py +11 -5
models/embedder.py CHANGED
@@ -64,18 +64,24 @@ class TextEmbedder:
64
  dense_vec = output['dense_vecs'].tolist()
65
  lexical_weights: Dict[str, float] = output['lexical_weights']
66
 
67
- # 2. Sparse Vector Transformation (Qdrant specifications: token_id array, weight array)
68
- sparse_indices = []
69
- sparse_values = []
70
 
71
  # Convert text tokens into unique IDs (integers) using the BGE-M3 tokenizer
72
  for token_str, weight in lexical_weights.items():
73
  # Get the ID of the string token through the tokenizer (vocab index)
74
  token_id = self.model.tokenizer.convert_tokens_to_ids(token_str)
 
75
  if token_id is not None:
76
- sparse_indices.append(token_id)
77
- sparse_values.append(float(weight))
 
 
78
 
 
 
 
79
  return EmbedderResult(
80
  dense_vector=dense_vec,
81
  sparse_indices=sparse_indices,
 
64
  dense_vec = output['dense_vecs'].tolist()
65
  lexical_weights: Dict[str, float] = output['lexical_weights']
66
 
67
+ # 2. Sparse Vector Transformation and Duplicate Index Removal
68
+ # Prevent duplicates by storing in the format {token_id: max_weight}
69
+ unique_sparse_data = {}
70
 
71
  # Convert text tokens into unique IDs (integers) using the BGE-M3 tokenizer
72
  for token_str, weight in lexical_weights.items():
73
  # Get the ID of the string token through the tokenizer (vocab index)
74
  token_id = self.model.tokenizer.convert_tokens_to_ids(token_str)
75
+ # If the same token_id appears, maintain the higher weight
76
  if token_id is not None:
77
+ unique_sparse_data[token_id] = max(
78
+ unique_sparse_data.get(token_id, 0.0),
79
+ float(weight)
80
+ )
81
 
82
+ sparse_indices = list(unique_sparse_data.keys())
83
+ sparse_values = list(unique_sparse_data.values())
84
+
85
  return EmbedderResult(
86
  dense_vector=dense_vec,
87
  sparse_indices=sparse_indices,