| | import time |
| |
|
| | import pandas as pd |
| | from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
| |
|
| | from src.db_utils.sql_utils import sql_fetch_batch |
| | from src.db_utils.qdrant_utils import qdrant_insert, qdrant_create_index |
| | from src.data.splitter import Splitter |
| |
|
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | splitter_mode = "recursive" |
| | model_name = "deepvk/USER-bge-m3" |
| | vector_index_name = f"{splitter_mode}_{model_name.split('/')[1]}" |
| |
|
| | |
| | splitter = Splitter(splitter_mode, chunk_size=256, chunk_overlap=64) |
| | emb = HuggingFaceEmbeddings( |
| | model_name=model_name, |
| | encode_kwargs={"normalize_embeddings": True}, |
| | ) |
| |
|
| | |
| | qdrant_create_index( |
| | index_name=vector_index_name, |
| | dim=len(emb.embed_documents(["None"])[0]), |
| | distance="cosine", |
| | ) |
| |
|
| | |
| | batch_size = 16 |
| | offset = 0 |
| | while True: |
| | rows = sql_fetch_batch(batch_size=batch_size, offset=offset) |
| | if not rows: |
| | break |
| | |
| | dfs = [] |
| | for r in rows: |
| | chunks = splitter.split_text(r["content"]) |
| | vectors = emb.embed_documents(chunks) |
| |
|
| | dfs.append(pd.DataFrame({"doc_id": r["ctid"], "text": chunks, "vector": vectors})) |
| | |
| | print(f"{offset} - {offset + batch_size}:", qdrant_insert(pd.concat(dfs), vector_index_name)) |
| | |
| | offset += batch_size |
| |
|
| | time.sleep(0.3) |
| |
|