| from typing import Optional |
|
|
| from core.rag.datasource.keyword.keyword_factory import Keyword |
| from core.rag.datasource.vdb.vector_factory import Vector |
| from core.rag.models.document import Document |
| from models.dataset import Dataset, DocumentSegment |
|
|
|
|
| class VectorService: |
| @classmethod |
| def create_segments_vector( |
| cls, keywords_list: Optional[list[list[str]]], segments: list[DocumentSegment], dataset: Dataset |
| ): |
| documents = [] |
| for segment in segments: |
| document = Document( |
| page_content=segment.content, |
| metadata={ |
| "doc_id": segment.index_node_id, |
| "doc_hash": segment.index_node_hash, |
| "document_id": segment.document_id, |
| "dataset_id": segment.dataset_id, |
| }, |
| ) |
| documents.append(document) |
| if dataset.indexing_technique == "high_quality": |
| |
| vector = Vector(dataset=dataset) |
| vector.add_texts(documents, duplicate_check=True) |
|
|
| |
| keyword = Keyword(dataset) |
|
|
| if keywords_list and len(keywords_list) > 0: |
| keyword.add_texts(documents, keywords_list=keywords_list) |
| else: |
| keyword.add_texts(documents) |
|
|
| @classmethod |
| def update_segment_vector(cls, keywords: Optional[list[str]], segment: DocumentSegment, dataset: Dataset): |
| |
|
|
| |
| document = Document( |
| page_content=segment.content, |
| metadata={ |
| "doc_id": segment.index_node_id, |
| "doc_hash": segment.index_node_hash, |
| "document_id": segment.document_id, |
| "dataset_id": segment.dataset_id, |
| }, |
| ) |
| if dataset.indexing_technique == "high_quality": |
| |
| vector = Vector(dataset=dataset) |
| vector.delete_by_ids([segment.index_node_id]) |
| vector.add_texts([document], duplicate_check=True) |
|
|
| |
| keyword = Keyword(dataset) |
| keyword.delete_by_ids([segment.index_node_id]) |
|
|
| |
| if keywords and len(keywords) > 0: |
| keyword.add_texts([document], keywords_list=[keywords]) |
| else: |
| keyword.add_texts([document]) |
|
|