| from pathlib import Path |
| from typing import List |
|
|
| from langchain.schema import Document |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.vectorstores.chroma import Chroma |
| from langchain_community.document_loaders import TextLoader |
| from langchain_openai import OpenAIEmbeddings |
|
|
| import configs |
|
|
| embeddings_model = OpenAIEmbeddings() |
|
|
|
|
| def process_documents(doc_storage_path: str): |
| print("doc preprocessing...") |
| doc_directory = Path(doc_storage_path) |
| docs = [] |
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=configs.CHUNK_SIZE, chunk_overlap=configs.CHUNK_OVERLAP |
| ) |
| doc_search = Chroma( |
| persist_directory=configs.STORE_FILE, embedding_function=embeddings_model |
| ) |
| for file_path in doc_directory.glob("*.txt"): |
| loader = TextLoader(str(file_path)) |
| documents = loader.load() |
| docs = text_splitter.split_documents(documents) |
| doc_search = doc_search.from_documents( |
| docs, embeddings_model, persist_directory=configs.STORE_FILE |
| ) |
| doc_search.persist() |
| print("doc preprocessing end.") |
| return doc_search |
|
|
|
|
| def format_docs(docs): |
| return "\n\n".join([d.page_content for d in docs]) |
|
|