| import os |
|
|
| from langchain import FAISS, OpenAI, HuggingFaceHub, Cohere, PromptTemplate |
| from langchain.chains import RetrievalQA, ConversationalRetrievalChain |
| from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, CohereEmbeddings |
| from langchain.memory import ConversationBufferMemory |
| from langchain.schema import Document |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, NLTKTextSplitter, \ |
| SpacyTextSplitter |
| from langchain.vectorstores import Chroma, ElasticVectorSearch |
| from pypdf import PdfReader |
|
|
| from schema import EmbeddingTypes, IndexerType, TransformType, BotType |
|
|
|
|
| class QnASystem: |
|
|
| def read_and_load_pdf(self, f_data): |
| pdf_data = PdfReader(f_data) |
| documents = [] |
| for idx, page in enumerate(pdf_data.pages): |
| documents.append(Document(page_content=page.extract_text(), |
| metadata={"page_no": idx, "source": f_data.name})) |
|
|
| self.documents = documents |
|
|
| def document_transformer(self, transform_type: TransformType): |
| match transform_type: |
| case TransformType.CharacterTransform: |
| t_type = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) |
| case TransformType.RecursiveTransform: |
| t_type = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20) |
| case TransformType.NLTKTransform: |
| t_type = NLTKTextSplitter() |
| case TransformType.SpacyTransform: |
| t_type = SpacyTextSplitter() |
|
|
| case _: |
| raise IndexError("Invalid Transformer Type") |
|
|
| self.transformed_documents = t_type.split_documents(documents=self.documents) |
|
|
| def generate_embeddings(self, embedding_type: EmbeddingTypes = EmbeddingTypes.OPENAI, |
| indexer_type: IndexerType = IndexerType.FAISS, **kwargs): |
| temperature = kwargs.get("temperature", 0) |
| max_tokens = kwargs.get("max_tokens", 512) |
| match embedding_type: |
| case EmbeddingTypes.OPENAI: |
| os.environ["OPENAI_API_KEY"] = kwargs.get("api_key") or os.getenv("OPENAI_API_KEY") |
| embeddings = OpenAIEmbeddings() |
| llm = OpenAI(temperature=temperature, max_tokens=max_tokens) |
| case EmbeddingTypes.HUGGING_FACE: |
| embeddings = HuggingFaceEmbeddings(model_name=kwargs.get("model_name")) |
| llm = HuggingFaceHub(repo_id=kwargs.get("model_name"), |
| model_kwargs={"temperature": temperature, "max_tokens": max_tokens}) |
| case EmbeddingTypes.COHERE: |
| embeddings = CohereEmbeddings(model=kwargs.get("model_name"), cohere_api_key=kwargs.get("api_key")) |
| llm = Cohere(model=kwargs.get("model_name"), cohere_api_key=kwargs.get("api_key"), |
| model_kwargs={"temperature": temperature, |
| "max_tokens": max_tokens}) |
| case _: |
| raise IndexError("Invalid Embedding Type") |
|
|
| match indexer_type: |
| case IndexerType.FAISS: |
| indexer = FAISS |
| case IndexerType.CHROMA: |
| indexer = Chroma() |
|
|
| case IndexerType.ELASTICSEARCH: |
| indexer = ElasticVectorSearch(elasticsearch_url=kwargs.get("elasticsearch_url")) |
| case _: |
| raise IndexError("Invalid Indexer Function") |
|
|
| self.llm = llm |
| self.indexer = indexer |
| self.vector_store = indexer.from_documents(documents=self.transformed_documents, embedding=embeddings) |
|
|
| def get_retriever(self, search_type="similarity", top_k=5, **kwargs): |
| retriever = self.vector_store.as_retriever(search_type=search_type, search_kwargs={"k": top_k}) |
| self.retriever = retriever |
|
|
| def get_prompt(self, bot_type: BotType, **kwargs): |
| match bot_type: |
| case BotType.qna: |
| prompt = """ |
| You are a smart and helpful AI assistant, who answer the question given context |
| {context} |
| Question: {question} |
| """ |
| case BotType.conversational: |
| prompt = """ |
| Given the following conversation and a follow up question, |
| rephrase the follow up question to be a standalone question, in its original language. |
| \nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question: |
| """ |
| return PromptTemplate(input_variables=["context", "question", "chat_history"], template=prompt) |
|
|
| def build_qa(self, qa_type: BotType, chain_type="stuff", |
| return_documents: bool = True, **kwargs): |
| match qa_type: |
| case BotType.qna: |
| self.chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever, chain_type=chain_type, |
| return_source_documents=return_documents, verbose=True) |
|
|
| case BotType.conversational: |
| self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, |
| output_key="answer") |
| self.chain = ConversationalRetrievalChain.from_llm(llm=self.llm, retriever=self.retriever, |
| chain_type=chain_type, |
| return_source_documents=return_documents, |
| memory=self.memory, verbose=True) |
|
|
| case _: |
| raise IndexError("Invalid QA Type") |
|
|
| def ask_question(self, query): |
| if type(self.chain) == RetrievalQA: |
| data = {"query": query} |
| else: |
| data = {"question": query} |
| return self.chain(data) |
|
|
| def build_chain(self, transform_type, embedding_type, indexer_type, **kwargs): |
| if hasattr(self, "llm"): |
| return self.chain |
| self.document_transformer(transform_type) |
| self.generate_embeddings(embedding_type=embedding_type, |
| indexer_type=indexer_type, **kwargs) |
| self.get_retriever(**kwargs) |
| qa = self.build_qa(qa_type=kwargs.get("bot_type"), **kwargs) |
| return qa |
|
|
|
|
| if __name__ == "__main__": |
| qna = QnASystem() |
| with open("../docs/Doc A.pdf", "rb") as f: |
| qna.read_and_load_pdf(f) |
| chain = qna.build_chain( |
| transform_type=TransformType.RecursiveTransform, |
| embedding_type=EmbeddingTypes.OPENAI, indexer_type=IndexerType.FAISS, |
| chain_type="map_reduce", bot_type=BotType.conversational, return_documents=True |
| ) |
| question = qna.ask_question(query="Hi! Summarize the document.") |
| question = qna.ask_question(query="What happened from June 1984 to September 1996") |
| print(question) |
|
|