| | import gradio as gr |
| | import chromadb |
| | import os |
| | import tempfile |
| | from langchain.embeddings import HuggingFaceEmbeddings |
| | from langchain.vectorstores import Chroma |
| | from langchain.text_splitter import CharacterTextSplitter |
| | from langchain.document_loaders import PyPDFLoader |
| |
|
| | def process_pdf(file_binary): |
| | log = [] |
| | status_message = "" |
| |
|
| | if not file_binary: |
| | return "No file uploaded.", "Error: No file was provided." |
| |
|
| | try: |
| | log.append("Starting PDF upload and processing...") |
| |
|
| | |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
| | temp_file.write(file_binary) |
| | temp_path = temp_file.name |
| | log.append(f"Temporary PDF path: {temp_path}") |
| |
|
| | |
| | try: |
| | loader = PyPDFLoader(temp_path) |
| | documents = loader.load() |
| | log.append(f"Loaded {len(documents)} page(s) from PDF.") |
| | except Exception as e: |
| | raise RuntimeError(f"Error loading PDF: {e}") |
| |
|
| | |
| | try: |
| | text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50) |
| | splits = text_splitter.split_documents(documents) |
| | log.append(f"Text split into {len(splits)} chunk(s).") |
| | except Exception as e: |
| | raise RuntimeError(f"Error splitting text: {e}") |
| |
|
| | |
| | try: |
| | log.append("Initializing in-memory ChromaDB...") |
| | chroma_client = chromadb.Client() |
| | embeddings = HuggingFaceEmbeddings( |
| | model_name="sentence-transformers/all-MiniLM-L6-v2" |
| | ) |
| | Chroma.from_documents( |
| | splits, |
| | embeddings, |
| | client=chroma_client |
| | ) |
| | log.append("Successfully stored PDF chunks in ChromaDB.") |
| | except Exception as e: |
| | raise RuntimeError(f"Error creating ChromaDB vector store: {e}") |
| |
|
| | status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!" |
| | log.append(status_message) |
| |
|
| | except Exception as e: |
| | status_message = "Error" |
| | log.append(f"Exception occurred: {str(e)}") |
| |
|
| | return status_message, "\n".join(log) |
| |
|
| |
|
| | def retrieve_context(query): |
| | log = [] |
| | if not query: |
| | return "Error: No query provided." |
| |
|
| | try: |
| | log.append("Retrieving context from in-memory ChromaDB...") |
| |
|
| | |
| | chroma_client = chromadb.Client() |
| | embeddings = HuggingFaceEmbeddings( |
| | model_name="sentence-transformers/all-MiniLM-L6-v2" |
| | ) |
| | vectorstore = Chroma(embedding_function=embeddings, client=chroma_client) |
| |
|
| | |
| | results = vectorstore.similarity_search(query, k=3) |
| | if results: |
| | log.append(f"Found {len(results)} matching chunk(s).") |
| | return "\n\n".join([doc.page_content for doc in results]) |
| | else: |
| | log.append("No matching context found in the current in-memory DB.") |
| | return "No relevant context found. Have you processed a PDF yet?" |
| |
|
| | except Exception as e: |
| | log.append(f"Error retrieving context: {str(e)}") |
| | return "\n".join(log) |
| |
|
| |
|
| | with gr.Blocks() as demo: |
| | gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)") |
| |
|
| | with gr.Row(): |
| | |
| | pdf_upload = gr.File(label="Upload PDF", type="binary") |
| | process_button = gr.Button("Process PDF") |
| |
|
| | output_text = gr.Textbox(label="Processing Status") |
| | log_output = gr.Textbox(label="Log Output", interactive=False) |
| |
|
| | |
| | process_button.click( |
| | fn=process_pdf, |
| | inputs=pdf_upload, |
| | outputs=[output_text, log_output] |
| | ) |
| |
|
| | query_input = gr.Textbox(label="Enter your query") |
| | retrieve_button = gr.Button("Retrieve Context") |
| | context_output = gr.Textbox(label="Retrieved Context") |
| |
|
| | retrieve_button.click( |
| | fn=retrieve_context, |
| | inputs=query_input, |
| | outputs=context_output |
| | ) |
| |
|
| | demo.launch() |
| |
|