Spaces:

Omarrran
/

Context_Retriever_with_ChromaDB_In-Memory

Sleeping

App Files Files Community

Context_Retriever_with_ChromaDB_In-Memory / app.py

Omarrran

Create app.py

f491b53 verified 12 months ago

raw

history blame contribute delete

4.3 kB

	import gradio as gr
	import chromadb
	import os
	import tempfile
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.text_splitter import CharacterTextSplitter
	from langchain.document_loaders import PyPDFLoader

	def process_pdf(file_binary):
	log = []
	status_message = ""

	if not file_binary:
	return "No file uploaded.", "Error: No file was provided."

	try:
	log.append("Starting PDF upload and processing...")

	# Write uploaded PDF bytes to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(file_binary)
	temp_path = temp_file.name
	log.append(f"Temporary PDF path: {temp_path}")

	# Load and extract text from the PDF
	try:
	loader = PyPDFLoader(temp_path)
	documents = loader.load()
	log.append(f"Loaded {len(documents)} page(s) from PDF.")
	except Exception as e:
	raise RuntimeError(f"Error loading PDF: {e}")

	# Split text into chunks
	try:
	text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	splits = text_splitter.split_documents(documents)
	log.append(f"Text split into {len(splits)} chunk(s).")
	except Exception as e:
	raise RuntimeError(f"Error splitting text: {e}")

	# Create an in-memory Chroma client (ephemeral)
	try:
	log.append("Initializing in-memory ChromaDB...")
	chroma_client = chromadb.Client() # in-memory, no local storage
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	Chroma.from_documents(
	splits,
	embeddings,
	client=chroma_client
	)
	log.append("Successfully stored PDF chunks in ChromaDB.")
	except Exception as e:
	raise RuntimeError(f"Error creating ChromaDB vector store: {e}")

	status_message = "PDF processed and stored in (ephemeral) ChromaDB successfully!"
	log.append(status_message)

	except Exception as e:
	status_message = "Error"
	log.append(f"Exception occurred: {str(e)}")

	return status_message, "\n".join(log)


	def retrieve_context(query):
	log = []
	if not query:
	return "Error: No query provided."

	try:
	log.append("Retrieving context from in-memory ChromaDB...")

	# Re-initialize the in-memory Chroma client each time
	chroma_client = chromadb.Client() # ephemeral
	embeddings = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)
	vectorstore = Chroma(embedding_function=embeddings, client=chroma_client)

	# Perform similarity search
	results = vectorstore.similarity_search(query, k=3)
	if results:
	log.append(f"Found {len(results)} matching chunk(s).")
	return "\n\n".join([doc.page_content for doc in results])
	else:
	log.append("No matching context found in the current in-memory DB.")
	return "No relevant context found. Have you processed a PDF yet?"

	except Exception as e:
	log.append(f"Error retrieving context: {str(e)}")
	return "\n".join(log)


	with gr.Blocks() as demo:
	gr.Markdown("## PDF Context Retriever with ChromaDB (In-Memory)")

	with gr.Row():
	# Use type 'binary' to receive file data as binary
	pdf_upload = gr.File(label="Upload PDF", type="binary")
	process_button = gr.Button("Process PDF")

	output_text = gr.Textbox(label="Processing Status")
	log_output = gr.Textbox(label="Log Output", interactive=False)

	# Outputs: [status_message, log_output]
	process_button.click(
	fn=process_pdf,
	inputs=pdf_upload,
	outputs=[output_text, log_output]
	)

	query_input = gr.Textbox(label="Enter your query")
	retrieve_button = gr.Button("Retrieve Context")
	context_output = gr.Textbox(label="Retrieved Context")

	retrieve_button.click(
	fn=retrieve_context,
	inputs=query_input,
	outputs=context_output
	)

	demo.launch()