Spaces:

agnixcode
/

chat_pDF

Sleeping

App Files Files Community

Dua Rajper commited on Feb 26, 2025

Commit

faea989

verified ·

1 Parent(s): 8bd7428

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -16

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import os
 import logging
 from dotenv import load_dotenv
 import streamlit as st
-from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceEmbeddings  # ✅ Fixed Import
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
@@ -14,32 +14,49 @@ from langchain_groq import ChatGroq
 load_dotenv()
 # Set up logging
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Function to extract text from PDF files
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
-        pdf_reader = PdfReader(pdf)
-        for page in pdf_reader.pages:
-            text += page.extract_text() or ""  # Ensure it doesn't break if extract_text() returns None
     return text
-# Function to split extracted text into chunks
 def get_text_chunks(text):
-    text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
-    return text_splitter.split_text(text)
-# Function to create a FAISS vectorstore using Hugging Face Embeddings
 def get_vectorstore(text_chunks):
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # ✅ Open-source model
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 # Function to set up the conversational retrieval chain
 def get_conversation_chain(vectorstore):
     try:
-        llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5)  # ✅ Uses GROQ LLaMA model
         memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
         conversation_chain = ConversationalRetrievalChain.from_llm(
@@ -71,21 +88,23 @@ def handle_userinput(user_question):
 # Main function to run the Streamlit app
 def main():
     load_dotenv()
-    st.set_page_config(page_title="Chat with PDFs", page_icon="📄")
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
-    st.header("Chat with your PDFs 📄🤖")
     user_question = st.text_input("Ask a question about your documents:")
     if user_question:
         handle_userinput(user_question)
     with st.sidebar:
-        st.subheader("Upload your PDFs")
-        pdf_docs = st.file_uploader("Upload PDFs and click 'Process'", accept_multiple_files=True)
         if st.button("Process"):
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)

 import logging
 from dotenv import load_dotenv
 import streamlit as st
+from PyPDF2 import PdfReader, PdfReadError
 from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 load_dotenv()
 # Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
 # Function to extract text from PDF files
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
+        try:
+            pdf_reader = PdfReader(pdf)
+            for page in pdf_reader.pages:
+                text += page.extract_text()
+        except PdfReadError:
+            st.warning(f"Could not read {pdf.name}. Skipping this file.")
+            logging.warning(f"Could not read {pdf.name}. Skipping.")
+        except Exception as e:
+            st.warning(f"Error processing {pdf.name}: {e}")
+            logging.error(f"Error processing {pdf.name}: {e}")
     return text
+# Function to split the extracted text into chunks
 def get_text_chunks(text):
+    text_splitter = CharacterTextSplitter(
+        separator="\n",
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(text)
+    return chunks
+# Function to create a FAISS vectorstore
 def get_vectorstore(text_chunks):
+    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
     vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
     return vectorstore
 # Function to set up the conversational retrieval chain
 def get_conversation_chain(vectorstore):
     try:
+        groq_api_key = os.getenv("GROQ_API_KEY")
+        llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5, api_key=groq_api_key)
         memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
         conversation_chain = ConversationalRetrievalChain.from_llm(
 # Main function to run the Streamlit app
 def main():
     load_dotenv()
+    st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
         st.session_state.chat_history = None
+    st.header("Chat with multiple PDFs :books:")
     user_question = st.text_input("Ask a question about your documents:")
     if user_question:
         handle_userinput(user_question)
     with st.sidebar:
+        st.subheader("Your documents")
+        pdf_docs = st.file_uploader(
+            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
+        )
         if st.button("Process"):
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)