Dua Rajper commited on
Commit
faea989
Β·
verified Β·
1 Parent(s): 8bd7428

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -16
app.py CHANGED
@@ -2,9 +2,9 @@ import os
2
  import logging
3
  from dotenv import load_dotenv
4
  import streamlit as st
5
- from PyPDF2 import PdfReader
6
  from langchain.text_splitter import CharacterTextSplitter
7
- from langchain_community.embeddings import HuggingFaceEmbeddings # βœ… Fixed Import
8
  from langchain.vectorstores import FAISS
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
@@ -14,32 +14,49 @@ from langchain_groq import ChatGroq
14
  load_dotenv()
15
 
16
  # Set up logging
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
 
 
18
 
19
  # Function to extract text from PDF files
20
  def get_pdf_text(pdf_docs):
21
  text = ""
22
  for pdf in pdf_docs:
23
- pdf_reader = PdfReader(pdf)
24
- for page in pdf_reader.pages:
25
- text += page.extract_text() or "" # Ensure it doesn't break if extract_text() returns None
 
 
 
 
 
 
 
26
  return text
27
 
28
- # Function to split extracted text into chunks
29
  def get_text_chunks(text):
30
- text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
31
- return text_splitter.split_text(text)
 
 
 
 
 
 
32
 
33
- # Function to create a FAISS vectorstore using Hugging Face Embeddings
34
  def get_vectorstore(text_chunks):
35
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") # βœ… Open-source model
36
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
37
  return vectorstore
38
 
39
  # Function to set up the conversational retrieval chain
40
  def get_conversation_chain(vectorstore):
41
  try:
42
- llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5) # βœ… Uses GROQ LLaMA model
 
43
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
44
 
45
  conversation_chain = ConversationalRetrievalChain.from_llm(
@@ -71,21 +88,23 @@ def handle_userinput(user_question):
71
  # Main function to run the Streamlit app
72
  def main():
73
  load_dotenv()
74
- st.set_page_config(page_title="Chat with PDFs", page_icon="πŸ“„")
75
 
76
  if "conversation" not in st.session_state:
77
  st.session_state.conversation = None
78
  if "chat_history" not in st.session_state:
79
  st.session_state.chat_history = None
80
 
81
- st.header("Chat with your PDFs πŸ“„πŸ€–")
82
  user_question = st.text_input("Ask a question about your documents:")
83
  if user_question:
84
  handle_userinput(user_question)
85
 
86
  with st.sidebar:
87
- st.subheader("Upload your PDFs")
88
- pdf_docs = st.file_uploader("Upload PDFs and click 'Process'", accept_multiple_files=True)
 
 
89
  if st.button("Process"):
90
  with st.spinner("Processing..."):
91
  raw_text = get_pdf_text(pdf_docs)
 
2
  import logging
3
  from dotenv import load_dotenv
4
  import streamlit as st
5
+ from PyPDF2 import PdfReader, PdfReadError
6
  from langchain.text_splitter import CharacterTextSplitter
7
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain.vectorstores import FAISS
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
 
14
  load_dotenv()
15
 
16
  # Set up logging
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s'
20
+ )
21
 
22
  # Function to extract text from PDF files
23
  def get_pdf_text(pdf_docs):
24
  text = ""
25
  for pdf in pdf_docs:
26
+ try:
27
+ pdf_reader = PdfReader(pdf)
28
+ for page in pdf_reader.pages:
29
+ text += page.extract_text()
30
+ except PdfReadError:
31
+ st.warning(f"Could not read {pdf.name}. Skipping this file.")
32
+ logging.warning(f"Could not read {pdf.name}. Skipping.")
33
+ except Exception as e:
34
+ st.warning(f"Error processing {pdf.name}: {e}")
35
+ logging.error(f"Error processing {pdf.name}: {e}")
36
  return text
37
 
38
+ # Function to split the extracted text into chunks
39
  def get_text_chunks(text):
40
+ text_splitter = CharacterTextSplitter(
41
+ separator="\n",
42
+ chunk_size=1000,
43
+ chunk_overlap=200,
44
+ length_function=len
45
+ )
46
+ chunks = text_splitter.split_text(text)
47
+ return chunks
48
 
49
+ # Function to create a FAISS vectorstore
50
  def get_vectorstore(text_chunks):
51
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
52
  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
53
  return vectorstore
54
 
55
  # Function to set up the conversational retrieval chain
56
  def get_conversation_chain(vectorstore):
57
  try:
58
+ groq_api_key = os.getenv("GROQ_API_KEY")
59
+ llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.5, api_key=groq_api_key)
60
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
61
 
62
  conversation_chain = ConversationalRetrievalChain.from_llm(
 
88
  # Main function to run the Streamlit app
89
  def main():
90
  load_dotenv()
91
+ st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
92
 
93
  if "conversation" not in st.session_state:
94
  st.session_state.conversation = None
95
  if "chat_history" not in st.session_state:
96
  st.session_state.chat_history = None
97
 
98
+ st.header("Chat with multiple PDFs :books:")
99
  user_question = st.text_input("Ask a question about your documents:")
100
  if user_question:
101
  handle_userinput(user_question)
102
 
103
  with st.sidebar:
104
+ st.subheader("Your documents")
105
+ pdf_docs = st.file_uploader(
106
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
107
+ )
108
  if st.button("Process"):
109
  with st.spinner("Processing..."):
110
  raw_text = get_pdf_text(pdf_docs)