PDF-Extractor / app.py
mfraz's picture
Update app.py
3cfed0b verified
Raw
History Blame Contribute Delete
2.55 kB
import os
import streamlit as st
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# Initialize Groq API Client
client = Groq(api_key=os.environ.get("Groq_Api"))
# Title with Book Icon
st.title("๐Ÿ“– A&Q From a File")
# File Upload
uploaded_file = st.file_uploader("Upload a PDF or DOCX file", type=["pdf", "docx"])
if uploaded_file:
st.write(f"**File Name:** {uploaded_file.name}") # Display file name
# Extract Text
def extract_text(file):
if file.name.endswith(".pdf"):
reader = PdfReader(file)
return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
elif file.name.endswith(".docx"):
doc = Document(file)
return "\n".join([para.text for para in doc.paragraphs])
return ""
file_text = extract_text(uploaded_file)
if file_text:
st.success("File uploaded and text extracted successfully!")
st.write("Ask a question about the file:")
query = st.text_input("Enter your question")
if query:
# Load Sentence Transformer Model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Chunk & Embed Text
chunk_size = 512
chunks = [file_text[i:i + chunk_size] for i in range(0, len(file_text), chunk_size)]
embeddings = model.encode(chunks, convert_to_numpy=True)
# Build FAISS Index for Fast Retrieval
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
# Query Embedding
query_embedding = model.encode([query], convert_to_numpy=True)
_, retrieved_idx = index.search(query_embedding, k=3)
# Retrieve Top 3 Relevant Chunks
relevant_text = " ".join([chunks[i] for i in retrieved_idx[0]])
# Query Groq API with relevant chunks only
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"Answer based on this document: {query}\n\n{relevant_text}"},
],
model="llama-3.3-70b-versatile",
)
# Display Answer
answer = chat_completion.choices[0].message.content
st.subheader("Answer:")
st.write(answer)
else:
st.error("Failed to extract text from the file. Please check the format.")