Spaces:

prithvi1029
/

agentic-document-intelligence

Sleeping

App Files Files Community

agentic-document-intelligence / app.py

prithvi1029

Update app.py

e10610a verified 2 months ago

raw

history blame contribute delete

5.55 kB

	import os
	import re
	import numpy as np
	import faiss
	import gradio as gr

	from pypdf import PdfReader
	from sentence_transformers import SentenceTransformer
	from openai import OpenAI

	# -----------------------------
	# Stability
	# -----------------------------
	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

	# -----------------------------
	# Config
	# -----------------------------
	TOGETHER_API_KEY = (os.getenv("TOGETHER_API_KEY") or "").strip()
	TOGETHER_BASE_URL = os.getenv("TOGETHER_BASE_URL", "https://api.together.xyz/v1").strip()
	TOGETHER_MODEL = os.getenv("TOGETHER_MODEL", "mistralai/Mixtral-8x7B-Instruct-v0.1").strip()

	EMBED_MODEL_NAME = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2").strip()
	TOP_K = int(os.getenv("TOP_K", "4"))

	# Load embedder once
	embedder = SentenceTransformer(EMBED_MODEL_NAME)


	# -----------------------------
	# Helpers
	# -----------------------------
	def clean_text(s: str) -> str:
	s = re.sub(r"\s+", " ", s)
	return s.strip()


	def chunk_text(text: str, chunk_size=900, overlap=150):
	chunks = []
	start = 0
	n = len(text)
	while start < n:
	end = min(n, start + chunk_size)
	chunks.append(text[start:end])
	start = max(0, end - overlap)
	if end == n:
	break
	return [c for c in (clean_text(x) for x in chunks) if len(c) > 30]


	def pdf_to_text(pdf_path: str) -> str:
	reader = PdfReader(pdf_path)
	pages = []
	for p in reader.pages:
	t = p.extract_text() or ""
	if t.strip():
	pages.append(t)
	return "\n".join(pages)


	def build_faiss_index(chunks):
	vectors = embedder.encode(chunks, convert_to_numpy=True, normalize_embeddings=True)
	dim = vectors.shape[1]
	index = faiss.IndexFlatIP(dim) # cosine similarity because normalized
	index.add(vectors.astype(np.float32))
	return index


	def retrieve(query, index, chunks, k=TOP_K):
	qv = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
	scores, ids = index.search(qv, k)
	hits = []
	for score, idx in zip(scores[0], ids[0]):
	if idx == -1:
	continue
	hits.append((float(score), chunks[int(idx)]))
	return hits


	def llm_generate(prompt: str) -> str:
	if not TOGETHER_API_KEY:
	return (
	"❌ TOGETHER_API_KEY not found.\n\n"
	"Go to Space → Settings → Variables and secrets → New secret:\n"
	"Name: TOGETHER_API_KEY\n"
	"Value: your Together key\n"
	"Then restart the Space."
	)

	client = OpenAI(api_key=TOGETHER_API_KEY, base_url=TOGETHER_BASE_URL)

	try:
	resp = client.chat.completions.create(
	model=TOGETHER_MODEL,
	messages=[
	{"role": "system", "content": "You are a helpful assistant. Follow instructions carefully."},
	{"role": "user", "content": prompt},
	],
	temperature=0.2,
	top_p=0.9,
	max_tokens=450,
	)
	return (resp.choices[0].message.content or "").strip()
	except Exception as e:
	return (
	"❌ LLM call failed.\n\n"
	f"Base URL: {TOGETHER_BASE_URL}\n"
	f"Model: {TOGETHER_MODEL}\n"
	f"Error: {type(e).__name__}: {e}"
	)


	# -----------------------------
	# Space logic
	# -----------------------------
	def index_pdf(pdf_file):
	if pdf_file is None:
	return None, None, "Please upload a PDF."

	text = pdf_to_text(pdf_file)
	if not text.strip():
	return None, None, "Could not extract text. If it’s scanned, you need OCR."

	chunks = chunk_text(text)
	if len(chunks) < 2:
	return None, None, "Not enough text to build RAG index."

	index = build_faiss_index(chunks)
	return index, chunks, f"✅ Indexed {len(chunks)} chunks. Now ask a question."


	def answer_question(index, chunks, question):
	if index is None or chunks is None:
	return "Upload a PDF first and wait for indexing."
	if not question or not question.strip():
	return "Type a question."

	hits = retrieve(question, index, chunks, k=TOP_K)
	context = "\n\n".join([f"[{i+1}] {h[1]}" for i, h in enumerate(hits)])

	prompt = f"""You are a helpful assistant. Answer using ONLY the context.
	If the answer is not in the context, say: "I don't know from the provided document."

	Question: {question}

	Context:
	{context}

	Answer:"""

	ans = llm_generate(prompt)

	sources = "\n\n".join(
	[f"Source {i+1} (score={hits[i][0]:.3f})\n{hits[i][1][:700]}..." for i in range(len(hits))]
	)

	return f"### Answer\n{ans}\n\n---\n### Retrieved Sources\n{sources}"


	# -----------------------------
	# UI (Gradio)
	# -----------------------------
	with gr.Blocks(title="PDF RAG (Together.ai)") as demo:
	gr.Markdown(
	"# 📄 PDF RAG (Together.ai)\n"
	"Upload a PDF, build a FAISS index, and ask questions.\n\n"
	f"LLM: `{TOGETHER_MODEL}` \n"
	f"Embedder: `{EMBED_MODEL_NAME}`"
	)

	pdf = gr.File(label="Upload PDF", type="filepath")
	status = gr.Markdown()

	index_state = gr.State(None)
	chunks_state = gr.State(None)

	pdf.change(fn=index_pdf, inputs=[pdf], outputs=[index_state, chunks_state, status])

	question = gr.Textbox(label="Question", placeholder="e.g., Summarize the document")
	out = gr.Markdown()
	btn = gr.Button("Ask")

	btn.click(fn=answer_question, inputs=[index_state, chunks_state, question], outputs=[out])

	demo.launch()