| | import os |
| | import gradio as gr |
| | import fitz |
| | from sentence_transformers import SentenceTransformer |
| | import chromadb |
| | from chromadb.utils import embedding_functions |
| | import openai |
| |
|
| | |
| | openai.api_key = os.getenv("GROQ_API_KEY") |
| | openai.api_base = "https://api.groq.com/openai/v1" |
| |
|
| | |
| | embedder = SentenceTransformer("all-MiniLM-L6-v2") |
| |
|
| | |
| | persist_path = "./chroma_db" |
| | db = chromadb.Client(chromadb.config.Settings(persist_directory=persist_path)) |
| | collection = db.get_or_create_collection("papers") |
| |
|
| | |
| | def extract_text_from_pdf(file): |
| | text = "" |
| | doc = fitz.open(stream=file.read(), filetype="pdf") |
| | for page in doc: |
| | text += page.get_text() |
| | return text |
| |
|
| | |
| | def chunk_and_store(text): |
| | chunks = [text[i:i+500] for i in range(0, len(text), 500)] |
| | embeddings = embedder.encode(chunks).tolist() |
| |
|
| | for i, chunk in enumerate(chunks): |
| | collection.add(documents=[chunk], ids=[f"id_{len(collection.get()['ids']) + i}"], embeddings=[embeddings[i]]) |
| | db.persist() |
| |
|
| | |
| | def retrieve_and_ask(query): |
| | if len(collection.get()["documents"]) == 0: |
| | return "Please upload a paper first." |
| |
|
| | query_embedding = embedder.encode([query]).tolist()[0] |
| | results = collection.query(query_embeddings=[query_embedding], n_results=3) |
| | context = "\n".join(results["documents"][0]) |
| |
|
| | system_prompt = "You are an academic assistant helping students understand research papers." |
| | user_prompt = f"Based on the following context:\n{context}\n\nAnswer the question:\n{query}" |
| |
|
| | try: |
| | response = openai.ChatCompletion.create( |
| | model="llama3-70b-8192", |
| | messages=[ |
| | {"role": "system", "content": system_prompt}, |
| | {"role": "user", "content": user_prompt} |
| | ] |
| | ) |
| | return response['choices'][0]['message']['content'] |
| | except Exception as e: |
| | return f"Error: {str(e)}" |
| |
|
| | |
| | def handle_upload(file): |
| | if file is None: |
| | return "Upload a valid PDF file." |
| | text = extract_text_from_pdf(file) |
| | chunk_and_store(text) |
| | return "β
Paper uploaded and processed." |
| |
|
| | def handle_query(query): |
| | return retrieve_and_ask(query) |
| |
|
| | with gr.Blocks() as demo: |
| | gr.Markdown("### π RAG Academic Assistant\nUpload a paper and ask questions.") |
| | |
| | with gr.Row(): |
| | file = gr.File(label="Upload PDF", type="binary") |
| | upload_btn = gr.Button("Process") |
| | upload_output = gr.Textbox() |
| |
|
| | with gr.Row(): |
| | query = gr.Textbox(label="Ask a question") |
| | response = gr.Textbox(label="Answer") |
| | ask_btn = gr.Button("Ask") |
| |
|
| | upload_btn.click(handle_upload, inputs=[file], outputs=[upload_output]) |
| | ask_btn.click(handle_query, inputs=[query], outputs=[response]) |
| |
|
| | demo.launch() |
| |
|