Spaces:
Sleeping
Sleeping
| import requests | |
| import fitz | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| import gradio as gr | |
| import os | |
| # ========================= | |
| # 1. LOAD API KEY (HF SECRET) | |
| # ========================= | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # ========================= | |
| # 2. LOAD PDF | |
| # ========================= | |
| pdf_url = "https://huggingface.co/datasets/HuzaifaTech/rag_file/resolve/main/Hands_On_Machine_Learning_with_Scikit_Le.pdf" | |
| pdf_path = "file.pdf" | |
| if not os.path.exists(pdf_path): | |
| response = requests.get(pdf_url) | |
| with open(pdf_path, "wb") as f: | |
| f.write(response.content) | |
| # ========================= | |
| # 3. EXTRACT TEXT | |
| # ========================= | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| # ========================= | |
| # 4. CHUNKING | |
| # ========================= | |
| def chunk_text(text, chunk_size=800): | |
| paragraphs = text.split("\n") | |
| chunks = [] | |
| current = "" | |
| for para in paragraphs: | |
| if len(current) + len(para) < chunk_size: | |
| current += para + "\n" | |
| else: | |
| chunks.append(current.strip()) | |
| current = para | |
| if current: | |
| chunks.append(current.strip()) | |
| return chunks | |
| chunks = chunk_text(text)[:300] | |
| # ========================= | |
| # 5. EMBEDDINGS | |
| # ========================= | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = model.encode(chunks, batch_size=32) | |
| faiss.normalize_L2(embeddings) | |
| # ========================= | |
| # 6. FAISS | |
| # ========================= | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| # ========================= | |
| # 7. RETRIEVAL | |
| # ========================= | |
| def retrieve(query, k=4): | |
| q_emb = model.encode([query]) | |
| faiss.normalize_L2(q_emb) | |
| _, idx = index.search(q_emb, k) | |
| return [chunks[i] for i in idx[0]] | |
| # ========================= | |
| # 8. GENERATION | |
| # ========================= | |
| def generate_answer(query): | |
| docs = retrieve(query) | |
| context = "\n\n".join(docs) | |
| prompt = f""" | |
| Context: | |
| {context} | |
| Question: | |
| {query} | |
| """ | |
| try: | |
| res = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "Answer ONLY from the provided context. If not found, say 'I don't know'." | |
| }, | |
| { | |
| "role": "user", | |
| "content": prompt | |
| } | |
| ], | |
| temperature=0, | |
| max_tokens=500 | |
| ) | |
| return res.choices[0].message.content | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # ========================= | |
| # 9. UI (PROFESSIONAL) | |
| # ========================= | |
| def chat(message, history): | |
| return generate_answer(message) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📚 RAG Chatbot (ML Book)") | |
| gr.Markdown("Ask questions from *Hands-On Machine Learning* PDF") | |
| chatbot = gr.ChatInterface( | |
| fn=chat, | |
| chatbot=gr.Chatbot(height=400), | |
| textbox=gr.Textbox(placeholder="Ask a question...", container=False), | |
| ) | |
| demo.launch(theme=gr.themes.Soft()) | |