trkn-hackrx

Sleeping

App Files Files Community

Nidhi-Phophaliya commited on Aug 4, 2025

Commit

5f36465

verified ·

1 Parent(s): d74cd48

Upload 12 files

Browse files

Files changed (12) hide show

app/__pycache__/main.cpython-313.pyc +0 -0
app/main.py +75 -0
app/models/__pycache__/schema.cpython-313.pyc +0 -0
app/models/schema.py +14 -0
app/utils/__pycache__/embedder.cpython-313.pyc +0 -0
app/utils/__pycache__/llm_decider.cpython-313.pyc +0 -0
app/utils/__pycache__/pdf_parser.cpython-313.pyc +0 -0
app/utils/__pycache__/search.cpython-313.pyc +0 -0
app/utils/embedder.py +38 -0
app/utils/llm_decider.py +64 -0
app/utils/pdf_parser.py +35 -0
app/utils/search.py +8 -0

app/__pycache__/main.cpython-313.pyc ADDED Viewed

Binary file (4.3 kB). View file

app/main.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from fastapi import FastAPI, HTTPException, Request, Header
+from app.models.schema import QueryRequest, QueryResponse, JustificationItem
+from app.utils.search import SemanticSearch
+from app.utils.llm_decider import generate_decision
+from app.utils.embedder import Embedder
+import os
+from dotenv import load_dotenv
+load_dotenv()
+API_KEY = os.getenv("API_KEY")
+app = FastAPI()
+# ✅ Load the embedder once from Drive
+embedder = Embedder()
+embedder.load_from_drive(
+    index_url="https://drive.google.com/uc?id=1GOSzA4PiEsDZupMEeNsuIEhKpbRMWgxl",
+    metadata_url="https://drive.google.com/uc?id=1MPkhB5L0TkXNivb1SjRlhYejDpP9Mp6v"
+)
+# ✅ Semantic Search wrapper
+search_engine = SemanticSearch(embedder)
+@app.post("/debug/test")
+def debug(payload: dict):
+    return {"echo": payload}
+@app.get("/")
+def health():
+    return {"status": "HackRx API running 🚀"}
+@app.head("/")
+def health_head():
+    return
+@app.middleware("http")
+async def log_all_requests(request: Request, call_next):
+    body = await request.body()
+    print("📥 RAW Body:", body.decode("utf-8"))
+    print("📥 Headers:", dict(request.headers))
+    response = await call_next(request)
+    return response
+@app.post("/hackrx/run", response_model=QueryResponse)
+def run_handler(request: Request, payload: QueryRequest, authorization: str = Header(None)):
+    print("📩 Incoming query:", payload.query)
+    if not authorization or not authorization.startswith("Bearer ") or authorization.split()[1] != API_KEY:
+        print("❌ Unauthorized request")
+        raise HTTPException(status_code=401, detail="Unauthorized")
+    try:
+        results_df = search_engine.search(payload.query)
+        print(f"🔍 Found {len(results_df)} relevant chunks")
+        if results_df.empty:
+            raise HTTPException(status_code=404, detail="No relevant information found")
+        top_chunks = results_df['text'].tolist()
+        print("📄 Preview chunk:", top_chunks[0][:150])
+        # ✅ Use LLM
+        parsed = generate_decision(payload.query, top_chunks)
+        justification_items = [JustificationItem(**j) for j in parsed.get('justification', [])]
+        return QueryResponse(
+            decision=parsed.get('decision', "No decision provided"),
+            amount=parsed.get('amount', "N/A"),
+            justification=justification_items
+        )
+    except Exception as e:
+        print("🔥 Internal error:", str(e))
+        raise HTTPException(status_code=500, detail=f"Internal Error: {str(e)}")

app/models/__pycache__/schema.cpython-313.pyc ADDED Viewed

Binary file (1.01 kB). View file

app/models/schema.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from pydantic import BaseModel
+from typing import List
+class QueryRequest(BaseModel):
+    query: str
+class JustificationItem(BaseModel):
+    clause: str
+    reason: str
+class QueryResponse(BaseModel):
+    decision: str
+    amount: str
+    justification: List[JustificationItem]

app/utils/__pycache__/embedder.cpython-313.pyc ADDED Viewed

Binary file (2.71 kB). View file

app/utils/__pycache__/llm_decider.cpython-313.pyc ADDED Viewed

Binary file (2.74 kB). View file

app/utils/__pycache__/pdf_parser.cpython-313.pyc ADDED Viewed

Binary file (1.72 kB). View file

app/utils/__pycache__/search.cpython-313.pyc ADDED Viewed

Binary file (754 Bytes). View file

app/utils/embedder.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# app/utils/embedder.py
+import gdown
+import os
+import faiss
+import numpy as np
+import pickle
+from sentence_transformers import SentenceTransformer
+class Embedder:
+    def __init__(self, model_name='paraphrase-MiniLM-L3-v2'):
+        self.model = SentenceTransformer(model_name)
+        self.index = None
+        self.metadata = None
+    def download_file(self, url, out_path):
+        if not os.path.exists(out_path):
+            gdown.download(url, out_path, quiet=False)
+    def load_from_files(self, index_path, metadata_path):
+        self.index = faiss.read_index(index_path)
+        with open(metadata_path, "rb") as f:
+            self.metadata = pickle.load(f)
+    def load_from_drive(self, index_url, metadata_url):
+        self.download_file(index_url, "faiss_index.idx")
+        self.download_file(metadata_url, "metadata.pkl")
+        self.load_from_files("faiss_index.idx", "metadata.pkl")
+    def query(self, query_text, k=5):
+        if self.index is None or self.metadata is None:
+            raise ValueError("Index or metadata not loaded")
+        query_embedding = self.model.encode([query_text]).astype('float32')
+        D, I = self.index.search(query_embedding, k)
+        results = self.metadata.iloc[I[0]].copy()
+        results['score'] = D[0]
+        return results

app/utils/llm_decider.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import os
+from dotenv import load_dotenv
+import google.generativeai as genai
+import json
+import re
+# Load environment variables
+load_dotenv(dotenv_path="D:/Bajaj-HackRX/.env")
+api_key = os.getenv("GEMINI_API_KEY")
+print("🧪 Loaded Gemini Key:", "Yes" if api_key else "❌ Not Found")
+if api_key:
+    genai.configure(api_key=api_key)
+else:
+    print("Warning: GEMINI_API_KEY not found. Gemini client will not be initialized.")
+client = genai.GenerativeModel("gemini-2.5-flash") if api_key else None
+def generate_decision(query, context_chunks):
+    if client is None:
+        raise ValueError("❌ Gemini client is not initialized.")
+    prompt = f'''
+Given the user query and policy clauses below, decide if the claim should be approved, estimate amount, and explain using clause references.
+Query:
+"{query}"
+Relevant Clauses:
+{chr(10).join(context_chunks)}
+Respond in JSON like:
+{{
+  "decision": "approved | rejected",
+  "amount": "<amount or N/A>",
+  "justification": [
+    {{ "clause": "<clause>", "reason": "<why>" }}
+  ]
+}}
+'''
+    print("🧠 Gemini Prompt Preview:\n", prompt[:500], "...\n")
+    response = client.generate_content(prompt)
+    raw_text = response.text
+    # Clean output (in case it's wrapped in ```json ``` blocks)
+    cleaned_text = re.sub(r"^```json\s*|```$", "", raw_text, flags=re.MULTILINE).strip()
+    try:
+        if not cleaned_text:
+            raise ValueError("❌ Gemini model returned an empty response.")
+        parsed_json = json.loads(cleaned_text)
+        print("✅ Gemini response parsed successfully")
+        return parsed_json  # Return actual Python dict
+    except json.JSONDecodeError as e:
+        print(f"❌ JSON decode error: {e}")
+        print(f"🧾 Raw Gemini response: {raw_text}")
+        raise
+    except Exception as e:
+        print(f"🔥 Unexpected error: {e}")
+        print(f"🧾 Raw Gemini response: {raw_text}")
+        raise

app/utils/pdf_parser.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import fitz  # PyMuPDF
+import uuid
+import pandas as pd
+import os
+def chunk_text(text, chunk_size=500, overlap=100):
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), chunk_size - overlap):
+        chunk = " ".join(words[i:i+chunk_size])
+        if chunk:
+            chunks.append(chunk)
+    return chunks
+def extract_chunks_from_pdfs(pdf_paths, output_csv_path):
+    all_chunks = []
+    for file_path in pdf_paths:
+        doc = fitz.open(file_path)
+        for page_num, page in enumerate(doc, start=1):
+            text = page.get_text()
+            chunks = chunk_text(text)
+            for idx, chunk in enumerate(chunks):
+                all_chunks.append({
+                    "chunk_id": str(uuid.uuid4()),
+                    "source_doc": os.path.basename(file_path),
+                    "page": page_num,
+                    "chunk_index": idx,
+                    "text": chunk
+                })
+    df['text'] = df['text'].apply(lambda t: t[:1000])
+    df = pd.DataFrame(all_chunks)
+    df.to_csv(output_csv_path, index=False)
+    return df

app/utils/search.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# app/utils/search.py
+class SemanticSearch:
+    def __init__(self, embedder):
+        self.embedder = embedder
+    def search(self, query_text, top_k=5):
+        return self.embedder.query(query_text, k=top_k)