Nidhi-Phophaliya commited on
Commit
5f36465
·
verified ·
1 Parent(s): d74cd48

Upload 12 files

Browse files
app/__pycache__/main.cpython-313.pyc ADDED
Binary file (4.3 kB). View file
 
app/main.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Request, Header
2
+ from app.models.schema import QueryRequest, QueryResponse, JustificationItem
3
+ from app.utils.search import SemanticSearch
4
+ from app.utils.llm_decider import generate_decision
5
+ from app.utils.embedder import Embedder
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ API_KEY = os.getenv("API_KEY")
12
+ app = FastAPI()
13
+
14
+ # ✅ Load the embedder once from Drive
15
+ embedder = Embedder()
16
+ embedder.load_from_drive(
17
+ index_url="https://drive.google.com/uc?id=1GOSzA4PiEsDZupMEeNsuIEhKpbRMWgxl",
18
+ metadata_url="https://drive.google.com/uc?id=1MPkhB5L0TkXNivb1SjRlhYejDpP9Mp6v"
19
+ )
20
+
21
+ # ✅ Semantic Search wrapper
22
+ search_engine = SemanticSearch(embedder)
23
+
24
+ @app.post("/debug/test")
25
+ def debug(payload: dict):
26
+ return {"echo": payload}
27
+
28
+ @app.get("/")
29
+ def health():
30
+ return {"status": "HackRx API running 🚀"}
31
+
32
+ @app.head("/")
33
+ def health_head():
34
+ return
35
+
36
+ @app.middleware("http")
37
+ async def log_all_requests(request: Request, call_next):
38
+ body = await request.body()
39
+ print("📥 RAW Body:", body.decode("utf-8"))
40
+ print("📥 Headers:", dict(request.headers))
41
+ response = await call_next(request)
42
+ return response
43
+
44
+ @app.post("/hackrx/run", response_model=QueryResponse)
45
+ def run_handler(request: Request, payload: QueryRequest, authorization: str = Header(None)):
46
+ print("📩 Incoming query:", payload.query)
47
+
48
+ if not authorization or not authorization.startswith("Bearer ") or authorization.split()[1] != API_KEY:
49
+ print("❌ Unauthorized request")
50
+ raise HTTPException(status_code=401, detail="Unauthorized")
51
+
52
+ try:
53
+ results_df = search_engine.search(payload.query)
54
+ print(f"🔍 Found {len(results_df)} relevant chunks")
55
+
56
+ if results_df.empty:
57
+ raise HTTPException(status_code=404, detail="No relevant information found")
58
+
59
+ top_chunks = results_df['text'].tolist()
60
+ print("📄 Preview chunk:", top_chunks[0][:150])
61
+
62
+ # ✅ Use LLM
63
+ parsed = generate_decision(payload.query, top_chunks)
64
+
65
+ justification_items = [JustificationItem(**j) for j in parsed.get('justification', [])]
66
+
67
+ return QueryResponse(
68
+ decision=parsed.get('decision', "No decision provided"),
69
+ amount=parsed.get('amount', "N/A"),
70
+ justification=justification_items
71
+ )
72
+
73
+ except Exception as e:
74
+ print("🔥 Internal error:", str(e))
75
+ raise HTTPException(status_code=500, detail=f"Internal Error: {str(e)}")
app/models/__pycache__/schema.cpython-313.pyc ADDED
Binary file (1.01 kB). View file
 
app/models/schema.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+ from typing import List
3
+
4
+ class QueryRequest(BaseModel):
5
+ query: str
6
+
7
+ class JustificationItem(BaseModel):
8
+ clause: str
9
+ reason: str
10
+
11
+ class QueryResponse(BaseModel):
12
+ decision: str
13
+ amount: str
14
+ justification: List[JustificationItem]
app/utils/__pycache__/embedder.cpython-313.pyc ADDED
Binary file (2.71 kB). View file
 
app/utils/__pycache__/llm_decider.cpython-313.pyc ADDED
Binary file (2.74 kB). View file
 
app/utils/__pycache__/pdf_parser.cpython-313.pyc ADDED
Binary file (1.72 kB). View file
 
app/utils/__pycache__/search.cpython-313.pyc ADDED
Binary file (754 Bytes). View file
 
app/utils/embedder.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/utils/embedder.py
2
+
3
+ import gdown
4
+ import os
5
+ import faiss
6
+ import numpy as np
7
+ import pickle
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ class Embedder:
11
+ def __init__(self, model_name='paraphrase-MiniLM-L3-v2'):
12
+ self.model = SentenceTransformer(model_name)
13
+ self.index = None
14
+ self.metadata = None
15
+
16
+ def download_file(self, url, out_path):
17
+ if not os.path.exists(out_path):
18
+ gdown.download(url, out_path, quiet=False)
19
+
20
+ def load_from_files(self, index_path, metadata_path):
21
+ self.index = faiss.read_index(index_path)
22
+ with open(metadata_path, "rb") as f:
23
+ self.metadata = pickle.load(f)
24
+
25
+ def load_from_drive(self, index_url, metadata_url):
26
+ self.download_file(index_url, "faiss_index.idx")
27
+ self.download_file(metadata_url, "metadata.pkl")
28
+ self.load_from_files("faiss_index.idx", "metadata.pkl")
29
+
30
+ def query(self, query_text, k=5):
31
+ if self.index is None or self.metadata is None:
32
+ raise ValueError("Index or metadata not loaded")
33
+
34
+ query_embedding = self.model.encode([query_text]).astype('float32')
35
+ D, I = self.index.search(query_embedding, k)
36
+ results = self.metadata.iloc[I[0]].copy()
37
+ results['score'] = D[0]
38
+ return results
app/utils/llm_decider.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import google.generativeai as genai
4
+ import json
5
+ import re
6
+
7
+ # Load environment variables
8
+ load_dotenv(dotenv_path="D:/Bajaj-HackRX/.env")
9
+ api_key = os.getenv("GEMINI_API_KEY")
10
+
11
+ print("🧪 Loaded Gemini Key:", "Yes" if api_key else "❌ Not Found")
12
+
13
+ if api_key:
14
+ genai.configure(api_key=api_key)
15
+ else:
16
+ print("Warning: GEMINI_API_KEY not found. Gemini client will not be initialized.")
17
+
18
+ client = genai.GenerativeModel("gemini-2.5-flash") if api_key else None
19
+
20
+ def generate_decision(query, context_chunks):
21
+ if client is None:
22
+ raise ValueError("❌ Gemini client is not initialized.")
23
+
24
+ prompt = f'''
25
+ Given the user query and policy clauses below, decide if the claim should be approved, estimate amount, and explain using clause references.
26
+
27
+ Query:
28
+ "{query}"
29
+
30
+ Relevant Clauses:
31
+ {chr(10).join(context_chunks)}
32
+
33
+ Respond in JSON like:
34
+ {{
35
+ "decision": "approved | rejected",
36
+ "amount": "<amount or N/A>",
37
+ "justification": [
38
+ {{ "clause": "<clause>", "reason": "<why>" }}
39
+ ]
40
+ }}
41
+ '''
42
+
43
+ print("🧠 Gemini Prompt Preview:\n", prompt[:500], "...\n")
44
+
45
+ response = client.generate_content(prompt)
46
+ raw_text = response.text
47
+
48
+ # Clean output (in case it's wrapped in ```json ``` blocks)
49
+ cleaned_text = re.sub(r"^```json\s*|```$", "", raw_text, flags=re.MULTILINE).strip()
50
+
51
+ try:
52
+ if not cleaned_text:
53
+ raise ValueError("❌ Gemini model returned an empty response.")
54
+ parsed_json = json.loads(cleaned_text)
55
+ print("✅ Gemini response parsed successfully")
56
+ return parsed_json # Return actual Python dict
57
+ except json.JSONDecodeError as e:
58
+ print(f"❌ JSON decode error: {e}")
59
+ print(f"🧾 Raw Gemini response: {raw_text}")
60
+ raise
61
+ except Exception as e:
62
+ print(f"🔥 Unexpected error: {e}")
63
+ print(f"🧾 Raw Gemini response: {raw_text}")
64
+ raise
app/utils/pdf_parser.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ import uuid
3
+ import pandas as pd
4
+ import os
5
+
6
+ def chunk_text(text, chunk_size=500, overlap=100):
7
+ words = text.split()
8
+ chunks = []
9
+ for i in range(0, len(words), chunk_size - overlap):
10
+ chunk = " ".join(words[i:i+chunk_size])
11
+ if chunk:
12
+ chunks.append(chunk)
13
+ return chunks
14
+
15
+ def extract_chunks_from_pdfs(pdf_paths, output_csv_path):
16
+ all_chunks = []
17
+
18
+ for file_path in pdf_paths:
19
+ doc = fitz.open(file_path)
20
+ for page_num, page in enumerate(doc, start=1):
21
+ text = page.get_text()
22
+ chunks = chunk_text(text)
23
+ for idx, chunk in enumerate(chunks):
24
+ all_chunks.append({
25
+ "chunk_id": str(uuid.uuid4()),
26
+ "source_doc": os.path.basename(file_path),
27
+ "page": page_num,
28
+ "chunk_index": idx,
29
+ "text": chunk
30
+ })
31
+
32
+ df['text'] = df['text'].apply(lambda t: t[:1000])
33
+ df = pd.DataFrame(all_chunks)
34
+ df.to_csv(output_csv_path, index=False)
35
+ return df
app/utils/search.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # app/utils/search.py
2
+
3
+ class SemanticSearch:
4
+ def __init__(self, embedder):
5
+ self.embedder = embedder
6
+
7
+ def search(self, query_text, top_k=5):
8
+ return self.embedder.query(query_text, k=top_k)