Nidhi-Phophaliya commited on
Commit
c765e15
·
verified ·
1 Parent(s): 1edf6bc

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ faiss_index.idx filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim-bullseye
2
+
3
+ WORKDIR /app
4
+
5
+ # System dependencies
6
+ RUN apt-get update && apt-get install -y git && apt-get upgrade -y && apt-get clean
7
+
8
+ # Install Python deps
9
+ COPY requirements.txt .
10
+ RUN pip install --upgrade pip && pip install -r requirements.txt
11
+
12
+ # Copy source
13
+ COPY . .
14
+
15
+
16
+ ENV PORT=7860
17
+
18
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:399d91b475a5284cb6da0772c2698c9a87cc2935d596336134d4034a1ef2fd6a
3
+ size 652928
faiss_index.idx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b00ac992818054b7dbcb20ef43f3b62c9f24c7ce814bd0ef769738d4920e8cc
3
+ size 652845
generate_chunks.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from app.utils.pdf_parser import extract_chunks_from_pdfs
2
+ import os
3
+
4
+ pdf_dir = "app/data"
5
+ pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
6
+
7
+ extract_chunks_from_pdfs(pdf_files, "app/data/chunks.csv")
8
+ print("✅ chunks.csv generated!")
metadata.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12f368e096c2788de86300c24ed8c4652aa7da370beec29087cd5124bb3b32e6
3
+ size 921211
precompute_embeddings.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import pickle
4
+ import faiss
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ # Load CSV
8
+ df = pd.read_csv("app/data/chunks.csv")
9
+ texts = df["text"].tolist()
10
+ metadata = df[['chunk_id', 'source_doc', 'page', 'text']].reset_index(drop=True)
11
+
12
+ # Embed
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+ embeddings = model.encode(texts, show_progress_bar=True)
15
+ embedding_matrix = np.array(embeddings).astype('float32')
16
+
17
+ # Build FAISS index
18
+ dimension = embedding_matrix.shape[1]
19
+ index = faiss.IndexFlatL2(dimension)
20
+ index.add(embedding_matrix)
21
+
22
+ # Save everything
23
+ faiss.write_index(index, "faiss_index.idx")
24
+ np.save("embeddings.npy", embedding_matrix)
25
+ with open("texts.pkl", "wb") as f:
26
+ pickle.dump(texts, f)
27
+ with open("metadata.pkl", "wb") as f:
28
+ pickle.dump(metadata, f)
29
+
30
+ print("✅ Saved: faiss_index.idx, embeddings.npy, texts.pkl, metadata.pkl")
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-dotenv
4
+ pandas
5
+ faiss-cpu
6
+ sentence-transformers
7
+ PyMuPDF
8
+ google-generativeai
9
+ gdown
texts.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3624250a9779ba5d446b69a9d2818a1d6ac47ebf9b1a00490e4672401a1dc3d
3
+ size 898273