import faiss import os import pickle from pypdf import PdfReader from sentence_transformers import SentenceTransformer, CrossEncoder from rank_bm25 import BM25Okapi USE_HNSW = True USE_RERANKER = True CHUNK_SIZE = 800 CHUNK_OVERLAP = 200 DB_FILE_INDEX = "vector.index" DB_FILE_META = "metadata.pkl" DB_FILE_BM25 = "bm25.pkl" index = None documents = [] metadata = [] bm25 = None tokenized_corpus = [] embedder = SentenceTransformer("all-MiniLM-L6-v2") reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") def chunk_text(text): import re sentences = re.split(r'(?<=[.!?])\s+', text) chunks, current = [], "" for s in sentences: if len(current) + len(s) > CHUNK_SIZE and current: chunks.append(current.strip()) overlap = max(0, len(current) - CHUNK_OVERLAP) current = current[overlap:] + " " + s else: current += " " + s if current else s if current.strip(): chunks.append(current.strip()) return chunks def save_db(): if index: faiss.write_index(index, DB_FILE_INDEX) if documents: with open(DB_FILE_META, "wb") as f: pickle.dump({"documents": documents, "metadata": metadata}, f) if bm25: with open(DB_FILE_BM25, "wb") as f: pickle.dump(bm25, f) def load_db(): global index, documents, metadata, bm25 if os.path.exists(DB_FILE_INDEX) and os.path.exists(DB_FILE_META): index = faiss.read_index(DB_FILE_INDEX) with open(DB_FILE_META, "rb") as f: data = pickle.load(f) documents = data["documents"] metadata = data["metadata"] if os.path.exists(DB_FILE_BM25): with open(DB_FILE_BM25, "rb") as f: bm25 = pickle.load(f) elif documents: # Auto-backfill if documents exist but BM25 is missing print("Backfilling BM25 index on first load...") tokenized_corpus = [doc.split(" ") for doc in documents] bm25 = BM25Okapi(tokenized_corpus) with open(DB_FILE_BM25, "wb") as f: pickle.dump(bm25, f) load_db() def clear_database(): global index, documents, metadata, bm25 index = None documents = [] metadata = [] bm25 = None if os.path.exists(DB_FILE_INDEX): os.remove(DB_FILE_INDEX) if os.path.exists(DB_FILE_META): os.remove(DB_FILE_META) if os.path.exists(DB_FILE_BM25): os.remove(DB_FILE_BM25) def ingest_documents(files): global index, documents, metadata texts, meta = [], [] for file in files: if file.filename.endswith(".pdf"): # Save temp file for pymupdf4llm import tempfile with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(file.file.read()) tmp_path = tmp.name try: # Use pymupdf4llm to extract markdown with tables import pymupdf4llm # Get list of dicts: [{'text': '...', 'metadata': {'page': 1, ...}}] pages_data = pymupdf4llm.to_markdown(tmp_path, page_chunks=True) for page_obj in pages_data: p_text = page_obj["text"] p_num = page_obj["metadata"].get("page", "N/A") # Chunk within the page to preserve page context for chunk in chunk_text(p_text): texts.append(chunk) meta.append({"source": file.filename, "page": p_num}) finally: os.remove(tmp_path) elif file.filename.endswith(".txt"): content = file.file.read().decode("utf-8", errors="ignore") for chunk in chunk_text(content): texts.append(chunk) meta.append({"source": file.filename, "page": "N/A"}) if not texts: raise ValueError("No readable text found (OCR needed for scanned PDFs).") embeddings = embedder.encode(texts, convert_to_numpy=True, normalize_embeddings=True) if index is None: dim = embeddings.shape[1] index = faiss.IndexHNSWFlat(dim, 32) if USE_HNSW else faiss.IndexFlatIP(dim) index.hnsw.efConstruction = 200 index.hnsw.efSearch = 64 index.add(embeddings) documents.extend(texts) metadata.extend(meta) # Update BM25 tokenized_corpus = [doc.split(" ") for doc in documents] bm25 = BM25Okapi(tokenized_corpus) save_db() return len(documents) def search_knowledge(query, top_k=8): if index is None: return [] # 1. Vector Search qvec = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True) scores, indices = index.search(qvec, top_k) vector_results = {} for i, (idx, score) in enumerate(zip(indices[0], scores[0])): if idx == -1: continue vector_results[idx] = i # Store rank (0-based) # 2. Keyword Search (BM25) bm25_results = {} if bm25: tokenized_query = query.split(" ") bm25_scores = bm25.get_scores(tokenized_query) # Get top_k indices top_n = sorted(range(len(bm25_scores)), key=lambda i: bm25_scores[i], reverse=True)[:top_k] for i, idx in enumerate(top_n): bm25_results[idx] = i # Store rank # 3. Reciprocal Rank Fusion (RRF) # score = 1 / (k + rank) k = 60 candidates_idx = set(vector_results.keys()) | set(bm25_results.keys()) merged_candidates = [] for idx in candidates_idx: v_rank = vector_results.get(idx, float('inf')) b_rank = bm25_results.get(idx, float('inf')) rrf_score = (1 / (k + v_rank)) + (1 / (k + b_rank)) merged_candidates.append({ "text": documents[idx], "metadata": metadata[idx], "score": rrf_score, # This is RRF score, not cosine/BM25 score "vector_rank": v_rank if v_rank != float('inf') else None, "bm25_rank": b_rank if b_rank != float('inf') else None }) # Sort by RRF score merged_candidates.sort(key=lambda x: x["score"], reverse=True) # 4. Rerank Top Candidates candidates = merged_candidates[:10] # Take top 10 for reranking if USE_RERANKER and candidates: pairs = [(query, c["text"]) for c in candidates] rerank_scores = reranker.predict(pairs) for c, rs in zip(candidates, rerank_scores): c["rerank"] = float(rs) candidates.sort(key=lambda x: x["rerank"], reverse=True) return candidates[:5] def get_all_chunks(limit=80): return [{"text": t, "metadata": m} for t, m in zip(documents[:limit], metadata[:limit])]