Spaces:

ZacBl
/

Note_Retriever

Sleeping

App Files Files Community

ZacBl commited on May 19, 2025

Commit

35bda59

verified ·

1 Parent(s): ca35b85

Upload 9 files

Browse files

Files changed (9) hide show

README.md +3 -10
app.py +51 -0
doc_preprocessing.py +200 -0
dockerfile +34 -0
imb.sh +4 -0
llm_interaction.py +11 -0
pyproject.toml +18 -0
uv.lock +0 -0
vector_DB.py +217 -0

README.md CHANGED Viewed

@@ -1,11 +1,4 @@
----
-title: Note Retriever
-emoji: 💻
-colorFrom: gray
-colorTo: blue
-sdk: docker
-pinned: false
-short_description: This project aims to help long notes writers to locate previ
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## This project aims to help long notes writers to locate previous scripts written and drown in massive texts
+docker build --no-cache -t notes_retriever .
+docker run -d -p -it 127.0.0.1:8501:8501 notes_retriever

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+import numpy as np
+from doc_preprocessing import process_files, get_embeddings
+from vector_DB import VectorDatabase  # Import the class
+from llm_interaction import get_answer
+# Initialize vector database (FAISS) - corrected instantiation
+vector_database = VectorDatabase() #Instantiate the VectorDatabase Class
+chunks_metadata = []
+def main():
+    st.title("Document Query App")
+    uploaded_files = st.file_uploader(
+        "Upload PDF or Word files", accept_multiple_files=True, type=["pdf", "docx"]
+    )
+    query = st.text_input("Enter your query:")
+    if uploaded_files:
+        global chunks_metadata
+        all_chunks, all_embeddings, chunks_metadata = process_files(uploaded_files)
+        vector_database.add_data(all_embeddings, all_chunks, chunks_metadata) # use the method
+        st.session_state.files_processed = True
+    if query:
+        results = process_query(query)
+        display_results(results)
+def process_query(query):
+    if vector_database.is_empty(): #Use the method
+        return "Please upload files first."
+    # query_embedding = get_embeddings([query])[0]
+    # results = vector_database.query(query_embedding, k=3) # use the method
+    query_embedding = get_embeddings([query])[0]  # Get the embedding for the query
+    results = vector_database.query(query_embedding, k=3)  # Get the top 2 results
+    return results
+def display_results(results):
+    for result in results:
+        st.subheader("Answer")
+        st.subheader("Source")
+        st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
+        st.subheader("Citations depuis le document :")
+        st.write(result["chunk_text"])
+if __name__ == "__main__":
+    main()

doc_preprocessing.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# from pypdf import PdfReader
+# import docx
+# from transformers.pipelines import pipeline
+# import streamlit as st
+# def extract_text(file):
+#     text = ""
+#     if file.name.endswith(".pdf"):
+#         try:
+#             reader = PdfReader(file)
+#             for page in reader.pages:
+#                 text += page.extract_text() + "\n"
+#         except Exception as e:
+#             st.error(f"Error reading PDF {file.name}: {e}")
+#             return ""
+#     elif file.name.endswith(".docx"):
+#         try:
+#             document = docx.Document(file)
+#             for paragraph in document.paragraphs:
+#                 text += paragraph.text + "\n"
+#         except Exception as e:
+#             st.error(f"Error reading DOCX {file.name}: {e}")
+#             return ""
+#     return text
+# def chunk_text(text, chunk_size=500, overlap=50):
+#     chunks = []
+#     start = 0
+#     while start < len(text):
+#         end = start + chunk_size
+#         chunk = text[start:end]
+#         chunks.append(chunk)
+#         start = end - overlap
+#     return chunks
+# def get_embeddings(texts):
+#     try:
+#         embedding_model = pipeline(
+#             'document-question-answering',
+#             "sentence-transformers/all-MiniLM-L6-v2"
+#         )  # Example model
+#         embeddings = embedding_model(texts)
+#         return embeddings
+#     except Exception as e:
+#         st.error(f"Error generating embeddings: {e}")
+#         return []
+# def process_files(files):
+#     all_chunks = []
+#     all_embeddings = []
+#     chunks_metadata = []
+#     for file in files:
+#         text = extract_text(file)
+#         if not text:  # Skip files that failed to process
+#             continue
+#         chunks = chunk_text(text)
+#         embeddings = get_embeddings(chunks)
+#         if not embeddings: # Skip files that failed to embed
+#             continue
+#         all_chunks.extend(chunks)
+#         all_embeddings.extend(embeddings)
+#         for i, chunk in enumerate(chunks):
+#             chunks_metadata.append({"file_name": file.name, "chunk_index": i})
+#     print(f"Processed {len(files)} files, {len(all_chunks)} chunks generated.")
+#     return all_chunks, all_embeddings, chunks_metadata
+import pypdf
+from docx import Document
+from transformers.pipelines import pipeline
+from sentence_transformers import SentenceTransformer
+import streamlit as st
+import numpy as np
+import os
+def extract_text(file):
+    text = ""
+    # Check if the input is a file path (string) or a file-like object
+    if isinstance(file, str):
+        file_name = os.path.basename(file)
+        try:
+            with open(file, 'rb') as f: # Open in binary mode
+                if file_name.endswith(".pdf"):
+                    print('Processing pdf file.................\n')
+                    reader = pypdf.PdfReader(f)
+                    for page in reader.pages:
+                        text += page.extract_text() + "\\n"
+                elif file_name.endswith(".docx"):
+                    document = Document(f)
+                    print('Processing DOCX file.................\n')
+                    for paragraph in document.paragraphs:
+                        if paragraph.text.strip():  # Check if the paragraph is not empty
+                            text += paragraph.text + "\\n"
+        except FileNotFoundError:
+            st.error(f"Error: File not found at {file}")
+            return ""
+        except Exception as e:
+            st.error(f"Error reading {file_name}: {e}")
+            return ""
+    else: # Assume it's a file-like object (e.g., from Streamlit file_uploader)
+        file_name = file.name
+        try:
+            if file_name.endswith(".pdf"):
+                reader = pypdf.PdfReader(file)
+                for page in reader.pages:
+                    text += page.extract_text() + "\\n"
+            elif file_name.endswith(".docx"):
+                document = Document(file)
+                for paragraph in document.paragraphs:
+                    text += paragraph.text + "\\n"
+        except Exception as e:
+            st.error(f"Error reading {file_name}: {e}")
+            return ""
+    return text
+def chunk_text(text, chunk_size=1500, overlap=150):
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk = text[start:end]
+        chunks.append(chunk)
+        start = end - overlap
+    return chunks
+def get_embeddings(texts)-> np.ndarray:
+    try:
+        # embedding_model = pipeline(
+        #     "sentence-transformers/all-MiniLM-L6-v2"
+        # )  # Example model
+        # embeddings = embedding_model(texts)
+        model = SentenceTransformer('dangvantuan/sentence-camembert-large')
+        embeddings = model.encode(texts)
+        print(f"Generated {len(embeddings)} embeddings.")
+        return embeddings
+    except Exception as e:
+        st.error(f"Error generating embeddings: {e}")
+        return []
+def process_files(files):
+    all_chunks = []
+    all_embeddings = []
+    chunks_metadata = []
+    for file in files:
+        print(f"Processing file: {file.name if hasattr(file, 'name') else os.path.basename(file)}")
+        text = extract_text(file)
+        if not text:  # Skip files that failed to process
+            print(f"Skipping file {file.name if hasattr(file, 'name') else os.path.basename(file)} due to extraction error.")
+            continue
+        print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
+        chunks = chunk_text(text)
+        embeddings = get_embeddings(chunks)
+        if not embeddings.any(): # Skip files that failed to embed
+            continue
+        all_chunks.extend(chunks)
+        all_embeddings.extend(embeddings)
+        for i, chunk in enumerate(chunks):
+            chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
+    return all_chunks, all_embeddings, chunks_metadata
+if __name__ == "__main__":
+    # Example usage
+    dummy_files = ['/Users/zac/Downloads/Janna/verbatimprocs/FZ- revenante - sept24.docx']
+    all_chunks, all_embeddings, chunks_metadata = process_files(dummy_files)
+    print("Chunks ex:")
+    print(f"Chunk 0: {all_chunks[0]}")
+    print("\nEmbeddings:")
+    print(f"Embedding 0: {all_embeddings[0][:10]}... (shape: {all_embeddings[0].shape})")  # Print only the first 5 elements for brevity
+    print("\nMetadata:")
+    for i, metadata in enumerate(chunks_metadata[0:5]):
+        print(f"Metadata {i}: {metadata}")
+"""
+Key improvements and explanations:
+    Clear Function Definitions: Each function has a specific purpose with comprehensive docstrings.
+    Error Handling: The extract_text and get_embeddings functions include try...except blocks to handle potential errors during file processing and embedding generation. Errors are displayed using st.error.
+    File Type Handling: The extract_text function correctly handles both .pdf and .docx files.
+    Chunking Strategy: The chunk_text function splits the text into smaller, overlapping chunks, which is a common strategy for RAG.
+    Embedding Generation: The get_embeddings function uses the Hugging Face pipeline to generate embeddings. You can easily swap out the model if needed.
+    Metadata: The process_files function now generates a list of metadata dictionaries, containing the file name and chunk index for each chunk. This is crucial for providing source attribution when answering queries.
+    Testing: The if __name__ == "__main__": block provides example usage and testing of the functions. This is good practice for ensuring your code works as expected. I've added dummy file creation for testing.
+    Efficiency: The code avoids unnecessary computations and handles files and text efficiently.
+"""

dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use an official Python runtime as a parent image
+FROM python:3.11-slim-buster
+# Set the working directory to /app
+WORKDIR /app
+# Copy the current directory contents into the container at /app
+COPY . /app/
+COPY app.py /app
+COPY vector_DB.py /app
+COPY llm_interaction.py /app
+COPY doc_preprocessing.py /app
+COPY pyproject.toml /app
+COPY uv.lock /app
+# Install uv
+RUN pip install uv
+# Create a virtual environment with uv
+RUN uv venv .venv
+# Activate the virtual environment. This is important for subsequent commands.
+ENV PATH="/app/.venv/bin:$PATH"
+# Install project dependencies from pyproject.toml and uv.lock
+#RUN uv pip install --no-cache-dir -r /app/uv.lock
+RUN uv sync --locked
+# Make port 8501 available to the world outside this container
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+# Run app.py when the container launches
+CMD ["streamlit", "run", "app.py","--server.port=8501", "--server.address=0.0.0.0"]

imb.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+docker stop $(docker ps -q --filter ancestor=notes_retriever)
+docker build --no-cache -t notes_retriever .
+docker run -d -p 127.0.0.1:8501:8501 notes_retriever

llm_interaction.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from transformers.pipelines import pipeline
+import streamlit as st
+def get_answer(query, context):
+    try:
+        qa_model = pipeline("document-question-answering", model="distilbert-base-cased-distilled-squad")
+        result = qa_model(question=query, context=context)
+        return result["answer"]
+    except Exception as e:
+        st.error(f"Error generating answer: {e}")
+        return "Sorry, I could not process your query."

pyproject.toml ADDED Viewed

	@@ -0,0 +1,18 @@

+[project]
+name = "notes-retriever"
+version = "0.1.0"
+description = "This project aims to help long notes writers to locate previous scripts written and drown in massive texts"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "docx>=0.2.4",
+    "faiss-cpu>=1.11.0",
+    "pdfreader>=0.1.15",
+    "processfiles>=0.1.4",
+    "pypdf>=5.5.0",
+    "python-docx>=1.1.2",
+    "sentence-transformers>=4.1.0",
+    "streamlit>=1.45.1",
+    "torch==2.2.0",
+    "transformers>=4.51.3",
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

vector_DB.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# import faiss
+# import numpy as np
+# import streamlit as st
+# class VectorDatabase:
+#     def __init__(self):
+#         self.index = None
+#         self.chunks = []
+#         self.chunks_metadata = []
+#     def add_data(self, embeddings, chunks, chunks_metadata):
+#         if not embeddings:
+#             st.error("No embeddings to add to the database.")
+#             return
+#         dimension = len(embeddings[0])
+#         self.index = faiss.IndexFlatL2(dimension)
+#         self.index.add(np.array(embeddings), x=np.float32)
+#         self.chunks = chunks
+#         self.chunks_metadata = chunks_metadata
+#     def query(self, query_embedding, k=3):
+#         if self.index is None:
+#             st.error("Vector database is empty. Please upload files and process them first.")
+#             return []
+#         _, indices = self.index.search(np.array([query_embedding]), k=k)
+#         results = []
+#         for i in indices[0]:
+#             chunk_text = self.chunks[i]
+#             metadata = self.chunks_metadata[i]
+#             answer = get_answer(query, chunk_text)  # Corrected call
+#             results.append({
+#                 "answer": answer,
+#                 "chunk_text": chunk_text,
+#                 "file_name": metadata["file_name"],
+#                 "chunk_index": metadata["chunk_index"],
+#             })
+#         return results
+#     def is_empty(self):
+#         return self.index is None
+# from llm_interaction import get_answer
+import faiss
+import numpy as np
+import streamlit as st
+from typing import List, Dict, Optional
+from doc_preprocessing import process_files
+class VectorDatabase:
+    """
+    A class to manage a vector database using FAISS for efficient similarity search.
+    """
+    def __init__(self, dimension: int = 0):
+        """
+        Initializes the VectorDatabase.
+        Args:
+            dimension (int, optional): The dimension of the embeddings. If None, the
+                index is not initialized until data is added. Defaults to None.
+        """
+        self.dimension = dimension
+        self.index: Optional[faiss.Index] = None
+        self.chunks: List[str] = []
+        self.chunks_metadata: List[Dict] = []
+    def add_data(self, embeddings: List[np.ndarray], chunks: List[str], chunks_metadata: List[Dict]):
+        """
+        Adds embeddings, text chunks, and metadata to the vector database.
+        Args:
+            embeddings (List[List[float]]): A list of embeddings (each a list or numpy array).
+            chunks (List[str]): A list of corresponding text chunks.
+            chunks_metadata (List[Dict]): A list of metadata dictionaries, one for each chunk.
+        """
+        if not embeddings:
+            st.error("No embeddings to add to the database.")
+            return
+        # Ensure embeddings are numpy arrays
+        embeddings = [np.array(emb) for emb in embeddings]
+        if self.dimension == 0:
+            self.dimension = embeddings[0].shape[0]
+            self.index = faiss.IndexFlatL2(self.dimension)  # Use L2 distance
+        elif self.dimension != embeddings[0].shape[0]:
+            st.error(f"Embedding dimension ({embeddings[0].shape[0]}) does not match database dimension ({self.dimension}).")
+            return
+        # Convert embeddings to a float32 numpy array for FAISS
+        embeddings_np = np.array(embeddings, dtype=np.float32)
+        if self.index is None:
+            self.index = faiss.IndexFlatL2(self.dimension)
+        self.index.add(embeddings_np)
+        self.chunks = chunks
+        self.chunks_metadata = chunks_metadata
+    def query(self, query_embedding: List[float], k: int = 3) -> List[Dict]:
+        """
+        Queries the vector database for the most similar chunks to a query embedding.
+        Args:
+            query_embedding (List[float]): The embedding of the query.
+            k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3.
+        Returns:
+            List[Dict]: A list of dictionaries, where each dictionary contains:
+                - "chunk_text" (str): The text of the retrieved chunk.
+                - "file_name" (str): The name of the file the chunk came from.
+                - "chunk_index" (int): The index of the chunk in the file.
+        """
+        if self.index is None:
+            st.error("Vector database is empty. Please upload files and process them first.")
+            return []
+        # Ensure query_embedding is a numpy array
+        query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)  # Reshape for FAISS
+        _, indices = self.index.search(query_embedding, k=k)
+        results = []
+        for i in indices[0]:
+            chunk_text = self.chunks[i]
+            metadata = self.chunks_metadata[i]
+            results.append({
+                "chunk_text": chunk_text,
+                "file_name": metadata["file_name"],
+                "chunk_index": metadata["chunk_index"],
+            })
+        return results
+    def is_empty(self) -> bool:
+        """
+        Checks if the vector database is empty.
+        Returns:
+            bool: True if the database is empty, False otherwise.
+        """
+        return self.index is None
+if __name__ == "__main__":
+    # This part is for testing the VectorDatabase class.
+    #  It will only run if you execute this file directly: python vector_database.py
+    # # Create some dummy data
+    # embeddings = [
+    #     np.array([1.0, 2.0, 3.0]),
+    #     np.array([4.0, 5.0, 6.0]),
+    #     np.array([7.0, 8.0, 9.0]),
+    #     np.array([10.0, 11.0, 12.0]),
+    # ]
+    # chunks = [
+    #     "This is chunk 1 from file A.",
+    #     "This is chunk 2 from file A.",
+    #     "This is chunk 1 from file B.",
+    #     "This is chunk 2 from file B.",
+    # ]
+    # chunks_metadata = [
+    #     {"file_name": "file_a.pdf", "chunk_index": 0},
+    #     {"file_name": "file_a.pdf", "chunk_index": 1},
+    #     {"file_name": "file_b.docx", "chunk_index": 0},
+    #     {"file_name": "file_b.docx", "chunk_index": 1},
+    # ]
+    dummy_files = ['/Users/zac/Downloads/Janna/verbatimprocs/FZ- revenante - sept24.docx']
+    chunks, embeddings, chunks_metadata = process_files(dummy_files)
+    # 1. Initialize the VectorDatabase
+    vector_db = VectorDatabase(dimension=embeddings[0].shape[0]) # Initialize with dimension
+    # 2. Add data to the VectorDatabase
+    vector_db.add_data(embeddings, chunks, chunks_metadata)
+    print("Data added to VectorDatabase.")
+    # 3. Perform a query
+    query_embedding = np.random.rand(embeddings[0].shape[0]).astype(np.float32)  # Random query embedding
+    results = vector_db.query(query_embedding, k=2)  # Get the top 2 results
+    print("\nQuery results:")
+    for result in results:
+        print(f"Chunk: {result['chunk_text']}")
+        print(f"  File: {result['file_name']}")
+        print(f"  Index: {result['chunk_index']}")
+    # 4. Check if the database is empty
+    print(f"\nIs the database empty? {vector_db.is_empty()}") # Check is_empty method
+    # 5.  Initialize without dimension and then add data
+    vector_db2 = VectorDatabase()
+    vector_db2.add_data(embeddings, chunks, chunks_metadata)
+    print("\nData added to VectorDatabase2 (without initial dimension).")
+    query_embedding_2 = np.random.rand(embeddings[0].shape[0]).astype(np.float32)  # Random query embedding
+    results_2 = vector_db2.query(query_embedding_2, k=1)
+    print("\nQuery results from VectorDatabase2:")
+    for result in results_2:
+        print(f"Chunk: {result['chunk_text']}")
+        print(f"  File: {result['file_name']}")
+        print(f"  Index: {result['chunk_index']}")
+"""
+Key improvements and explanations:
+    Class Structure: The VectorDatabase class encapsulates the FAISS index, chunks, and metadata, providing a clean and organized way to manage the vector database.
+    Initialization: The __init__ method now takes an optional dimension argument. If not provided during initialization, the dimension is inferred when the first data is added. This provides more flexibility.
+    Data Handling: The add_data method takes lists of embeddings, chunks, and metadata, and stores them in the object. It also converts the embeddings to a float32 numpy array, which is the format FAISS expects, and checks for dimension consistency.
+    Querying: The query method performs a similarity search using FAISS and returns a list of dictionaries containing the relevant information. It also handles the case where the database is empty.
+    Error Handling: The add_data and query methods include error handling for invalid input or an empty database.
+    Clarity: The code is well-commented and easy to understand.
+    Testing: The if __name__ == "__main__": block provides a comprehensive test of the VectorDatabase class, demonstrating how to add data, perform queries, and check if the database is empty. I've added a test for initializing the database without a dimension.
+"""