Spaces:

ZacBl
/

Note_Retriever

Sleeping

App Files Files Community

ZacBl commited on May 21, 2025

Commit

76894b4

verified ·

1 Parent(s): 580ab28

Upload 3 files

Browse files

Files changed (3) hide show

app.py +10 -9
doc_preprocessing.py +4 -11
vector_DB.py +6 -48

app.py CHANGED Viewed

@@ -37,24 +37,25 @@ def process_query(query):
     # query_embedding = get_embeddings([query])[0]
     # results = vector_database.query(query_embedding, k=3) # use the method
     query_embedding = get_embeddings([query])[0]  # Get the embedding for the query
-    results = vector_database.query(query_embedding, k=3)  # Get the top 2 results
     return results
 def normalize_line_breaks(text):
-    # text = text.replace("\n", "  \n ")
-    # text = text.replace('\n', '  \n ')
     text = text.replace("\\n", "  \n ")
     return text
 def display_results(results):
-       for i, result in enumerate(results):
-        st.subheader(f"Réponse {i} :")
-        st.write(f"Source File: {result['file_name']}, Chunk: {result['chunk_index']}")
-        st.subheader("Citations depuis le document :")
-        st.write(normalize_line_breaks(result["chunk_text"]))
-        st_copy_to_clipboard(normalize_line_breaks(result["chunk_text"]))
 if __name__ == "__main__":

     # query_embedding = get_embeddings([query])[0]
     # results = vector_database.query(query_embedding, k=3) # use the method
     query_embedding = get_embeddings([query])[0]  # Get the embedding for the query
+    results = vector_database.query(query_embedding, k=10)  # Get the top 2 results
     return results
 def normalize_line_breaks(text):
     text = text.replace("\\n", "  \n ")
     return text
 def display_results(results):
+    cpt = 1
+    for result in (results):
+        if result['score'] < 0.5:
+            st.subheader(f"Réponse {cpt+1} :")
+            st.write(f"Source File: {result['file_name']}, Chunk: {result['chunk_index']}, Score: {round((1-result['score'])*100,2)}%")
+            st.subheader("Citations depuis le document :")
+            st.write(normalize_line_breaks(result["chunk_text"]))
+            st_copy_to_clipboard(normalize_line_breaks(result["chunk_text"]))
+            cpt += 1
 if __name__ == "__main__":

doc_preprocessing.py CHANGED Viewed

@@ -6,8 +6,8 @@ import streamlit as st
 import numpy as np
 import os
-emb_model = "sujet-ai/Marsilia-Embeddings-FR-Base"
 def extract_text(file):
     text = ""
     # Check if the input is a file path (string) or a file-like object
@@ -60,12 +60,6 @@ def chunk_text(text, chunk_size=500, overlap=50):
 def get_embeddings(texts)-> np.ndarray:
     try:
-        # embedding_model = pipeline(
-        #     "sentence-transformers/all-MiniLM-L6-v2"
-        # )  # Example model
-        # embeddings = embedding_model(texts)
         model = SentenceTransformer(emb_model, trust_remote_code=True)
         embeddings = model.encode(texts)
@@ -73,7 +67,7 @@ def get_embeddings(texts)-> np.ndarray:
         return embeddings
     except Exception as e:
         st.error(f"Error generating embeddings: {e}")
-        return []
 def process_files(files):
     all_chunks = []
@@ -96,5 +90,4 @@ def process_files(files):
         all_embeddings.extend(embeddings)
         for i, chunk in enumerate(chunks):
             chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
-    return all_chunks, all_embeddings, chunks_metadata

 import numpy as np
 import os
+emb_model = "intfloat/multilingual-e5-large-instruct"
+emb_model2 = "DeepPavlov/distilrubert-small-cased-conversational"
 def extract_text(file):
     text = ""
     # Check if the input is a file path (string) or a file-like object
 def get_embeddings(texts)-> np.ndarray:
     try:
         model = SentenceTransformer(emb_model, trust_remote_code=True)
         embeddings = model.encode(texts)
         return embeddings
     except Exception as e:
         st.error(f"Error generating embeddings: {e}")
+        return np.array([])
 def process_files(files):
     all_chunks = []
         all_embeddings.extend(embeddings)
         for i, chunk in enumerate(chunks):
             chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
+    return all_chunks, all_embeddings, chunks_metadata

vector_DB.py CHANGED Viewed

@@ -1,50 +1,7 @@
-# import faiss
-# import numpy as np
-# import streamlit as st
-# class VectorDatabase:
-#     def __init__(self):
-#         self.index = None
-#         self.chunks = []
-#         self.chunks_metadata = []
-#     def add_data(self, embeddings, chunks, chunks_metadata):
-#         if not embeddings:
-#             st.error("No embeddings to add to the database.")
-#             return
-#         dimension = len(embeddings[0])
-#         self.index = faiss.IndexFlatL2(dimension)
-#         self.index.add(np.array(embeddings), x=np.float32)
-#         self.chunks = chunks
-#         self.chunks_metadata = chunks_metadata
-#     def query(self, query_embedding, k=3):
-#         if self.index is None:
-#             st.error("Vector database is empty. Please upload files and process them first.")
-#             return []
-#         _, indices = self.index.search(np.array([query_embedding]), k=k)
-#         results = []
-#         for i in indices[0]:
-#             chunk_text = self.chunks[i]
-#             metadata = self.chunks_metadata[i]
-#             answer = get_answer(query, chunk_text)  # Corrected call
-#             results.append({
-#                 "answer": answer,
-#                 "chunk_text": chunk_text,
-#                 "file_name": metadata["file_name"],
-#                 "chunk_index": metadata["chunk_index"],
-#             })
-#         return results
-#     def is_empty(self):
-#         return self.index is None
-# from llm_interaction import get_answer
 import faiss
 import numpy as np
 import streamlit as st
-from typing import List, Dict, Optional
 from doc_preprocessing import process_files
@@ -96,12 +53,12 @@ class VectorDatabase:
         self.chunks = chunks
         self.chunks_metadata = chunks_metadata
-    def query(self, query_embedding: List[float], k: int = 3) -> List[Dict]:
         """
         Queries the vector database for the most similar chunks to a query embedding.
         Args:
-            query_embedding (List[float]): The embedding of the query.
             k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3.
         Returns:
@@ -117,15 +74,16 @@ class VectorDatabase:
         # Ensure query_embedding is a numpy array
         query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)  # Reshape for FAISS
-        _, indices = self.index.search(query_embedding, k=k)
         results = []
-        for i in indices[0]:
             chunk_text = self.chunks[i]
             metadata = self.chunks_metadata[i]
             results.append({
                 "chunk_text": chunk_text,
                 "file_name": metadata["file_name"],
                 "chunk_index": metadata["chunk_index"],
             })
         return results

 import faiss
 import numpy as np
 import streamlit as st
+from typing import List, Dict, Optional, Union
 from doc_preprocessing import process_files
         self.chunks = chunks
         self.chunks_metadata = chunks_metadata
+    def query(self, query_embedding: Union[List[float], np.ndarray], k: int = 3) -> List[Dict]:
         """
         Queries the vector database for the most similar chunks to a query embedding.
         Args:
+            query_embedding (List[float] or np.ndarray): The embedding of the query.
             k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3.
         Returns:
         # Ensure query_embedding is a numpy array
         query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)  # Reshape for FAISS
+        dist, indices = self.index.search(query_embedding, k=k)
         results = []
+        for (i, j) in zip(indices[0], dist[0]):
             chunk_text = self.chunks[i]
             metadata = self.chunks_metadata[i]
             results.append({
                 "chunk_text": chunk_text,
                 "file_name": metadata["file_name"],
                 "chunk_index": metadata["chunk_index"],
+                "score": j
             })
         return results