Spaces:

ZacBl
/

Note_Retriever

Sleeping

App Files Files Community

ZacBl commited on May 20, 2025

Commit

9ff2eb6

verified ·

1 Parent(s): c846056

Upload 5 files

Browse files

Files changed (2) hide show

app.py +14 -2
doc_preprocessing.py +4 -39

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import streamlit as st
 import numpy as np
 from doc_preprocessing import process_files, get_embeddings
 from vector_DB import VectorDatabase  # Import the class
@@ -39,13 +40,24 @@ def process_query(query):
     return results
 def display_results(results):
     for result in results:
         st.subheader("Answer")
         st.subheader("Source")
         st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
         st.subheader("Citations depuis le document :")
-        st.write(result["chunk_text"])
 if __name__ == "__main__":
     main()

 import streamlit as st
+import re
 import numpy as np
 from doc_preprocessing import process_files, get_embeddings
 from vector_DB import VectorDatabase  # Import the class
     return results
+def normalize_line_breaks(text):
+    # Replace all \r\n (Windows line endings) with \n
+    text = text.replace('\r\n', '\n')
+    # Replace multiple \n with a single token to preserve paragraph breaks
+    text = re.sub(r'\n{2,}', '<PARA_BREAK>', text)
+    # Replace remaining single \n (i.e. line breaks) with double \n
+    text = text.replace('\n', '  \n\n')
+    # Restore original paragraph breaks
+    text = text.replace('<PARA_BREAK>', '  \n\n')
+    return text
 def display_results(results):
     for result in results:
         st.subheader("Answer")
         st.subheader("Source")
         st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
         st.subheader("Citations depuis le document :")
+        st.markdown(normalize_line_breaks(result["chunk_text"]))
+        print(normalize_line_breaks(result["chunk_text"]))
 if __name__ == "__main__":
     main()

doc_preprocessing.py CHANGED Viewed

@@ -113,7 +113,7 @@ def extract_text(file):
             return ""
     return text
-def chunk_text(text, chunk_size=1500, overlap=150):
     chunks = []
     start = 0
     while start < len(text):
@@ -131,7 +131,7 @@ def get_embeddings(texts)-> np.ndarray:
         # embeddings = embedding_model(texts)
-        model = SentenceTransformer('dangvantuan/sentence-camembert-large')
         embeddings = model.encode(texts)
         print(f"Generated {len(embeddings)} embeddings.")
@@ -154,8 +154,8 @@ def process_files(files):
         print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
         chunks = chunk_text(text)
         embeddings = get_embeddings(chunks)
-        if not embeddings.any(): # Skip files that failed to embed
-            continue
         all_chunks.extend(chunks)
         all_embeddings.extend(embeddings)
@@ -163,38 +163,3 @@ def process_files(files):
             chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
     return all_chunks, all_embeddings, chunks_metadata
-if __name__ == "__main__":
-    # Example usage
-    dummy_files = ['/Users/zac/Downloads/Janna/verbatimprocs/FZ- revenante - sept24.docx']
-    all_chunks, all_embeddings, chunks_metadata = process_files(dummy_files)
-    print("Chunks ex:")
-    print(f"Chunk 0: {all_chunks[0]}")
-    print("\nEmbeddings:")
-    print(f"Embedding 0: {all_embeddings[0][:10]}... (shape: {all_embeddings[0].shape})")  # Print only the first 5 elements for brevity
-    print("\nMetadata:")
-    for i, metadata in enumerate(chunks_metadata[0:5]):
-        print(f"Metadata {i}: {metadata}")
-"""
-Key improvements and explanations:
-    Clear Function Definitions: Each function has a specific purpose with comprehensive docstrings.
-    Error Handling: The extract_text and get_embeddings functions include try...except blocks to handle potential errors during file processing and embedding generation. Errors are displayed using st.error.
-    File Type Handling: The extract_text function correctly handles both .pdf and .docx files.
-    Chunking Strategy: The chunk_text function splits the text into smaller, overlapping chunks, which is a common strategy for RAG.
-    Embedding Generation: The get_embeddings function uses the Hugging Face pipeline to generate embeddings. You can easily swap out the model if needed.
-    Metadata: The process_files function now generates a list of metadata dictionaries, containing the file name and chunk index for each chunk. This is crucial for providing source attribution when answering queries.
-    Testing: The if __name__ == "__main__": block provides example usage and testing of the functions. This is good practice for ensuring your code works as expected. I've added dummy file creation for testing.
-    Efficiency: The code avoids unnecessary computations and handles files and text efficiently.
-"""

             return ""
     return text
+def chunk_text(text, chunk_size=500, overlap=50):
     chunks = []
     start = 0
     while start < len(text):
         # embeddings = embedding_model(texts)
+        model = SentenceTransformer("sujet-ai/Marsilia-Embeddings-FR-Base")
         embeddings = model.encode(texts)
         print(f"Generated {len(embeddings)} embeddings.")
         print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
         chunks = chunk_text(text)
         embeddings = get_embeddings(chunks)
+        # if not embeddings: # Skip files that failed to embed
+        #     continue
         all_chunks.extend(chunks)
         all_embeddings.extend(embeddings)
             chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
     return all_chunks, all_embeddings, chunks_metadata