Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- app.py +14 -2
- doc_preprocessing.py +4 -39
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
import numpy as np
|
| 3 |
from doc_preprocessing import process_files, get_embeddings
|
| 4 |
from vector_DB import VectorDatabase # Import the class
|
|
@@ -39,13 +40,24 @@ def process_query(query):
|
|
| 39 |
|
| 40 |
return results
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def display_results(results):
|
| 43 |
for result in results:
|
| 44 |
st.subheader("Answer")
|
| 45 |
st.subheader("Source")
|
| 46 |
st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
|
| 47 |
st.subheader("Citations depuis le document :")
|
| 48 |
-
st.
|
| 49 |
-
|
| 50 |
if __name__ == "__main__":
|
| 51 |
main()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import re
|
| 3 |
import numpy as np
|
| 4 |
from doc_preprocessing import process_files, get_embeddings
|
| 5 |
from vector_DB import VectorDatabase # Import the class
|
|
|
|
| 40 |
|
| 41 |
return results
|
| 42 |
|
| 43 |
+
def normalize_line_breaks(text):
|
| 44 |
+
# Replace all \r\n (Windows line endings) with \n
|
| 45 |
+
text = text.replace('\r\n', '\n')
|
| 46 |
+
# Replace multiple \n with a single token to preserve paragraph breaks
|
| 47 |
+
text = re.sub(r'\n{2,}', '<PARA_BREAK>', text)
|
| 48 |
+
# Replace remaining single \n (i.e. line breaks) with double \n
|
| 49 |
+
text = text.replace('\n', ' \n\n')
|
| 50 |
+
# Restore original paragraph breaks
|
| 51 |
+
text = text.replace('<PARA_BREAK>', ' \n\n')
|
| 52 |
+
return text
|
| 53 |
+
|
| 54 |
def display_results(results):
|
| 55 |
for result in results:
|
| 56 |
st.subheader("Answer")
|
| 57 |
st.subheader("Source")
|
| 58 |
st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
|
| 59 |
st.subheader("Citations depuis le document :")
|
| 60 |
+
st.markdown(normalize_line_breaks(result["chunk_text"]))
|
| 61 |
+
print(normalize_line_breaks(result["chunk_text"]))
|
| 62 |
if __name__ == "__main__":
|
| 63 |
main()
|
doc_preprocessing.py
CHANGED
|
@@ -113,7 +113,7 @@ def extract_text(file):
|
|
| 113 |
return ""
|
| 114 |
return text
|
| 115 |
|
| 116 |
-
def chunk_text(text, chunk_size=
|
| 117 |
chunks = []
|
| 118 |
start = 0
|
| 119 |
while start < len(text):
|
|
@@ -131,7 +131,7 @@ def get_embeddings(texts)-> np.ndarray:
|
|
| 131 |
# embeddings = embedding_model(texts)
|
| 132 |
|
| 133 |
|
| 134 |
-
model = SentenceTransformer(
|
| 135 |
embeddings = model.encode(texts)
|
| 136 |
|
| 137 |
print(f"Generated {len(embeddings)} embeddings.")
|
|
@@ -154,8 +154,8 @@ def process_files(files):
|
|
| 154 |
print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
|
| 155 |
chunks = chunk_text(text)
|
| 156 |
embeddings = get_embeddings(chunks)
|
| 157 |
-
if not embeddings
|
| 158 |
-
|
| 159 |
|
| 160 |
all_chunks.extend(chunks)
|
| 161 |
all_embeddings.extend(embeddings)
|
|
@@ -163,38 +163,3 @@ def process_files(files):
|
|
| 163 |
chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
|
| 164 |
return all_chunks, all_embeddings, chunks_metadata
|
| 165 |
|
| 166 |
-
|
| 167 |
-
if __name__ == "__main__":
|
| 168 |
-
# Example usage
|
| 169 |
-
dummy_files = ['/Users/zac/Downloads/Janna/verbatimprocs/FZ- revenante - sept24.docx']
|
| 170 |
-
|
| 171 |
-
all_chunks, all_embeddings, chunks_metadata = process_files(dummy_files)
|
| 172 |
-
|
| 173 |
-
print("Chunks ex:")
|
| 174 |
-
print(f"Chunk 0: {all_chunks[0]}")
|
| 175 |
-
|
| 176 |
-
print("\nEmbeddings:")
|
| 177 |
-
print(f"Embedding 0: {all_embeddings[0][:10]}... (shape: {all_embeddings[0].shape})") # Print only the first 5 elements for brevity
|
| 178 |
-
|
| 179 |
-
print("\nMetadata:")
|
| 180 |
-
for i, metadata in enumerate(chunks_metadata[0:5]):
|
| 181 |
-
print(f"Metadata {i}: {metadata}")
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
"""
|
| 187 |
-
|
| 188 |
-
Key improvements and explanations:
|
| 189 |
-
|
| 190 |
-
Clear Function Definitions: Each function has a specific purpose with comprehensive docstrings.
|
| 191 |
-
Error Handling: The extract_text and get_embeddings functions include try...except blocks to handle potential errors during file processing and embedding generation. Errors are displayed using st.error.
|
| 192 |
-
File Type Handling: The extract_text function correctly handles both .pdf and .docx files.
|
| 193 |
-
Chunking Strategy: The chunk_text function splits the text into smaller, overlapping chunks, which is a common strategy for RAG.
|
| 194 |
-
Embedding Generation: The get_embeddings function uses the Hugging Face pipeline to generate embeddings. You can easily swap out the model if needed.
|
| 195 |
-
Metadata: The process_files function now generates a list of metadata dictionaries, containing the file name and chunk index for each chunk. This is crucial for providing source attribution when answering queries.
|
| 196 |
-
Testing: The if __name__ == "__main__": block provides example usage and testing of the functions. This is good practice for ensuring your code works as expected. I've added dummy file creation for testing.
|
| 197 |
-
Efficiency: The code avoids unnecessary computations and handles files and text efficiently.
|
| 198 |
-
|
| 199 |
-
"""
|
| 200 |
-
|
|
|
|
| 113 |
return ""
|
| 114 |
return text
|
| 115 |
|
| 116 |
+
def chunk_text(text, chunk_size=500, overlap=50):
|
| 117 |
chunks = []
|
| 118 |
start = 0
|
| 119 |
while start < len(text):
|
|
|
|
| 131 |
# embeddings = embedding_model(texts)
|
| 132 |
|
| 133 |
|
| 134 |
+
model = SentenceTransformer("sujet-ai/Marsilia-Embeddings-FR-Base")
|
| 135 |
embeddings = model.encode(texts)
|
| 136 |
|
| 137 |
print(f"Generated {len(embeddings)} embeddings.")
|
|
|
|
| 154 |
print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
|
| 155 |
chunks = chunk_text(text)
|
| 156 |
embeddings = get_embeddings(chunks)
|
| 157 |
+
# if not embeddings: # Skip files that failed to embed
|
| 158 |
+
# continue
|
| 159 |
|
| 160 |
all_chunks.extend(chunks)
|
| 161 |
all_embeddings.extend(embeddings)
|
|
|
|
| 163 |
chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
|
| 164 |
return all_chunks, all_embeddings, chunks_metadata
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|