ZacBl commited on
Commit
9ff2eb6
·
verified ·
1 Parent(s): c846056

Upload 5 files

Browse files
Files changed (2) hide show
  1. app.py +14 -2
  2. doc_preprocessing.py +4 -39
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
  import numpy as np
3
  from doc_preprocessing import process_files, get_embeddings
4
  from vector_DB import VectorDatabase # Import the class
@@ -39,13 +40,24 @@ def process_query(query):
39
 
40
  return results
41
 
 
 
 
 
 
 
 
 
 
 
 
42
  def display_results(results):
43
  for result in results:
44
  st.subheader("Answer")
45
  st.subheader("Source")
46
  st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
47
  st.subheader("Citations depuis le document :")
48
- st.write(result["chunk_text"])
49
-
50
  if __name__ == "__main__":
51
  main()
 
1
  import streamlit as st
2
+ import re
3
  import numpy as np
4
  from doc_preprocessing import process_files, get_embeddings
5
  from vector_DB import VectorDatabase # Import the class
 
40
 
41
  return results
42
 
43
+ def normalize_line_breaks(text):
44
+ # Replace all \r\n (Windows line endings) with \n
45
+ text = text.replace('\r\n', '\n')
46
+ # Replace multiple \n with a single token to preserve paragraph breaks
47
+ text = re.sub(r'\n{2,}', '<PARA_BREAK>', text)
48
+ # Replace remaining single \n (i.e. line breaks) with double \n
49
+ text = text.replace('\n', ' \n\n')
50
+ # Restore original paragraph breaks
51
+ text = text.replace('<PARA_BREAK>', ' \n\n')
52
+ return text
53
+
54
  def display_results(results):
55
  for result in results:
56
  st.subheader("Answer")
57
  st.subheader("Source")
58
  st.write(f"File: {result['file_name']}, Chunk: {result['chunk_index']}")
59
  st.subheader("Citations depuis le document :")
60
+ st.markdown(normalize_line_breaks(result["chunk_text"]))
61
+ print(normalize_line_breaks(result["chunk_text"]))
62
  if __name__ == "__main__":
63
  main()
doc_preprocessing.py CHANGED
@@ -113,7 +113,7 @@ def extract_text(file):
113
  return ""
114
  return text
115
 
116
- def chunk_text(text, chunk_size=1500, overlap=150):
117
  chunks = []
118
  start = 0
119
  while start < len(text):
@@ -131,7 +131,7 @@ def get_embeddings(texts)-> np.ndarray:
131
  # embeddings = embedding_model(texts)
132
 
133
 
134
- model = SentenceTransformer('dangvantuan/sentence-camembert-large')
135
  embeddings = model.encode(texts)
136
 
137
  print(f"Generated {len(embeddings)} embeddings.")
@@ -154,8 +154,8 @@ def process_files(files):
154
  print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
155
  chunks = chunk_text(text)
156
  embeddings = get_embeddings(chunks)
157
- if not embeddings.any(): # Skip files that failed to embed
158
- continue
159
 
160
  all_chunks.extend(chunks)
161
  all_embeddings.extend(embeddings)
@@ -163,38 +163,3 @@ def process_files(files):
163
  chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
164
  return all_chunks, all_embeddings, chunks_metadata
165
 
166
-
167
- if __name__ == "__main__":
168
- # Example usage
169
- dummy_files = ['/Users/zac/Downloads/Janna/verbatimprocs/FZ- revenante - sept24.docx']
170
-
171
- all_chunks, all_embeddings, chunks_metadata = process_files(dummy_files)
172
-
173
- print("Chunks ex:")
174
- print(f"Chunk 0: {all_chunks[0]}")
175
-
176
- print("\nEmbeddings:")
177
- print(f"Embedding 0: {all_embeddings[0][:10]}... (shape: {all_embeddings[0].shape})") # Print only the first 5 elements for brevity
178
-
179
- print("\nMetadata:")
180
- for i, metadata in enumerate(chunks_metadata[0:5]):
181
- print(f"Metadata {i}: {metadata}")
182
-
183
-
184
-
185
-
186
- """
187
-
188
- Key improvements and explanations:
189
-
190
- Clear Function Definitions: Each function has a specific purpose with comprehensive docstrings.
191
- Error Handling: The extract_text and get_embeddings functions include try...except blocks to handle potential errors during file processing and embedding generation. Errors are displayed using st.error.
192
- File Type Handling: The extract_text function correctly handles both .pdf and .docx files.
193
- Chunking Strategy: The chunk_text function splits the text into smaller, overlapping chunks, which is a common strategy for RAG.
194
- Embedding Generation: The get_embeddings function uses the Hugging Face pipeline to generate embeddings. You can easily swap out the model if needed.
195
- Metadata: The process_files function now generates a list of metadata dictionaries, containing the file name and chunk index for each chunk. This is crucial for providing source attribution when answering queries.
196
- Testing: The if __name__ == "__main__": block provides example usage and testing of the functions. This is good practice for ensuring your code works as expected. I've added dummy file creation for testing.
197
- Efficiency: The code avoids unnecessary computations and handles files and text efficiently.
198
-
199
- """
200
-
 
113
  return ""
114
  return text
115
 
116
+ def chunk_text(text, chunk_size=500, overlap=50):
117
  chunks = []
118
  start = 0
119
  while start < len(text):
 
131
  # embeddings = embedding_model(texts)
132
 
133
 
134
+ model = SentenceTransformer("sujet-ai/Marsilia-Embeddings-FR-Base")
135
  embeddings = model.encode(texts)
136
 
137
  print(f"Generated {len(embeddings)} embeddings.")
 
154
  print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
155
  chunks = chunk_text(text)
156
  embeddings = get_embeddings(chunks)
157
+ # if not embeddings: # Skip files that failed to embed
158
+ # continue
159
 
160
  all_chunks.extend(chunks)
161
  all_embeddings.extend(embeddings)
 
163
  chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
164
  return all_chunks, all_embeddings, chunks_metadata
165