ZacBl commited on
Commit
76894b4
·
verified ·
1 Parent(s): 580ab28

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +10 -9
  2. doc_preprocessing.py +4 -11
  3. vector_DB.py +6 -48
app.py CHANGED
@@ -37,24 +37,25 @@ def process_query(query):
37
  # query_embedding = get_embeddings([query])[0]
38
  # results = vector_database.query(query_embedding, k=3) # use the method
39
  query_embedding = get_embeddings([query])[0] # Get the embedding for the query
40
- results = vector_database.query(query_embedding, k=3) # Get the top 2 results
41
 
42
  return results
43
 
44
  def normalize_line_breaks(text):
45
- # text = text.replace("\n", " \n ")
46
- # text = text.replace('\n', ' \n ')
47
  text = text.replace("\\n", " \n ")
48
 
49
  return text
50
 
51
  def display_results(results):
52
- for i, result in enumerate(results):
53
- st.subheader(f"Réponse {i} :")
54
- st.write(f"Source File: {result['file_name']}, Chunk: {result['chunk_index']}")
55
- st.subheader("Citations depuis le document :")
56
- st.write(normalize_line_breaks(result["chunk_text"]))
57
- st_copy_to_clipboard(normalize_line_breaks(result["chunk_text"]))
 
 
 
58
 
59
 
60
  if __name__ == "__main__":
 
37
  # query_embedding = get_embeddings([query])[0]
38
  # results = vector_database.query(query_embedding, k=3) # use the method
39
  query_embedding = get_embeddings([query])[0] # Get the embedding for the query
40
+ results = vector_database.query(query_embedding, k=10) # Get the top 2 results
41
 
42
  return results
43
 
44
  def normalize_line_breaks(text):
 
 
45
  text = text.replace("\\n", " \n ")
46
 
47
  return text
48
 
49
  def display_results(results):
50
+ cpt = 1
51
+ for result in (results):
52
+ if result['score'] < 0.5:
53
+ st.subheader(f"Réponse {cpt+1} :")
54
+ st.write(f"Source File: {result['file_name']}, Chunk: {result['chunk_index']}, Score: {round((1-result['score'])*100,2)}%")
55
+ st.subheader("Citations depuis le document :")
56
+ st.write(normalize_line_breaks(result["chunk_text"]))
57
+ st_copy_to_clipboard(normalize_line_breaks(result["chunk_text"]))
58
+ cpt += 1
59
 
60
 
61
  if __name__ == "__main__":
doc_preprocessing.py CHANGED
@@ -6,8 +6,8 @@ import streamlit as st
6
  import numpy as np
7
  import os
8
 
9
- emb_model = "sujet-ai/Marsilia-Embeddings-FR-Base"
10
-
11
  def extract_text(file):
12
  text = ""
13
  # Check if the input is a file path (string) or a file-like object
@@ -60,12 +60,6 @@ def chunk_text(text, chunk_size=500, overlap=50):
60
 
61
  def get_embeddings(texts)-> np.ndarray:
62
  try:
63
- # embedding_model = pipeline(
64
- # "sentence-transformers/all-MiniLM-L6-v2"
65
- # ) # Example model
66
- # embeddings = embedding_model(texts)
67
-
68
-
69
  model = SentenceTransformer(emb_model, trust_remote_code=True)
70
  embeddings = model.encode(texts)
71
 
@@ -73,7 +67,7 @@ def get_embeddings(texts)-> np.ndarray:
73
  return embeddings
74
  except Exception as e:
75
  st.error(f"Error generating embeddings: {e}")
76
- return []
77
 
78
  def process_files(files):
79
  all_chunks = []
@@ -96,5 +90,4 @@ def process_files(files):
96
  all_embeddings.extend(embeddings)
97
  for i, chunk in enumerate(chunks):
98
  chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
99
- return all_chunks, all_embeddings, chunks_metadata
100
-
 
6
  import numpy as np
7
  import os
8
 
9
+ emb_model = "intfloat/multilingual-e5-large-instruct"
10
+ emb_model2 = "DeepPavlov/distilrubert-small-cased-conversational"
11
  def extract_text(file):
12
  text = ""
13
  # Check if the input is a file path (string) or a file-like object
 
60
 
61
  def get_embeddings(texts)-> np.ndarray:
62
  try:
 
 
 
 
 
 
63
  model = SentenceTransformer(emb_model, trust_remote_code=True)
64
  embeddings = model.encode(texts)
65
 
 
67
  return embeddings
68
  except Exception as e:
69
  st.error(f"Error generating embeddings: {e}")
70
+ return np.array([])
71
 
72
  def process_files(files):
73
  all_chunks = []
 
90
  all_embeddings.extend(embeddings)
91
  for i, chunk in enumerate(chunks):
92
  chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
93
+ return all_chunks, all_embeddings, chunks_metadata
 
vector_DB.py CHANGED
@@ -1,50 +1,7 @@
1
- # import faiss
2
- # import numpy as np
3
- # import streamlit as st
4
-
5
- # class VectorDatabase:
6
- # def __init__(self):
7
- # self.index = None
8
- # self.chunks = []
9
- # self.chunks_metadata = []
10
-
11
- # def add_data(self, embeddings, chunks, chunks_metadata):
12
- # if not embeddings:
13
- # st.error("No embeddings to add to the database.")
14
- # return
15
- # dimension = len(embeddings[0])
16
- # self.index = faiss.IndexFlatL2(dimension)
17
- # self.index.add(np.array(embeddings), x=np.float32)
18
- # self.chunks = chunks
19
- # self.chunks_metadata = chunks_metadata
20
-
21
- # def query(self, query_embedding, k=3):
22
- # if self.index is None:
23
- # st.error("Vector database is empty. Please upload files and process them first.")
24
- # return []
25
- # _, indices = self.index.search(np.array([query_embedding]), k=k)
26
- # results = []
27
- # for i in indices[0]:
28
- # chunk_text = self.chunks[i]
29
- # metadata = self.chunks_metadata[i]
30
- # answer = get_answer(query, chunk_text) # Corrected call
31
- # results.append({
32
- # "answer": answer,
33
- # "chunk_text": chunk_text,
34
- # "file_name": metadata["file_name"],
35
- # "chunk_index": metadata["chunk_index"],
36
- # })
37
- # return results
38
-
39
- # def is_empty(self):
40
- # return self.index is None
41
- # from llm_interaction import get_answer
42
-
43
-
44
  import faiss
45
  import numpy as np
46
  import streamlit as st
47
- from typing import List, Dict, Optional
48
 
49
  from doc_preprocessing import process_files
50
 
@@ -96,12 +53,12 @@ class VectorDatabase:
96
  self.chunks = chunks
97
  self.chunks_metadata = chunks_metadata
98
 
99
- def query(self, query_embedding: List[float], k: int = 3) -> List[Dict]:
100
  """
101
  Queries the vector database for the most similar chunks to a query embedding.
102
 
103
  Args:
104
- query_embedding (List[float]): The embedding of the query.
105
  k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3.
106
 
107
  Returns:
@@ -117,15 +74,16 @@ class VectorDatabase:
117
  # Ensure query_embedding is a numpy array
118
  query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1) # Reshape for FAISS
119
 
120
- _, indices = self.index.search(query_embedding, k=k)
121
  results = []
122
- for i in indices[0]:
123
  chunk_text = self.chunks[i]
124
  metadata = self.chunks_metadata[i]
125
  results.append({
126
  "chunk_text": chunk_text,
127
  "file_name": metadata["file_name"],
128
  "chunk_index": metadata["chunk_index"],
 
129
  })
130
  return results
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import faiss
2
  import numpy as np
3
  import streamlit as st
4
+ from typing import List, Dict, Optional, Union
5
 
6
  from doc_preprocessing import process_files
7
 
 
53
  self.chunks = chunks
54
  self.chunks_metadata = chunks_metadata
55
 
56
+ def query(self, query_embedding: Union[List[float], np.ndarray], k: int = 3) -> List[Dict]:
57
  """
58
  Queries the vector database for the most similar chunks to a query embedding.
59
 
60
  Args:
61
+ query_embedding (List[float] or np.ndarray): The embedding of the query.
62
  k (int, optional): The number of nearest neighbors to retrieve. Defaults to 3.
63
 
64
  Returns:
 
74
  # Ensure query_embedding is a numpy array
75
  query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1) # Reshape for FAISS
76
 
77
+ dist, indices = self.index.search(query_embedding, k=k)
78
  results = []
79
+ for (i, j) in zip(indices[0], dist[0]):
80
  chunk_text = self.chunks[i]
81
  metadata = self.chunks_metadata[i]
82
  results.append({
83
  "chunk_text": chunk_text,
84
  "file_name": metadata["file_name"],
85
  "chunk_index": metadata["chunk_index"],
86
+ "score": j
87
  })
88
  return results
89