File size: 3,277 Bytes
35bda59
071bde7
9ff2eb6
35bda59
 
 
 
 
 
 
 
 
 
 
79a5c64
35bda59
 
 
 
 
 
79a5c64
35bda59
79a5c64
76894b4
35bda59
 
 
9ff2eb6
071bde7
 
9ff2eb6
 
79a5c64
76894b4
 
 
79a5c64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
071bde7
79a5c64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
071bde7
35bda59
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import streamlit as st
from st_copy_to_clipboard import st_copy_to_clipboard
import re
import numpy as np
from doc_preprocessing import process_files, get_embeddings
from vector_DB import VectorDatabase  # Import the class
from llm_interaction import get_answer

# Initialize vector database (FAISS) - corrected instantiation
vector_database = VectorDatabase() #Instantiate the VectorDatabase Class
chunks_metadata = []



@st.cache_data
def process_query(query):
    if vector_database.is_empty(): #Use the method
        return "Please upload files first."

    # query_embedding = get_embeddings([query])[0]
    # results = vector_database.query(query_embedding, k=3) # use the method
    print('Query:', query)
    query_embedding = get_embeddings([query])[0]  # Get the embedding for the query
    print('Asking Queries..................')
    results = vector_database.query(query_embedding, k=10)  # Get the top 2 results

    return results

def normalize_line_breaks(text):
    text = text.replace("\\n", "  \n ")

    return text

def display_results(results, chunks):
    cpt = 1
    for result in (results):
        if result['score'] < 0.5:
            st.subheader(f"Réponse {cpt} :")
            st.write(f"Source File: {result['file_name']}, Score: {round(1./(1+result['score'])*100,2)}%")     #
            
            text_to_display = result['chunk_text']
            col1, col2 = st.columns(2)
            with col1:
                previous_chunk_index = result['chunk_index'] - 1
                if previous_chunk_index >= 0:
                    try:
                        previous_chunk_text = chunks[previous_chunk_index]
                        if st.button(f"Ajouter la portion de texte précédente", key=f"before_{cpt}"):
                            text_to_display = previous_chunk_text[:-50] + result['chunk_text']
                    except IndexError:
                        pass #silently ignore
            with col2:
                next_chunk_index = result['chunk_index'] + 1
                if next_chunk_index < len(chunks):
                    try:
                        next_chunk_text = chunks[next_chunk_index]
                        if st.button(f"Ajouter la portion de texte suivante", key=f"after_{cpt}"):
                            text_to_display = result['chunk_text'] + next_chunk_text[50:]
                    except IndexError:
                        pass

            st.write("Citations depuis le document :")
            st.write(normalize_line_breaks(text_to_display))
            st_copy_to_clipboard(normalize_line_breaks(text_to_display))
        cpt += 1


def main():
    st.title("Document Query App")

    uploaded_files = st.file_uploader(
        "Upload PDF or Word files", accept_multiple_files=True, type=["pdf", "docx"]
    )

    query = st.text_input("Enter your query:")

    if uploaded_files:
        global chunks_metadata
        all_chunks, all_embeddings, chunks_metadata = process_files(uploaded_files)
        vector_database.add_data(all_embeddings, all_chunks, chunks_metadata) # use the method

        st.session_state.files_processed = True
    
    if query:
        results = process_query(query)
        display_results(results, all_chunks)

if __name__ == "__main__":
    main()