File size: 3,638 Bytes
35bda59
 
 
 
 
 
 
 
71880a4
79a5c64
4fcb1d4
35bda59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ff2eb6
35bda59
 
 
 
 
 
 
 
 
 
 
732359a
35bda59
 
 
 
 
 
76894b4
35bda59
1e44973
071bde7
35bda59
 
 
 
071bde7
 
 
 
 
 
 
 
 
 
 
35bda59
071bde7
 
 
 
76894b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pypdf
from docx import Document
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
import streamlit as st
import numpy as np
import os


emb_model = "intfloat/multilingual-e5-base"

def extract_text(file):
    text = ""
    # Check if the input is a file path (string) or a file-like object
    if isinstance(file, str):
        file_name = os.path.basename(file)
        try:
            with open(file, 'rb') as f: # Open in binary mode
                if file_name.endswith(".pdf"):
                    print('Processing pdf file.................\n')
                    reader = pypdf.PdfReader(f)
                    for page in reader.pages:
                        text += page.extract_text() + "\\n"
                elif file_name.endswith(".docx"):
                    document = Document(f)
                    print('Processing DOCX file.................\n')
                    for paragraph in document.paragraphs:
                        if paragraph.text.strip():  # Check if the paragraph is not empty
                            text += paragraph.text + "\\n"
        except FileNotFoundError:
            st.error(f"Error: File not found at {file}")
            return ""
        except Exception as e:
            st.error(f"Error reading {file_name}: {e}")
            return ""
    else: # Assume it's a file-like object (e.g., from Streamlit file_uploader)
        file_name = file.name
        try:
            if file_name.endswith(".pdf"):
                reader = pypdf.PdfReader(file)
                for page in reader.pages:
                    text += page.extract_text() + "\\n"
            elif file_name.endswith(".docx"):
                document = Document(file)
                for paragraph in document.paragraphs:
                    text += paragraph.text + "\\n"
        except Exception as e:
            st.error(f"Error reading {file_name}: {e}")
            return ""
    return text

def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    return chunks

def get_embeddings(texts)-> np.ndarray:
    try:
        model = SentenceTransformer(emb_model, trust_remote_code=True)
        embeddings = model.encode(texts)

        print(f"Generated {len(embeddings)} embeddings.")
        return embeddings
    except Exception as e:
        st.error(f"Error generating embeddings: {e}")
        return np.array([])

@st.cache_data
def process_files(files):
    all_chunks = []
    all_embeddings = []
    chunks_metadata = []

    for file in files:
        print(f"Processing file: {file.name if hasattr(file, 'name') else os.path.basename(file)}")
        text = extract_text(file)
        if not text:  # Skip files that failed to process
            print(f"Skipping file {file.name if hasattr(file, 'name') else os.path.basename(file)} due to extraction error.")   
            continue
        print(f"Chunking text...{file.name if hasattr(file, 'name') else os.path.basename(file)}\n")
        chunks = chunk_text(text)
        embeddings = get_embeddings(chunks)
        # if not embeddings: # Skip files that failed to embed
        #     continue

        all_chunks.extend(chunks)
        all_embeddings.extend(embeddings)
        for i, chunk in enumerate(chunks):
            chunks_metadata.append({"file_name": file.name if hasattr(file, 'name') else os.path.basename(file), "chunk_index": i})
    return all_chunks, all_embeddings, chunks_metadata