File size: 4,230 Bytes
f7db2f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# utils.py - Utility Functions
import PyPDF2
import io
import difflib
from typing import List, Dict
import hashlib
import json
import os
from pathlib import Path

class DocumentProcessor:
    """Document processing utilities"""
    
    @staticmethod
    def extract_text_from_pdf(pdf_bytes: bytes) -> str:
        """Extract text from PDF bytes"""
        try:
            pdf_file = io.BytesIO(pdf_bytes)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            
            return text
        except Exception as e:
            raise Exception(f"Error extracting PDF text: {str(e)}")
    
    @staticmethod
    def compute_hash(content: bytes) -> str:
        """Compute SHA-256 hash of content"""
        return hashlib.sha256(content).hexdigest()
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 1000, 
                   overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            if chunk.strip():  # Only add non-empty chunks
                chunks.append(chunk)
            start = end - overlap
        
        return chunks

class ChangeDetector:
    """Detect changes between document versions"""
    
    @staticmethod
    def compute_diff(old_text: str, new_text: str) -> Dict:
        """Compute differences between two text versions"""
        old_lines = old_text.split('\n')
        new_lines = new_text.split('\n')
        
        differ = difflib.Differ()
        diff = list(differ.compare(old_lines, new_lines))
        
        additions = []
        deletions = []
        modifications = []
        
        for line in diff:
            if line.startswith('+ '):
                additions.append(line[2:])
            elif line.startswith('- '):
                deletions.append(line[2:])
            elif line.startswith('? '):
                modifications.append(line[2:])
        
        return {
            'additions': additions,
            'deletions': deletions,
            'modifications': modifications
        }
    
    @staticmethod
    def semantic_change_detection(old_text: str, new_text: str,
                                 embeddings) -> List[Dict]:
        """Detect semantic changes using embeddings"""
        old_chunks = DocumentProcessor.chunk_text(old_text)
        new_chunks = DocumentProcessor.chunk_text(new_text)
        
        try:
            old_embeddings = embeddings.embed_documents(old_chunks)
            new_embeddings = embeddings.embed_documents(new_chunks)
            
            # This is a simplified version - can be enhanced with
            # more sophisticated change detection algorithms
            changes = []
            
            return changes
        except Exception as e:
            print(f"Error in semantic change detection: {e}")
            return []

class PersistentStorage:
    """Handle persistent storage of metadata"""
    
    def __init__(self, user_id: str):
        self.user_id = user_id
        self.storage_dir = Path(f"./user_data_{user_id}")
        self.storage_dir.mkdir(exist_ok=True)
        self.metadata_file = self.storage_dir / "uploaded_files.json"
    
    def save_metadata(self, metadata: Dict):
        """Save uploaded files metadata"""
        try:
            with open(self.metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2)
        except Exception as e:
            print(f"Error saving metadata: {e}")
    
    def load_metadata(self) -> Dict:
        """Load uploaded files metadata"""
        if self.metadata_file.exists():
            try:
                with open(self.metadata_file, 'r') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Error loading metadata: {e}")
                return {}
        return {}
    
    def clear_metadata(self):
        """Clear all metadata"""
        if self.metadata_file.exists():
            self.metadata_file.unlink()