# utils.py - Utility Functions import PyPDF2 import io import difflib from typing import List, Dict import hashlib import json import os from pathlib import Path class DocumentProcessor: """Document processing utilities""" @staticmethod def extract_text_from_pdf(pdf_bytes: bytes) -> str: """Extract text from PDF bytes""" try: pdf_file = io.BytesIO(pdf_bytes) pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text except Exception as e: raise Exception(f"Error extracting PDF text: {str(e)}") @staticmethod def compute_hash(content: bytes) -> str: """Compute SHA-256 hash of content""" return hashlib.sha256(content).hexdigest() @staticmethod def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]: """Split text into overlapping chunks""" chunks = [] start = 0 while start < len(text): end = start + chunk_size chunk = text[start:end] if chunk.strip(): # Only add non-empty chunks chunks.append(chunk) start = end - overlap return chunks class ChangeDetector: """Detect changes between document versions""" @staticmethod def compute_diff(old_text: str, new_text: str) -> Dict: """Compute differences between two text versions""" old_lines = old_text.split('\n') new_lines = new_text.split('\n') differ = difflib.Differ() diff = list(differ.compare(old_lines, new_lines)) additions = [] deletions = [] modifications = [] for line in diff: if line.startswith('+ '): additions.append(line[2:]) elif line.startswith('- '): deletions.append(line[2:]) elif line.startswith('? '): modifications.append(line[2:]) return { 'additions': additions, 'deletions': deletions, 'modifications': modifications } @staticmethod def semantic_change_detection(old_text: str, new_text: str, embeddings) -> List[Dict]: """Detect semantic changes using embeddings""" old_chunks = DocumentProcessor.chunk_text(old_text) new_chunks = DocumentProcessor.chunk_text(new_text) try: old_embeddings = embeddings.embed_documents(old_chunks) new_embeddings = embeddings.embed_documents(new_chunks) # This is a simplified version - can be enhanced with # more sophisticated change detection algorithms changes = [] return changes except Exception as e: print(f"Error in semantic change detection: {e}") return [] class PersistentStorage: """Handle persistent storage of metadata""" def __init__(self, user_id: str): self.user_id = user_id self.storage_dir = Path(f"./user_data_{user_id}") self.storage_dir.mkdir(exist_ok=True) self.metadata_file = self.storage_dir / "uploaded_files.json" def save_metadata(self, metadata: Dict): """Save uploaded files metadata""" try: with open(self.metadata_file, 'w') as f: json.dump(metadata, f, indent=2) except Exception as e: print(f"Error saving metadata: {e}") def load_metadata(self) -> Dict: """Load uploaded files metadata""" if self.metadata_file.exists(): try: with open(self.metadata_file, 'r') as f: return json.load(f) except Exception as e: print(f"Error loading metadata: {e}") return {} return {} def clear_metadata(self): """Clear all metadata""" if self.metadata_file.exists(): self.metadata_file.unlink()