Spaces:
Sleeping
Sleeping
| # utils.py - Utility Functions | |
| import PyPDF2 | |
| import io | |
| import difflib | |
| from typing import List, Dict | |
| import hashlib | |
| import json | |
| import os | |
| from pathlib import Path | |
| class DocumentProcessor: | |
| """Document processing utilities""" | |
| def extract_text_from_pdf(pdf_bytes: bytes) -> str: | |
| """Extract text from PDF bytes""" | |
| try: | |
| pdf_file = io.BytesIO(pdf_bytes) | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| raise Exception(f"Error extracting PDF text: {str(e)}") | |
| def compute_hash(content: bytes) -> str: | |
| """Compute SHA-256 hash of content""" | |
| return hashlib.sha256(content).hexdigest() | |
| def chunk_text(text: str, chunk_size: int = 1000, | |
| overlap: int = 200) -> List[str]: | |
| """Split text into overlapping chunks""" | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunk = text[start:end] | |
| if chunk.strip(): # Only add non-empty chunks | |
| chunks.append(chunk) | |
| start = end - overlap | |
| return chunks | |
| class ChangeDetector: | |
| """Detect changes between document versions""" | |
| def compute_diff(old_text: str, new_text: str) -> Dict: | |
| """Compute differences between two text versions""" | |
| old_lines = old_text.split('\n') | |
| new_lines = new_text.split('\n') | |
| differ = difflib.Differ() | |
| diff = list(differ.compare(old_lines, new_lines)) | |
| additions = [] | |
| deletions = [] | |
| modifications = [] | |
| for line in diff: | |
| if line.startswith('+ '): | |
| additions.append(line[2:]) | |
| elif line.startswith('- '): | |
| deletions.append(line[2:]) | |
| elif line.startswith('? '): | |
| modifications.append(line[2:]) | |
| return { | |
| 'additions': additions, | |
| 'deletions': deletions, | |
| 'modifications': modifications | |
| } | |
| def semantic_change_detection(old_text: str, new_text: str, | |
| embeddings) -> List[Dict]: | |
| """Detect semantic changes using embeddings""" | |
| old_chunks = DocumentProcessor.chunk_text(old_text) | |
| new_chunks = DocumentProcessor.chunk_text(new_text) | |
| try: | |
| old_embeddings = embeddings.embed_documents(old_chunks) | |
| new_embeddings = embeddings.embed_documents(new_chunks) | |
| # This is a simplified version - can be enhanced with | |
| # more sophisticated change detection algorithms | |
| changes = [] | |
| return changes | |
| except Exception as e: | |
| print(f"Error in semantic change detection: {e}") | |
| return [] | |
| class PersistentStorage: | |
| """Handle persistent storage of metadata""" | |
| def __init__(self, user_id: str): | |
| self.user_id = user_id | |
| self.storage_dir = Path(f"./user_data_{user_id}") | |
| self.storage_dir.mkdir(exist_ok=True) | |
| self.metadata_file = self.storage_dir / "uploaded_files.json" | |
| def save_metadata(self, metadata: Dict): | |
| """Save uploaded files metadata""" | |
| try: | |
| with open(self.metadata_file, 'w') as f: | |
| json.dump(metadata, f, indent=2) | |
| except Exception as e: | |
| print(f"Error saving metadata: {e}") | |
| def load_metadata(self) -> Dict: | |
| """Load uploaded files metadata""" | |
| if self.metadata_file.exists(): | |
| try: | |
| with open(self.metadata_file, 'r') as f: | |
| return json.load(f) | |
| except Exception as e: | |
| print(f"Error loading metadata: {e}") | |
| return {} | |
| return {} | |
| def clear_metadata(self): | |
| """Clear all metadata""" | |
| if self.metadata_file.exists(): | |
| self.metadata_file.unlink() |