Spaces:
Sleeping
Sleeping
File size: 4,230 Bytes
f7db2f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | # utils.py - Utility Functions
import PyPDF2
import io
import difflib
from typing import List, Dict
import hashlib
import json
import os
from pathlib import Path
class DocumentProcessor:
"""Document processing utilities"""
@staticmethod
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
"""Extract text from PDF bytes"""
try:
pdf_file = io.BytesIO(pdf_bytes)
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
raise Exception(f"Error extracting PDF text: {str(e)}")
@staticmethod
def compute_hash(content: bytes) -> str:
"""Compute SHA-256 hash of content"""
return hashlib.sha256(content).hexdigest()
@staticmethod
def chunk_text(text: str, chunk_size: int = 1000,
overlap: int = 200) -> List[str]:
"""Split text into overlapping chunks"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip(): # Only add non-empty chunks
chunks.append(chunk)
start = end - overlap
return chunks
class ChangeDetector:
"""Detect changes between document versions"""
@staticmethod
def compute_diff(old_text: str, new_text: str) -> Dict:
"""Compute differences between two text versions"""
old_lines = old_text.split('\n')
new_lines = new_text.split('\n')
differ = difflib.Differ()
diff = list(differ.compare(old_lines, new_lines))
additions = []
deletions = []
modifications = []
for line in diff:
if line.startswith('+ '):
additions.append(line[2:])
elif line.startswith('- '):
deletions.append(line[2:])
elif line.startswith('? '):
modifications.append(line[2:])
return {
'additions': additions,
'deletions': deletions,
'modifications': modifications
}
@staticmethod
def semantic_change_detection(old_text: str, new_text: str,
embeddings) -> List[Dict]:
"""Detect semantic changes using embeddings"""
old_chunks = DocumentProcessor.chunk_text(old_text)
new_chunks = DocumentProcessor.chunk_text(new_text)
try:
old_embeddings = embeddings.embed_documents(old_chunks)
new_embeddings = embeddings.embed_documents(new_chunks)
# This is a simplified version - can be enhanced with
# more sophisticated change detection algorithms
changes = []
return changes
except Exception as e:
print(f"Error in semantic change detection: {e}")
return []
class PersistentStorage:
"""Handle persistent storage of metadata"""
def __init__(self, user_id: str):
self.user_id = user_id
self.storage_dir = Path(f"./user_data_{user_id}")
self.storage_dir.mkdir(exist_ok=True)
self.metadata_file = self.storage_dir / "uploaded_files.json"
def save_metadata(self, metadata: Dict):
"""Save uploaded files metadata"""
try:
with open(self.metadata_file, 'w') as f:
json.dump(metadata, f, indent=2)
except Exception as e:
print(f"Error saving metadata: {e}")
def load_metadata(self) -> Dict:
"""Load uploaded files metadata"""
if self.metadata_file.exists():
try:
with open(self.metadata_file, 'r') as f:
return json.load(f)
except Exception as e:
print(f"Error loading metadata: {e}")
return {}
return {}
def clear_metadata(self):
"""Clear all metadata"""
if self.metadata_file.exists():
self.metadata_file.unlink() |