VersionRAG / utils.py
shahbazdev0's picture
Upload 9 files
f7db2f9 verified
# utils.py - Utility Functions
import PyPDF2
import io
import difflib
from typing import List, Dict
import hashlib
import json
import os
from pathlib import Path
class DocumentProcessor:
"""Document processing utilities"""
@staticmethod
def extract_text_from_pdf(pdf_bytes: bytes) -> str:
"""Extract text from PDF bytes"""
try:
pdf_file = io.BytesIO(pdf_bytes)
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
raise Exception(f"Error extracting PDF text: {str(e)}")
@staticmethod
def compute_hash(content: bytes) -> str:
"""Compute SHA-256 hash of content"""
return hashlib.sha256(content).hexdigest()
@staticmethod
def chunk_text(text: str, chunk_size: int = 1000,
overlap: int = 200) -> List[str]:
"""Split text into overlapping chunks"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
if chunk.strip(): # Only add non-empty chunks
chunks.append(chunk)
start = end - overlap
return chunks
class ChangeDetector:
"""Detect changes between document versions"""
@staticmethod
def compute_diff(old_text: str, new_text: str) -> Dict:
"""Compute differences between two text versions"""
old_lines = old_text.split('\n')
new_lines = new_text.split('\n')
differ = difflib.Differ()
diff = list(differ.compare(old_lines, new_lines))
additions = []
deletions = []
modifications = []
for line in diff:
if line.startswith('+ '):
additions.append(line[2:])
elif line.startswith('- '):
deletions.append(line[2:])
elif line.startswith('? '):
modifications.append(line[2:])
return {
'additions': additions,
'deletions': deletions,
'modifications': modifications
}
@staticmethod
def semantic_change_detection(old_text: str, new_text: str,
embeddings) -> List[Dict]:
"""Detect semantic changes using embeddings"""
old_chunks = DocumentProcessor.chunk_text(old_text)
new_chunks = DocumentProcessor.chunk_text(new_text)
try:
old_embeddings = embeddings.embed_documents(old_chunks)
new_embeddings = embeddings.embed_documents(new_chunks)
# This is a simplified version - can be enhanced with
# more sophisticated change detection algorithms
changes = []
return changes
except Exception as e:
print(f"Error in semantic change detection: {e}")
return []
class PersistentStorage:
"""Handle persistent storage of metadata"""
def __init__(self, user_id: str):
self.user_id = user_id
self.storage_dir = Path(f"./user_data_{user_id}")
self.storage_dir.mkdir(exist_ok=True)
self.metadata_file = self.storage_dir / "uploaded_files.json"
def save_metadata(self, metadata: Dict):
"""Save uploaded files metadata"""
try:
with open(self.metadata_file, 'w') as f:
json.dump(metadata, f, indent=2)
except Exception as e:
print(f"Error saving metadata: {e}")
def load_metadata(self) -> Dict:
"""Load uploaded files metadata"""
if self.metadata_file.exists():
try:
with open(self.metadata_file, 'r') as f:
return json.load(f)
except Exception as e:
print(f"Error loading metadata: {e}")
return {}
return {}
def clear_metadata(self):
"""Clear all metadata"""
if self.metadata_file.exists():
self.metadata_file.unlink()