Spaces:

shahbazdev0
/

VersionRAG

Sleeping

File size: 4,230 Bytes

f7db2f9

# utils.py - Utility Functions
import PyPDF2
import io
import difflib
from typing import List, Dict
import hashlib
import json
import os
from pathlib import Path

class DocumentProcessor:
    """Document processing utilities"""
    
    @staticmethod
    def extract_text_from_pdf(pdf_bytes: bytes) -> str:
        """Extract text from PDF bytes"""
        try:
            pdf_file = io.BytesIO(pdf_bytes)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            
            return text
        except Exception as e:
            raise Exception(f"Error extracting PDF text: {str(e)}")
    
    @staticmethod
    def compute_hash(content: bytes) -> str:
        """Compute SHA-256 hash of content"""
        return hashlib.sha256(content).hexdigest()
    
    @staticmethod
    def chunk_text(text: str, chunk_size: int = 1000, 
                   overlap: int = 200) -> List[str]:
        """Split text into overlapping chunks"""
        chunks = []
        start = 0
        
        while start < len(text):
            end = start + chunk_size
            chunk = text[start:end]
            if chunk.strip():  # Only add non-empty chunks
                chunks.append(chunk)
            start = end - overlap
        
        return chunks

class ChangeDetector:
    """Detect changes between document versions"""
    
    @staticmethod
    def compute_diff(old_text: str, new_text: str) -> Dict:
        """Compute differences between two text versions"""
        old_lines = old_text.split('\n')
        new_lines = new_text.split('\n')
        
        differ = difflib.Differ()
        diff = list(differ.compare(old_lines, new_lines))
        
        additions = []
        deletions = []
        modifications = []
        
        for line in diff:
            if line.startswith('+ '):
                additions.append(line[2:])
            elif line.startswith('- '):
                deletions.append(line[2:])
            elif line.startswith('? '):
                modifications.append(line[2:])
        
        return {
            'additions': additions,
            'deletions': deletions,
            'modifications': modifications
        }
    
    @staticmethod
    def semantic_change_detection(old_text: str, new_text: str,
                                 embeddings) -> List[Dict]:
        """Detect semantic changes using embeddings"""
        old_chunks = DocumentProcessor.chunk_text(old_text)
        new_chunks = DocumentProcessor.chunk_text(new_text)
        
        try:
            old_embeddings = embeddings.embed_documents(old_chunks)
            new_embeddings = embeddings.embed_documents(new_chunks)
            
            # This is a simplified version - can be enhanced with
            # more sophisticated change detection algorithms
            changes = []
            
            return changes
        except Exception as e:
            print(f"Error in semantic change detection: {e}")
            return []

class PersistentStorage:
    """Handle persistent storage of metadata"""
    
    def __init__(self, user_id: str):
        self.user_id = user_id
        self.storage_dir = Path(f"./user_data_{user_id}")
        self.storage_dir.mkdir(exist_ok=True)
        self.metadata_file = self.storage_dir / "uploaded_files.json"
    
    def save_metadata(self, metadata: Dict):
        """Save uploaded files metadata"""
        try:
            with open(self.metadata_file, 'w') as f:
                json.dump(metadata, f, indent=2)
        except Exception as e:
            print(f"Error saving metadata: {e}")
    
    def load_metadata(self) -> Dict:
        """Load uploaded files metadata"""
        if self.metadata_file.exists():
            try:
                with open(self.metadata_file, 'r') as f:
                    return json.load(f)
            except Exception as e:
                print(f"Error loading metadata: {e}")
                return {}
        return {}
    
    def clear_metadata(self):
        """Clear all metadata"""
        if self.metadata_file.exists():
            self.metadata_file.unlink()