Spaces:

shahbazdev0
/

VersionRAG

Sleeping

App Files Files Community

VersionRAG / utils.py

shahbazdev0

Upload 9 files

f7db2f9 verified 3 months ago

raw

history blame contribute delete

4.23 kB

	# utils.py - Utility Functions
	import PyPDF2
	import io
	import difflib
	from typing import List, Dict
	import hashlib
	import json
	import os
	from pathlib import Path

	class DocumentProcessor:
	"""Document processing utilities"""

	@staticmethod
	def extract_text_from_pdf(pdf_bytes: bytes) -> str:
	"""Extract text from PDF bytes"""
	try:
	pdf_file = io.BytesIO(pdf_bytes)
	pdf_reader = PyPDF2.PdfReader(pdf_file)

	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"

	return text
	except Exception as e:
	raise Exception(f"Error extracting PDF text: {str(e)}")

	@staticmethod
	def compute_hash(content: bytes) -> str:
	"""Compute SHA-256 hash of content"""
	return hashlib.sha256(content).hexdigest()

	@staticmethod
	def chunk_text(text: str, chunk_size: int = 1000,
	overlap: int = 200) -> List[str]:
	"""Split text into overlapping chunks"""
	chunks = []
	start = 0

	while start < len(text):
	end = start + chunk_size
	chunk = text[start:end]
	if chunk.strip(): # Only add non-empty chunks
	chunks.append(chunk)
	start = end - overlap

	return chunks

	class ChangeDetector:
	"""Detect changes between document versions"""

	@staticmethod
	def compute_diff(old_text: str, new_text: str) -> Dict:
	"""Compute differences between two text versions"""
	old_lines = old_text.split('\n')
	new_lines = new_text.split('\n')

	differ = difflib.Differ()
	diff = list(differ.compare(old_lines, new_lines))

	additions = []
	deletions = []
	modifications = []

	for line in diff:
	if line.startswith('+ '):
	additions.append(line[2:])
	elif line.startswith('- '):
	deletions.append(line[2:])
	elif line.startswith('? '):
	modifications.append(line[2:])

	return {
	'additions': additions,
	'deletions': deletions,
	'modifications': modifications
	}

	@staticmethod
	def semantic_change_detection(old_text: str, new_text: str,
	embeddings) -> List[Dict]:
	"""Detect semantic changes using embeddings"""
	old_chunks = DocumentProcessor.chunk_text(old_text)
	new_chunks = DocumentProcessor.chunk_text(new_text)

	try:
	old_embeddings = embeddings.embed_documents(old_chunks)
	new_embeddings = embeddings.embed_documents(new_chunks)

	# This is a simplified version - can be enhanced with
	# more sophisticated change detection algorithms
	changes = []

	return changes
	except Exception as e:
	print(f"Error in semantic change detection: {e}")
	return []

	class PersistentStorage:
	"""Handle persistent storage of metadata"""

	def __init__(self, user_id: str):
	self.user_id = user_id
	self.storage_dir = Path(f"./user_data_{user_id}")
	self.storage_dir.mkdir(exist_ok=True)
	self.metadata_file = self.storage_dir / "uploaded_files.json"

	def save_metadata(self, metadata: Dict):
	"""Save uploaded files metadata"""
	try:
	with open(self.metadata_file, 'w') as f:
	json.dump(metadata, f, indent=2)
	except Exception as e:
	print(f"Error saving metadata: {e}")

	def load_metadata(self) -> Dict:
	"""Load uploaded files metadata"""
	if self.metadata_file.exists():
	try:
	with open(self.metadata_file, 'r') as f:
	return json.load(f)
	except Exception as e:
	print(f"Error loading metadata: {e}")
	return {}
	return {}

	def clear_metadata(self):
	"""Clear all metadata"""
	if self.metadata_file.exists():
	self.metadata_file.unlink()