""" Document Loader Loads and renders PDF and image documents for processing. Supports page-by-page rendering with configurable DPI. """ import os import hashlib from pathlib import Path from typing import List, Tuple, Optional, Union, BinaryIO from dataclasses import dataclass from loguru import logger import numpy as np from PIL import Image # PDF support via PyMuPDF (fitz) try: import fitz # PyMuPDF HAS_PYMUPDF = True except ImportError: HAS_PYMUPDF = False logger.warning("PyMuPDF not installed. PDF support disabled. Install with: pip install pymupdf") # Alternative PDF support via pdf2image try: from pdf2image import convert_from_path, convert_from_bytes HAS_PDF2IMAGE = True except ImportError: HAS_PDF2IMAGE = False @dataclass class PageInfo: """Information about a document page.""" page_number: int width: int height: int dpi: int has_text: bool = False rotation: int = 0 @dataclass class LoadedDocument: """ A loaded document ready for processing. """ document_id: str source_path: str filename: str file_type: str file_size_bytes: int num_pages: int pages_info: List[PageInfo] # Raw document handle (for lazy page rendering) _doc_handle: Optional[object] = None def get_page_image(self, page_number: int, dpi: int = 300) -> np.ndarray: """Render a specific page as an image.""" raise NotImplementedError("Subclasses must implement get_page_image") def close(self): """Close document handle and free resources.""" pass class PDFDocument(LoadedDocument): """Loaded PDF document with PyMuPDF backend.""" def get_page_image(self, page_number: int, dpi: int = 300) -> np.ndarray: """Render PDF page as numpy array.""" if not HAS_PYMUPDF or self._doc_handle is None: raise RuntimeError("PyMuPDF not available or document not loaded") if page_number < 0 or page_number >= self.num_pages: raise ValueError(f"Page {page_number} out of range (0-{self.num_pages - 1})") doc = self._doc_handle page = doc[page_number] # Calculate zoom factor for desired DPI zoom = dpi / 72.0 matrix = fitz.Matrix(zoom, zoom) # Render page to pixmap pixmap = page.get_pixmap(matrix=matrix, alpha=False) # Convert to numpy array img_array = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape( pixmap.height, pixmap.width, 3 ) return img_array def get_page_text(self, page_number: int) -> str: """Extract text from PDF page using PyMuPDF.""" if not HAS_PYMUPDF or self._doc_handle is None: return "" if page_number < 0 or page_number >= self.num_pages: return "" page = self._doc_handle[page_number] return page.get_text() def close(self): """Close PDF document.""" if self._doc_handle is not None: self._doc_handle.close() self._doc_handle = None class ImageDocument(LoadedDocument): """Loaded image document (single page).""" _image: Optional[np.ndarray] = None def get_page_image(self, page_number: int = 0, dpi: int = 300) -> np.ndarray: """Return the image (images are single-page).""" if page_number != 0: raise ValueError("Image documents have only one page (page 0)") if self._image is None: # Load image with Image.open(self.source_path) as img: if img.mode != "RGB": img = img.convert("RGB") self._image = np.array(img) return self._image def close(self): """Clear image from memory.""" self._image = None class DocumentLoader: """ Document loader with support for PDF and image files. """ SUPPORTED_EXTENSIONS = { ".pdf": "pdf", ".png": "image", ".jpg": "image", ".jpeg": "image", ".tiff": "image", ".tif": "image", ".bmp": "image", ".webp": "image", } def __init__(self, default_dpi: int = 300, cache_enabled: bool = True): """ Initialize document loader. Args: default_dpi: Default DPI for PDF rendering cache_enabled: Whether to cache rendered pages """ self.default_dpi = default_dpi self.cache_enabled = cache_enabled # Check available backends if not HAS_PYMUPDF and not HAS_PDF2IMAGE: logger.warning("No PDF backend available. PDF loading will fail.") def load( self, source: Union[str, Path, BinaryIO], document_id: Optional[str] = None, ) -> LoadedDocument: """ Load a document from file path or file object. Args: source: File path or file-like object document_id: Optional document ID (generated from hash if not provided) Returns: LoadedDocument instance """ # Handle file path if isinstance(source, (str, Path)): path = Path(source) if not path.exists(): raise FileNotFoundError(f"Document not found: {path}") source_path = str(path.absolute()) filename = path.name file_size = path.stat().st_size ext = path.suffix.lower() # Generate document ID from file hash if not provided if document_id is None: document_id = self._generate_doc_id(source_path) else: raise ValueError("File-like objects not yet supported. Please provide a file path.") # Determine file type if ext not in self.SUPPORTED_EXTENSIONS: raise ValueError(f"Unsupported file type: {ext}") file_type = self.SUPPORTED_EXTENSIONS[ext] # Load based on type if file_type == "pdf": return self._load_pdf(source_path, filename, file_size, document_id) else: return self._load_image(source_path, filename, file_size, document_id) def _load_pdf( self, source_path: str, filename: str, file_size: int, document_id: str, ) -> PDFDocument: """Load a PDF document.""" if not HAS_PYMUPDF: raise RuntimeError("PyMuPDF required for PDF loading. Install with: pip install pymupdf") logger.info(f"Loading PDF: {filename}") doc = fitz.open(source_path) num_pages = len(doc) # Collect page info pages_info = [] for i in range(num_pages): page = doc[i] rect = page.rect has_text = len(page.get_text().strip()) > 0 pages_info.append(PageInfo( page_number=i, width=int(rect.width), height=int(rect.height), dpi=72, # PDF native resolution has_text=has_text, rotation=page.rotation, )) return PDFDocument( document_id=document_id, source_path=source_path, filename=filename, file_type="pdf", file_size_bytes=file_size, num_pages=num_pages, pages_info=pages_info, _doc_handle=doc, ) def _load_image( self, source_path: str, filename: str, file_size: int, document_id: str, ) -> ImageDocument: """Load an image document.""" logger.info(f"Loading image: {filename}") with Image.open(source_path) as img: width, height = img.size pages_info = [PageInfo( page_number=0, width=width, height=height, dpi=self.default_dpi, has_text=False, )] return ImageDocument( document_id=document_id, source_path=source_path, filename=filename, file_type="image", file_size_bytes=file_size, num_pages=1, pages_info=pages_info, ) def _generate_doc_id(self, source_path: str) -> str: """Generate document ID from file path and modification time.""" stat = os.stat(source_path) content = f"{source_path}:{stat.st_mtime}:{stat.st_size}" return hashlib.sha256(content.encode()).hexdigest()[:16] # Module-level convenience functions _default_loader: Optional[DocumentLoader] = None def get_loader() -> DocumentLoader: """Get or create the default document loader.""" global _default_loader if _default_loader is None: _default_loader = DocumentLoader() return _default_loader def load_document( source: Union[str, Path, BinaryIO], document_id: Optional[str] = None, ) -> LoadedDocument: """Load a document using the default loader.""" return get_loader().load(source, document_id) def load_pdf(source: Union[str, Path], document_id: Optional[str] = None) -> PDFDocument: """Load a PDF document.""" doc = load_document(source, document_id) if not isinstance(doc, PDFDocument): raise ValueError(f"Expected PDF, got {doc.file_type}") return doc def load_image(source: Union[str, Path], document_id: Optional[str] = None) -> ImageDocument: """Load an image document.""" doc = load_document(source, document_id) if not isinstance(doc, ImageDocument): raise ValueError(f"Expected image, got {doc.file_type}") return doc def render_page( document: LoadedDocument, page_number: int, dpi: int = 300, ) -> np.ndarray: """Render a document page as a numpy array.""" return document.get_page_image(page_number, dpi)