""" Base IO Classes for Document Intelligence Abstract interfaces for document loading and page rendering. """ from abc import ABC, abstractmethod from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Tuple, Union import numpy as np from PIL import Image class DocumentFormat(str, Enum): """Supported document formats.""" PDF = "pdf" IMAGE = "image" # JPEG, PNG, TIFF, etc. TIFF_MULTIPAGE = "tiff_multipage" UNKNOWN = "unknown" @classmethod def from_path(cls, path: Union[str, Path]) -> "DocumentFormat": """Detect format from file path.""" path = Path(path) suffix = path.suffix.lower() if suffix == ".pdf": return cls.PDF elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}: return cls.IMAGE elif suffix in {".tif", ".tiff"}: # Could be single or multipage return cls.TIFF_MULTIPAGE else: return cls.UNKNOWN @dataclass class PageInfo: """Information about a document page.""" page_number: int # 1-indexed width_pixels: int height_pixels: int width_points: Optional[float] = None # PDF points (1/72 inch) height_points: Optional[float] = None dpi: int = 72 rotation: int = 0 # Degrees (0, 90, 180, 270) has_text: bool = False has_images: bool = False @dataclass class DocumentInfo: """Metadata about a loaded document.""" path: Path format: DocumentFormat num_pages: int pages: List[PageInfo] = field(default_factory=list) # Document metadata title: Optional[str] = None author: Optional[str] = None subject: Optional[str] = None creator: Optional[str] = None creation_date: Optional[str] = None modification_date: Optional[str] = None # File info file_size_bytes: int = 0 is_encrypted: bool = False is_digitally_signed: bool = False # Content flags has_text_layer: bool = False is_scanned: bool = False has_forms: bool = False has_annotations: bool = False @property def doc_id(self) -> str: """Generate a stable document ID from path and size.""" import hashlib content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}" return hashlib.sha256(content.encode()).hexdigest()[:16] @dataclass class RenderOptions: """Options for page rendering.""" dpi: int = 200 color_mode: str = "RGB" # "RGB", "L" (grayscale), "RGBA" background_color: Tuple[int, ...] = (255, 255, 255) # White antialias: bool = True include_annotations: bool = True include_forms: bool = True class DocumentLoader(ABC): """ Abstract base class for document loaders. Handles opening documents and extracting metadata. """ @abstractmethod def load(self, path: Union[str, Path]) -> DocumentInfo: """ Load a document and extract metadata. Args: path: Path to the document file Returns: DocumentInfo with document metadata """ pass @abstractmethod def close(self) -> None: """Release resources and close the document.""" pass @abstractmethod def is_loaded(self) -> bool: """Check if a document is currently loaded.""" pass @property @abstractmethod def info(self) -> Optional[DocumentInfo]: """Get information about the loaded document.""" pass def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False class PageRenderer(ABC): """ Abstract base class for page rendering. Converts document pages to images for processing. """ @abstractmethod def render_page( self, page_number: int, options: Optional[RenderOptions] = None ) -> np.ndarray: """ Render a single page to an image. Args: page_number: 1-indexed page number options: Rendering options Returns: Page image as numpy array (H, W, C) """ pass def render_pages( self, page_numbers: Optional[List[int]] = None, options: Optional[RenderOptions] = None ) -> Iterator[Tuple[int, np.ndarray]]: """ Render multiple pages. Args: page_numbers: List of 1-indexed page numbers (None = all pages) options: Rendering options Yields: Tuples of (page_number, image_array) """ if page_numbers is None: # Subclasses should override to provide total pages raise NotImplementedError("Subclass must provide page iteration") for page_num in page_numbers: yield page_num, self.render_page(page_num, options) def render_region( self, page_number: int, region: Tuple[float, float, float, float], options: Optional[RenderOptions] = None, normalized: bool = True ) -> np.ndarray: """ Render a specific region of a page. Args: page_number: 1-indexed page number region: (x_min, y_min, x_max, y_max) coordinates options: Rendering options normalized: Whether coordinates are normalized (0-1) Returns: Region image as numpy array """ # Default: render full page and crop full_page = self.render_page(page_number, options) h, w = full_page.shape[:2] x_min, y_min, x_max, y_max = region if normalized: x_min, x_max = int(x_min * w), int(x_max * w) y_min, y_max = int(y_min * h), int(y_max * h) else: x_min, y_min = int(x_min), int(y_min) x_max, y_max = int(x_max), int(y_max) # Clip to valid range x_min = max(0, min(x_min, w)) x_max = max(0, min(x_max, w)) y_min = max(0, min(y_min, h)) y_max = max(0, min(y_max, h)) return full_page[y_min:y_max, x_min:x_max] class DocumentProcessor(ABC): """ Combined document loader and renderer. Convenience class that combines loading and rendering. """ def __init__(self, loader: DocumentLoader, renderer: PageRenderer): self.loader = loader self.renderer = renderer @abstractmethod def process( self, path: Union[str, Path], options: Optional[RenderOptions] = None, page_range: Optional[Tuple[int, int]] = None ) -> Iterator[Tuple[int, np.ndarray, PageInfo]]: """ Load and render document pages. Args: path: Document path options: Rendering options page_range: Optional (start, end) page range (1-indexed, inclusive) Yields: Tuples of (page_number, image, page_info) """ pass