|
|
""" |
|
|
Base IO Classes for Document Intelligence |
|
|
|
|
|
Abstract interfaces for document loading and page rendering. |
|
|
""" |
|
|
|
|
|
from abc import ABC, abstractmethod |
|
|
from dataclasses import dataclass, field |
|
|
from enum import Enum |
|
|
from pathlib import Path |
|
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union |
|
|
|
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
class DocumentFormat(str, Enum): |
|
|
"""Supported document formats.""" |
|
|
|
|
|
PDF = "pdf" |
|
|
IMAGE = "image" |
|
|
TIFF_MULTIPAGE = "tiff_multipage" |
|
|
UNKNOWN = "unknown" |
|
|
|
|
|
@classmethod |
|
|
def from_path(cls, path: Union[str, Path]) -> "DocumentFormat": |
|
|
"""Detect format from file path.""" |
|
|
path = Path(path) |
|
|
suffix = path.suffix.lower() |
|
|
|
|
|
if suffix == ".pdf": |
|
|
return cls.PDF |
|
|
elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}: |
|
|
return cls.IMAGE |
|
|
elif suffix in {".tif", ".tiff"}: |
|
|
|
|
|
return cls.TIFF_MULTIPAGE |
|
|
else: |
|
|
return cls.UNKNOWN |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PageInfo: |
|
|
"""Information about a document page.""" |
|
|
|
|
|
page_number: int |
|
|
width_pixels: int |
|
|
height_pixels: int |
|
|
width_points: Optional[float] = None |
|
|
height_points: Optional[float] = None |
|
|
dpi: int = 72 |
|
|
rotation: int = 0 |
|
|
has_text: bool = False |
|
|
has_images: bool = False |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DocumentInfo: |
|
|
"""Metadata about a loaded document.""" |
|
|
|
|
|
path: Path |
|
|
format: DocumentFormat |
|
|
num_pages: int |
|
|
pages: List[PageInfo] = field(default_factory=list) |
|
|
|
|
|
|
|
|
title: Optional[str] = None |
|
|
author: Optional[str] = None |
|
|
subject: Optional[str] = None |
|
|
creator: Optional[str] = None |
|
|
creation_date: Optional[str] = None |
|
|
modification_date: Optional[str] = None |
|
|
|
|
|
|
|
|
file_size_bytes: int = 0 |
|
|
is_encrypted: bool = False |
|
|
is_digitally_signed: bool = False |
|
|
|
|
|
|
|
|
has_text_layer: bool = False |
|
|
is_scanned: bool = False |
|
|
has_forms: bool = False |
|
|
has_annotations: bool = False |
|
|
|
|
|
@property |
|
|
def doc_id(self) -> str: |
|
|
"""Generate a stable document ID from path and size.""" |
|
|
import hashlib |
|
|
content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}" |
|
|
return hashlib.sha256(content.encode()).hexdigest()[:16] |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class RenderOptions: |
|
|
"""Options for page rendering.""" |
|
|
|
|
|
dpi: int = 200 |
|
|
color_mode: str = "RGB" |
|
|
background_color: Tuple[int, ...] = (255, 255, 255) |
|
|
antialias: bool = True |
|
|
include_annotations: bool = True |
|
|
include_forms: bool = True |
|
|
|
|
|
|
|
|
class DocumentLoader(ABC): |
|
|
""" |
|
|
Abstract base class for document loaders. |
|
|
|
|
|
Handles opening documents and extracting metadata. |
|
|
""" |
|
|
|
|
|
@abstractmethod |
|
|
def load(self, path: Union[str, Path]) -> DocumentInfo: |
|
|
""" |
|
|
Load a document and extract metadata. |
|
|
|
|
|
Args: |
|
|
path: Path to the document file |
|
|
|
|
|
Returns: |
|
|
DocumentInfo with document metadata |
|
|
""" |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def close(self) -> None: |
|
|
"""Release resources and close the document.""" |
|
|
pass |
|
|
|
|
|
@abstractmethod |
|
|
def is_loaded(self) -> bool: |
|
|
"""Check if a document is currently loaded.""" |
|
|
pass |
|
|
|
|
|
@property |
|
|
@abstractmethod |
|
|
def info(self) -> Optional[DocumentInfo]: |
|
|
"""Get information about the loaded document.""" |
|
|
pass |
|
|
|
|
|
def __enter__(self): |
|
|
return self |
|
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb): |
|
|
self.close() |
|
|
return False |
|
|
|
|
|
|
|
|
class PageRenderer(ABC): |
|
|
""" |
|
|
Abstract base class for page rendering. |
|
|
|
|
|
Converts document pages to images for processing. |
|
|
""" |
|
|
|
|
|
@abstractmethod |
|
|
def render_page( |
|
|
self, |
|
|
page_number: int, |
|
|
options: Optional[RenderOptions] = None |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Render a single page to an image. |
|
|
|
|
|
Args: |
|
|
page_number: 1-indexed page number |
|
|
options: Rendering options |
|
|
|
|
|
Returns: |
|
|
Page image as numpy array (H, W, C) |
|
|
""" |
|
|
pass |
|
|
|
|
|
def render_pages( |
|
|
self, |
|
|
page_numbers: Optional[List[int]] = None, |
|
|
options: Optional[RenderOptions] = None |
|
|
) -> Iterator[Tuple[int, np.ndarray]]: |
|
|
""" |
|
|
Render multiple pages. |
|
|
|
|
|
Args: |
|
|
page_numbers: List of 1-indexed page numbers (None = all pages) |
|
|
options: Rendering options |
|
|
|
|
|
Yields: |
|
|
Tuples of (page_number, image_array) |
|
|
""" |
|
|
if page_numbers is None: |
|
|
|
|
|
raise NotImplementedError("Subclass must provide page iteration") |
|
|
|
|
|
for page_num in page_numbers: |
|
|
yield page_num, self.render_page(page_num, options) |
|
|
|
|
|
def render_region( |
|
|
self, |
|
|
page_number: int, |
|
|
region: Tuple[float, float, float, float], |
|
|
options: Optional[RenderOptions] = None, |
|
|
normalized: bool = True |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Render a specific region of a page. |
|
|
|
|
|
Args: |
|
|
page_number: 1-indexed page number |
|
|
region: (x_min, y_min, x_max, y_max) coordinates |
|
|
options: Rendering options |
|
|
normalized: Whether coordinates are normalized (0-1) |
|
|
|
|
|
Returns: |
|
|
Region image as numpy array |
|
|
""" |
|
|
|
|
|
full_page = self.render_page(page_number, options) |
|
|
h, w = full_page.shape[:2] |
|
|
|
|
|
x_min, y_min, x_max, y_max = region |
|
|
if normalized: |
|
|
x_min, x_max = int(x_min * w), int(x_max * w) |
|
|
y_min, y_max = int(y_min * h), int(y_max * h) |
|
|
else: |
|
|
x_min, y_min = int(x_min), int(y_min) |
|
|
x_max, y_max = int(x_max), int(y_max) |
|
|
|
|
|
|
|
|
x_min = max(0, min(x_min, w)) |
|
|
x_max = max(0, min(x_max, w)) |
|
|
y_min = max(0, min(y_min, h)) |
|
|
y_max = max(0, min(y_max, h)) |
|
|
|
|
|
return full_page[y_min:y_max, x_min:x_max] |
|
|
|
|
|
|
|
|
class DocumentProcessor(ABC): |
|
|
""" |
|
|
Combined document loader and renderer. |
|
|
|
|
|
Convenience class that combines loading and rendering. |
|
|
""" |
|
|
|
|
|
def __init__(self, loader: DocumentLoader, renderer: PageRenderer): |
|
|
self.loader = loader |
|
|
self.renderer = renderer |
|
|
|
|
|
@abstractmethod |
|
|
def process( |
|
|
self, |
|
|
path: Union[str, Path], |
|
|
options: Optional[RenderOptions] = None, |
|
|
page_range: Optional[Tuple[int, int]] = None |
|
|
) -> Iterator[Tuple[int, np.ndarray, PageInfo]]: |
|
|
""" |
|
|
Load and render document pages. |
|
|
|
|
|
Args: |
|
|
path: Document path |
|
|
options: Rendering options |
|
|
page_range: Optional (start, end) page range (1-indexed, inclusive) |
|
|
|
|
|
Yields: |
|
|
Tuples of (page_number, image, page_info) |
|
|
""" |
|
|
pass |
|
|
|