MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Base IO Classes for Document Intelligence
Abstract interfaces for document loading and page rendering.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
import numpy as np
from PIL import Image
class DocumentFormat(str, Enum):
"""Supported document formats."""
PDF = "pdf"
IMAGE = "image" # JPEG, PNG, TIFF, etc.
TIFF_MULTIPAGE = "tiff_multipage"
UNKNOWN = "unknown"
@classmethod
def from_path(cls, path: Union[str, Path]) -> "DocumentFormat":
"""Detect format from file path."""
path = Path(path)
suffix = path.suffix.lower()
if suffix == ".pdf":
return cls.PDF
elif suffix in {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}:
return cls.IMAGE
elif suffix in {".tif", ".tiff"}:
# Could be single or multipage
return cls.TIFF_MULTIPAGE
else:
return cls.UNKNOWN
@dataclass
class PageInfo:
"""Information about a document page."""
page_number: int # 1-indexed
width_pixels: int
height_pixels: int
width_points: Optional[float] = None # PDF points (1/72 inch)
height_points: Optional[float] = None
dpi: int = 72
rotation: int = 0 # Degrees (0, 90, 180, 270)
has_text: bool = False
has_images: bool = False
@dataclass
class DocumentInfo:
"""Metadata about a loaded document."""
path: Path
format: DocumentFormat
num_pages: int
pages: List[PageInfo] = field(default_factory=list)
# Document metadata
title: Optional[str] = None
author: Optional[str] = None
subject: Optional[str] = None
creator: Optional[str] = None
creation_date: Optional[str] = None
modification_date: Optional[str] = None
# File info
file_size_bytes: int = 0
is_encrypted: bool = False
is_digitally_signed: bool = False
# Content flags
has_text_layer: bool = False
is_scanned: bool = False
has_forms: bool = False
has_annotations: bool = False
@property
def doc_id(self) -> str:
"""Generate a stable document ID from path and size."""
import hashlib
content = f"{self.path.name}_{self.file_size_bytes}_{self.num_pages}"
return hashlib.sha256(content.encode()).hexdigest()[:16]
@dataclass
class RenderOptions:
"""Options for page rendering."""
dpi: int = 200
color_mode: str = "RGB" # "RGB", "L" (grayscale), "RGBA"
background_color: Tuple[int, ...] = (255, 255, 255) # White
antialias: bool = True
include_annotations: bool = True
include_forms: bool = True
class DocumentLoader(ABC):
"""
Abstract base class for document loaders.
Handles opening documents and extracting metadata.
"""
@abstractmethod
def load(self, path: Union[str, Path]) -> DocumentInfo:
"""
Load a document and extract metadata.
Args:
path: Path to the document file
Returns:
DocumentInfo with document metadata
"""
pass
@abstractmethod
def close(self) -> None:
"""Release resources and close the document."""
pass
@abstractmethod
def is_loaded(self) -> bool:
"""Check if a document is currently loaded."""
pass
@property
@abstractmethod
def info(self) -> Optional[DocumentInfo]:
"""Get information about the loaded document."""
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False
class PageRenderer(ABC):
"""
Abstract base class for page rendering.
Converts document pages to images for processing.
"""
@abstractmethod
def render_page(
self,
page_number: int,
options: Optional[RenderOptions] = None
) -> np.ndarray:
"""
Render a single page to an image.
Args:
page_number: 1-indexed page number
options: Rendering options
Returns:
Page image as numpy array (H, W, C)
"""
pass
def render_pages(
self,
page_numbers: Optional[List[int]] = None,
options: Optional[RenderOptions] = None
) -> Iterator[Tuple[int, np.ndarray]]:
"""
Render multiple pages.
Args:
page_numbers: List of 1-indexed page numbers (None = all pages)
options: Rendering options
Yields:
Tuples of (page_number, image_array)
"""
if page_numbers is None:
# Subclasses should override to provide total pages
raise NotImplementedError("Subclass must provide page iteration")
for page_num in page_numbers:
yield page_num, self.render_page(page_num, options)
def render_region(
self,
page_number: int,
region: Tuple[float, float, float, float],
options: Optional[RenderOptions] = None,
normalized: bool = True
) -> np.ndarray:
"""
Render a specific region of a page.
Args:
page_number: 1-indexed page number
region: (x_min, y_min, x_max, y_max) coordinates
options: Rendering options
normalized: Whether coordinates are normalized (0-1)
Returns:
Region image as numpy array
"""
# Default: render full page and crop
full_page = self.render_page(page_number, options)
h, w = full_page.shape[:2]
x_min, y_min, x_max, y_max = region
if normalized:
x_min, x_max = int(x_min * w), int(x_max * w)
y_min, y_max = int(y_min * h), int(y_max * h)
else:
x_min, y_min = int(x_min), int(y_min)
x_max, y_max = int(x_max), int(y_max)
# Clip to valid range
x_min = max(0, min(x_min, w))
x_max = max(0, min(x_max, w))
y_min = max(0, min(y_min, h))
y_max = max(0, min(y_max, h))
return full_page[y_min:y_max, x_min:x_max]
class DocumentProcessor(ABC):
"""
Combined document loader and renderer.
Convenience class that combines loading and rendering.
"""
def __init__(self, loader: DocumentLoader, renderer: PageRenderer):
self.loader = loader
self.renderer = renderer
@abstractmethod
def process(
self,
path: Union[str, Path],
options: Optional[RenderOptions] = None,
page_range: Optional[Tuple[int, int]] = None
) -> Iterator[Tuple[int, np.ndarray, PageInfo]]:
"""
Load and render document pages.
Args:
path: Document path
options: Rendering options
page_range: Optional (start, end) page range (1-indexed, inclusive)
Yields:
Tuples of (page_number, image, page_info)
"""
pass