| """ | |
| Document Intelligence IO Module | |
| Document loading, rendering, and caching: | |
| - PDF loading with PyMuPDF | |
| - Image loading (JPEG, PNG, TIFF) | |
| - Page rendering at configurable DPI | |
| - File-based caching with LRU eviction | |
| """ | |
| from .base import ( | |
| # Format detection | |
| DocumentFormat, | |
| # Metadata | |
| PageInfo, | |
| DocumentInfo, | |
| # Options | |
| RenderOptions, | |
| # Base classes | |
| DocumentLoader, | |
| PageRenderer, | |
| DocumentProcessor, | |
| ) | |
| from .pdf import ( | |
| PDFLoader, | |
| PDFRenderer, | |
| PDFTextExtractor, | |
| load_pdf, | |
| ) | |
| from .image import ( | |
| ImageLoader, | |
| ImageRenderer, | |
| load_image, | |
| ) | |
| from .cache import ( | |
| CacheConfig, | |
| CacheEntry, | |
| DocumentCache, | |
| get_document_cache, | |
| cached_page, | |
| ) | |
| __all__ = [ | |
| # Format | |
| "DocumentFormat", | |
| # Metadata | |
| "PageInfo", | |
| "DocumentInfo", | |
| "RenderOptions", | |
| # Base | |
| "DocumentLoader", | |
| "PageRenderer", | |
| "DocumentProcessor", | |
| "PDFLoader", | |
| "PDFRenderer", | |
| "PDFTextExtractor", | |
| "load_pdf", | |
| # Image | |
| "ImageLoader", | |
| "ImageRenderer", | |
| "load_image", | |
| # Cache | |
| "CacheConfig", | |
| "CacheEntry", | |
| "DocumentCache", | |
| "get_document_cache", | |
| "cached_page", | |
| ] | |
| def load_document(path): | |
| """ | |
| Load a document based on its format. | |
| Auto-detects format from file extension. | |
| Args: | |
| path: Path to document file | |
| Returns: | |
| Tuple of (loader, renderer) | |
| """ | |
| from pathlib import Path as PathLib | |
| path = PathLib(path) | |
| fmt = DocumentFormat.from_path(path) | |
| if fmt == DocumentFormat.PDF: | |
| return load_pdf(path) | |
| elif fmt in {DocumentFormat.IMAGE, DocumentFormat.TIFF_MULTIPAGE}: | |
| return load_image(path) | |
| else: | |
| raise ValueError(f"Unsupported document format: {path.suffix}") | |