File size: 1,784 Bytes
d520909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
"""
Document Intelligence IO Module
Document loading, rendering, and caching:
- PDF loading with PyMuPDF
- Image loading (JPEG, PNG, TIFF)
- Page rendering at configurable DPI
- File-based caching with LRU eviction
"""
from .base import (
# Format detection
DocumentFormat,
# Metadata
PageInfo,
DocumentInfo,
# Options
RenderOptions,
# Base classes
DocumentLoader,
PageRenderer,
DocumentProcessor,
)
from .pdf import (
PDFLoader,
PDFRenderer,
PDFTextExtractor,
load_pdf,
)
from .image import (
ImageLoader,
ImageRenderer,
load_image,
)
from .cache import (
CacheConfig,
CacheEntry,
DocumentCache,
get_document_cache,
cached_page,
)
__all__ = [
# Format
"DocumentFormat",
# Metadata
"PageInfo",
"DocumentInfo",
"RenderOptions",
# Base
"DocumentLoader",
"PageRenderer",
"DocumentProcessor",
# PDF
"PDFLoader",
"PDFRenderer",
"PDFTextExtractor",
"load_pdf",
# Image
"ImageLoader",
"ImageRenderer",
"load_image",
# Cache
"CacheConfig",
"CacheEntry",
"DocumentCache",
"get_document_cache",
"cached_page",
]
def load_document(path):
"""
Load a document based on its format.
Auto-detects format from file extension.
Args:
path: Path to document file
Returns:
Tuple of (loader, renderer)
"""
from pathlib import Path as PathLib
path = PathLib(path)
fmt = DocumentFormat.from_path(path)
if fmt == DocumentFormat.PDF:
return load_pdf(path)
elif fmt in {DocumentFormat.IMAGE, DocumentFormat.TIFF_MULTIPAGE}:
return load_image(path)
else:
raise ValueError(f"Unsupported document format: {path.suffix}")
|