MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Document Intelligence IO Module
Document loading, rendering, and caching:
- PDF loading with PyMuPDF
- Image loading (JPEG, PNG, TIFF)
- Page rendering at configurable DPI
- File-based caching with LRU eviction
"""
from .base import (
# Format detection
DocumentFormat,
# Metadata
PageInfo,
DocumentInfo,
# Options
RenderOptions,
# Base classes
DocumentLoader,
PageRenderer,
DocumentProcessor,
)
from .pdf import (
PDFLoader,
PDFRenderer,
PDFTextExtractor,
load_pdf,
)
from .image import (
ImageLoader,
ImageRenderer,
load_image,
)
from .cache import (
CacheConfig,
CacheEntry,
DocumentCache,
get_document_cache,
cached_page,
)
__all__ = [
# Format
"DocumentFormat",
# Metadata
"PageInfo",
"DocumentInfo",
"RenderOptions",
# Base
"DocumentLoader",
"PageRenderer",
"DocumentProcessor",
# PDF
"PDFLoader",
"PDFRenderer",
"PDFTextExtractor",
"load_pdf",
# Image
"ImageLoader",
"ImageRenderer",
"load_image",
# Cache
"CacheConfig",
"CacheEntry",
"DocumentCache",
"get_document_cache",
"cached_page",
]
def load_document(path):
"""
Load a document based on its format.
Auto-detects format from file extension.
Args:
path: Path to document file
Returns:
Tuple of (loader, renderer)
"""
from pathlib import Path as PathLib
path = PathLib(path)
fmt = DocumentFormat.from_path(path)
if fmt == DocumentFormat.PDF:
return load_pdf(path)
elif fmt in {DocumentFormat.IMAGE, DocumentFormat.TIFF_MULTIPAGE}:
return load_image(path)
else:
raise ValueError(f"Unsupported document format: {path.suffix}")