File size: 1,784 Bytes
d520909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Document Intelligence IO Module

Document loading, rendering, and caching:
- PDF loading with PyMuPDF
- Image loading (JPEG, PNG, TIFF)
- Page rendering at configurable DPI
- File-based caching with LRU eviction
"""

from .base import (
    # Format detection
    DocumentFormat,
    # Metadata
    PageInfo,
    DocumentInfo,
    # Options
    RenderOptions,
    # Base classes
    DocumentLoader,
    PageRenderer,
    DocumentProcessor,
)

from .pdf import (
    PDFLoader,
    PDFRenderer,
    PDFTextExtractor,
    load_pdf,
)

from .image import (
    ImageLoader,
    ImageRenderer,
    load_image,
)

from .cache import (
    CacheConfig,
    CacheEntry,
    DocumentCache,
    get_document_cache,
    cached_page,
)

__all__ = [
    # Format
    "DocumentFormat",
    # Metadata
    "PageInfo",
    "DocumentInfo",
    "RenderOptions",
    # Base
    "DocumentLoader",
    "PageRenderer",
    "DocumentProcessor",
    # PDF
    "PDFLoader",
    "PDFRenderer",
    "PDFTextExtractor",
    "load_pdf",
    # Image
    "ImageLoader",
    "ImageRenderer",
    "load_image",
    # Cache
    "CacheConfig",
    "CacheEntry",
    "DocumentCache",
    "get_document_cache",
    "cached_page",
]


def load_document(path):
    """
    Load a document based on its format.

    Auto-detects format from file extension.

    Args:
        path: Path to document file

    Returns:
        Tuple of (loader, renderer)
    """
    from pathlib import Path as PathLib
    path = PathLib(path)

    fmt = DocumentFormat.from_path(path)

    if fmt == DocumentFormat.PDF:
        return load_pdf(path)
    elif fmt in {DocumentFormat.IMAGE, DocumentFormat.TIFF_MULTIPAGE}:
        return load_image(path)
    else:
        raise ValueError(f"Unsupported document format: {path.suffix}")