File size: 5,727 Bytes
e2ec8a2
 
 
 
 
 
 
 
 
bbbfba8
 
e2ec8a2
 
 
 
 
 
 
 
 
 
 
bbbfba8
 
 
e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbfba8
 
 
 
 
e2ec8a2
 
bbbfba8
 
 
 
 
e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbfba8
 
e2ec8a2
bbbfba8
 
e2ec8a2
bbbfba8
 
e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbfba8
 
 
 
 
e2ec8a2
 
bbbfba8
 
 
 
 
e2ec8a2
 
 
 
bbbfba8
 
e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbfba8
e2ec8a2
bbbfba8
e2ec8a2
bbbfba8
e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""Readiness validator — computes how ready a document is for export.

Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
at document level.  Does NOT decide whether to allow export — that's the
export eligibility validator's job.
"""

from __future__ import annotations

from typing import TYPE_CHECKING

from src.app.domain.models.readiness import (
    AltoReadiness,
    DocumentReadiness,
    PageXmlReadiness,
)
from src.app.domain.models.status import (
    GeometryStatus,
    MissingCapability,
    ReadinessLevel,
)

if TYPE_CHECKING:
    from src.app.domain.models import CanonicalDocument, Page


def compute_page_alto_readiness(page: Page) -> AltoReadiness:
    """Compute ALTO readiness for a single page.

    ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
    """
    missing: list[MissingCapability] = []

    if page.width <= 0 or page.height <= 0:
        missing.append(MissingCapability.PAGE_DIMENSIONS)

    has_blocks = len(page.text_regions) > 0
    has_lines = False
    has_words = False
    has_word_geo = True
    has_word_text = True
    has_confidence = True

    for region in page.text_regions:
        if (
            region.geometry.status == GeometryStatus.UNKNOWN
            and MissingCapability.BLOCK_GEOMETRY not in missing
        ):
            missing.append(MissingCapability.BLOCK_GEOMETRY)
        for line in region.lines:
            has_lines = True
            if (
                line.geometry.status == GeometryStatus.UNKNOWN
                and MissingCapability.LINE_GEOMETRY not in missing
            ):
                missing.append(MissingCapability.LINE_GEOMETRY)
            for word in line.words:
                has_words = True
                if word.geometry.status == GeometryStatus.UNKNOWN:
                    has_word_geo = False
                if not word.text:
                    has_word_text = False
                if word.confidence is None:
                    has_confidence = False

    if not has_blocks or not has_lines or not has_words:
        if not has_words:
            missing.append(MissingCapability.WORD_TEXT)
        if not has_lines:
            missing.append(MissingCapability.LINE_GEOMETRY)

    if not has_word_geo and MissingCapability.WORD_GEOMETRY not in missing:
        missing.append(MissingCapability.WORD_GEOMETRY)

    if not has_word_text and MissingCapability.WORD_TEXT not in missing:
        missing.append(MissingCapability.WORD_TEXT)

    if not has_confidence and MissingCapability.CONFIDENCE not in missing:
        missing.append(MissingCapability.CONFIDENCE)

    if not page.reading_order:
        missing.append(MissingCapability.READING_ORDER)

    level = _level_from_missing(missing, critical={
        MissingCapability.PAGE_DIMENSIONS,
        MissingCapability.WORD_TEXT,
        MissingCapability.WORD_GEOMETRY,
        MissingCapability.LINE_GEOMETRY,
    })

    return AltoReadiness(level=level, missing=missing)


def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
    """Compute PAGE XML readiness for a single page.

    PAGE XML is more lenient: regions + lines are often sufficient.
    Word-level geometry is nice-to-have, not required.
    """
    missing: list[MissingCapability] = []

    if page.width <= 0 or page.height <= 0:
        missing.append(MissingCapability.PAGE_DIMENSIONS)

    has_regions = len(page.text_regions) > 0
    has_lines = False

    for region in page.text_regions:
        if (
            region.geometry.status == GeometryStatus.UNKNOWN
            and MissingCapability.BLOCK_GEOMETRY not in missing
        ):
            missing.append(MissingCapability.BLOCK_GEOMETRY)
        for line in region.lines:
            has_lines = True
            if (
                line.geometry.status == GeometryStatus.UNKNOWN
                and MissingCapability.LINE_GEOMETRY not in missing
            ):
                missing.append(MissingCapability.LINE_GEOMETRY)

    if not has_regions:
        missing.append(MissingCapability.BLOCK_GEOMETRY)

    if not has_lines and MissingCapability.LINE_GEOMETRY not in missing:
        missing.append(MissingCapability.LINE_GEOMETRY)

    if not page.reading_order:
        missing.append(MissingCapability.READING_ORDER)

    level = _level_from_missing(missing, critical={
        MissingCapability.PAGE_DIMENSIONS,
        MissingCapability.BLOCK_GEOMETRY,
    })

    return PageXmlReadiness(level=level, missing=missing)


def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
    """Compute overall document readiness from per-page readiness."""
    page_levels: list[ReadinessLevel] = []
    for page in doc.pages:
        alto = compute_page_alto_readiness(page)
        page_levels.append(alto.level)

    if not page_levels:
        return DocumentReadiness(level=ReadinessLevel.NONE)

    if all(lv == ReadinessLevel.FULL for lv in page_levels):
        overall = ReadinessLevel.FULL
    elif all(lv == ReadinessLevel.NONE for lv in page_levels):
        overall = ReadinessLevel.NONE
    elif any(lv == ReadinessLevel.NONE for lv in page_levels):
        overall = ReadinessLevel.DEGRADED
    else:
        overall = ReadinessLevel.PARTIAL

    return DocumentReadiness(level=overall, page_readiness=page_levels)


def _level_from_missing(
    missing: list[MissingCapability],
    critical: set[MissingCapability],
) -> ReadinessLevel:
    """Determine readiness level from missing capabilities."""
    if not missing:
        return ReadinessLevel.FULL

    has_critical = any(m in critical for m in missing)
    if has_critical:
        return ReadinessLevel.NONE

    return ReadinessLevel.PARTIAL