Spaces:
Sleeping
Sleeping
File size: 5,727 Bytes
e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 bbbfba8 e2ec8a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | """Readiness validator — computes how ready a document is for export.
Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
at document level. Does NOT decide whether to allow export — that's the
export eligibility validator's job.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from src.app.domain.models.readiness import (
AltoReadiness,
DocumentReadiness,
PageXmlReadiness,
)
from src.app.domain.models.status import (
GeometryStatus,
MissingCapability,
ReadinessLevel,
)
if TYPE_CHECKING:
from src.app.domain.models import CanonicalDocument, Page
def compute_page_alto_readiness(page: Page) -> AltoReadiness:
"""Compute ALTO readiness for a single page.
ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
"""
missing: list[MissingCapability] = []
if page.width <= 0 or page.height <= 0:
missing.append(MissingCapability.PAGE_DIMENSIONS)
has_blocks = len(page.text_regions) > 0
has_lines = False
has_words = False
has_word_geo = True
has_word_text = True
has_confidence = True
for region in page.text_regions:
if (
region.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.BLOCK_GEOMETRY not in missing
):
missing.append(MissingCapability.BLOCK_GEOMETRY)
for line in region.lines:
has_lines = True
if (
line.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.LINE_GEOMETRY not in missing
):
missing.append(MissingCapability.LINE_GEOMETRY)
for word in line.words:
has_words = True
if word.geometry.status == GeometryStatus.UNKNOWN:
has_word_geo = False
if not word.text:
has_word_text = False
if word.confidence is None:
has_confidence = False
if not has_blocks or not has_lines or not has_words:
if not has_words:
missing.append(MissingCapability.WORD_TEXT)
if not has_lines:
missing.append(MissingCapability.LINE_GEOMETRY)
if not has_word_geo and MissingCapability.WORD_GEOMETRY not in missing:
missing.append(MissingCapability.WORD_GEOMETRY)
if not has_word_text and MissingCapability.WORD_TEXT not in missing:
missing.append(MissingCapability.WORD_TEXT)
if not has_confidence and MissingCapability.CONFIDENCE not in missing:
missing.append(MissingCapability.CONFIDENCE)
if not page.reading_order:
missing.append(MissingCapability.READING_ORDER)
level = _level_from_missing(missing, critical={
MissingCapability.PAGE_DIMENSIONS,
MissingCapability.WORD_TEXT,
MissingCapability.WORD_GEOMETRY,
MissingCapability.LINE_GEOMETRY,
})
return AltoReadiness(level=level, missing=missing)
def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
"""Compute PAGE XML readiness for a single page.
PAGE XML is more lenient: regions + lines are often sufficient.
Word-level geometry is nice-to-have, not required.
"""
missing: list[MissingCapability] = []
if page.width <= 0 or page.height <= 0:
missing.append(MissingCapability.PAGE_DIMENSIONS)
has_regions = len(page.text_regions) > 0
has_lines = False
for region in page.text_regions:
if (
region.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.BLOCK_GEOMETRY not in missing
):
missing.append(MissingCapability.BLOCK_GEOMETRY)
for line in region.lines:
has_lines = True
if (
line.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.LINE_GEOMETRY not in missing
):
missing.append(MissingCapability.LINE_GEOMETRY)
if not has_regions:
missing.append(MissingCapability.BLOCK_GEOMETRY)
if not has_lines and MissingCapability.LINE_GEOMETRY not in missing:
missing.append(MissingCapability.LINE_GEOMETRY)
if not page.reading_order:
missing.append(MissingCapability.READING_ORDER)
level = _level_from_missing(missing, critical={
MissingCapability.PAGE_DIMENSIONS,
MissingCapability.BLOCK_GEOMETRY,
})
return PageXmlReadiness(level=level, missing=missing)
def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
"""Compute overall document readiness from per-page readiness."""
page_levels: list[ReadinessLevel] = []
for page in doc.pages:
alto = compute_page_alto_readiness(page)
page_levels.append(alto.level)
if not page_levels:
return DocumentReadiness(level=ReadinessLevel.NONE)
if all(lv == ReadinessLevel.FULL for lv in page_levels):
overall = ReadinessLevel.FULL
elif all(lv == ReadinessLevel.NONE for lv in page_levels):
overall = ReadinessLevel.NONE
elif any(lv == ReadinessLevel.NONE for lv in page_levels):
overall = ReadinessLevel.DEGRADED
else:
overall = ReadinessLevel.PARTIAL
return DocumentReadiness(level=overall, page_readiness=page_levels)
def _level_from_missing(
missing: list[MissingCapability],
critical: set[MissingCapability],
) -> ReadinessLevel:
"""Determine readiness level from missing capabilities."""
if not missing:
return ReadinessLevel.FULL
has_critical = any(m in critical for m in missing)
if has_critical:
return ReadinessLevel.NONE
return ReadinessLevel.PARTIAL
|