XmLLM / src /app /validators /readiness_validator.py
Claude
Code quality: fix all ruff warnings, add CI/CD, improve test coverage
bbbfba8 unverified
"""Readiness validator — computes how ready a document is for export.
Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
at document level. Does NOT decide whether to allow export — that's the
export eligibility validator's job.
"""
from __future__ import annotations
from typing import TYPE_CHECKING
from src.app.domain.models.readiness import (
AltoReadiness,
DocumentReadiness,
PageXmlReadiness,
)
from src.app.domain.models.status import (
GeometryStatus,
MissingCapability,
ReadinessLevel,
)
if TYPE_CHECKING:
from src.app.domain.models import CanonicalDocument, Page
def compute_page_alto_readiness(page: Page) -> AltoReadiness:
"""Compute ALTO readiness for a single page.
ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
"""
missing: list[MissingCapability] = []
if page.width <= 0 or page.height <= 0:
missing.append(MissingCapability.PAGE_DIMENSIONS)
has_blocks = len(page.text_regions) > 0
has_lines = False
has_words = False
has_word_geo = True
has_word_text = True
has_confidence = True
for region in page.text_regions:
if (
region.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.BLOCK_GEOMETRY not in missing
):
missing.append(MissingCapability.BLOCK_GEOMETRY)
for line in region.lines:
has_lines = True
if (
line.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.LINE_GEOMETRY not in missing
):
missing.append(MissingCapability.LINE_GEOMETRY)
for word in line.words:
has_words = True
if word.geometry.status == GeometryStatus.UNKNOWN:
has_word_geo = False
if not word.text:
has_word_text = False
if word.confidence is None:
has_confidence = False
if not has_blocks or not has_lines or not has_words:
if not has_words:
missing.append(MissingCapability.WORD_TEXT)
if not has_lines:
missing.append(MissingCapability.LINE_GEOMETRY)
if not has_word_geo and MissingCapability.WORD_GEOMETRY not in missing:
missing.append(MissingCapability.WORD_GEOMETRY)
if not has_word_text and MissingCapability.WORD_TEXT not in missing:
missing.append(MissingCapability.WORD_TEXT)
if not has_confidence and MissingCapability.CONFIDENCE not in missing:
missing.append(MissingCapability.CONFIDENCE)
if not page.reading_order:
missing.append(MissingCapability.READING_ORDER)
level = _level_from_missing(missing, critical={
MissingCapability.PAGE_DIMENSIONS,
MissingCapability.WORD_TEXT,
MissingCapability.WORD_GEOMETRY,
MissingCapability.LINE_GEOMETRY,
})
return AltoReadiness(level=level, missing=missing)
def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
"""Compute PAGE XML readiness for a single page.
PAGE XML is more lenient: regions + lines are often sufficient.
Word-level geometry is nice-to-have, not required.
"""
missing: list[MissingCapability] = []
if page.width <= 0 or page.height <= 0:
missing.append(MissingCapability.PAGE_DIMENSIONS)
has_regions = len(page.text_regions) > 0
has_lines = False
for region in page.text_regions:
if (
region.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.BLOCK_GEOMETRY not in missing
):
missing.append(MissingCapability.BLOCK_GEOMETRY)
for line in region.lines:
has_lines = True
if (
line.geometry.status == GeometryStatus.UNKNOWN
and MissingCapability.LINE_GEOMETRY not in missing
):
missing.append(MissingCapability.LINE_GEOMETRY)
if not has_regions:
missing.append(MissingCapability.BLOCK_GEOMETRY)
if not has_lines and MissingCapability.LINE_GEOMETRY not in missing:
missing.append(MissingCapability.LINE_GEOMETRY)
if not page.reading_order:
missing.append(MissingCapability.READING_ORDER)
level = _level_from_missing(missing, critical={
MissingCapability.PAGE_DIMENSIONS,
MissingCapability.BLOCK_GEOMETRY,
})
return PageXmlReadiness(level=level, missing=missing)
def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
"""Compute overall document readiness from per-page readiness."""
page_levels: list[ReadinessLevel] = []
for page in doc.pages:
alto = compute_page_alto_readiness(page)
page_levels.append(alto.level)
if not page_levels:
return DocumentReadiness(level=ReadinessLevel.NONE)
if all(lv == ReadinessLevel.FULL for lv in page_levels):
overall = ReadinessLevel.FULL
elif all(lv == ReadinessLevel.NONE for lv in page_levels):
overall = ReadinessLevel.NONE
elif any(lv == ReadinessLevel.NONE for lv in page_levels):
overall = ReadinessLevel.DEGRADED
else:
overall = ReadinessLevel.PARTIAL
return DocumentReadiness(level=overall, page_readiness=page_levels)
def _level_from_missing(
missing: list[MissingCapability],
critical: set[MissingCapability],
) -> ReadinessLevel:
"""Determine readiness level from missing capabilities."""
if not missing:
return ReadinessLevel.FULL
has_critical = any(m in critical for m in missing)
if has_critical:
return ReadinessLevel.NONE
return ReadinessLevel.PARTIAL