Spaces:
Sleeping
Sprint 4: validation layer and document policy
Browse filesFour validators and two policy modules that control the pipeline:
Validators:
- structural_validator: ID uniqueness, reading_order ref validity, bbox
containment (word ⊂ line ⊂ region ⊂ page) with configurable tolerance
- readiness_validator: computes AltoReadiness / PageXmlReadiness per page
and DocumentReadiness at document level. ALTO requires word text+geometry;
PAGE XML is more lenient (regions+lines sufficient)
- export_eligibility_validator: produces ExportEligibility (alto/page/viewer
each independently full/partial/none). Strict policy downgrades partial→none
- schema_validator: wraps Pydantic validation as explicit service, returns
(document | None, ValidationReport) instead of raising exceptions
Policies:
- document_policy: centralised business rules (3 modes: strict/standard/
permissive). Controls what the system may infer, repair, or export.
Text invention and bbox invention always forbidden
- export_policy: go/no-go decisions per format, consuming eligibility + policy
Infrastructure:
- ValidationReport with entries (validator, severity, path, message, code)
- Severity enum (error/warning/info)
- is_valid, error_count, warning_count, merge()
48 new tests, 331 total passing.
https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg
- src/app/domain/errors/__init__.py +59 -0
- src/app/policies/document_policy.py +97 -0
- src/app/policies/export_policy.py +90 -0
- src/app/validators/__init__.py +1 -0
- src/app/validators/export_eligibility_validator.py +79 -0
- src/app/validators/readiness_validator.py +165 -0
- src/app/validators/schema_validator.py +42 -0
- src/app/validators/structural_validator.py +122 -0
- tests/unit/test_document_policy.py +98 -0
- tests/unit/test_export_eligibility.py +181 -0
- tests/unit/test_readiness_validator.py +194 -0
- tests/unit/test_schema_validator.py +95 -0
- tests/unit/test_structural_validator.py +201 -0
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Domain errors for validation and export."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
|
| 7 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Severity(str, Enum):
|
| 11 |
+
"""Severity level for validation entries."""
|
| 12 |
+
|
| 13 |
+
ERROR = "error"
|
| 14 |
+
WARNING = "warning"
|
| 15 |
+
INFO = "info"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ValidationEntry(BaseModel):
|
| 19 |
+
"""A single validation finding."""
|
| 20 |
+
|
| 21 |
+
model_config = ConfigDict(frozen=True)
|
| 22 |
+
|
| 23 |
+
validator: str = Field(min_length=1)
|
| 24 |
+
severity: Severity
|
| 25 |
+
path: str = Field(min_length=1, description="Path in the document, e.g. pages[0].text_regions[1].lines[3]")
|
| 26 |
+
message: str = Field(min_length=1)
|
| 27 |
+
code: str | None = None
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ValidationReport(BaseModel):
|
| 31 |
+
"""Aggregated results from all validators."""
|
| 32 |
+
|
| 33 |
+
entries: list[ValidationEntry] = Field(default_factory=list)
|
| 34 |
+
|
| 35 |
+
@property
|
| 36 |
+
def errors(self) -> list[ValidationEntry]:
|
| 37 |
+
return [e for e in self.entries if e.severity == Severity.ERROR]
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def warnings(self) -> list[ValidationEntry]:
|
| 41 |
+
return [e for e in self.entries if e.severity == Severity.WARNING]
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def is_valid(self) -> bool:
|
| 45 |
+
return len(self.errors) == 0
|
| 46 |
+
|
| 47 |
+
@property
|
| 48 |
+
def error_count(self) -> int:
|
| 49 |
+
return len(self.errors)
|
| 50 |
+
|
| 51 |
+
@property
|
| 52 |
+
def warning_count(self) -> int:
|
| 53 |
+
return len(self.warnings)
|
| 54 |
+
|
| 55 |
+
def add(self, entry: ValidationEntry) -> None:
|
| 56 |
+
self.entries.append(entry)
|
| 57 |
+
|
| 58 |
+
def merge(self, other: ValidationReport) -> None:
|
| 59 |
+
self.entries.extend(other.entries)
|
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document policy — centralised business rules for the pipeline.
|
| 2 |
+
|
| 3 |
+
This layer prevents critical decisions from being scattered across
|
| 4 |
+
adapters, validators, and serializers. A policy is a named configuration
|
| 5 |
+
that controls what the system may or may not do.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from enum import Enum
|
| 11 |
+
|
| 12 |
+
from pydantic import BaseModel, ConfigDict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PolicyMode(str, Enum):
|
| 16 |
+
"""Named policy presets."""
|
| 17 |
+
|
| 18 |
+
STRICT = "strict"
|
| 19 |
+
STANDARD = "standard"
|
| 20 |
+
PERMISSIVE = "permissive"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class DocumentPolicy(BaseModel):
|
| 24 |
+
"""Concrete policy controlling pipeline behaviour."""
|
| 25 |
+
|
| 26 |
+
model_config = ConfigDict(frozen=True)
|
| 27 |
+
|
| 28 |
+
mode: PolicyMode = PolicyMode.STANDARD
|
| 29 |
+
|
| 30 |
+
# -- Text rules -----------------------------------------------------------
|
| 31 |
+
allow_text_invention: bool = False
|
| 32 |
+
"""Never invent text that wasn't in the provider output."""
|
| 33 |
+
|
| 34 |
+
# -- Geometry rules -------------------------------------------------------
|
| 35 |
+
allow_polygon_to_bbox: bool = True
|
| 36 |
+
"""Allow deriving bbox from polygon (enricher)."""
|
| 37 |
+
|
| 38 |
+
allow_bbox_inference: bool = True
|
| 39 |
+
"""Allow inferring bbox from context (e.g. line bbox from word bboxes)."""
|
| 40 |
+
|
| 41 |
+
allow_bbox_invention: bool = False
|
| 42 |
+
"""Never invent bbox without any geometric basis."""
|
| 43 |
+
|
| 44 |
+
# -- Language rules -------------------------------------------------------
|
| 45 |
+
allow_lang_propagation: bool = True
|
| 46 |
+
"""Allow propagating language from parent to child nodes."""
|
| 47 |
+
|
| 48 |
+
# -- Export rules ---------------------------------------------------------
|
| 49 |
+
require_lines_for_alto: bool = True
|
| 50 |
+
"""ALTO export requires at least line-level geometry."""
|
| 51 |
+
|
| 52 |
+
require_words_for_alto: bool = True
|
| 53 |
+
"""ALTO export requires word-level text and geometry."""
|
| 54 |
+
|
| 55 |
+
allow_partial_alto: bool = True
|
| 56 |
+
"""Allow ALTO export with partial readiness."""
|
| 57 |
+
|
| 58 |
+
allow_partial_page: bool = True
|
| 59 |
+
"""Allow PAGE export with partial readiness."""
|
| 60 |
+
|
| 61 |
+
# -- Enricher rules -------------------------------------------------------
|
| 62 |
+
allow_reading_order_inference: bool = True
|
| 63 |
+
"""Allow inferring reading order from spatial position."""
|
| 64 |
+
|
| 65 |
+
allow_hyphenation_detection: bool = True
|
| 66 |
+
"""Allow detecting word hyphenation at line boundaries."""
|
| 67 |
+
|
| 68 |
+
# -- Tolerance ------------------------------------------------------------
|
| 69 |
+
bbox_containment_tolerance: float = 5.0
|
| 70 |
+
"""Pixels of allowed overflow for bbox containment checks."""
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def strict_mode(self) -> bool:
|
| 74 |
+
return self.mode == PolicyMode.STRICT
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def strict_policy() -> DocumentPolicy:
|
| 78 |
+
"""A strict policy: no inference, no partial exports."""
|
| 79 |
+
return DocumentPolicy(
|
| 80 |
+
mode=PolicyMode.STRICT,
|
| 81 |
+
allow_bbox_inference=False,
|
| 82 |
+
allow_partial_alto=False,
|
| 83 |
+
allow_partial_page=False,
|
| 84 |
+
allow_reading_order_inference=False,
|
| 85 |
+
allow_hyphenation_detection=False,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def permissive_policy() -> DocumentPolicy:
|
| 90 |
+
"""A permissive policy: allow inference and partial exports."""
|
| 91 |
+
return DocumentPolicy(
|
| 92 |
+
mode=PolicyMode.PERMISSIVE,
|
| 93 |
+
allow_bbox_inference=True,
|
| 94 |
+
allow_partial_alto=True,
|
| 95 |
+
allow_partial_page=True,
|
| 96 |
+
bbox_containment_tolerance=10.0,
|
| 97 |
+
)
|
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Export policy — decides whether a specific export should proceed.
|
| 2 |
+
|
| 3 |
+
Uses the document policy and export eligibility to make a final go/no-go
|
| 4 |
+
decision for each export format.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from dataclasses import dataclass
|
| 10 |
+
|
| 11 |
+
from src.app.domain.models.readiness import ExportEligibility
|
| 12 |
+
from src.app.domain.models.status import ReadinessLevel
|
| 13 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@dataclass(frozen=True)
|
| 17 |
+
class ExportDecision:
|
| 18 |
+
"""Result of an export policy check."""
|
| 19 |
+
|
| 20 |
+
allowed: bool
|
| 21 |
+
level: ReadinessLevel
|
| 22 |
+
reason: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def check_alto_export(
|
| 26 |
+
eligibility: ExportEligibility,
|
| 27 |
+
policy: DocumentPolicy | None = None,
|
| 28 |
+
) -> ExportDecision:
|
| 29 |
+
"""Check if ALTO export should proceed."""
|
| 30 |
+
if policy is None:
|
| 31 |
+
policy = DocumentPolicy()
|
| 32 |
+
|
| 33 |
+
level = eligibility.alto_export
|
| 34 |
+
|
| 35 |
+
if level == ReadinessLevel.NONE:
|
| 36 |
+
return ExportDecision(
|
| 37 |
+
allowed=False,
|
| 38 |
+
level=level,
|
| 39 |
+
reason="ALTO export not possible: missing required data (word text/geometry or line geometry)",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
|
| 43 |
+
return ExportDecision(
|
| 44 |
+
allowed=False,
|
| 45 |
+
level=level,
|
| 46 |
+
reason="ALTO export is partial but policy does not allow partial exports",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
if level == ReadinessLevel.DEGRADED:
|
| 50 |
+
return ExportDecision(
|
| 51 |
+
allowed=False,
|
| 52 |
+
level=level,
|
| 53 |
+
reason="ALTO export is degraded: too much data missing",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
return ExportDecision(allowed=True, level=level, reason="OK")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def check_page_export(
|
| 60 |
+
eligibility: ExportEligibility,
|
| 61 |
+
policy: DocumentPolicy | None = None,
|
| 62 |
+
) -> ExportDecision:
|
| 63 |
+
"""Check if PAGE XML export should proceed."""
|
| 64 |
+
if policy is None:
|
| 65 |
+
policy = DocumentPolicy()
|
| 66 |
+
|
| 67 |
+
level = eligibility.page_export
|
| 68 |
+
|
| 69 |
+
if level == ReadinessLevel.NONE:
|
| 70 |
+
return ExportDecision(
|
| 71 |
+
allowed=False,
|
| 72 |
+
level=level,
|
| 73 |
+
reason="PAGE export not possible: missing required data",
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if level == ReadinessLevel.PARTIAL and not policy.allow_partial_page:
|
| 77 |
+
return ExportDecision(
|
| 78 |
+
allowed=False,
|
| 79 |
+
level=level,
|
| 80 |
+
reason="PAGE export is partial but policy does not allow partial exports",
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
if level == ReadinessLevel.DEGRADED:
|
| 84 |
+
return ExportDecision(
|
| 85 |
+
allowed=False,
|
| 86 |
+
level=level,
|
| 87 |
+
reason="PAGE export is degraded: too much data missing",
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return ExportDecision(allowed=True, level=level, reason="OK")
|
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Validators — structural, readiness, schema, export eligibility."""
|
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Export eligibility validator — decides what can be exported.
|
| 2 |
+
|
| 3 |
+
Consumes readiness assessments and document policy to produce
|
| 4 |
+
an ExportEligibility decision for the whole document.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from src.app.domain.models import CanonicalDocument
|
| 10 |
+
from src.app.domain.models.readiness import ExportEligibility
|
| 11 |
+
from src.app.domain.models.status import ReadinessLevel
|
| 12 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 13 |
+
from src.app.validators.readiness_validator import (
|
| 14 |
+
compute_page_alto_readiness,
|
| 15 |
+
compute_page_pagexml_readiness,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def compute_export_eligibility(
|
| 20 |
+
doc: CanonicalDocument,
|
| 21 |
+
policy: DocumentPolicy | None = None,
|
| 22 |
+
) -> ExportEligibility:
|
| 23 |
+
"""Compute export eligibility for a document.
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
doc: The canonical document.
|
| 27 |
+
policy: Document policy (uses default if None).
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
ExportEligibility with per-format readiness levels.
|
| 31 |
+
"""
|
| 32 |
+
if policy is None:
|
| 33 |
+
policy = DocumentPolicy()
|
| 34 |
+
|
| 35 |
+
alto_levels: list[ReadinessLevel] = []
|
| 36 |
+
page_levels: list[ReadinessLevel] = []
|
| 37 |
+
|
| 38 |
+
for page in doc.pages:
|
| 39 |
+
alto_levels.append(compute_page_alto_readiness(page).level)
|
| 40 |
+
page_levels.append(compute_page_pagexml_readiness(page).level)
|
| 41 |
+
|
| 42 |
+
alto_export = _aggregate_levels(alto_levels)
|
| 43 |
+
page_export = _aggregate_levels(page_levels)
|
| 44 |
+
|
| 45 |
+
# Apply policy constraints
|
| 46 |
+
if policy.strict_mode:
|
| 47 |
+
# In strict mode, partial is downgraded to none
|
| 48 |
+
if alto_export == ReadinessLevel.PARTIAL:
|
| 49 |
+
alto_export = ReadinessLevel.NONE
|
| 50 |
+
if page_export == ReadinessLevel.PARTIAL:
|
| 51 |
+
page_export = ReadinessLevel.NONE
|
| 52 |
+
|
| 53 |
+
# Viewer is more lenient — it can render degraded content
|
| 54 |
+
if alto_export != ReadinessLevel.NONE or page_export != ReadinessLevel.NONE:
|
| 55 |
+
viewer_render = ReadinessLevel.FULL
|
| 56 |
+
elif any(len(p.text_regions) > 0 for p in doc.pages):
|
| 57 |
+
viewer_render = ReadinessLevel.DEGRADED
|
| 58 |
+
else:
|
| 59 |
+
viewer_render = ReadinessLevel.NONE
|
| 60 |
+
|
| 61 |
+
return ExportEligibility(
|
| 62 |
+
alto_export=alto_export,
|
| 63 |
+
page_export=page_export,
|
| 64 |
+
viewer_render=viewer_render,
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _aggregate_levels(levels: list[ReadinessLevel]) -> ReadinessLevel:
|
| 69 |
+
"""Aggregate per-page readiness into a single document-level readiness."""
|
| 70 |
+
if not levels:
|
| 71 |
+
return ReadinessLevel.NONE
|
| 72 |
+
|
| 73 |
+
if all(l == ReadinessLevel.FULL for l in levels):
|
| 74 |
+
return ReadinessLevel.FULL
|
| 75 |
+
if all(l == ReadinessLevel.NONE for l in levels):
|
| 76 |
+
return ReadinessLevel.NONE
|
| 77 |
+
if any(l in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for l in levels):
|
| 78 |
+
return ReadinessLevel.PARTIAL
|
| 79 |
+
return ReadinessLevel.DEGRADED
|
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Readiness validator — computes how ready a document is for export.
|
| 2 |
+
|
| 3 |
+
Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
|
| 4 |
+
at document level. Does NOT decide whether to allow export — that's the
|
| 5 |
+
export eligibility validator's job.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from src.app.domain.models import CanonicalDocument, Page
|
| 11 |
+
from src.app.domain.models.readiness import (
|
| 12 |
+
AltoReadiness,
|
| 13 |
+
DocumentReadiness,
|
| 14 |
+
PageXmlReadiness,
|
| 15 |
+
)
|
| 16 |
+
from src.app.domain.models.status import (
|
| 17 |
+
GeometryStatus,
|
| 18 |
+
MissingCapability,
|
| 19 |
+
ReadinessLevel,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def compute_page_alto_readiness(page: Page) -> AltoReadiness:
|
| 24 |
+
"""Compute ALTO readiness for a single page.
|
| 25 |
+
|
| 26 |
+
ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
|
| 27 |
+
"""
|
| 28 |
+
missing: list[MissingCapability] = []
|
| 29 |
+
|
| 30 |
+
if page.width <= 0 or page.height <= 0:
|
| 31 |
+
missing.append(MissingCapability.PAGE_DIMENSIONS)
|
| 32 |
+
|
| 33 |
+
has_blocks = len(page.text_regions) > 0
|
| 34 |
+
has_lines = False
|
| 35 |
+
has_words = False
|
| 36 |
+
has_word_geo = True
|
| 37 |
+
has_word_text = True
|
| 38 |
+
has_confidence = True
|
| 39 |
+
|
| 40 |
+
for region in page.text_regions:
|
| 41 |
+
if region.geometry.status == GeometryStatus.UNKNOWN:
|
| 42 |
+
if MissingCapability.BLOCK_GEOMETRY not in missing:
|
| 43 |
+
missing.append(MissingCapability.BLOCK_GEOMETRY)
|
| 44 |
+
for line in region.lines:
|
| 45 |
+
has_lines = True
|
| 46 |
+
if line.geometry.status == GeometryStatus.UNKNOWN:
|
| 47 |
+
if MissingCapability.LINE_GEOMETRY not in missing:
|
| 48 |
+
missing.append(MissingCapability.LINE_GEOMETRY)
|
| 49 |
+
for word in line.words:
|
| 50 |
+
has_words = True
|
| 51 |
+
if word.geometry.status == GeometryStatus.UNKNOWN:
|
| 52 |
+
has_word_geo = False
|
| 53 |
+
if not word.text:
|
| 54 |
+
has_word_text = False
|
| 55 |
+
if word.confidence is None:
|
| 56 |
+
has_confidence = False
|
| 57 |
+
|
| 58 |
+
if not has_blocks or not has_lines or not has_words:
|
| 59 |
+
if not has_words:
|
| 60 |
+
missing.append(MissingCapability.WORD_TEXT)
|
| 61 |
+
if not has_lines:
|
| 62 |
+
missing.append(MissingCapability.LINE_GEOMETRY)
|
| 63 |
+
|
| 64 |
+
if not has_word_geo:
|
| 65 |
+
if MissingCapability.WORD_GEOMETRY not in missing:
|
| 66 |
+
missing.append(MissingCapability.WORD_GEOMETRY)
|
| 67 |
+
|
| 68 |
+
if not has_word_text:
|
| 69 |
+
if MissingCapability.WORD_TEXT not in missing:
|
| 70 |
+
missing.append(MissingCapability.WORD_TEXT)
|
| 71 |
+
|
| 72 |
+
if not has_confidence:
|
| 73 |
+
if MissingCapability.CONFIDENCE not in missing:
|
| 74 |
+
missing.append(MissingCapability.CONFIDENCE)
|
| 75 |
+
|
| 76 |
+
if not page.reading_order:
|
| 77 |
+
missing.append(MissingCapability.READING_ORDER)
|
| 78 |
+
|
| 79 |
+
level = _level_from_missing(missing, critical={
|
| 80 |
+
MissingCapability.PAGE_DIMENSIONS,
|
| 81 |
+
MissingCapability.WORD_TEXT,
|
| 82 |
+
MissingCapability.WORD_GEOMETRY,
|
| 83 |
+
MissingCapability.LINE_GEOMETRY,
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
return AltoReadiness(level=level, missing=missing)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
|
| 90 |
+
"""Compute PAGE XML readiness for a single page.
|
| 91 |
+
|
| 92 |
+
PAGE XML is more lenient: regions + lines are often sufficient.
|
| 93 |
+
Word-level geometry is nice-to-have, not required.
|
| 94 |
+
"""
|
| 95 |
+
missing: list[MissingCapability] = []
|
| 96 |
+
|
| 97 |
+
if page.width <= 0 or page.height <= 0:
|
| 98 |
+
missing.append(MissingCapability.PAGE_DIMENSIONS)
|
| 99 |
+
|
| 100 |
+
has_regions = len(page.text_regions) > 0
|
| 101 |
+
has_lines = False
|
| 102 |
+
|
| 103 |
+
for region in page.text_regions:
|
| 104 |
+
if region.geometry.status == GeometryStatus.UNKNOWN:
|
| 105 |
+
if MissingCapability.BLOCK_GEOMETRY not in missing:
|
| 106 |
+
missing.append(MissingCapability.BLOCK_GEOMETRY)
|
| 107 |
+
for line in region.lines:
|
| 108 |
+
has_lines = True
|
| 109 |
+
if line.geometry.status == GeometryStatus.UNKNOWN:
|
| 110 |
+
if MissingCapability.LINE_GEOMETRY not in missing:
|
| 111 |
+
missing.append(MissingCapability.LINE_GEOMETRY)
|
| 112 |
+
|
| 113 |
+
if not has_regions:
|
| 114 |
+
missing.append(MissingCapability.BLOCK_GEOMETRY)
|
| 115 |
+
|
| 116 |
+
if not has_lines:
|
| 117 |
+
if MissingCapability.LINE_GEOMETRY not in missing:
|
| 118 |
+
missing.append(MissingCapability.LINE_GEOMETRY)
|
| 119 |
+
|
| 120 |
+
if not page.reading_order:
|
| 121 |
+
missing.append(MissingCapability.READING_ORDER)
|
| 122 |
+
|
| 123 |
+
level = _level_from_missing(missing, critical={
|
| 124 |
+
MissingCapability.PAGE_DIMENSIONS,
|
| 125 |
+
MissingCapability.BLOCK_GEOMETRY,
|
| 126 |
+
})
|
| 127 |
+
|
| 128 |
+
return PageXmlReadiness(level=level, missing=missing)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
|
| 132 |
+
"""Compute overall document readiness from per-page readiness."""
|
| 133 |
+
page_levels: list[ReadinessLevel] = []
|
| 134 |
+
for page in doc.pages:
|
| 135 |
+
alto = compute_page_alto_readiness(page)
|
| 136 |
+
page_levels.append(alto.level)
|
| 137 |
+
|
| 138 |
+
if not page_levels:
|
| 139 |
+
return DocumentReadiness(level=ReadinessLevel.NONE)
|
| 140 |
+
|
| 141 |
+
if all(l == ReadinessLevel.FULL for l in page_levels):
|
| 142 |
+
overall = ReadinessLevel.FULL
|
| 143 |
+
elif all(l == ReadinessLevel.NONE for l in page_levels):
|
| 144 |
+
overall = ReadinessLevel.NONE
|
| 145 |
+
elif any(l == ReadinessLevel.NONE for l in page_levels):
|
| 146 |
+
overall = ReadinessLevel.DEGRADED
|
| 147 |
+
else:
|
| 148 |
+
overall = ReadinessLevel.PARTIAL
|
| 149 |
+
|
| 150 |
+
return DocumentReadiness(level=overall, page_readiness=page_levels)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _level_from_missing(
|
| 154 |
+
missing: list[MissingCapability],
|
| 155 |
+
critical: set[MissingCapability],
|
| 156 |
+
) -> ReadinessLevel:
|
| 157 |
+
"""Determine readiness level from missing capabilities."""
|
| 158 |
+
if not missing:
|
| 159 |
+
return ReadinessLevel.FULL
|
| 160 |
+
|
| 161 |
+
has_critical = any(m in critical for m in missing)
|
| 162 |
+
if has_critical:
|
| 163 |
+
return ReadinessLevel.NONE
|
| 164 |
+
|
| 165 |
+
return ReadinessLevel.PARTIAL
|
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Schema validator — validates a dict/JSON against the CanonicalDocument schema.
|
| 2 |
+
|
| 3 |
+
This wraps Pydantic validation as an explicit service, producing a
|
| 4 |
+
ValidationReport rather than raising exceptions.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from pydantic import ValidationError
|
| 12 |
+
|
| 13 |
+
from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
|
| 14 |
+
from src.app.domain.models import CanonicalDocument
|
| 15 |
+
|
| 16 |
+
VALIDATOR_NAME = "schema"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def validate_schema(data: dict[str, Any]) -> tuple[CanonicalDocument | None, ValidationReport]:
|
| 20 |
+
"""Validate raw data against the CanonicalDocument schema.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
A tuple of (parsed document or None, validation report).
|
| 24 |
+
If parsing succeeds, the document is returned with an empty report.
|
| 25 |
+
If parsing fails, None is returned with errors in the report.
|
| 26 |
+
"""
|
| 27 |
+
report = ValidationReport()
|
| 28 |
+
try:
|
| 29 |
+
doc = CanonicalDocument.model_validate(data)
|
| 30 |
+
return doc, report
|
| 31 |
+
except ValidationError as e:
|
| 32 |
+
for error in e.errors():
|
| 33 |
+
loc_parts = [str(p) for p in error["loc"]]
|
| 34 |
+
path = ".".join(loc_parts) if loc_parts else "root"
|
| 35 |
+
report.add(ValidationEntry(
|
| 36 |
+
validator=VALIDATOR_NAME,
|
| 37 |
+
severity=Severity.ERROR,
|
| 38 |
+
path=path,
|
| 39 |
+
message=error["msg"],
|
| 40 |
+
code=error["type"],
|
| 41 |
+
))
|
| 42 |
+
return None, report
|
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Structural validator — checks internal consistency of a CanonicalDocument.
|
| 2 |
+
|
| 3 |
+
Checks:
|
| 4 |
+
- ID uniqueness across the entire document
|
| 5 |
+
- reading_order references existing region IDs
|
| 6 |
+
- bbox containment: word ⊂ line ⊂ region ⊂ page (with tolerance)
|
| 7 |
+
- spatial ordering: words in a line, lines in a region
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
|
| 13 |
+
from src.app.domain.models import CanonicalDocument
|
| 14 |
+
from src.app.geometry.bbox import contains
|
| 15 |
+
|
| 16 |
+
VALIDATOR_NAME = "structural"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def validate_structure(
|
| 20 |
+
doc: CanonicalDocument,
|
| 21 |
+
*,
|
| 22 |
+
bbox_tolerance: float = 5.0,
|
| 23 |
+
) -> ValidationReport:
|
| 24 |
+
"""Run all structural checks on a CanonicalDocument."""
|
| 25 |
+
report = ValidationReport()
|
| 26 |
+
_check_id_uniqueness(doc, report)
|
| 27 |
+
_check_reading_order(doc, report)
|
| 28 |
+
_check_bbox_containment(doc, report, bbox_tolerance)
|
| 29 |
+
return report
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _check_id_uniqueness(doc: CanonicalDocument, report: ValidationReport) -> None:
|
| 33 |
+
"""Every ID in the document must be unique."""
|
| 34 |
+
seen: dict[str, str] = {} # id → first path
|
| 35 |
+
for pi, page in enumerate(doc.pages):
|
| 36 |
+
_register_id(page.id, f"pages[{pi}]", seen, report)
|
| 37 |
+
for ri, region in enumerate(page.text_regions):
|
| 38 |
+
rpath = f"pages[{pi}].text_regions[{ri}]"
|
| 39 |
+
_register_id(region.id, rpath, seen, report)
|
| 40 |
+
for li, line in enumerate(region.lines):
|
| 41 |
+
lpath = f"{rpath}.lines[{li}]"
|
| 42 |
+
_register_id(line.id, lpath, seen, report)
|
| 43 |
+
for wi, word in enumerate(line.words):
|
| 44 |
+
wpath = f"{lpath}.words[{wi}]"
|
| 45 |
+
_register_id(word.id, wpath, seen, report)
|
| 46 |
+
for ni, ntr in enumerate(page.non_text_regions):
|
| 47 |
+
npath = f"pages[{pi}].non_text_regions[{ni}]"
|
| 48 |
+
_register_id(ntr.id, npath, seen, report)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _register_id(
|
| 52 |
+
node_id: str, path: str, seen: dict[str, str], report: ValidationReport
|
| 53 |
+
) -> None:
|
| 54 |
+
if node_id in seen:
|
| 55 |
+
report.add(ValidationEntry(
|
| 56 |
+
validator=VALIDATOR_NAME,
|
| 57 |
+
severity=Severity.ERROR,
|
| 58 |
+
path=path,
|
| 59 |
+
message=f"Duplicate ID '{node_id}', first seen at {seen[node_id]}",
|
| 60 |
+
code="duplicate_id",
|
| 61 |
+
))
|
| 62 |
+
else:
|
| 63 |
+
seen[node_id] = path
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _check_reading_order(doc: CanonicalDocument, report: ValidationReport) -> None:
|
| 67 |
+
"""reading_order entries must reference existing region IDs."""
|
| 68 |
+
for pi, page in enumerate(doc.pages):
|
| 69 |
+
region_ids = {r.id for r in page.text_regions}
|
| 70 |
+
for idx, ref_id in enumerate(page.reading_order):
|
| 71 |
+
if ref_id not in region_ids:
|
| 72 |
+
report.add(ValidationEntry(
|
| 73 |
+
validator=VALIDATOR_NAME,
|
| 74 |
+
severity=Severity.ERROR,
|
| 75 |
+
path=f"pages[{pi}].reading_order[{idx}]",
|
| 76 |
+
message=f"reading_order references unknown region ID '{ref_id}'",
|
| 77 |
+
code="invalid_reading_order_ref",
|
| 78 |
+
))
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _check_bbox_containment(
|
| 82 |
+
doc: CanonicalDocument, report: ValidationReport, tolerance: float
|
| 83 |
+
) -> None:
|
| 84 |
+
"""Check that child bboxes are contained within parent bboxes."""
|
| 85 |
+
for pi, page in enumerate(doc.pages):
|
| 86 |
+
page_bbox = (0.0, 0.0, page.width, page.height)
|
| 87 |
+
|
| 88 |
+
for ri, region in enumerate(page.text_regions):
|
| 89 |
+
rpath = f"pages[{pi}].text_regions[{ri}]"
|
| 90 |
+
|
| 91 |
+
if not contains(page_bbox, region.geometry.bbox, tolerance):
|
| 92 |
+
report.add(ValidationEntry(
|
| 93 |
+
validator=VALIDATOR_NAME,
|
| 94 |
+
severity=Severity.WARNING,
|
| 95 |
+
path=rpath,
|
| 96 |
+
message=f"Region bbox {region.geometry.bbox} exceeds page bounds ({page.width}x{page.height}) beyond tolerance {tolerance}px",
|
| 97 |
+
code="region_exceeds_page",
|
| 98 |
+
))
|
| 99 |
+
|
| 100 |
+
for li, line in enumerate(region.lines):
|
| 101 |
+
lpath = f"{rpath}.lines[{li}]"
|
| 102 |
+
|
| 103 |
+
if not contains(region.geometry.bbox, line.geometry.bbox, tolerance):
|
| 104 |
+
report.add(ValidationEntry(
|
| 105 |
+
validator=VALIDATOR_NAME,
|
| 106 |
+
severity=Severity.WARNING,
|
| 107 |
+
path=lpath,
|
| 108 |
+
message=f"Line bbox exceeds region bbox beyond tolerance {tolerance}px",
|
| 109 |
+
code="line_exceeds_region",
|
| 110 |
+
))
|
| 111 |
+
|
| 112 |
+
for wi, word in enumerate(line.words):
|
| 113 |
+
wpath = f"{lpath}.words[{wi}]"
|
| 114 |
+
|
| 115 |
+
if not contains(line.geometry.bbox, word.geometry.bbox, tolerance):
|
| 116 |
+
report.add(ValidationEntry(
|
| 117 |
+
validator=VALIDATOR_NAME,
|
| 118 |
+
severity=Severity.WARNING,
|
| 119 |
+
path=wpath,
|
| 120 |
+
message=f"Word bbox exceeds line bbox beyond tolerance {tolerance}px",
|
| 121 |
+
code="word_exceeds_line",
|
| 122 |
+
))
|
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for document policy and validation report."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
|
| 6 |
+
from src.app.policies.document_policy import (
|
| 7 |
+
DocumentPolicy,
|
| 8 |
+
PolicyMode,
|
| 9 |
+
permissive_policy,
|
| 10 |
+
strict_policy,
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TestDocumentPolicy:
|
| 15 |
+
def test_default_is_standard(self) -> None:
|
| 16 |
+
p = DocumentPolicy()
|
| 17 |
+
assert p.mode == PolicyMode.STANDARD
|
| 18 |
+
assert not p.strict_mode
|
| 19 |
+
|
| 20 |
+
def test_strict(self) -> None:
|
| 21 |
+
p = strict_policy()
|
| 22 |
+
assert p.strict_mode
|
| 23 |
+
assert not p.allow_bbox_inference
|
| 24 |
+
assert not p.allow_partial_alto
|
| 25 |
+
|
| 26 |
+
def test_permissive(self) -> None:
|
| 27 |
+
p = permissive_policy()
|
| 28 |
+
assert p.mode == PolicyMode.PERMISSIVE
|
| 29 |
+
assert p.allow_bbox_inference
|
| 30 |
+
assert p.bbox_containment_tolerance == 10.0
|
| 31 |
+
|
| 32 |
+
def test_never_allows_text_invention(self) -> None:
|
| 33 |
+
for factory in [DocumentPolicy, strict_policy, permissive_policy]:
|
| 34 |
+
p = factory()
|
| 35 |
+
assert p.allow_text_invention is False
|
| 36 |
+
|
| 37 |
+
def test_never_allows_bbox_invention(self) -> None:
|
| 38 |
+
for factory in [DocumentPolicy, strict_policy, permissive_policy]:
|
| 39 |
+
p = factory()
|
| 40 |
+
assert p.allow_bbox_invention is False
|
| 41 |
+
|
| 42 |
+
def test_frozen(self) -> None:
|
| 43 |
+
import pytest
|
| 44 |
+
from pydantic import ValidationError
|
| 45 |
+
|
| 46 |
+
p = DocumentPolicy()
|
| 47 |
+
with pytest.raises(ValidationError):
|
| 48 |
+
p.mode = PolicyMode.STRICT # type: ignore[misc]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class TestValidationReport:
|
| 52 |
+
def test_empty_is_valid(self) -> None:
|
| 53 |
+
r = ValidationReport()
|
| 54 |
+
assert r.is_valid
|
| 55 |
+
assert r.error_count == 0
|
| 56 |
+
assert r.warning_count == 0
|
| 57 |
+
|
| 58 |
+
def test_with_error(self) -> None:
|
| 59 |
+
r = ValidationReport()
|
| 60 |
+
r.add(ValidationEntry(
|
| 61 |
+
validator="test", severity=Severity.ERROR,
|
| 62 |
+
path="pages[0]", message="bad",
|
| 63 |
+
))
|
| 64 |
+
assert not r.is_valid
|
| 65 |
+
assert r.error_count == 1
|
| 66 |
+
|
| 67 |
+
def test_warnings_dont_invalidate(self) -> None:
|
| 68 |
+
r = ValidationReport()
|
| 69 |
+
r.add(ValidationEntry(
|
| 70 |
+
validator="test", severity=Severity.WARNING,
|
| 71 |
+
path="pages[0]", message="meh",
|
| 72 |
+
))
|
| 73 |
+
assert r.is_valid
|
| 74 |
+
assert r.warning_count == 1
|
| 75 |
+
|
| 76 |
+
def test_merge(self) -> None:
|
| 77 |
+
r1 = ValidationReport()
|
| 78 |
+
r1.add(ValidationEntry(
|
| 79 |
+
validator="a", severity=Severity.ERROR,
|
| 80 |
+
path="x", message="e1",
|
| 81 |
+
))
|
| 82 |
+
r2 = ValidationReport()
|
| 83 |
+
r2.add(ValidationEntry(
|
| 84 |
+
validator="b", severity=Severity.WARNING,
|
| 85 |
+
path="y", message="w1",
|
| 86 |
+
))
|
| 87 |
+
r1.merge(r2)
|
| 88 |
+
assert r1.error_count == 1
|
| 89 |
+
assert r1.warning_count == 1
|
| 90 |
+
assert len(r1.entries) == 2
|
| 91 |
+
|
| 92 |
+
def test_errors_property(self) -> None:
|
| 93 |
+
r = ValidationReport()
|
| 94 |
+
r.add(ValidationEntry(validator="a", severity=Severity.ERROR, path="x", message="e"))
|
| 95 |
+
r.add(ValidationEntry(validator="b", severity=Severity.WARNING, path="y", message="w"))
|
| 96 |
+
r.add(ValidationEntry(validator="c", severity=Severity.INFO, path="z", message="i"))
|
| 97 |
+
assert len(r.errors) == 1
|
| 98 |
+
assert len(r.warnings) == 1
|
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for export eligibility and export policy."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from src.app.domain.models import (
|
| 6 |
+
AltoReadiness,
|
| 7 |
+
CanonicalDocument,
|
| 8 |
+
EvidenceType,
|
| 9 |
+
Geometry,
|
| 10 |
+
GeometryStatus,
|
| 11 |
+
Page,
|
| 12 |
+
PageXmlReadiness,
|
| 13 |
+
Provenance,
|
| 14 |
+
ReadinessLevel,
|
| 15 |
+
Source,
|
| 16 |
+
TextLine,
|
| 17 |
+
TextRegion,
|
| 18 |
+
Word,
|
| 19 |
+
)
|
| 20 |
+
from src.app.domain.models.status import InputType
|
| 21 |
+
from src.app.policies.document_policy import DocumentPolicy, strict_policy
|
| 22 |
+
from src.app.policies.export_policy import check_alto_export, check_page_export
|
| 23 |
+
from src.app.validators.export_eligibility_validator import compute_export_eligibility
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _prov() -> Provenance:
|
| 27 |
+
return Provenance(
|
| 28 |
+
provider="test", adapter="v1", source_ref="$",
|
| 29 |
+
evidence_type=EvidenceType.PROVIDER_NATIVE,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _geo() -> Geometry:
|
| 34 |
+
return Geometry(bbox=(10, 10, 100, 30), status=GeometryStatus.EXACT)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _complete_doc() -> CanonicalDocument:
|
| 38 |
+
return CanonicalDocument(
|
| 39 |
+
document_id="test",
|
| 40 |
+
source=Source(input_type=InputType.IMAGE),
|
| 41 |
+
pages=[Page(
|
| 42 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 43 |
+
alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
|
| 44 |
+
page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
|
| 45 |
+
reading_order=["tb1"],
|
| 46 |
+
text_regions=[
|
| 47 |
+
TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
|
| 48 |
+
lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
|
| 49 |
+
words=[Word(id="w1", text="Hello", geometry=_geo(),
|
| 50 |
+
provenance=_prov(), confidence=0.95)])])],
|
| 51 |
+
)],
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _empty_doc() -> CanonicalDocument:
|
| 56 |
+
return CanonicalDocument(
|
| 57 |
+
document_id="test",
|
| 58 |
+
source=Source(input_type=InputType.IMAGE),
|
| 59 |
+
pages=[Page(id="p1", page_index=0, width=2480, height=3508)],
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class TestExportEligibility:
|
| 64 |
+
def test_complete_doc_full_eligible(self) -> None:
|
| 65 |
+
doc = _complete_doc()
|
| 66 |
+
elig = compute_export_eligibility(doc)
|
| 67 |
+
assert elig.alto_export == ReadinessLevel.FULL
|
| 68 |
+
assert elig.page_export == ReadinessLevel.FULL
|
| 69 |
+
assert elig.viewer_render == ReadinessLevel.FULL
|
| 70 |
+
|
| 71 |
+
def test_empty_doc_none(self) -> None:
|
| 72 |
+
doc = _empty_doc()
|
| 73 |
+
elig = compute_export_eligibility(doc)
|
| 74 |
+
assert elig.alto_export == ReadinessLevel.NONE
|
| 75 |
+
assert elig.page_export == ReadinessLevel.NONE
|
| 76 |
+
assert elig.viewer_render == ReadinessLevel.NONE
|
| 77 |
+
|
| 78 |
+
def test_strict_policy_downgrades_partial(self) -> None:
|
| 79 |
+
# A doc with missing confidence → partial
|
| 80 |
+
doc = CanonicalDocument(
|
| 81 |
+
document_id="test",
|
| 82 |
+
source=Source(input_type=InputType.IMAGE),
|
| 83 |
+
pages=[Page(
|
| 84 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 85 |
+
reading_order=["tb1"],
|
| 86 |
+
text_regions=[
|
| 87 |
+
TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
|
| 88 |
+
lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
|
| 89 |
+
words=[Word(id="w1", text="Hello", geometry=_geo(),
|
| 90 |
+
provenance=_prov(), confidence=None)])])],
|
| 91 |
+
)],
|
| 92 |
+
)
|
| 93 |
+
policy = strict_policy()
|
| 94 |
+
elig = compute_export_eligibility(doc, policy)
|
| 95 |
+
# Strict mode downgrades partial to none
|
| 96 |
+
assert elig.alto_export == ReadinessLevel.NONE
|
| 97 |
+
|
| 98 |
+
def test_viewer_degraded_for_regions_without_exports(self) -> None:
|
| 99 |
+
# Doc with unknown word geometry — ALTO none, but viewer shows something
|
| 100 |
+
doc = CanonicalDocument(
|
| 101 |
+
document_id="test",
|
| 102 |
+
source=Source(input_type=InputType.IMAGE),
|
| 103 |
+
pages=[Page(
|
| 104 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 105 |
+
text_regions=[
|
| 106 |
+
TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
|
| 107 |
+
lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
|
| 108 |
+
words=[Word(id="w1", text="Hello",
|
| 109 |
+
geometry=Geometry(bbox=(10, 10, 100, 30),
|
| 110 |
+
status=GeometryStatus.UNKNOWN),
|
| 111 |
+
provenance=_prov())])])],
|
| 112 |
+
)],
|
| 113 |
+
)
|
| 114 |
+
elig = compute_export_eligibility(doc)
|
| 115 |
+
# ALTO is none (missing word geo), but viewer can show regions
|
| 116 |
+
assert elig.viewer_render in (ReadinessLevel.FULL, ReadinessLevel.DEGRADED)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class TestExportPolicy:
|
| 120 |
+
def test_alto_allowed_full(self) -> None:
|
| 121 |
+
doc = _complete_doc()
|
| 122 |
+
elig = compute_export_eligibility(doc)
|
| 123 |
+
decision = check_alto_export(elig)
|
| 124 |
+
assert decision.allowed is True
|
| 125 |
+
assert decision.reason == "OK"
|
| 126 |
+
|
| 127 |
+
def test_alto_refused_none(self) -> None:
|
| 128 |
+
doc = _empty_doc()
|
| 129 |
+
elig = compute_export_eligibility(doc)
|
| 130 |
+
decision = check_alto_export(elig)
|
| 131 |
+
assert decision.allowed is False
|
| 132 |
+
assert "not possible" in decision.reason
|
| 133 |
+
|
| 134 |
+
def test_alto_partial_default_allowed(self) -> None:
|
| 135 |
+
doc = CanonicalDocument(
|
| 136 |
+
document_id="test",
|
| 137 |
+
source=Source(input_type=InputType.IMAGE),
|
| 138 |
+
pages=[Page(
|
| 139 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 140 |
+
reading_order=["tb1"],
|
| 141 |
+
text_regions=[
|
| 142 |
+
TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
|
| 143 |
+
lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
|
| 144 |
+
words=[Word(id="w1", text="Hello", geometry=_geo(),
|
| 145 |
+
provenance=_prov(), confidence=None)])])],
|
| 146 |
+
)],
|
| 147 |
+
)
|
| 148 |
+
elig = compute_export_eligibility(doc)
|
| 149 |
+
decision = check_alto_export(elig)
|
| 150 |
+
assert decision.allowed is True
|
| 151 |
+
|
| 152 |
+
def test_alto_partial_strict_refused(self) -> None:
|
| 153 |
+
doc = CanonicalDocument(
|
| 154 |
+
document_id="test",
|
| 155 |
+
source=Source(input_type=InputType.IMAGE),
|
| 156 |
+
pages=[Page(
|
| 157 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 158 |
+
reading_order=["tb1"],
|
| 159 |
+
text_regions=[
|
| 160 |
+
TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
|
| 161 |
+
lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
|
| 162 |
+
words=[Word(id="w1", text="Hello", geometry=_geo(),
|
| 163 |
+
provenance=_prov(), confidence=None)])])],
|
| 164 |
+
)],
|
| 165 |
+
)
|
| 166 |
+
policy = strict_policy()
|
| 167 |
+
elig = compute_export_eligibility(doc, policy)
|
| 168 |
+
decision = check_alto_export(elig, policy)
|
| 169 |
+
assert decision.allowed is False
|
| 170 |
+
|
| 171 |
+
def test_page_allowed_full(self) -> None:
|
| 172 |
+
doc = _complete_doc()
|
| 173 |
+
elig = compute_export_eligibility(doc)
|
| 174 |
+
decision = check_page_export(elig)
|
| 175 |
+
assert decision.allowed is True
|
| 176 |
+
|
| 177 |
+
def test_page_refused_none(self) -> None:
|
| 178 |
+
doc = _empty_doc()
|
| 179 |
+
elig = compute_export_eligibility(doc)
|
| 180 |
+
decision = check_page_export(elig)
|
| 181 |
+
assert decision.allowed is False
|
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the readiness validator."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from src.app.domain.models import (
|
| 6 |
+
AltoReadiness,
|
| 7 |
+
CanonicalDocument,
|
| 8 |
+
EvidenceType,
|
| 9 |
+
Geometry,
|
| 10 |
+
GeometryStatus,
|
| 11 |
+
Page,
|
| 12 |
+
PageXmlReadiness,
|
| 13 |
+
Provenance,
|
| 14 |
+
ReadinessLevel,
|
| 15 |
+
Source,
|
| 16 |
+
TextLine,
|
| 17 |
+
TextRegion,
|
| 18 |
+
Word,
|
| 19 |
+
)
|
| 20 |
+
from src.app.domain.models.status import InputType, MissingCapability
|
| 21 |
+
from src.app.validators.readiness_validator import (
|
| 22 |
+
compute_document_readiness,
|
| 23 |
+
compute_page_alto_readiness,
|
| 24 |
+
compute_page_pagexml_readiness,
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _prov() -> Provenance:
|
| 29 |
+
return Provenance(
|
| 30 |
+
provider="test", adapter="v1", source_ref="$",
|
| 31 |
+
evidence_type=EvidenceType.PROVIDER_NATIVE,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _geo(status: GeometryStatus = GeometryStatus.EXACT) -> Geometry:
|
| 36 |
+
return Geometry(bbox=(10, 10, 100, 30), status=status)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _complete_page() -> Page:
|
| 40 |
+
"""A fully complete page with all data."""
|
| 41 |
+
return Page(
|
| 42 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 43 |
+
alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
|
| 44 |
+
page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
|
| 45 |
+
reading_order=["tb1"],
|
| 46 |
+
text_regions=[
|
| 47 |
+
TextRegion(
|
| 48 |
+
id="tb1", geometry=_geo(), provenance=_prov(), lang="fra",
|
| 49 |
+
lines=[
|
| 50 |
+
TextLine(
|
| 51 |
+
id="tl1", geometry=_geo(), provenance=_prov(),
|
| 52 |
+
words=[
|
| 53 |
+
Word(id="w1", text="Hello", geometry=_geo(),
|
| 54 |
+
provenance=_prov(), confidence=0.95),
|
| 55 |
+
],
|
| 56 |
+
),
|
| 57 |
+
],
|
| 58 |
+
),
|
| 59 |
+
],
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class TestAltoReadiness:
|
| 64 |
+
def test_complete_page_is_full(self) -> None:
|
| 65 |
+
page = _complete_page()
|
| 66 |
+
r = compute_page_alto_readiness(page)
|
| 67 |
+
assert r.level == ReadinessLevel.FULL
|
| 68 |
+
assert r.missing == []
|
| 69 |
+
|
| 70 |
+
def test_missing_word_geometry_is_none(self) -> None:
|
| 71 |
+
page = Page(
|
| 72 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 73 |
+
text_regions=[
|
| 74 |
+
TextRegion(
|
| 75 |
+
id="tb1", geometry=_geo(), provenance=_prov(),
|
| 76 |
+
lines=[TextLine(
|
| 77 |
+
id="tl1", geometry=_geo(), provenance=_prov(),
|
| 78 |
+
words=[Word(
|
| 79 |
+
id="w1", text="Hello",
|
| 80 |
+
geometry=_geo(GeometryStatus.UNKNOWN),
|
| 81 |
+
provenance=_prov(),
|
| 82 |
+
)],
|
| 83 |
+
)],
|
| 84 |
+
),
|
| 85 |
+
],
|
| 86 |
+
)
|
| 87 |
+
r = compute_page_alto_readiness(page)
|
| 88 |
+
assert r.level == ReadinessLevel.NONE
|
| 89 |
+
assert MissingCapability.WORD_GEOMETRY in r.missing
|
| 90 |
+
|
| 91 |
+
def test_missing_confidence_is_partial(self) -> None:
|
| 92 |
+
page = Page(
|
| 93 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 94 |
+
reading_order=["tb1"],
|
| 95 |
+
text_regions=[
|
| 96 |
+
TextRegion(
|
| 97 |
+
id="tb1", geometry=_geo(), provenance=_prov(),
|
| 98 |
+
lines=[TextLine(
|
| 99 |
+
id="tl1", geometry=_geo(), provenance=_prov(),
|
| 100 |
+
words=[Word(
|
| 101 |
+
id="w1", text="Hello", geometry=_geo(),
|
| 102 |
+
provenance=_prov(), confidence=None,
|
| 103 |
+
)],
|
| 104 |
+
)],
|
| 105 |
+
),
|
| 106 |
+
],
|
| 107 |
+
)
|
| 108 |
+
r = compute_page_alto_readiness(page)
|
| 109 |
+
assert r.level == ReadinessLevel.PARTIAL
|
| 110 |
+
assert MissingCapability.CONFIDENCE in r.missing
|
| 111 |
+
|
| 112 |
+
def test_no_reading_order_is_partial(self) -> None:
|
| 113 |
+
page = Page(
|
| 114 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 115 |
+
reading_order=[],
|
| 116 |
+
text_regions=[
|
| 117 |
+
TextRegion(
|
| 118 |
+
id="tb1", geometry=_geo(), provenance=_prov(),
|
| 119 |
+
lines=[TextLine(
|
| 120 |
+
id="tl1", geometry=_geo(), provenance=_prov(),
|
| 121 |
+
words=[Word(
|
| 122 |
+
id="w1", text="Hello", geometry=_geo(),
|
| 123 |
+
provenance=_prov(), confidence=0.9,
|
| 124 |
+
)],
|
| 125 |
+
)],
|
| 126 |
+
),
|
| 127 |
+
],
|
| 128 |
+
)
|
| 129 |
+
r = compute_page_alto_readiness(page)
|
| 130 |
+
assert r.level == ReadinessLevel.PARTIAL
|
| 131 |
+
assert MissingCapability.READING_ORDER in r.missing
|
| 132 |
+
|
| 133 |
+
def test_empty_page_is_none(self) -> None:
|
| 134 |
+
page = Page(id="p1", page_index=0, width=2480, height=3508)
|
| 135 |
+
r = compute_page_alto_readiness(page)
|
| 136 |
+
assert r.level == ReadinessLevel.NONE
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class TestPageXmlReadiness:
|
| 140 |
+
def test_complete_page_is_full(self) -> None:
|
| 141 |
+
page = _complete_page()
|
| 142 |
+
r = compute_page_pagexml_readiness(page)
|
| 143 |
+
assert r.level == ReadinessLevel.FULL
|
| 144 |
+
|
| 145 |
+
def test_no_regions_is_none(self) -> None:
|
| 146 |
+
page = Page(id="p1", page_index=0, width=2480, height=3508)
|
| 147 |
+
r = compute_page_pagexml_readiness(page)
|
| 148 |
+
assert r.level == ReadinessLevel.NONE
|
| 149 |
+
|
| 150 |
+
def test_regions_without_word_geo_still_ok(self) -> None:
|
| 151 |
+
"""PAGE XML is more lenient — word geometry is not critical."""
|
| 152 |
+
page = Page(
|
| 153 |
+
id="p1", page_index=0, width=2480, height=3508,
|
| 154 |
+
reading_order=["tb1"],
|
| 155 |
+
text_regions=[
|
| 156 |
+
TextRegion(
|
| 157 |
+
id="tb1", geometry=_geo(), provenance=_prov(),
|
| 158 |
+
lines=[TextLine(
|
| 159 |
+
id="tl1", geometry=_geo(), provenance=_prov(),
|
| 160 |
+
words=[Word(
|
| 161 |
+
id="w1", text="Hello",
|
| 162 |
+
geometry=_geo(GeometryStatus.UNKNOWN),
|
| 163 |
+
provenance=_prov(),
|
| 164 |
+
)],
|
| 165 |
+
)],
|
| 166 |
+
),
|
| 167 |
+
],
|
| 168 |
+
)
|
| 169 |
+
r = compute_page_pagexml_readiness(page)
|
| 170 |
+
# PAGE doesn't require word geometry — should still be achievable
|
| 171 |
+
assert r.level in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class TestDocumentReadiness:
|
| 175 |
+
def test_single_full_page(self) -> None:
|
| 176 |
+
doc = CanonicalDocument(
|
| 177 |
+
document_id="test",
|
| 178 |
+
source=Source(input_type=InputType.IMAGE),
|
| 179 |
+
pages=[_complete_page()],
|
| 180 |
+
)
|
| 181 |
+
dr = compute_document_readiness(doc)
|
| 182 |
+
assert dr.level == ReadinessLevel.FULL
|
| 183 |
+
|
| 184 |
+
def test_mixed_pages(self) -> None:
|
| 185 |
+
full_page = _complete_page()
|
| 186 |
+
empty_page = Page(id="p2", page_index=1, width=2480, height=3508)
|
| 187 |
+
doc = CanonicalDocument(
|
| 188 |
+
document_id="test",
|
| 189 |
+
source=Source(input_type=InputType.IMAGE),
|
| 190 |
+
pages=[full_page, empty_page],
|
| 191 |
+
)
|
| 192 |
+
dr = compute_document_readiness(doc)
|
| 193 |
+
assert dr.level == ReadinessLevel.DEGRADED
|
| 194 |
+
assert len(dr.page_readiness) == 2
|
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the schema validator."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from src.app.validators.schema_validator import validate_schema
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class TestSchemaValidator:
|
| 9 |
+
def test_valid_document(self) -> None:
|
| 10 |
+
data = {
|
| 11 |
+
"schema_version": "1.0.0",
|
| 12 |
+
"document_id": "doc1",
|
| 13 |
+
"source": {"input_type": "image"},
|
| 14 |
+
"pages": [{
|
| 15 |
+
"id": "p1",
|
| 16 |
+
"page_index": 0,
|
| 17 |
+
"width": 2480,
|
| 18 |
+
"height": 3508,
|
| 19 |
+
"text_regions": [{
|
| 20 |
+
"id": "tb1",
|
| 21 |
+
"geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
|
| 22 |
+
"provenance": {
|
| 23 |
+
"provider": "test", "adapter": "v1",
|
| 24 |
+
"source_ref": "$", "evidence_type": "provider_native",
|
| 25 |
+
"derived_from": [],
|
| 26 |
+
},
|
| 27 |
+
"lines": [{
|
| 28 |
+
"id": "tl1",
|
| 29 |
+
"geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
|
| 30 |
+
"provenance": {
|
| 31 |
+
"provider": "test", "adapter": "v1",
|
| 32 |
+
"source_ref": "$", "evidence_type": "provider_native",
|
| 33 |
+
"derived_from": [],
|
| 34 |
+
},
|
| 35 |
+
"words": [{
|
| 36 |
+
"id": "w1",
|
| 37 |
+
"text": "Hello",
|
| 38 |
+
"geometry": {"bbox": [100, 200, 50, 30], "status": "exact"},
|
| 39 |
+
"provenance": {
|
| 40 |
+
"provider": "test", "adapter": "v1",
|
| 41 |
+
"source_ref": "$", "evidence_type": "provider_native",
|
| 42 |
+
"derived_from": [],
|
| 43 |
+
},
|
| 44 |
+
}],
|
| 45 |
+
}],
|
| 46 |
+
}],
|
| 47 |
+
}],
|
| 48 |
+
}
|
| 49 |
+
doc, report = validate_schema(data)
|
| 50 |
+
assert doc is not None
|
| 51 |
+
assert report.is_valid
|
| 52 |
+
|
| 53 |
+
def test_missing_required_field(self) -> None:
|
| 54 |
+
data = {
|
| 55 |
+
"source": {"input_type": "image"},
|
| 56 |
+
"pages": [],
|
| 57 |
+
}
|
| 58 |
+
doc, report = validate_schema(data)
|
| 59 |
+
assert doc is None
|
| 60 |
+
assert not report.is_valid
|
| 61 |
+
assert report.error_count > 0
|
| 62 |
+
|
| 63 |
+
def test_invalid_schema_version(self) -> None:
|
| 64 |
+
data = {
|
| 65 |
+
"schema_version": "bad",
|
| 66 |
+
"document_id": "doc1",
|
| 67 |
+
"source": {"input_type": "image"},
|
| 68 |
+
"pages": [{"id": "p1", "page_index": 0, "width": 100, "height": 100}],
|
| 69 |
+
}
|
| 70 |
+
doc, report = validate_schema(data)
|
| 71 |
+
assert doc is None
|
| 72 |
+
assert not report.is_valid
|
| 73 |
+
|
| 74 |
+
def test_empty_pages(self) -> None:
|
| 75 |
+
data = {
|
| 76 |
+
"document_id": "doc1",
|
| 77 |
+
"source": {"input_type": "image"},
|
| 78 |
+
"pages": [],
|
| 79 |
+
}
|
| 80 |
+
doc, report = validate_schema(data)
|
| 81 |
+
assert doc is None
|
| 82 |
+
assert report.error_count > 0
|
| 83 |
+
|
| 84 |
+
def test_error_paths_populated(self) -> None:
|
| 85 |
+
data = {
|
| 86 |
+
"document_id": "",
|
| 87 |
+
"source": {"input_type": "image"},
|
| 88 |
+
"pages": [],
|
| 89 |
+
}
|
| 90 |
+
doc, report = validate_schema(data)
|
| 91 |
+
assert doc is None
|
| 92 |
+
for entry in report.errors:
|
| 93 |
+
assert entry.path
|
| 94 |
+
assert entry.message
|
| 95 |
+
assert entry.validator == "schema"
|
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the structural validator."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from src.app.domain.models import (
|
| 6 |
+
AltoReadiness,
|
| 7 |
+
CanonicalDocument,
|
| 8 |
+
EvidenceType,
|
| 9 |
+
Geometry,
|
| 10 |
+
GeometryStatus,
|
| 11 |
+
NonTextRegion,
|
| 12 |
+
Page,
|
| 13 |
+
PageXmlReadiness,
|
| 14 |
+
Provenance,
|
| 15 |
+
ReadinessLevel,
|
| 16 |
+
Source,
|
| 17 |
+
TextLine,
|
| 18 |
+
TextRegion,
|
| 19 |
+
Word,
|
| 20 |
+
)
|
| 21 |
+
from src.app.domain.models.status import BlockRole, InputType, NonTextKind
|
| 22 |
+
from src.app.validators.structural_validator import validate_structure
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def _prov() -> Provenance:
|
| 26 |
+
return Provenance(
|
| 27 |
+
provider="test", adapter="v1", source_ref="$",
|
| 28 |
+
evidence_type=EvidenceType.PROVIDER_NATIVE,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _geo(x: float, y: float, w: float, h: float) -> Geometry:
|
| 33 |
+
return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _word(wid: str, x: float, y: float, w: float, h: float) -> Word:
|
| 37 |
+
return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov())
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine:
|
| 41 |
+
return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion:
|
| 45 |
+
return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000,
|
| 49 |
+
reading_order: list[str] | None = None,
|
| 50 |
+
non_text: list[NonTextRegion] | None = None) -> CanonicalDocument:
|
| 51 |
+
ro = reading_order if reading_order is not None else [r.id for r in regions]
|
| 52 |
+
return CanonicalDocument(
|
| 53 |
+
document_id="test",
|
| 54 |
+
source=Source(input_type=InputType.IMAGE),
|
| 55 |
+
pages=[Page(
|
| 56 |
+
id="p1", page_index=0, width=width, height=height,
|
| 57 |
+
alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
|
| 58 |
+
page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
|
| 59 |
+
reading_order=ro,
|
| 60 |
+
text_regions=regions,
|
| 61 |
+
non_text_regions=non_text or [],
|
| 62 |
+
)],
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class TestIdUniqueness:
|
| 67 |
+
def test_all_unique_passes(self) -> None:
|
| 68 |
+
doc = _doc([
|
| 69 |
+
_region("tb1", 0, 0, 500, 200, [
|
| 70 |
+
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
|
| 71 |
+
]),
|
| 72 |
+
])
|
| 73 |
+
report = validate_structure(doc)
|
| 74 |
+
assert report.is_valid
|
| 75 |
+
|
| 76 |
+
def test_duplicate_word_ids(self) -> None:
|
| 77 |
+
doc = _doc([
|
| 78 |
+
_region("tb1", 0, 0, 500, 200, [
|
| 79 |
+
_line("tl1", 0, 0, 500, 40, [
|
| 80 |
+
_word("w1", 0, 0, 50, 30),
|
| 81 |
+
_word("w1", 60, 0, 50, 30), # duplicate
|
| 82 |
+
]),
|
| 83 |
+
]),
|
| 84 |
+
])
|
| 85 |
+
report = validate_structure(doc)
|
| 86 |
+
assert not report.is_valid
|
| 87 |
+
assert any("Duplicate ID 'w1'" in e.message for e in report.errors)
|
| 88 |
+
|
| 89 |
+
def test_duplicate_across_levels(self) -> None:
|
| 90 |
+
# line ID = region ID
|
| 91 |
+
doc = _doc([
|
| 92 |
+
_region("same_id", 0, 0, 500, 200, [
|
| 93 |
+
_line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
|
| 94 |
+
]),
|
| 95 |
+
])
|
| 96 |
+
report = validate_structure(doc)
|
| 97 |
+
assert not report.is_valid
|
| 98 |
+
|
| 99 |
+
def test_duplicate_with_non_text_region(self) -> None:
|
| 100 |
+
ntr = NonTextRegion(
|
| 101 |
+
id="tb1", kind=NonTextKind.ILLUSTRATION,
|
| 102 |
+
geometry=_geo(600, 0, 100, 100), provenance=_prov(),
|
| 103 |
+
)
|
| 104 |
+
doc = _doc(
|
| 105 |
+
[_region("tb1", 0, 0, 500, 200, [
|
| 106 |
+
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
|
| 107 |
+
])],
|
| 108 |
+
non_text=[ntr],
|
| 109 |
+
)
|
| 110 |
+
report = validate_structure(doc)
|
| 111 |
+
assert not report.is_valid
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
class TestReadingOrder:
|
| 115 |
+
def test_valid_references(self) -> None:
|
| 116 |
+
doc = _doc([
|
| 117 |
+
_region("tb1", 0, 0, 500, 200, [
|
| 118 |
+
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
|
| 119 |
+
]),
|
| 120 |
+
], reading_order=["tb1"])
|
| 121 |
+
report = validate_structure(doc)
|
| 122 |
+
assert report.is_valid
|
| 123 |
+
|
| 124 |
+
def test_invalid_reference(self) -> None:
|
| 125 |
+
doc = _doc([
|
| 126 |
+
_region("tb1", 0, 0, 500, 200, [
|
| 127 |
+
_line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
|
| 128 |
+
]),
|
| 129 |
+
], reading_order=["tb1", "tb_nonexistent"])
|
| 130 |
+
report = validate_structure(doc)
|
| 131 |
+
assert not report.is_valid
|
| 132 |
+
assert any("unknown region ID" in e.message for e in report.errors)
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class TestBboxContainment:
|
| 136 |
+
def test_all_contained_passes(self) -> None:
|
| 137 |
+
doc = _doc([
|
| 138 |
+
_region("tb1", 10, 10, 200, 100, [
|
| 139 |
+
_line("tl1", 20, 20, 150, 30, [
|
| 140 |
+
_word("w1", 25, 22, 50, 25),
|
| 141 |
+
]),
|
| 142 |
+
]),
|
| 143 |
+
])
|
| 144 |
+
report = validate_structure(doc)
|
| 145 |
+
assert report.warning_count == 0
|
| 146 |
+
|
| 147 |
+
def test_word_exceeds_line(self) -> None:
|
| 148 |
+
doc = _doc([
|
| 149 |
+
_region("tb1", 10, 10, 400, 100, [
|
| 150 |
+
_line("tl1", 20, 20, 100, 30, [
|
| 151 |
+
_word("w1", 20, 20, 200, 30), # word wider than line
|
| 152 |
+
]),
|
| 153 |
+
]),
|
| 154 |
+
])
|
| 155 |
+
report = validate_structure(doc, bbox_tolerance=0)
|
| 156 |
+
assert report.warning_count > 0
|
| 157 |
+
assert any("word_exceeds_line" in (e.code or "") for e in report.warnings)
|
| 158 |
+
|
| 159 |
+
def test_tolerance_allows_small_overflow(self) -> None:
|
| 160 |
+
doc = _doc([
|
| 161 |
+
_region("tb1", 10, 10, 200, 100, [
|
| 162 |
+
_line("tl1", 20, 20, 100, 30, [
|
| 163 |
+
_word("w1", 20, 20, 103, 30), # 3px overflow
|
| 164 |
+
]),
|
| 165 |
+
]),
|
| 166 |
+
])
|
| 167 |
+
report = validate_structure(doc, bbox_tolerance=5)
|
| 168 |
+
assert report.warning_count == 0
|
| 169 |
+
|
| 170 |
+
def test_tolerance_rejects_large_overflow(self) -> None:
|
| 171 |
+
doc = _doc([
|
| 172 |
+
_region("tb1", 10, 10, 200, 100, [
|
| 173 |
+
_line("tl1", 20, 20, 100, 30, [
|
| 174 |
+
_word("w1", 20, 20, 120, 30), # 20px overflow
|
| 175 |
+
]),
|
| 176 |
+
]),
|
| 177 |
+
])
|
| 178 |
+
report = validate_structure(doc, bbox_tolerance=5)
|
| 179 |
+
assert report.warning_count > 0
|
| 180 |
+
|
| 181 |
+
def test_region_exceeds_page(self) -> None:
|
| 182 |
+
doc = _doc([
|
| 183 |
+
_region("tb1", 900, 900, 200, 200, [ # exceeds 1000x1000 page
|
| 184 |
+
_line("tl1", 900, 900, 100, 30, [
|
| 185 |
+
_word("w1", 900, 900, 50, 25),
|
| 186 |
+
]),
|
| 187 |
+
]),
|
| 188 |
+
], width=1000, height=1000)
|
| 189 |
+
report = validate_structure(doc, bbox_tolerance=0)
|
| 190 |
+
assert any("region_exceeds_page" in (e.code or "") for e in report.warnings)
|
| 191 |
+
|
| 192 |
+
def test_line_exceeds_region(self) -> None:
|
| 193 |
+
doc = _doc([
|
| 194 |
+
_region("tb1", 10, 10, 100, 50, [
|
| 195 |
+
_line("tl1", 10, 10, 200, 30, [ # line wider than region
|
| 196 |
+
_word("w1", 10, 10, 50, 25),
|
| 197 |
+
]),
|
| 198 |
+
]),
|
| 199 |
+
])
|
| 200 |
+
report = validate_structure(doc, bbox_tolerance=0)
|
| 201 |
+
assert any("line_exceeds_region" in (e.code or "") for e in report.warnings)
|