Spaces:

Ma-Ri-Ba-Ku
/

XmLLM

Sleeping

Claude commited on 24 days ago

Commit

e2ec8a2

unverified ·

1 Parent(s): 1cbec06

Sprint 4: validation layer and document policy

Four validators and two policy modules that control the pipeline:

Validators:
- structural_validator: ID uniqueness, reading_order ref validity, bbox
containment (word ⊂ line ⊂ region ⊂ page) with configurable tolerance
- readiness_validator: computes AltoReadiness / PageXmlReadiness per page
and DocumentReadiness at document level. ALTO requires word text+geometry;
PAGE XML is more lenient (regions+lines sufficient)
- export_eligibility_validator: produces ExportEligibility (alto/page/viewer
each independently full/partial/none). Strict policy downgrades partial→none
- schema_validator: wraps Pydantic validation as explicit service, returns
(document | None, ValidationReport) instead of raising exceptions

Policies:
- document_policy: centralised business rules (3 modes: strict/standard/
permissive). Controls what the system may infer, repair, or export.
Text invention and bbox invention always forbidden
- export_policy: go/no-go decisions per format, consuming eligibility + policy

Infrastructure:
- ValidationReport with entries (validator, severity, path, message, code)
- Severity enum (error/warning/info)
- is_valid, error_count, warning_count, merge()

48 new tests, 331 total passing.

https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg

Files changed (13) hide show

src/app/domain/errors/__init__.py +59 -0
src/app/policies/document_policy.py +97 -0
src/app/policies/export_policy.py +90 -0
src/app/validators/__init__.py +1 -0
src/app/validators/export_eligibility_validator.py +79 -0
src/app/validators/readiness_validator.py +165 -0
src/app/validators/schema_validator.py +42 -0
src/app/validators/structural_validator.py +122 -0
tests/unit/test_document_policy.py +98 -0
tests/unit/test_export_eligibility.py +181 -0
tests/unit/test_readiness_validator.py +194 -0
tests/unit/test_schema_validator.py +95 -0
tests/unit/test_structural_validator.py +201 -0

src/app/domain/errors/__init__.py CHANGED Viewed

	@@ -0,0 +1,59 @@

+"""Domain errors for validation and export."""
+from __future__ import annotations
+from enum import Enum
+from pydantic import BaseModel, ConfigDict, Field
+class Severity(str, Enum):
+    """Severity level for validation entries."""
+    ERROR = "error"
+    WARNING = "warning"
+    INFO = "info"
+class ValidationEntry(BaseModel):
+    """A single validation finding."""
+    model_config = ConfigDict(frozen=True)
+    validator: str = Field(min_length=1)
+    severity: Severity
+    path: str = Field(min_length=1, description="Path in the document, e.g. pages[0].text_regions[1].lines[3]")
+    message: str = Field(min_length=1)
+    code: str | None = None
+class ValidationReport(BaseModel):
+    """Aggregated results from all validators."""
+    entries: list[ValidationEntry] = Field(default_factory=list)
+    @property
+    def errors(self) -> list[ValidationEntry]:
+        return [e for e in self.entries if e.severity == Severity.ERROR]
+    @property
+    def warnings(self) -> list[ValidationEntry]:
+        return [e for e in self.entries if e.severity == Severity.WARNING]
+    @property
+    def is_valid(self) -> bool:
+        return len(self.errors) == 0
+    @property
+    def error_count(self) -> int:
+        return len(self.errors)
+    @property
+    def warning_count(self) -> int:
+        return len(self.warnings)
+    def add(self, entry: ValidationEntry) -> None:
+        self.entries.append(entry)
+    def merge(self, other: ValidationReport) -> None:
+        self.entries.extend(other.entries)

src/app/policies/document_policy.py CHANGED Viewed

	@@ -0,0 +1,97 @@

+"""Document policy — centralised business rules for the pipeline.
+This layer prevents critical decisions from being scattered across
+adapters, validators, and serializers.  A policy is a named configuration
+that controls what the system may or may not do.
+"""
+from __future__ import annotations
+from enum import Enum
+from pydantic import BaseModel, ConfigDict
+class PolicyMode(str, Enum):
+    """Named policy presets."""
+    STRICT = "strict"
+    STANDARD = "standard"
+    PERMISSIVE = "permissive"
+class DocumentPolicy(BaseModel):
+    """Concrete policy controlling pipeline behaviour."""
+    model_config = ConfigDict(frozen=True)
+    mode: PolicyMode = PolicyMode.STANDARD
+    # -- Text rules -----------------------------------------------------------
+    allow_text_invention: bool = False
+    """Never invent text that wasn't in the provider output."""
+    # -- Geometry rules -------------------------------------------------------
+    allow_polygon_to_bbox: bool = True
+    """Allow deriving bbox from polygon (enricher)."""
+    allow_bbox_inference: bool = True
+    """Allow inferring bbox from context (e.g. line bbox from word bboxes)."""
+    allow_bbox_invention: bool = False
+    """Never invent bbox without any geometric basis."""
+    # -- Language rules -------------------------------------------------------
+    allow_lang_propagation: bool = True
+    """Allow propagating language from parent to child nodes."""
+    # -- Export rules ---------------------------------------------------------
+    require_lines_for_alto: bool = True
+    """ALTO export requires at least line-level geometry."""
+    require_words_for_alto: bool = True
+    """ALTO export requires word-level text and geometry."""
+    allow_partial_alto: bool = True
+    """Allow ALTO export with partial readiness."""
+    allow_partial_page: bool = True
+    """Allow PAGE export with partial readiness."""
+    # -- Enricher rules -------------------------------------------------------
+    allow_reading_order_inference: bool = True
+    """Allow inferring reading order from spatial position."""
+    allow_hyphenation_detection: bool = True
+    """Allow detecting word hyphenation at line boundaries."""
+    # -- Tolerance ------------------------------------------------------------
+    bbox_containment_tolerance: float = 5.0
+    """Pixels of allowed overflow for bbox containment checks."""
+    @property
+    def strict_mode(self) -> bool:
+        return self.mode == PolicyMode.STRICT
+def strict_policy() -> DocumentPolicy:
+    """A strict policy: no inference, no partial exports."""
+    return DocumentPolicy(
+        mode=PolicyMode.STRICT,
+        allow_bbox_inference=False,
+        allow_partial_alto=False,
+        allow_partial_page=False,
+        allow_reading_order_inference=False,
+        allow_hyphenation_detection=False,
+    )
+def permissive_policy() -> DocumentPolicy:
+    """A permissive policy: allow inference and partial exports."""
+    return DocumentPolicy(
+        mode=PolicyMode.PERMISSIVE,
+        allow_bbox_inference=True,
+        allow_partial_alto=True,
+        allow_partial_page=True,
+        bbox_containment_tolerance=10.0,
+    )

src/app/policies/export_policy.py CHANGED Viewed

	@@ -0,0 +1,90 @@

+"""Export policy — decides whether a specific export should proceed.
+Uses the document policy and export eligibility to make a final go/no-go
+decision for each export format.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from src.app.domain.models.readiness import ExportEligibility
+from src.app.domain.models.status import ReadinessLevel
+from src.app.policies.document_policy import DocumentPolicy
+@dataclass(frozen=True)
+class ExportDecision:
+    """Result of an export policy check."""
+    allowed: bool
+    level: ReadinessLevel
+    reason: str
+def check_alto_export(
+    eligibility: ExportEligibility,
+    policy: DocumentPolicy | None = None,
+) -> ExportDecision:
+    """Check if ALTO export should proceed."""
+    if policy is None:
+        policy = DocumentPolicy()
+    level = eligibility.alto_export
+    if level == ReadinessLevel.NONE:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="ALTO export not possible: missing required data (word text/geometry or line geometry)",
+        )
+    if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="ALTO export is partial but policy does not allow partial exports",
+        )
+    if level == ReadinessLevel.DEGRADED:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="ALTO export is degraded: too much data missing",
+        )
+    return ExportDecision(allowed=True, level=level, reason="OK")
+def check_page_export(
+    eligibility: ExportEligibility,
+    policy: DocumentPolicy | None = None,
+) -> ExportDecision:
+    """Check if PAGE XML export should proceed."""
+    if policy is None:
+        policy = DocumentPolicy()
+    level = eligibility.page_export
+    if level == ReadinessLevel.NONE:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="PAGE export not possible: missing required data",
+        )
+    if level == ReadinessLevel.PARTIAL and not policy.allow_partial_page:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="PAGE export is partial but policy does not allow partial exports",
+        )
+    if level == ReadinessLevel.DEGRADED:
+        return ExportDecision(
+            allowed=False,
+            level=level,
+            reason="PAGE export is degraded: too much data missing",
+        )
+    return ExportDecision(allowed=True, level=level, reason="OK")

src/app/validators/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@


1	+ """Validators — structural, readiness, schema, export eligibility."""

src/app/validators/export_eligibility_validator.py CHANGED Viewed

	@@ -0,0 +1,79 @@

+"""Export eligibility validator — decides what can be exported.
+Consumes readiness assessments and document policy to produce
+an ExportEligibility decision for the whole document.
+"""
+from __future__ import annotations
+from src.app.domain.models import CanonicalDocument
+from src.app.domain.models.readiness import ExportEligibility
+from src.app.domain.models.status import ReadinessLevel
+from src.app.policies.document_policy import DocumentPolicy
+from src.app.validators.readiness_validator import (
+    compute_page_alto_readiness,
+    compute_page_pagexml_readiness,
+)
+def compute_export_eligibility(
+    doc: CanonicalDocument,
+    policy: DocumentPolicy | None = None,
+) -> ExportEligibility:
+    """Compute export eligibility for a document.
+    Args:
+        doc: The canonical document.
+        policy: Document policy (uses default if None).
+    Returns:
+        ExportEligibility with per-format readiness levels.
+    """
+    if policy is None:
+        policy = DocumentPolicy()
+    alto_levels: list[ReadinessLevel] = []
+    page_levels: list[ReadinessLevel] = []
+    for page in doc.pages:
+        alto_levels.append(compute_page_alto_readiness(page).level)
+        page_levels.append(compute_page_pagexml_readiness(page).level)
+    alto_export = _aggregate_levels(alto_levels)
+    page_export = _aggregate_levels(page_levels)
+    # Apply policy constraints
+    if policy.strict_mode:
+        # In strict mode, partial is downgraded to none
+        if alto_export == ReadinessLevel.PARTIAL:
+            alto_export = ReadinessLevel.NONE
+        if page_export == ReadinessLevel.PARTIAL:
+            page_export = ReadinessLevel.NONE
+    # Viewer is more lenient — it can render degraded content
+    if alto_export != ReadinessLevel.NONE or page_export != ReadinessLevel.NONE:
+        viewer_render = ReadinessLevel.FULL
+    elif any(len(p.text_regions) > 0 for p in doc.pages):
+        viewer_render = ReadinessLevel.DEGRADED
+    else:
+        viewer_render = ReadinessLevel.NONE
+    return ExportEligibility(
+        alto_export=alto_export,
+        page_export=page_export,
+        viewer_render=viewer_render,
+    )
+def _aggregate_levels(levels: list[ReadinessLevel]) -> ReadinessLevel:
+    """Aggregate per-page readiness into a single document-level readiness."""
+    if not levels:
+        return ReadinessLevel.NONE
+    if all(l == ReadinessLevel.FULL for l in levels):
+        return ReadinessLevel.FULL
+    if all(l == ReadinessLevel.NONE for l in levels):
+        return ReadinessLevel.NONE
+    if any(l in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for l in levels):
+        return ReadinessLevel.PARTIAL
+    return ReadinessLevel.DEGRADED

src/app/validators/readiness_validator.py CHANGED Viewed

	@@ -0,0 +1,165 @@

+"""Readiness validator — computes how ready a document is for export.
+Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
+at document level.  Does NOT decide whether to allow export — that's the
+export eligibility validator's job.
+"""
+from __future__ import annotations
+from src.app.domain.models import CanonicalDocument, Page
+from src.app.domain.models.readiness import (
+    AltoReadiness,
+    DocumentReadiness,
+    PageXmlReadiness,
+)
+from src.app.domain.models.status import (
+    GeometryStatus,
+    MissingCapability,
+    ReadinessLevel,
+)
+def compute_page_alto_readiness(page: Page) -> AltoReadiness:
+    """Compute ALTO readiness for a single page.
+    ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
+    """
+    missing: list[MissingCapability] = []
+    if page.width <= 0 or page.height <= 0:
+        missing.append(MissingCapability.PAGE_DIMENSIONS)
+    has_blocks = len(page.text_regions) > 0
+    has_lines = False
+    has_words = False
+    has_word_geo = True
+    has_word_text = True
+    has_confidence = True
+    for region in page.text_regions:
+        if region.geometry.status == GeometryStatus.UNKNOWN:
+            if MissingCapability.BLOCK_GEOMETRY not in missing:
+                missing.append(MissingCapability.BLOCK_GEOMETRY)
+        for line in region.lines:
+            has_lines = True
+            if line.geometry.status == GeometryStatus.UNKNOWN:
+                if MissingCapability.LINE_GEOMETRY not in missing:
+                    missing.append(MissingCapability.LINE_GEOMETRY)
+            for word in line.words:
+                has_words = True
+                if word.geometry.status == GeometryStatus.UNKNOWN:
+                    has_word_geo = False
+                if not word.text:
+                    has_word_text = False
+                if word.confidence is None:
+                    has_confidence = False
+    if not has_blocks or not has_lines or not has_words:
+        if not has_words:
+            missing.append(MissingCapability.WORD_TEXT)
+        if not has_lines:
+            missing.append(MissingCapability.LINE_GEOMETRY)
+    if not has_word_geo:
+        if MissingCapability.WORD_GEOMETRY not in missing:
+            missing.append(MissingCapability.WORD_GEOMETRY)
+    if not has_word_text:
+        if MissingCapability.WORD_TEXT not in missing:
+            missing.append(MissingCapability.WORD_TEXT)
+    if not has_confidence:
+        if MissingCapability.CONFIDENCE not in missing:
+            missing.append(MissingCapability.CONFIDENCE)
+    if not page.reading_order:
+        missing.append(MissingCapability.READING_ORDER)
+    level = _level_from_missing(missing, critical={
+        MissingCapability.PAGE_DIMENSIONS,
+        MissingCapability.WORD_TEXT,
+        MissingCapability.WORD_GEOMETRY,
+        MissingCapability.LINE_GEOMETRY,
+    })
+    return AltoReadiness(level=level, missing=missing)
+def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
+    """Compute PAGE XML readiness for a single page.
+    PAGE XML is more lenient: regions + lines are often sufficient.
+    Word-level geometry is nice-to-have, not required.
+    """
+    missing: list[MissingCapability] = []
+    if page.width <= 0 or page.height <= 0:
+        missing.append(MissingCapability.PAGE_DIMENSIONS)
+    has_regions = len(page.text_regions) > 0
+    has_lines = False
+    for region in page.text_regions:
+        if region.geometry.status == GeometryStatus.UNKNOWN:
+            if MissingCapability.BLOCK_GEOMETRY not in missing:
+                missing.append(MissingCapability.BLOCK_GEOMETRY)
+        for line in region.lines:
+            has_lines = True
+            if line.geometry.status == GeometryStatus.UNKNOWN:
+                if MissingCapability.LINE_GEOMETRY not in missing:
+                    missing.append(MissingCapability.LINE_GEOMETRY)
+    if not has_regions:
+        missing.append(MissingCapability.BLOCK_GEOMETRY)
+    if not has_lines:
+        if MissingCapability.LINE_GEOMETRY not in missing:
+            missing.append(MissingCapability.LINE_GEOMETRY)
+    if not page.reading_order:
+        missing.append(MissingCapability.READING_ORDER)
+    level = _level_from_missing(missing, critical={
+        MissingCapability.PAGE_DIMENSIONS,
+        MissingCapability.BLOCK_GEOMETRY,
+    })
+    return PageXmlReadiness(level=level, missing=missing)
+def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
+    """Compute overall document readiness from per-page readiness."""
+    page_levels: list[ReadinessLevel] = []
+    for page in doc.pages:
+        alto = compute_page_alto_readiness(page)
+        page_levels.append(alto.level)
+    if not page_levels:
+        return DocumentReadiness(level=ReadinessLevel.NONE)
+    if all(l == ReadinessLevel.FULL for l in page_levels):
+        overall = ReadinessLevel.FULL
+    elif all(l == ReadinessLevel.NONE for l in page_levels):
+        overall = ReadinessLevel.NONE
+    elif any(l == ReadinessLevel.NONE for l in page_levels):
+        overall = ReadinessLevel.DEGRADED
+    else:
+        overall = ReadinessLevel.PARTIAL
+    return DocumentReadiness(level=overall, page_readiness=page_levels)
+def _level_from_missing(
+    missing: list[MissingCapability],
+    critical: set[MissingCapability],
+) -> ReadinessLevel:
+    """Determine readiness level from missing capabilities."""
+    if not missing:
+        return ReadinessLevel.FULL
+    has_critical = any(m in critical for m in missing)
+    if has_critical:
+        return ReadinessLevel.NONE
+    return ReadinessLevel.PARTIAL

src/app/validators/schema_validator.py CHANGED Viewed

	@@ -0,0 +1,42 @@

+"""Schema validator — validates a dict/JSON against the CanonicalDocument schema.
+This wraps Pydantic validation as an explicit service, producing a
+ValidationReport rather than raising exceptions.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import ValidationError
+from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
+from src.app.domain.models import CanonicalDocument
+VALIDATOR_NAME = "schema"
+def validate_schema(data: dict[str, Any]) -> tuple[CanonicalDocument | None, ValidationReport]:
+    """Validate raw data against the CanonicalDocument schema.
+    Returns:
+        A tuple of (parsed document or None, validation report).
+        If parsing succeeds, the document is returned with an empty report.
+        If parsing fails, None is returned with errors in the report.
+    """
+    report = ValidationReport()
+    try:
+        doc = CanonicalDocument.model_validate(data)
+        return doc, report
+    except ValidationError as e:
+        for error in e.errors():
+            loc_parts = [str(p) for p in error["loc"]]
+            path = ".".join(loc_parts) if loc_parts else "root"
+            report.add(ValidationEntry(
+                validator=VALIDATOR_NAME,
+                severity=Severity.ERROR,
+                path=path,
+                message=error["msg"],
+                code=error["type"],
+            ))
+        return None, report

src/app/validators/structural_validator.py CHANGED Viewed

	@@ -0,0 +1,122 @@

+"""Structural validator — checks internal consistency of a CanonicalDocument.
+Checks:
+  - ID uniqueness across the entire document
+  - reading_order references existing region IDs
+  - bbox containment: word ⊂ line ⊂ region ⊂ page (with tolerance)
+  - spatial ordering: words in a line, lines in a region
+"""
+from __future__ import annotations
+from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
+from src.app.domain.models import CanonicalDocument
+from src.app.geometry.bbox import contains
+VALIDATOR_NAME = "structural"
+def validate_structure(
+    doc: CanonicalDocument,
+    *,
+    bbox_tolerance: float = 5.0,
+) -> ValidationReport:
+    """Run all structural checks on a CanonicalDocument."""
+    report = ValidationReport()
+    _check_id_uniqueness(doc, report)
+    _check_reading_order(doc, report)
+    _check_bbox_containment(doc, report, bbox_tolerance)
+    return report
+def _check_id_uniqueness(doc: CanonicalDocument, report: ValidationReport) -> None:
+    """Every ID in the document must be unique."""
+    seen: dict[str, str] = {}  # id → first path
+    for pi, page in enumerate(doc.pages):
+        _register_id(page.id, f"pages[{pi}]", seen, report)
+        for ri, region in enumerate(page.text_regions):
+            rpath = f"pages[{pi}].text_regions[{ri}]"
+            _register_id(region.id, rpath, seen, report)
+            for li, line in enumerate(region.lines):
+                lpath = f"{rpath}.lines[{li}]"
+                _register_id(line.id, lpath, seen, report)
+                for wi, word in enumerate(line.words):
+                    wpath = f"{lpath}.words[{wi}]"
+                    _register_id(word.id, wpath, seen, report)
+        for ni, ntr in enumerate(page.non_text_regions):
+            npath = f"pages[{pi}].non_text_regions[{ni}]"
+            _register_id(ntr.id, npath, seen, report)
+def _register_id(
+    node_id: str, path: str, seen: dict[str, str], report: ValidationReport
+) -> None:
+    if node_id in seen:
+        report.add(ValidationEntry(
+            validator=VALIDATOR_NAME,
+            severity=Severity.ERROR,
+            path=path,
+            message=f"Duplicate ID '{node_id}', first seen at {seen[node_id]}",
+            code="duplicate_id",
+        ))
+    else:
+        seen[node_id] = path
+def _check_reading_order(doc: CanonicalDocument, report: ValidationReport) -> None:
+    """reading_order entries must reference existing region IDs."""
+    for pi, page in enumerate(doc.pages):
+        region_ids = {r.id for r in page.text_regions}
+        for idx, ref_id in enumerate(page.reading_order):
+            if ref_id not in region_ids:
+                report.add(ValidationEntry(
+                    validator=VALIDATOR_NAME,
+                    severity=Severity.ERROR,
+                    path=f"pages[{pi}].reading_order[{idx}]",
+                    message=f"reading_order references unknown region ID '{ref_id}'",
+                    code="invalid_reading_order_ref",
+                ))
+def _check_bbox_containment(
+    doc: CanonicalDocument, report: ValidationReport, tolerance: float
+) -> None:
+    """Check that child bboxes are contained within parent bboxes."""
+    for pi, page in enumerate(doc.pages):
+        page_bbox = (0.0, 0.0, page.width, page.height)
+        for ri, region in enumerate(page.text_regions):
+            rpath = f"pages[{pi}].text_regions[{ri}]"
+            if not contains(page_bbox, region.geometry.bbox, tolerance):
+                report.add(ValidationEntry(
+                    validator=VALIDATOR_NAME,
+                    severity=Severity.WARNING,
+                    path=rpath,
+                    message=f"Region bbox {region.geometry.bbox} exceeds page bounds ({page.width}x{page.height}) beyond tolerance {tolerance}px",
+                    code="region_exceeds_page",
+                ))
+            for li, line in enumerate(region.lines):
+                lpath = f"{rpath}.lines[{li}]"
+                if not contains(region.geometry.bbox, line.geometry.bbox, tolerance):
+                    report.add(ValidationEntry(
+                        validator=VALIDATOR_NAME,
+                        severity=Severity.WARNING,
+                        path=lpath,
+                        message=f"Line bbox exceeds region bbox beyond tolerance {tolerance}px",
+                        code="line_exceeds_region",
+                    ))
+                for wi, word in enumerate(line.words):
+                    wpath = f"{lpath}.words[{wi}]"
+                    if not contains(line.geometry.bbox, word.geometry.bbox, tolerance):
+                        report.add(ValidationEntry(
+                            validator=VALIDATOR_NAME,
+                            severity=Severity.WARNING,
+                            path=wpath,
+                            message=f"Word bbox exceeds line bbox beyond tolerance {tolerance}px",
+                            code="word_exceeds_line",
+                        ))

tests/unit/test_document_policy.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Tests for document policy and validation report."""
+from __future__ import annotations
+from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
+from src.app.policies.document_policy import (
+    DocumentPolicy,
+    PolicyMode,
+    permissive_policy,
+    strict_policy,
+)
+class TestDocumentPolicy:
+    def test_default_is_standard(self) -> None:
+        p = DocumentPolicy()
+        assert p.mode == PolicyMode.STANDARD
+        assert not p.strict_mode
+    def test_strict(self) -> None:
+        p = strict_policy()
+        assert p.strict_mode
+        assert not p.allow_bbox_inference
+        assert not p.allow_partial_alto
+    def test_permissive(self) -> None:
+        p = permissive_policy()
+        assert p.mode == PolicyMode.PERMISSIVE
+        assert p.allow_bbox_inference
+        assert p.bbox_containment_tolerance == 10.0
+    def test_never_allows_text_invention(self) -> None:
+        for factory in [DocumentPolicy, strict_policy, permissive_policy]:
+            p = factory()
+            assert p.allow_text_invention is False
+    def test_never_allows_bbox_invention(self) -> None:
+        for factory in [DocumentPolicy, strict_policy, permissive_policy]:
+            p = factory()
+            assert p.allow_bbox_invention is False
+    def test_frozen(self) -> None:
+        import pytest
+        from pydantic import ValidationError
+        p = DocumentPolicy()
+        with pytest.raises(ValidationError):
+            p.mode = PolicyMode.STRICT  # type: ignore[misc]
+class TestValidationReport:
+    def test_empty_is_valid(self) -> None:
+        r = ValidationReport()
+        assert r.is_valid
+        assert r.error_count == 0
+        assert r.warning_count == 0
+    def test_with_error(self) -> None:
+        r = ValidationReport()
+        r.add(ValidationEntry(
+            validator="test", severity=Severity.ERROR,
+            path="pages[0]", message="bad",
+        ))
+        assert not r.is_valid
+        assert r.error_count == 1
+    def test_warnings_dont_invalidate(self) -> None:
+        r = ValidationReport()
+        r.add(ValidationEntry(
+            validator="test", severity=Severity.WARNING,
+            path="pages[0]", message="meh",
+        ))
+        assert r.is_valid
+        assert r.warning_count == 1
+    def test_merge(self) -> None:
+        r1 = ValidationReport()
+        r1.add(ValidationEntry(
+            validator="a", severity=Severity.ERROR,
+            path="x", message="e1",
+        ))
+        r2 = ValidationReport()
+        r2.add(ValidationEntry(
+            validator="b", severity=Severity.WARNING,
+            path="y", message="w1",
+        ))
+        r1.merge(r2)
+        assert r1.error_count == 1
+        assert r1.warning_count == 1
+        assert len(r1.entries) == 2
+    def test_errors_property(self) -> None:
+        r = ValidationReport()
+        r.add(ValidationEntry(validator="a", severity=Severity.ERROR, path="x", message="e"))
+        r.add(ValidationEntry(validator="b", severity=Severity.WARNING, path="y", message="w"))
+        r.add(ValidationEntry(validator="c", severity=Severity.INFO, path="z", message="i"))
+        assert len(r.errors) == 1
+        assert len(r.warnings) == 1

tests/unit/test_export_eligibility.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""Tests for export eligibility and export policy."""
+from __future__ import annotations
+from src.app.domain.models import (
+    AltoReadiness,
+    CanonicalDocument,
+    EvidenceType,
+    Geometry,
+    GeometryStatus,
+    Page,
+    PageXmlReadiness,
+    Provenance,
+    ReadinessLevel,
+    Source,
+    TextLine,
+    TextRegion,
+    Word,
+)
+from src.app.domain.models.status import InputType
+from src.app.policies.document_policy import DocumentPolicy, strict_policy
+from src.app.policies.export_policy import check_alto_export, check_page_export
+from src.app.validators.export_eligibility_validator import compute_export_eligibility
+def _prov() -> Provenance:
+    return Provenance(
+        provider="test", adapter="v1", source_ref="$",
+        evidence_type=EvidenceType.PROVIDER_NATIVE,
+    )
+def _geo() -> Geometry:
+    return Geometry(bbox=(10, 10, 100, 30), status=GeometryStatus.EXACT)
+def _complete_doc() -> CanonicalDocument:
+    return CanonicalDocument(
+        document_id="test",
+        source=Source(input_type=InputType.IMAGE),
+        pages=[Page(
+            id="p1", page_index=0, width=2480, height=3508,
+            alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
+            page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
+            reading_order=["tb1"],
+            text_regions=[
+                TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
+                    lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[Word(id="w1", text="Hello", geometry=_geo(),
+                            provenance=_prov(), confidence=0.95)])])],
+        )],
+    )
+def _empty_doc() -> CanonicalDocument:
+    return CanonicalDocument(
+        document_id="test",
+        source=Source(input_type=InputType.IMAGE),
+        pages=[Page(id="p1", page_index=0, width=2480, height=3508)],
+    )
+class TestExportEligibility:
+    def test_complete_doc_full_eligible(self) -> None:
+        doc = _complete_doc()
+        elig = compute_export_eligibility(doc)
+        assert elig.alto_export == ReadinessLevel.FULL
+        assert elig.page_export == ReadinessLevel.FULL
+        assert elig.viewer_render == ReadinessLevel.FULL
+    def test_empty_doc_none(self) -> None:
+        doc = _empty_doc()
+        elig = compute_export_eligibility(doc)
+        assert elig.alto_export == ReadinessLevel.NONE
+        assert elig.page_export == ReadinessLevel.NONE
+        assert elig.viewer_render == ReadinessLevel.NONE
+    def test_strict_policy_downgrades_partial(self) -> None:
+        # A doc with missing confidence → partial
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[Page(
+                id="p1", page_index=0, width=2480, height=3508,
+                reading_order=["tb1"],
+                text_regions=[
+                    TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
+                        lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
+                            words=[Word(id="w1", text="Hello", geometry=_geo(),
+                                provenance=_prov(), confidence=None)])])],
+            )],
+        )
+        policy = strict_policy()
+        elig = compute_export_eligibility(doc, policy)
+        # Strict mode downgrades partial to none
+        assert elig.alto_export == ReadinessLevel.NONE
+    def test_viewer_degraded_for_regions_without_exports(self) -> None:
+        # Doc with unknown word geometry — ALTO none, but viewer shows something
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[Page(
+                id="p1", page_index=0, width=2480, height=3508,
+                text_regions=[
+                    TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
+                        lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
+                            words=[Word(id="w1", text="Hello",
+                                geometry=Geometry(bbox=(10, 10, 100, 30),
+                                    status=GeometryStatus.UNKNOWN),
+                                provenance=_prov())])])],
+            )],
+        )
+        elig = compute_export_eligibility(doc)
+        # ALTO is none (missing word geo), but viewer can show regions
+        assert elig.viewer_render in (ReadinessLevel.FULL, ReadinessLevel.DEGRADED)
+class TestExportPolicy:
+    def test_alto_allowed_full(self) -> None:
+        doc = _complete_doc()
+        elig = compute_export_eligibility(doc)
+        decision = check_alto_export(elig)
+        assert decision.allowed is True
+        assert decision.reason == "OK"
+    def test_alto_refused_none(self) -> None:
+        doc = _empty_doc()
+        elig = compute_export_eligibility(doc)
+        decision = check_alto_export(elig)
+        assert decision.allowed is False
+        assert "not possible" in decision.reason
+    def test_alto_partial_default_allowed(self) -> None:
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[Page(
+                id="p1", page_index=0, width=2480, height=3508,
+                reading_order=["tb1"],
+                text_regions=[
+                    TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
+                        lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
+                            words=[Word(id="w1", text="Hello", geometry=_geo(),
+                                provenance=_prov(), confidence=None)])])],
+            )],
+        )
+        elig = compute_export_eligibility(doc)
+        decision = check_alto_export(elig)
+        assert decision.allowed is True
+    def test_alto_partial_strict_refused(self) -> None:
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[Page(
+                id="p1", page_index=0, width=2480, height=3508,
+                reading_order=["tb1"],
+                text_regions=[
+                    TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
+                        lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
+                            words=[Word(id="w1", text="Hello", geometry=_geo(),
+                                provenance=_prov(), confidence=None)])])],
+            )],
+        )
+        policy = strict_policy()
+        elig = compute_export_eligibility(doc, policy)
+        decision = check_alto_export(elig, policy)
+        assert decision.allowed is False
+    def test_page_allowed_full(self) -> None:
+        doc = _complete_doc()
+        elig = compute_export_eligibility(doc)
+        decision = check_page_export(elig)
+        assert decision.allowed is True
+    def test_page_refused_none(self) -> None:
+        doc = _empty_doc()
+        elig = compute_export_eligibility(doc)
+        decision = check_page_export(elig)
+        assert decision.allowed is False

tests/unit/test_readiness_validator.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""Tests for the readiness validator."""
+from __future__ import annotations
+from src.app.domain.models import (
+    AltoReadiness,
+    CanonicalDocument,
+    EvidenceType,
+    Geometry,
+    GeometryStatus,
+    Page,
+    PageXmlReadiness,
+    Provenance,
+    ReadinessLevel,
+    Source,
+    TextLine,
+    TextRegion,
+    Word,
+)
+from src.app.domain.models.status import InputType, MissingCapability
+from src.app.validators.readiness_validator import (
+    compute_document_readiness,
+    compute_page_alto_readiness,
+    compute_page_pagexml_readiness,
+)
+def _prov() -> Provenance:
+    return Provenance(
+        provider="test", adapter="v1", source_ref="$",
+        evidence_type=EvidenceType.PROVIDER_NATIVE,
+    )
+def _geo(status: GeometryStatus = GeometryStatus.EXACT) -> Geometry:
+    return Geometry(bbox=(10, 10, 100, 30), status=status)
+def _complete_page() -> Page:
+    """A fully complete page with all data."""
+    return Page(
+        id="p1", page_index=0, width=2480, height=3508,
+        alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
+        page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
+        reading_order=["tb1"],
+        text_regions=[
+            TextRegion(
+                id="tb1", geometry=_geo(), provenance=_prov(), lang="fra",
+                lines=[
+                    TextLine(
+                        id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[
+                            Word(id="w1", text="Hello", geometry=_geo(),
+                                 provenance=_prov(), confidence=0.95),
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+class TestAltoReadiness:
+    def test_complete_page_is_full(self) -> None:
+        page = _complete_page()
+        r = compute_page_alto_readiness(page)
+        assert r.level == ReadinessLevel.FULL
+        assert r.missing == []
+    def test_missing_word_geometry_is_none(self) -> None:
+        page = Page(
+            id="p1", page_index=0, width=2480, height=3508,
+            text_regions=[
+                TextRegion(
+                    id="tb1", geometry=_geo(), provenance=_prov(),
+                    lines=[TextLine(
+                        id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[Word(
+                            id="w1", text="Hello",
+                            geometry=_geo(GeometryStatus.UNKNOWN),
+                            provenance=_prov(),
+                        )],
+                    )],
+                ),
+            ],
+        )
+        r = compute_page_alto_readiness(page)
+        assert r.level == ReadinessLevel.NONE
+        assert MissingCapability.WORD_GEOMETRY in r.missing
+    def test_missing_confidence_is_partial(self) -> None:
+        page = Page(
+            id="p1", page_index=0, width=2480, height=3508,
+            reading_order=["tb1"],
+            text_regions=[
+                TextRegion(
+                    id="tb1", geometry=_geo(), provenance=_prov(),
+                    lines=[TextLine(
+                        id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[Word(
+                            id="w1", text="Hello", geometry=_geo(),
+                            provenance=_prov(), confidence=None,
+                        )],
+                    )],
+                ),
+            ],
+        )
+        r = compute_page_alto_readiness(page)
+        assert r.level == ReadinessLevel.PARTIAL
+        assert MissingCapability.CONFIDENCE in r.missing
+    def test_no_reading_order_is_partial(self) -> None:
+        page = Page(
+            id="p1", page_index=0, width=2480, height=3508,
+            reading_order=[],
+            text_regions=[
+                TextRegion(
+                    id="tb1", geometry=_geo(), provenance=_prov(),
+                    lines=[TextLine(
+                        id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[Word(
+                            id="w1", text="Hello", geometry=_geo(),
+                            provenance=_prov(), confidence=0.9,
+                        )],
+                    )],
+                ),
+            ],
+        )
+        r = compute_page_alto_readiness(page)
+        assert r.level == ReadinessLevel.PARTIAL
+        assert MissingCapability.READING_ORDER in r.missing
+    def test_empty_page_is_none(self) -> None:
+        page = Page(id="p1", page_index=0, width=2480, height=3508)
+        r = compute_page_alto_readiness(page)
+        assert r.level == ReadinessLevel.NONE
+class TestPageXmlReadiness:
+    def test_complete_page_is_full(self) -> None:
+        page = _complete_page()
+        r = compute_page_pagexml_readiness(page)
+        assert r.level == ReadinessLevel.FULL
+    def test_no_regions_is_none(self) -> None:
+        page = Page(id="p1", page_index=0, width=2480, height=3508)
+        r = compute_page_pagexml_readiness(page)
+        assert r.level == ReadinessLevel.NONE
+    def test_regions_without_word_geo_still_ok(self) -> None:
+        """PAGE XML is more lenient — word geometry is not critical."""
+        page = Page(
+            id="p1", page_index=0, width=2480, height=3508,
+            reading_order=["tb1"],
+            text_regions=[
+                TextRegion(
+                    id="tb1", geometry=_geo(), provenance=_prov(),
+                    lines=[TextLine(
+                        id="tl1", geometry=_geo(), provenance=_prov(),
+                        words=[Word(
+                            id="w1", text="Hello",
+                            geometry=_geo(GeometryStatus.UNKNOWN),
+                            provenance=_prov(),
+                        )],
+                    )],
+                ),
+            ],
+        )
+        r = compute_page_pagexml_readiness(page)
+        # PAGE doesn't require word geometry — should still be achievable
+        assert r.level in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL)
+class TestDocumentReadiness:
+    def test_single_full_page(self) -> None:
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[_complete_page()],
+        )
+        dr = compute_document_readiness(doc)
+        assert dr.level == ReadinessLevel.FULL
+    def test_mixed_pages(self) -> None:
+        full_page = _complete_page()
+        empty_page = Page(id="p2", page_index=1, width=2480, height=3508)
+        doc = CanonicalDocument(
+            document_id="test",
+            source=Source(input_type=InputType.IMAGE),
+            pages=[full_page, empty_page],
+        )
+        dr = compute_document_readiness(doc)
+        assert dr.level == ReadinessLevel.DEGRADED
+        assert len(dr.page_readiness) == 2

tests/unit/test_schema_validator.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Tests for the schema validator."""
+from __future__ import annotations
+from src.app.validators.schema_validator import validate_schema
+class TestSchemaValidator:
+    def test_valid_document(self) -> None:
+        data = {
+            "schema_version": "1.0.0",
+            "document_id": "doc1",
+            "source": {"input_type": "image"},
+            "pages": [{
+                "id": "p1",
+                "page_index": 0,
+                "width": 2480,
+                "height": 3508,
+                "text_regions": [{
+                    "id": "tb1",
+                    "geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
+                    "provenance": {
+                        "provider": "test", "adapter": "v1",
+                        "source_ref": "$", "evidence_type": "provider_native",
+                        "derived_from": [],
+                    },
+                    "lines": [{
+                        "id": "tl1",
+                        "geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
+                        "provenance": {
+                            "provider": "test", "adapter": "v1",
+                            "source_ref": "$", "evidence_type": "provider_native",
+                            "derived_from": [],
+                        },
+                        "words": [{
+                            "id": "w1",
+                            "text": "Hello",
+                            "geometry": {"bbox": [100, 200, 50, 30], "status": "exact"},
+                            "provenance": {
+                                "provider": "test", "adapter": "v1",
+                                "source_ref": "$", "evidence_type": "provider_native",
+                                "derived_from": [],
+                            },
+                        }],
+                    }],
+                }],
+            }],
+        }
+        doc, report = validate_schema(data)
+        assert doc is not None
+        assert report.is_valid
+    def test_missing_required_field(self) -> None:
+        data = {
+            "source": {"input_type": "image"},
+            "pages": [],
+        }
+        doc, report = validate_schema(data)
+        assert doc is None
+        assert not report.is_valid
+        assert report.error_count > 0
+    def test_invalid_schema_version(self) -> None:
+        data = {
+            "schema_version": "bad",
+            "document_id": "doc1",
+            "source": {"input_type": "image"},
+            "pages": [{"id": "p1", "page_index": 0, "width": 100, "height": 100}],
+        }
+        doc, report = validate_schema(data)
+        assert doc is None
+        assert not report.is_valid
+    def test_empty_pages(self) -> None:
+        data = {
+            "document_id": "doc1",
+            "source": {"input_type": "image"},
+            "pages": [],
+        }
+        doc, report = validate_schema(data)
+        assert doc is None
+        assert report.error_count > 0
+    def test_error_paths_populated(self) -> None:
+        data = {
+            "document_id": "",
+            "source": {"input_type": "image"},
+            "pages": [],
+        }
+        doc, report = validate_schema(data)
+        assert doc is None
+        for entry in report.errors:
+            assert entry.path
+            assert entry.message
+            assert entry.validator == "schema"

tests/unit/test_structural_validator.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""Tests for the structural validator."""
+from __future__ import annotations
+from src.app.domain.models import (
+    AltoReadiness,
+    CanonicalDocument,
+    EvidenceType,
+    Geometry,
+    GeometryStatus,
+    NonTextRegion,
+    Page,
+    PageXmlReadiness,
+    Provenance,
+    ReadinessLevel,
+    Source,
+    TextLine,
+    TextRegion,
+    Word,
+)
+from src.app.domain.models.status import BlockRole, InputType, NonTextKind
+from src.app.validators.structural_validator import validate_structure
+def _prov() -> Provenance:
+    return Provenance(
+        provider="test", adapter="v1", source_ref="$",
+        evidence_type=EvidenceType.PROVIDER_NATIVE,
+    )
+def _geo(x: float, y: float, w: float, h: float) -> Geometry:
+    return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT)
+def _word(wid: str, x: float, y: float, w: float, h: float) -> Word:
+    return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov())
+def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine:
+    return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words)
+def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion:
+    return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines)
+def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000,
+         reading_order: list[str] | None = None,
+         non_text: list[NonTextRegion] | None = None) -> CanonicalDocument:
+    ro = reading_order if reading_order is not None else [r.id for r in regions]
+    return CanonicalDocument(
+        document_id="test",
+        source=Source(input_type=InputType.IMAGE),
+        pages=[Page(
+            id="p1", page_index=0, width=width, height=height,
+            alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
+            page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
+            reading_order=ro,
+            text_regions=regions,
+            non_text_regions=non_text or [],
+        )],
+    )
+class TestIdUniqueness:
+    def test_all_unique_passes(self) -> None:
+        doc = _doc([
+            _region("tb1", 0, 0, 500, 200, [
+                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
+            ]),
+        ])
+        report = validate_structure(doc)
+        assert report.is_valid
+    def test_duplicate_word_ids(self) -> None:
+        doc = _doc([
+            _region("tb1", 0, 0, 500, 200, [
+                _line("tl1", 0, 0, 500, 40, [
+                    _word("w1", 0, 0, 50, 30),
+                    _word("w1", 60, 0, 50, 30),  # duplicate
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc)
+        assert not report.is_valid
+        assert any("Duplicate ID 'w1'" in e.message for e in report.errors)
+    def test_duplicate_across_levels(self) -> None:
+        # line ID = region ID
+        doc = _doc([
+            _region("same_id", 0, 0, 500, 200, [
+                _line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
+            ]),
+        ])
+        report = validate_structure(doc)
+        assert not report.is_valid
+    def test_duplicate_with_non_text_region(self) -> None:
+        ntr = NonTextRegion(
+            id="tb1", kind=NonTextKind.ILLUSTRATION,
+            geometry=_geo(600, 0, 100, 100), provenance=_prov(),
+        )
+        doc = _doc(
+            [_region("tb1", 0, 0, 500, 200, [
+                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
+            ])],
+            non_text=[ntr],
+        )
+        report = validate_structure(doc)
+        assert not report.is_valid
+class TestReadingOrder:
+    def test_valid_references(self) -> None:
+        doc = _doc([
+            _region("tb1", 0, 0, 500, 200, [
+                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
+            ]),
+        ], reading_order=["tb1"])
+        report = validate_structure(doc)
+        assert report.is_valid
+    def test_invalid_reference(self) -> None:
+        doc = _doc([
+            _region("tb1", 0, 0, 500, 200, [
+                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
+            ]),
+        ], reading_order=["tb1", "tb_nonexistent"])
+        report = validate_structure(doc)
+        assert not report.is_valid
+        assert any("unknown region ID" in e.message for e in report.errors)
+class TestBboxContainment:
+    def test_all_contained_passes(self) -> None:
+        doc = _doc([
+            _region("tb1", 10, 10, 200, 100, [
+                _line("tl1", 20, 20, 150, 30, [
+                    _word("w1", 25, 22, 50, 25),
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc)
+        assert report.warning_count == 0
+    def test_word_exceeds_line(self) -> None:
+        doc = _doc([
+            _region("tb1", 10, 10, 400, 100, [
+                _line("tl1", 20, 20, 100, 30, [
+                    _word("w1", 20, 20, 200, 30),  # word wider than line
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc, bbox_tolerance=0)
+        assert report.warning_count > 0
+        assert any("word_exceeds_line" in (e.code or "") for e in report.warnings)
+    def test_tolerance_allows_small_overflow(self) -> None:
+        doc = _doc([
+            _region("tb1", 10, 10, 200, 100, [
+                _line("tl1", 20, 20, 100, 30, [
+                    _word("w1", 20, 20, 103, 30),  # 3px overflow
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc, bbox_tolerance=5)
+        assert report.warning_count == 0
+    def test_tolerance_rejects_large_overflow(self) -> None:
+        doc = _doc([
+            _region("tb1", 10, 10, 200, 100, [
+                _line("tl1", 20, 20, 100, 30, [
+                    _word("w1", 20, 20, 120, 30),  # 20px overflow
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc, bbox_tolerance=5)
+        assert report.warning_count > 0
+    def test_region_exceeds_page(self) -> None:
+        doc = _doc([
+            _region("tb1", 900, 900, 200, 200, [  # exceeds 1000x1000 page
+                _line("tl1", 900, 900, 100, 30, [
+                    _word("w1", 900, 900, 50, 25),
+                ]),
+            ]),
+        ], width=1000, height=1000)
+        report = validate_structure(doc, bbox_tolerance=0)
+        assert any("region_exceeds_page" in (e.code or "") for e in report.warnings)
+    def test_line_exceeds_region(self) -> None:
+        doc = _doc([
+            _region("tb1", 10, 10, 100, 50, [
+                _line("tl1", 10, 10, 200, 30, [  # line wider than region
+                    _word("w1", 10, 10, 50, 25),
+                ]),
+            ]),
+        ])
+        report = validate_structure(doc, bbox_tolerance=0)
+        assert any("line_exceeds_region" in (e.code or "") for e in report.warnings)