Claude commited on
Commit
e2ec8a2
·
unverified ·
1 Parent(s): 1cbec06

Sprint 4: validation layer and document policy

Browse files

Four validators and two policy modules that control the pipeline:

Validators:
- structural_validator: ID uniqueness, reading_order ref validity, bbox
containment (word ⊂ line ⊂ region ⊂ page) with configurable tolerance
- readiness_validator: computes AltoReadiness / PageXmlReadiness per page
and DocumentReadiness at document level. ALTO requires word text+geometry;
PAGE XML is more lenient (regions+lines sufficient)
- export_eligibility_validator: produces ExportEligibility (alto/page/viewer
each independently full/partial/none). Strict policy downgrades partial→none
- schema_validator: wraps Pydantic validation as explicit service, returns
(document | None, ValidationReport) instead of raising exceptions

Policies:
- document_policy: centralised business rules (3 modes: strict/standard/
permissive). Controls what the system may infer, repair, or export.
Text invention and bbox invention always forbidden
- export_policy: go/no-go decisions per format, consuming eligibility + policy

Infrastructure:
- ValidationReport with entries (validator, severity, path, message, code)
- Severity enum (error/warning/info)
- is_valid, error_count, warning_count, merge()

48 new tests, 331 total passing.

https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg

src/app/domain/errors/__init__.py CHANGED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Domain errors for validation and export."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class Severity(str, Enum):
11
+ """Severity level for validation entries."""
12
+
13
+ ERROR = "error"
14
+ WARNING = "warning"
15
+ INFO = "info"
16
+
17
+
18
+ class ValidationEntry(BaseModel):
19
+ """A single validation finding."""
20
+
21
+ model_config = ConfigDict(frozen=True)
22
+
23
+ validator: str = Field(min_length=1)
24
+ severity: Severity
25
+ path: str = Field(min_length=1, description="Path in the document, e.g. pages[0].text_regions[1].lines[3]")
26
+ message: str = Field(min_length=1)
27
+ code: str | None = None
28
+
29
+
30
+ class ValidationReport(BaseModel):
31
+ """Aggregated results from all validators."""
32
+
33
+ entries: list[ValidationEntry] = Field(default_factory=list)
34
+
35
+ @property
36
+ def errors(self) -> list[ValidationEntry]:
37
+ return [e for e in self.entries if e.severity == Severity.ERROR]
38
+
39
+ @property
40
+ def warnings(self) -> list[ValidationEntry]:
41
+ return [e for e in self.entries if e.severity == Severity.WARNING]
42
+
43
+ @property
44
+ def is_valid(self) -> bool:
45
+ return len(self.errors) == 0
46
+
47
+ @property
48
+ def error_count(self) -> int:
49
+ return len(self.errors)
50
+
51
+ @property
52
+ def warning_count(self) -> int:
53
+ return len(self.warnings)
54
+
55
+ def add(self, entry: ValidationEntry) -> None:
56
+ self.entries.append(entry)
57
+
58
+ def merge(self, other: ValidationReport) -> None:
59
+ self.entries.extend(other.entries)
src/app/policies/document_policy.py CHANGED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document policy — centralised business rules for the pipeline.
2
+
3
+ This layer prevents critical decisions from being scattered across
4
+ adapters, validators, and serializers. A policy is a named configuration
5
+ that controls what the system may or may not do.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from enum import Enum
11
+
12
+ from pydantic import BaseModel, ConfigDict
13
+
14
+
15
+ class PolicyMode(str, Enum):
16
+ """Named policy presets."""
17
+
18
+ STRICT = "strict"
19
+ STANDARD = "standard"
20
+ PERMISSIVE = "permissive"
21
+
22
+
23
+ class DocumentPolicy(BaseModel):
24
+ """Concrete policy controlling pipeline behaviour."""
25
+
26
+ model_config = ConfigDict(frozen=True)
27
+
28
+ mode: PolicyMode = PolicyMode.STANDARD
29
+
30
+ # -- Text rules -----------------------------------------------------------
31
+ allow_text_invention: bool = False
32
+ """Never invent text that wasn't in the provider output."""
33
+
34
+ # -- Geometry rules -------------------------------------------------------
35
+ allow_polygon_to_bbox: bool = True
36
+ """Allow deriving bbox from polygon (enricher)."""
37
+
38
+ allow_bbox_inference: bool = True
39
+ """Allow inferring bbox from context (e.g. line bbox from word bboxes)."""
40
+
41
+ allow_bbox_invention: bool = False
42
+ """Never invent bbox without any geometric basis."""
43
+
44
+ # -- Language rules -------------------------------------------------------
45
+ allow_lang_propagation: bool = True
46
+ """Allow propagating language from parent to child nodes."""
47
+
48
+ # -- Export rules ---------------------------------------------------------
49
+ require_lines_for_alto: bool = True
50
+ """ALTO export requires at least line-level geometry."""
51
+
52
+ require_words_for_alto: bool = True
53
+ """ALTO export requires word-level text and geometry."""
54
+
55
+ allow_partial_alto: bool = True
56
+ """Allow ALTO export with partial readiness."""
57
+
58
+ allow_partial_page: bool = True
59
+ """Allow PAGE export with partial readiness."""
60
+
61
+ # -- Enricher rules -------------------------------------------------------
62
+ allow_reading_order_inference: bool = True
63
+ """Allow inferring reading order from spatial position."""
64
+
65
+ allow_hyphenation_detection: bool = True
66
+ """Allow detecting word hyphenation at line boundaries."""
67
+
68
+ # -- Tolerance ------------------------------------------------------------
69
+ bbox_containment_tolerance: float = 5.0
70
+ """Pixels of allowed overflow for bbox containment checks."""
71
+
72
+ @property
73
+ def strict_mode(self) -> bool:
74
+ return self.mode == PolicyMode.STRICT
75
+
76
+
77
+ def strict_policy() -> DocumentPolicy:
78
+ """A strict policy: no inference, no partial exports."""
79
+ return DocumentPolicy(
80
+ mode=PolicyMode.STRICT,
81
+ allow_bbox_inference=False,
82
+ allow_partial_alto=False,
83
+ allow_partial_page=False,
84
+ allow_reading_order_inference=False,
85
+ allow_hyphenation_detection=False,
86
+ )
87
+
88
+
89
+ def permissive_policy() -> DocumentPolicy:
90
+ """A permissive policy: allow inference and partial exports."""
91
+ return DocumentPolicy(
92
+ mode=PolicyMode.PERMISSIVE,
93
+ allow_bbox_inference=True,
94
+ allow_partial_alto=True,
95
+ allow_partial_page=True,
96
+ bbox_containment_tolerance=10.0,
97
+ )
src/app/policies/export_policy.py CHANGED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export policy — decides whether a specific export should proceed.
2
+
3
+ Uses the document policy and export eligibility to make a final go/no-go
4
+ decision for each export format.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+
11
+ from src.app.domain.models.readiness import ExportEligibility
12
+ from src.app.domain.models.status import ReadinessLevel
13
+ from src.app.policies.document_policy import DocumentPolicy
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class ExportDecision:
18
+ """Result of an export policy check."""
19
+
20
+ allowed: bool
21
+ level: ReadinessLevel
22
+ reason: str
23
+
24
+
25
+ def check_alto_export(
26
+ eligibility: ExportEligibility,
27
+ policy: DocumentPolicy | None = None,
28
+ ) -> ExportDecision:
29
+ """Check if ALTO export should proceed."""
30
+ if policy is None:
31
+ policy = DocumentPolicy()
32
+
33
+ level = eligibility.alto_export
34
+
35
+ if level == ReadinessLevel.NONE:
36
+ return ExportDecision(
37
+ allowed=False,
38
+ level=level,
39
+ reason="ALTO export not possible: missing required data (word text/geometry or line geometry)",
40
+ )
41
+
42
+ if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
43
+ return ExportDecision(
44
+ allowed=False,
45
+ level=level,
46
+ reason="ALTO export is partial but policy does not allow partial exports",
47
+ )
48
+
49
+ if level == ReadinessLevel.DEGRADED:
50
+ return ExportDecision(
51
+ allowed=False,
52
+ level=level,
53
+ reason="ALTO export is degraded: too much data missing",
54
+ )
55
+
56
+ return ExportDecision(allowed=True, level=level, reason="OK")
57
+
58
+
59
+ def check_page_export(
60
+ eligibility: ExportEligibility,
61
+ policy: DocumentPolicy | None = None,
62
+ ) -> ExportDecision:
63
+ """Check if PAGE XML export should proceed."""
64
+ if policy is None:
65
+ policy = DocumentPolicy()
66
+
67
+ level = eligibility.page_export
68
+
69
+ if level == ReadinessLevel.NONE:
70
+ return ExportDecision(
71
+ allowed=False,
72
+ level=level,
73
+ reason="PAGE export not possible: missing required data",
74
+ )
75
+
76
+ if level == ReadinessLevel.PARTIAL and not policy.allow_partial_page:
77
+ return ExportDecision(
78
+ allowed=False,
79
+ level=level,
80
+ reason="PAGE export is partial but policy does not allow partial exports",
81
+ )
82
+
83
+ if level == ReadinessLevel.DEGRADED:
84
+ return ExportDecision(
85
+ allowed=False,
86
+ level=level,
87
+ reason="PAGE export is degraded: too much data missing",
88
+ )
89
+
90
+ return ExportDecision(allowed=True, level=level, reason="OK")
src/app/validators/__init__.py CHANGED
@@ -0,0 +1 @@
 
 
1
+ """Validators — structural, readiness, schema, export eligibility."""
src/app/validators/export_eligibility_validator.py CHANGED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Export eligibility validator — decides what can be exported.
2
+
3
+ Consumes readiness assessments and document policy to produce
4
+ an ExportEligibility decision for the whole document.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from src.app.domain.models import CanonicalDocument
10
+ from src.app.domain.models.readiness import ExportEligibility
11
+ from src.app.domain.models.status import ReadinessLevel
12
+ from src.app.policies.document_policy import DocumentPolicy
13
+ from src.app.validators.readiness_validator import (
14
+ compute_page_alto_readiness,
15
+ compute_page_pagexml_readiness,
16
+ )
17
+
18
+
19
+ def compute_export_eligibility(
20
+ doc: CanonicalDocument,
21
+ policy: DocumentPolicy | None = None,
22
+ ) -> ExportEligibility:
23
+ """Compute export eligibility for a document.
24
+
25
+ Args:
26
+ doc: The canonical document.
27
+ policy: Document policy (uses default if None).
28
+
29
+ Returns:
30
+ ExportEligibility with per-format readiness levels.
31
+ """
32
+ if policy is None:
33
+ policy = DocumentPolicy()
34
+
35
+ alto_levels: list[ReadinessLevel] = []
36
+ page_levels: list[ReadinessLevel] = []
37
+
38
+ for page in doc.pages:
39
+ alto_levels.append(compute_page_alto_readiness(page).level)
40
+ page_levels.append(compute_page_pagexml_readiness(page).level)
41
+
42
+ alto_export = _aggregate_levels(alto_levels)
43
+ page_export = _aggregate_levels(page_levels)
44
+
45
+ # Apply policy constraints
46
+ if policy.strict_mode:
47
+ # In strict mode, partial is downgraded to none
48
+ if alto_export == ReadinessLevel.PARTIAL:
49
+ alto_export = ReadinessLevel.NONE
50
+ if page_export == ReadinessLevel.PARTIAL:
51
+ page_export = ReadinessLevel.NONE
52
+
53
+ # Viewer is more lenient — it can render degraded content
54
+ if alto_export != ReadinessLevel.NONE or page_export != ReadinessLevel.NONE:
55
+ viewer_render = ReadinessLevel.FULL
56
+ elif any(len(p.text_regions) > 0 for p in doc.pages):
57
+ viewer_render = ReadinessLevel.DEGRADED
58
+ else:
59
+ viewer_render = ReadinessLevel.NONE
60
+
61
+ return ExportEligibility(
62
+ alto_export=alto_export,
63
+ page_export=page_export,
64
+ viewer_render=viewer_render,
65
+ )
66
+
67
+
68
+ def _aggregate_levels(levels: list[ReadinessLevel]) -> ReadinessLevel:
69
+ """Aggregate per-page readiness into a single document-level readiness."""
70
+ if not levels:
71
+ return ReadinessLevel.NONE
72
+
73
+ if all(l == ReadinessLevel.FULL for l in levels):
74
+ return ReadinessLevel.FULL
75
+ if all(l == ReadinessLevel.NONE for l in levels):
76
+ return ReadinessLevel.NONE
77
+ if any(l in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for l in levels):
78
+ return ReadinessLevel.PARTIAL
79
+ return ReadinessLevel.DEGRADED
src/app/validators/readiness_validator.py CHANGED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Readiness validator — computes how ready a document is for export.
2
+
3
+ Produces AltoReadiness / PageXmlReadiness per page and DocumentReadiness
4
+ at document level. Does NOT decide whether to allow export — that's the
5
+ export eligibility validator's job.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from src.app.domain.models import CanonicalDocument, Page
11
+ from src.app.domain.models.readiness import (
12
+ AltoReadiness,
13
+ DocumentReadiness,
14
+ PageXmlReadiness,
15
+ )
16
+ from src.app.domain.models.status import (
17
+ GeometryStatus,
18
+ MissingCapability,
19
+ ReadinessLevel,
20
+ )
21
+
22
+
23
+ def compute_page_alto_readiness(page: Page) -> AltoReadiness:
24
+ """Compute ALTO readiness for a single page.
25
+
26
+ ALTO full requires: page dimensions, block bbox, line bbox, word bbox, word text.
27
+ """
28
+ missing: list[MissingCapability] = []
29
+
30
+ if page.width <= 0 or page.height <= 0:
31
+ missing.append(MissingCapability.PAGE_DIMENSIONS)
32
+
33
+ has_blocks = len(page.text_regions) > 0
34
+ has_lines = False
35
+ has_words = False
36
+ has_word_geo = True
37
+ has_word_text = True
38
+ has_confidence = True
39
+
40
+ for region in page.text_regions:
41
+ if region.geometry.status == GeometryStatus.UNKNOWN:
42
+ if MissingCapability.BLOCK_GEOMETRY not in missing:
43
+ missing.append(MissingCapability.BLOCK_GEOMETRY)
44
+ for line in region.lines:
45
+ has_lines = True
46
+ if line.geometry.status == GeometryStatus.UNKNOWN:
47
+ if MissingCapability.LINE_GEOMETRY not in missing:
48
+ missing.append(MissingCapability.LINE_GEOMETRY)
49
+ for word in line.words:
50
+ has_words = True
51
+ if word.geometry.status == GeometryStatus.UNKNOWN:
52
+ has_word_geo = False
53
+ if not word.text:
54
+ has_word_text = False
55
+ if word.confidence is None:
56
+ has_confidence = False
57
+
58
+ if not has_blocks or not has_lines or not has_words:
59
+ if not has_words:
60
+ missing.append(MissingCapability.WORD_TEXT)
61
+ if not has_lines:
62
+ missing.append(MissingCapability.LINE_GEOMETRY)
63
+
64
+ if not has_word_geo:
65
+ if MissingCapability.WORD_GEOMETRY not in missing:
66
+ missing.append(MissingCapability.WORD_GEOMETRY)
67
+
68
+ if not has_word_text:
69
+ if MissingCapability.WORD_TEXT not in missing:
70
+ missing.append(MissingCapability.WORD_TEXT)
71
+
72
+ if not has_confidence:
73
+ if MissingCapability.CONFIDENCE not in missing:
74
+ missing.append(MissingCapability.CONFIDENCE)
75
+
76
+ if not page.reading_order:
77
+ missing.append(MissingCapability.READING_ORDER)
78
+
79
+ level = _level_from_missing(missing, critical={
80
+ MissingCapability.PAGE_DIMENSIONS,
81
+ MissingCapability.WORD_TEXT,
82
+ MissingCapability.WORD_GEOMETRY,
83
+ MissingCapability.LINE_GEOMETRY,
84
+ })
85
+
86
+ return AltoReadiness(level=level, missing=missing)
87
+
88
+
89
+ def compute_page_pagexml_readiness(page: Page) -> PageXmlReadiness:
90
+ """Compute PAGE XML readiness for a single page.
91
+
92
+ PAGE XML is more lenient: regions + lines are often sufficient.
93
+ Word-level geometry is nice-to-have, not required.
94
+ """
95
+ missing: list[MissingCapability] = []
96
+
97
+ if page.width <= 0 or page.height <= 0:
98
+ missing.append(MissingCapability.PAGE_DIMENSIONS)
99
+
100
+ has_regions = len(page.text_regions) > 0
101
+ has_lines = False
102
+
103
+ for region in page.text_regions:
104
+ if region.geometry.status == GeometryStatus.UNKNOWN:
105
+ if MissingCapability.BLOCK_GEOMETRY not in missing:
106
+ missing.append(MissingCapability.BLOCK_GEOMETRY)
107
+ for line in region.lines:
108
+ has_lines = True
109
+ if line.geometry.status == GeometryStatus.UNKNOWN:
110
+ if MissingCapability.LINE_GEOMETRY not in missing:
111
+ missing.append(MissingCapability.LINE_GEOMETRY)
112
+
113
+ if not has_regions:
114
+ missing.append(MissingCapability.BLOCK_GEOMETRY)
115
+
116
+ if not has_lines:
117
+ if MissingCapability.LINE_GEOMETRY not in missing:
118
+ missing.append(MissingCapability.LINE_GEOMETRY)
119
+
120
+ if not page.reading_order:
121
+ missing.append(MissingCapability.READING_ORDER)
122
+
123
+ level = _level_from_missing(missing, critical={
124
+ MissingCapability.PAGE_DIMENSIONS,
125
+ MissingCapability.BLOCK_GEOMETRY,
126
+ })
127
+
128
+ return PageXmlReadiness(level=level, missing=missing)
129
+
130
+
131
+ def compute_document_readiness(doc: CanonicalDocument) -> DocumentReadiness:
132
+ """Compute overall document readiness from per-page readiness."""
133
+ page_levels: list[ReadinessLevel] = []
134
+ for page in doc.pages:
135
+ alto = compute_page_alto_readiness(page)
136
+ page_levels.append(alto.level)
137
+
138
+ if not page_levels:
139
+ return DocumentReadiness(level=ReadinessLevel.NONE)
140
+
141
+ if all(l == ReadinessLevel.FULL for l in page_levels):
142
+ overall = ReadinessLevel.FULL
143
+ elif all(l == ReadinessLevel.NONE for l in page_levels):
144
+ overall = ReadinessLevel.NONE
145
+ elif any(l == ReadinessLevel.NONE for l in page_levels):
146
+ overall = ReadinessLevel.DEGRADED
147
+ else:
148
+ overall = ReadinessLevel.PARTIAL
149
+
150
+ return DocumentReadiness(level=overall, page_readiness=page_levels)
151
+
152
+
153
+ def _level_from_missing(
154
+ missing: list[MissingCapability],
155
+ critical: set[MissingCapability],
156
+ ) -> ReadinessLevel:
157
+ """Determine readiness level from missing capabilities."""
158
+ if not missing:
159
+ return ReadinessLevel.FULL
160
+
161
+ has_critical = any(m in critical for m in missing)
162
+ if has_critical:
163
+ return ReadinessLevel.NONE
164
+
165
+ return ReadinessLevel.PARTIAL
src/app/validators/schema_validator.py CHANGED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Schema validator — validates a dict/JSON against the CanonicalDocument schema.
2
+
3
+ This wraps Pydantic validation as an explicit service, producing a
4
+ ValidationReport rather than raising exceptions.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ from pydantic import ValidationError
12
+
13
+ from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
14
+ from src.app.domain.models import CanonicalDocument
15
+
16
+ VALIDATOR_NAME = "schema"
17
+
18
+
19
+ def validate_schema(data: dict[str, Any]) -> tuple[CanonicalDocument | None, ValidationReport]:
20
+ """Validate raw data against the CanonicalDocument schema.
21
+
22
+ Returns:
23
+ A tuple of (parsed document or None, validation report).
24
+ If parsing succeeds, the document is returned with an empty report.
25
+ If parsing fails, None is returned with errors in the report.
26
+ """
27
+ report = ValidationReport()
28
+ try:
29
+ doc = CanonicalDocument.model_validate(data)
30
+ return doc, report
31
+ except ValidationError as e:
32
+ for error in e.errors():
33
+ loc_parts = [str(p) for p in error["loc"]]
34
+ path = ".".join(loc_parts) if loc_parts else "root"
35
+ report.add(ValidationEntry(
36
+ validator=VALIDATOR_NAME,
37
+ severity=Severity.ERROR,
38
+ path=path,
39
+ message=error["msg"],
40
+ code=error["type"],
41
+ ))
42
+ return None, report
src/app/validators/structural_validator.py CHANGED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Structural validator — checks internal consistency of a CanonicalDocument.
2
+
3
+ Checks:
4
+ - ID uniqueness across the entire document
5
+ - reading_order references existing region IDs
6
+ - bbox containment: word ⊂ line ⊂ region ⊂ page (with tolerance)
7
+ - spatial ordering: words in a line, lines in a region
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
13
+ from src.app.domain.models import CanonicalDocument
14
+ from src.app.geometry.bbox import contains
15
+
16
+ VALIDATOR_NAME = "structural"
17
+
18
+
19
+ def validate_structure(
20
+ doc: CanonicalDocument,
21
+ *,
22
+ bbox_tolerance: float = 5.0,
23
+ ) -> ValidationReport:
24
+ """Run all structural checks on a CanonicalDocument."""
25
+ report = ValidationReport()
26
+ _check_id_uniqueness(doc, report)
27
+ _check_reading_order(doc, report)
28
+ _check_bbox_containment(doc, report, bbox_tolerance)
29
+ return report
30
+
31
+
32
+ def _check_id_uniqueness(doc: CanonicalDocument, report: ValidationReport) -> None:
33
+ """Every ID in the document must be unique."""
34
+ seen: dict[str, str] = {} # id → first path
35
+ for pi, page in enumerate(doc.pages):
36
+ _register_id(page.id, f"pages[{pi}]", seen, report)
37
+ for ri, region in enumerate(page.text_regions):
38
+ rpath = f"pages[{pi}].text_regions[{ri}]"
39
+ _register_id(region.id, rpath, seen, report)
40
+ for li, line in enumerate(region.lines):
41
+ lpath = f"{rpath}.lines[{li}]"
42
+ _register_id(line.id, lpath, seen, report)
43
+ for wi, word in enumerate(line.words):
44
+ wpath = f"{lpath}.words[{wi}]"
45
+ _register_id(word.id, wpath, seen, report)
46
+ for ni, ntr in enumerate(page.non_text_regions):
47
+ npath = f"pages[{pi}].non_text_regions[{ni}]"
48
+ _register_id(ntr.id, npath, seen, report)
49
+
50
+
51
+ def _register_id(
52
+ node_id: str, path: str, seen: dict[str, str], report: ValidationReport
53
+ ) -> None:
54
+ if node_id in seen:
55
+ report.add(ValidationEntry(
56
+ validator=VALIDATOR_NAME,
57
+ severity=Severity.ERROR,
58
+ path=path,
59
+ message=f"Duplicate ID '{node_id}', first seen at {seen[node_id]}",
60
+ code="duplicate_id",
61
+ ))
62
+ else:
63
+ seen[node_id] = path
64
+
65
+
66
+ def _check_reading_order(doc: CanonicalDocument, report: ValidationReport) -> None:
67
+ """reading_order entries must reference existing region IDs."""
68
+ for pi, page in enumerate(doc.pages):
69
+ region_ids = {r.id for r in page.text_regions}
70
+ for idx, ref_id in enumerate(page.reading_order):
71
+ if ref_id not in region_ids:
72
+ report.add(ValidationEntry(
73
+ validator=VALIDATOR_NAME,
74
+ severity=Severity.ERROR,
75
+ path=f"pages[{pi}].reading_order[{idx}]",
76
+ message=f"reading_order references unknown region ID '{ref_id}'",
77
+ code="invalid_reading_order_ref",
78
+ ))
79
+
80
+
81
+ def _check_bbox_containment(
82
+ doc: CanonicalDocument, report: ValidationReport, tolerance: float
83
+ ) -> None:
84
+ """Check that child bboxes are contained within parent bboxes."""
85
+ for pi, page in enumerate(doc.pages):
86
+ page_bbox = (0.0, 0.0, page.width, page.height)
87
+
88
+ for ri, region in enumerate(page.text_regions):
89
+ rpath = f"pages[{pi}].text_regions[{ri}]"
90
+
91
+ if not contains(page_bbox, region.geometry.bbox, tolerance):
92
+ report.add(ValidationEntry(
93
+ validator=VALIDATOR_NAME,
94
+ severity=Severity.WARNING,
95
+ path=rpath,
96
+ message=f"Region bbox {region.geometry.bbox} exceeds page bounds ({page.width}x{page.height}) beyond tolerance {tolerance}px",
97
+ code="region_exceeds_page",
98
+ ))
99
+
100
+ for li, line in enumerate(region.lines):
101
+ lpath = f"{rpath}.lines[{li}]"
102
+
103
+ if not contains(region.geometry.bbox, line.geometry.bbox, tolerance):
104
+ report.add(ValidationEntry(
105
+ validator=VALIDATOR_NAME,
106
+ severity=Severity.WARNING,
107
+ path=lpath,
108
+ message=f"Line bbox exceeds region bbox beyond tolerance {tolerance}px",
109
+ code="line_exceeds_region",
110
+ ))
111
+
112
+ for wi, word in enumerate(line.words):
113
+ wpath = f"{lpath}.words[{wi}]"
114
+
115
+ if not contains(line.geometry.bbox, word.geometry.bbox, tolerance):
116
+ report.add(ValidationEntry(
117
+ validator=VALIDATOR_NAME,
118
+ severity=Severity.WARNING,
119
+ path=wpath,
120
+ message=f"Word bbox exceeds line bbox beyond tolerance {tolerance}px",
121
+ code="word_exceeds_line",
122
+ ))
tests/unit/test_document_policy.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for document policy and validation report."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.app.domain.errors import Severity, ValidationEntry, ValidationReport
6
+ from src.app.policies.document_policy import (
7
+ DocumentPolicy,
8
+ PolicyMode,
9
+ permissive_policy,
10
+ strict_policy,
11
+ )
12
+
13
+
14
+ class TestDocumentPolicy:
15
+ def test_default_is_standard(self) -> None:
16
+ p = DocumentPolicy()
17
+ assert p.mode == PolicyMode.STANDARD
18
+ assert not p.strict_mode
19
+
20
+ def test_strict(self) -> None:
21
+ p = strict_policy()
22
+ assert p.strict_mode
23
+ assert not p.allow_bbox_inference
24
+ assert not p.allow_partial_alto
25
+
26
+ def test_permissive(self) -> None:
27
+ p = permissive_policy()
28
+ assert p.mode == PolicyMode.PERMISSIVE
29
+ assert p.allow_bbox_inference
30
+ assert p.bbox_containment_tolerance == 10.0
31
+
32
+ def test_never_allows_text_invention(self) -> None:
33
+ for factory in [DocumentPolicy, strict_policy, permissive_policy]:
34
+ p = factory()
35
+ assert p.allow_text_invention is False
36
+
37
+ def test_never_allows_bbox_invention(self) -> None:
38
+ for factory in [DocumentPolicy, strict_policy, permissive_policy]:
39
+ p = factory()
40
+ assert p.allow_bbox_invention is False
41
+
42
+ def test_frozen(self) -> None:
43
+ import pytest
44
+ from pydantic import ValidationError
45
+
46
+ p = DocumentPolicy()
47
+ with pytest.raises(ValidationError):
48
+ p.mode = PolicyMode.STRICT # type: ignore[misc]
49
+
50
+
51
+ class TestValidationReport:
52
+ def test_empty_is_valid(self) -> None:
53
+ r = ValidationReport()
54
+ assert r.is_valid
55
+ assert r.error_count == 0
56
+ assert r.warning_count == 0
57
+
58
+ def test_with_error(self) -> None:
59
+ r = ValidationReport()
60
+ r.add(ValidationEntry(
61
+ validator="test", severity=Severity.ERROR,
62
+ path="pages[0]", message="bad",
63
+ ))
64
+ assert not r.is_valid
65
+ assert r.error_count == 1
66
+
67
+ def test_warnings_dont_invalidate(self) -> None:
68
+ r = ValidationReport()
69
+ r.add(ValidationEntry(
70
+ validator="test", severity=Severity.WARNING,
71
+ path="pages[0]", message="meh",
72
+ ))
73
+ assert r.is_valid
74
+ assert r.warning_count == 1
75
+
76
+ def test_merge(self) -> None:
77
+ r1 = ValidationReport()
78
+ r1.add(ValidationEntry(
79
+ validator="a", severity=Severity.ERROR,
80
+ path="x", message="e1",
81
+ ))
82
+ r2 = ValidationReport()
83
+ r2.add(ValidationEntry(
84
+ validator="b", severity=Severity.WARNING,
85
+ path="y", message="w1",
86
+ ))
87
+ r1.merge(r2)
88
+ assert r1.error_count == 1
89
+ assert r1.warning_count == 1
90
+ assert len(r1.entries) == 2
91
+
92
+ def test_errors_property(self) -> None:
93
+ r = ValidationReport()
94
+ r.add(ValidationEntry(validator="a", severity=Severity.ERROR, path="x", message="e"))
95
+ r.add(ValidationEntry(validator="b", severity=Severity.WARNING, path="y", message="w"))
96
+ r.add(ValidationEntry(validator="c", severity=Severity.INFO, path="z", message="i"))
97
+ assert len(r.errors) == 1
98
+ assert len(r.warnings) == 1
tests/unit/test_export_eligibility.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for export eligibility and export policy."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.app.domain.models import (
6
+ AltoReadiness,
7
+ CanonicalDocument,
8
+ EvidenceType,
9
+ Geometry,
10
+ GeometryStatus,
11
+ Page,
12
+ PageXmlReadiness,
13
+ Provenance,
14
+ ReadinessLevel,
15
+ Source,
16
+ TextLine,
17
+ TextRegion,
18
+ Word,
19
+ )
20
+ from src.app.domain.models.status import InputType
21
+ from src.app.policies.document_policy import DocumentPolicy, strict_policy
22
+ from src.app.policies.export_policy import check_alto_export, check_page_export
23
+ from src.app.validators.export_eligibility_validator import compute_export_eligibility
24
+
25
+
26
+ def _prov() -> Provenance:
27
+ return Provenance(
28
+ provider="test", adapter="v1", source_ref="$",
29
+ evidence_type=EvidenceType.PROVIDER_NATIVE,
30
+ )
31
+
32
+
33
+ def _geo() -> Geometry:
34
+ return Geometry(bbox=(10, 10, 100, 30), status=GeometryStatus.EXACT)
35
+
36
+
37
+ def _complete_doc() -> CanonicalDocument:
38
+ return CanonicalDocument(
39
+ document_id="test",
40
+ source=Source(input_type=InputType.IMAGE),
41
+ pages=[Page(
42
+ id="p1", page_index=0, width=2480, height=3508,
43
+ alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
44
+ page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
45
+ reading_order=["tb1"],
46
+ text_regions=[
47
+ TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
48
+ lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
49
+ words=[Word(id="w1", text="Hello", geometry=_geo(),
50
+ provenance=_prov(), confidence=0.95)])])],
51
+ )],
52
+ )
53
+
54
+
55
+ def _empty_doc() -> CanonicalDocument:
56
+ return CanonicalDocument(
57
+ document_id="test",
58
+ source=Source(input_type=InputType.IMAGE),
59
+ pages=[Page(id="p1", page_index=0, width=2480, height=3508)],
60
+ )
61
+
62
+
63
+ class TestExportEligibility:
64
+ def test_complete_doc_full_eligible(self) -> None:
65
+ doc = _complete_doc()
66
+ elig = compute_export_eligibility(doc)
67
+ assert elig.alto_export == ReadinessLevel.FULL
68
+ assert elig.page_export == ReadinessLevel.FULL
69
+ assert elig.viewer_render == ReadinessLevel.FULL
70
+
71
+ def test_empty_doc_none(self) -> None:
72
+ doc = _empty_doc()
73
+ elig = compute_export_eligibility(doc)
74
+ assert elig.alto_export == ReadinessLevel.NONE
75
+ assert elig.page_export == ReadinessLevel.NONE
76
+ assert elig.viewer_render == ReadinessLevel.NONE
77
+
78
+ def test_strict_policy_downgrades_partial(self) -> None:
79
+ # A doc with missing confidence → partial
80
+ doc = CanonicalDocument(
81
+ document_id="test",
82
+ source=Source(input_type=InputType.IMAGE),
83
+ pages=[Page(
84
+ id="p1", page_index=0, width=2480, height=3508,
85
+ reading_order=["tb1"],
86
+ text_regions=[
87
+ TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
88
+ lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
89
+ words=[Word(id="w1", text="Hello", geometry=_geo(),
90
+ provenance=_prov(), confidence=None)])])],
91
+ )],
92
+ )
93
+ policy = strict_policy()
94
+ elig = compute_export_eligibility(doc, policy)
95
+ # Strict mode downgrades partial to none
96
+ assert elig.alto_export == ReadinessLevel.NONE
97
+
98
+ def test_viewer_degraded_for_regions_without_exports(self) -> None:
99
+ # Doc with unknown word geometry — ALTO none, but viewer shows something
100
+ doc = CanonicalDocument(
101
+ document_id="test",
102
+ source=Source(input_type=InputType.IMAGE),
103
+ pages=[Page(
104
+ id="p1", page_index=0, width=2480, height=3508,
105
+ text_regions=[
106
+ TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
107
+ lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
108
+ words=[Word(id="w1", text="Hello",
109
+ geometry=Geometry(bbox=(10, 10, 100, 30),
110
+ status=GeometryStatus.UNKNOWN),
111
+ provenance=_prov())])])],
112
+ )],
113
+ )
114
+ elig = compute_export_eligibility(doc)
115
+ # ALTO is none (missing word geo), but viewer can show regions
116
+ assert elig.viewer_render in (ReadinessLevel.FULL, ReadinessLevel.DEGRADED)
117
+
118
+
119
+ class TestExportPolicy:
120
+ def test_alto_allowed_full(self) -> None:
121
+ doc = _complete_doc()
122
+ elig = compute_export_eligibility(doc)
123
+ decision = check_alto_export(elig)
124
+ assert decision.allowed is True
125
+ assert decision.reason == "OK"
126
+
127
+ def test_alto_refused_none(self) -> None:
128
+ doc = _empty_doc()
129
+ elig = compute_export_eligibility(doc)
130
+ decision = check_alto_export(elig)
131
+ assert decision.allowed is False
132
+ assert "not possible" in decision.reason
133
+
134
+ def test_alto_partial_default_allowed(self) -> None:
135
+ doc = CanonicalDocument(
136
+ document_id="test",
137
+ source=Source(input_type=InputType.IMAGE),
138
+ pages=[Page(
139
+ id="p1", page_index=0, width=2480, height=3508,
140
+ reading_order=["tb1"],
141
+ text_regions=[
142
+ TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
143
+ lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
144
+ words=[Word(id="w1", text="Hello", geometry=_geo(),
145
+ provenance=_prov(), confidence=None)])])],
146
+ )],
147
+ )
148
+ elig = compute_export_eligibility(doc)
149
+ decision = check_alto_export(elig)
150
+ assert decision.allowed is True
151
+
152
+ def test_alto_partial_strict_refused(self) -> None:
153
+ doc = CanonicalDocument(
154
+ document_id="test",
155
+ source=Source(input_type=InputType.IMAGE),
156
+ pages=[Page(
157
+ id="p1", page_index=0, width=2480, height=3508,
158
+ reading_order=["tb1"],
159
+ text_regions=[
160
+ TextRegion(id="tb1", geometry=_geo(), provenance=_prov(),
161
+ lines=[TextLine(id="tl1", geometry=_geo(), provenance=_prov(),
162
+ words=[Word(id="w1", text="Hello", geometry=_geo(),
163
+ provenance=_prov(), confidence=None)])])],
164
+ )],
165
+ )
166
+ policy = strict_policy()
167
+ elig = compute_export_eligibility(doc, policy)
168
+ decision = check_alto_export(elig, policy)
169
+ assert decision.allowed is False
170
+
171
+ def test_page_allowed_full(self) -> None:
172
+ doc = _complete_doc()
173
+ elig = compute_export_eligibility(doc)
174
+ decision = check_page_export(elig)
175
+ assert decision.allowed is True
176
+
177
+ def test_page_refused_none(self) -> None:
178
+ doc = _empty_doc()
179
+ elig = compute_export_eligibility(doc)
180
+ decision = check_page_export(elig)
181
+ assert decision.allowed is False
tests/unit/test_readiness_validator.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the readiness validator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.app.domain.models import (
6
+ AltoReadiness,
7
+ CanonicalDocument,
8
+ EvidenceType,
9
+ Geometry,
10
+ GeometryStatus,
11
+ Page,
12
+ PageXmlReadiness,
13
+ Provenance,
14
+ ReadinessLevel,
15
+ Source,
16
+ TextLine,
17
+ TextRegion,
18
+ Word,
19
+ )
20
+ from src.app.domain.models.status import InputType, MissingCapability
21
+ from src.app.validators.readiness_validator import (
22
+ compute_document_readiness,
23
+ compute_page_alto_readiness,
24
+ compute_page_pagexml_readiness,
25
+ )
26
+
27
+
28
+ def _prov() -> Provenance:
29
+ return Provenance(
30
+ provider="test", adapter="v1", source_ref="$",
31
+ evidence_type=EvidenceType.PROVIDER_NATIVE,
32
+ )
33
+
34
+
35
+ def _geo(status: GeometryStatus = GeometryStatus.EXACT) -> Geometry:
36
+ return Geometry(bbox=(10, 10, 100, 30), status=status)
37
+
38
+
39
+ def _complete_page() -> Page:
40
+ """A fully complete page with all data."""
41
+ return Page(
42
+ id="p1", page_index=0, width=2480, height=3508,
43
+ alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
44
+ page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
45
+ reading_order=["tb1"],
46
+ text_regions=[
47
+ TextRegion(
48
+ id="tb1", geometry=_geo(), provenance=_prov(), lang="fra",
49
+ lines=[
50
+ TextLine(
51
+ id="tl1", geometry=_geo(), provenance=_prov(),
52
+ words=[
53
+ Word(id="w1", text="Hello", geometry=_geo(),
54
+ provenance=_prov(), confidence=0.95),
55
+ ],
56
+ ),
57
+ ],
58
+ ),
59
+ ],
60
+ )
61
+
62
+
63
+ class TestAltoReadiness:
64
+ def test_complete_page_is_full(self) -> None:
65
+ page = _complete_page()
66
+ r = compute_page_alto_readiness(page)
67
+ assert r.level == ReadinessLevel.FULL
68
+ assert r.missing == []
69
+
70
+ def test_missing_word_geometry_is_none(self) -> None:
71
+ page = Page(
72
+ id="p1", page_index=0, width=2480, height=3508,
73
+ text_regions=[
74
+ TextRegion(
75
+ id="tb1", geometry=_geo(), provenance=_prov(),
76
+ lines=[TextLine(
77
+ id="tl1", geometry=_geo(), provenance=_prov(),
78
+ words=[Word(
79
+ id="w1", text="Hello",
80
+ geometry=_geo(GeometryStatus.UNKNOWN),
81
+ provenance=_prov(),
82
+ )],
83
+ )],
84
+ ),
85
+ ],
86
+ )
87
+ r = compute_page_alto_readiness(page)
88
+ assert r.level == ReadinessLevel.NONE
89
+ assert MissingCapability.WORD_GEOMETRY in r.missing
90
+
91
+ def test_missing_confidence_is_partial(self) -> None:
92
+ page = Page(
93
+ id="p1", page_index=0, width=2480, height=3508,
94
+ reading_order=["tb1"],
95
+ text_regions=[
96
+ TextRegion(
97
+ id="tb1", geometry=_geo(), provenance=_prov(),
98
+ lines=[TextLine(
99
+ id="tl1", geometry=_geo(), provenance=_prov(),
100
+ words=[Word(
101
+ id="w1", text="Hello", geometry=_geo(),
102
+ provenance=_prov(), confidence=None,
103
+ )],
104
+ )],
105
+ ),
106
+ ],
107
+ )
108
+ r = compute_page_alto_readiness(page)
109
+ assert r.level == ReadinessLevel.PARTIAL
110
+ assert MissingCapability.CONFIDENCE in r.missing
111
+
112
+ def test_no_reading_order_is_partial(self) -> None:
113
+ page = Page(
114
+ id="p1", page_index=0, width=2480, height=3508,
115
+ reading_order=[],
116
+ text_regions=[
117
+ TextRegion(
118
+ id="tb1", geometry=_geo(), provenance=_prov(),
119
+ lines=[TextLine(
120
+ id="tl1", geometry=_geo(), provenance=_prov(),
121
+ words=[Word(
122
+ id="w1", text="Hello", geometry=_geo(),
123
+ provenance=_prov(), confidence=0.9,
124
+ )],
125
+ )],
126
+ ),
127
+ ],
128
+ )
129
+ r = compute_page_alto_readiness(page)
130
+ assert r.level == ReadinessLevel.PARTIAL
131
+ assert MissingCapability.READING_ORDER in r.missing
132
+
133
+ def test_empty_page_is_none(self) -> None:
134
+ page = Page(id="p1", page_index=0, width=2480, height=3508)
135
+ r = compute_page_alto_readiness(page)
136
+ assert r.level == ReadinessLevel.NONE
137
+
138
+
139
+ class TestPageXmlReadiness:
140
+ def test_complete_page_is_full(self) -> None:
141
+ page = _complete_page()
142
+ r = compute_page_pagexml_readiness(page)
143
+ assert r.level == ReadinessLevel.FULL
144
+
145
+ def test_no_regions_is_none(self) -> None:
146
+ page = Page(id="p1", page_index=0, width=2480, height=3508)
147
+ r = compute_page_pagexml_readiness(page)
148
+ assert r.level == ReadinessLevel.NONE
149
+
150
+ def test_regions_without_word_geo_still_ok(self) -> None:
151
+ """PAGE XML is more lenient — word geometry is not critical."""
152
+ page = Page(
153
+ id="p1", page_index=0, width=2480, height=3508,
154
+ reading_order=["tb1"],
155
+ text_regions=[
156
+ TextRegion(
157
+ id="tb1", geometry=_geo(), provenance=_prov(),
158
+ lines=[TextLine(
159
+ id="tl1", geometry=_geo(), provenance=_prov(),
160
+ words=[Word(
161
+ id="w1", text="Hello",
162
+ geometry=_geo(GeometryStatus.UNKNOWN),
163
+ provenance=_prov(),
164
+ )],
165
+ )],
166
+ ),
167
+ ],
168
+ )
169
+ r = compute_page_pagexml_readiness(page)
170
+ # PAGE doesn't require word geometry — should still be achievable
171
+ assert r.level in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL)
172
+
173
+
174
+ class TestDocumentReadiness:
175
+ def test_single_full_page(self) -> None:
176
+ doc = CanonicalDocument(
177
+ document_id="test",
178
+ source=Source(input_type=InputType.IMAGE),
179
+ pages=[_complete_page()],
180
+ )
181
+ dr = compute_document_readiness(doc)
182
+ assert dr.level == ReadinessLevel.FULL
183
+
184
+ def test_mixed_pages(self) -> None:
185
+ full_page = _complete_page()
186
+ empty_page = Page(id="p2", page_index=1, width=2480, height=3508)
187
+ doc = CanonicalDocument(
188
+ document_id="test",
189
+ source=Source(input_type=InputType.IMAGE),
190
+ pages=[full_page, empty_page],
191
+ )
192
+ dr = compute_document_readiness(doc)
193
+ assert dr.level == ReadinessLevel.DEGRADED
194
+ assert len(dr.page_readiness) == 2
tests/unit/test_schema_validator.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the schema validator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.app.validators.schema_validator import validate_schema
6
+
7
+
8
+ class TestSchemaValidator:
9
+ def test_valid_document(self) -> None:
10
+ data = {
11
+ "schema_version": "1.0.0",
12
+ "document_id": "doc1",
13
+ "source": {"input_type": "image"},
14
+ "pages": [{
15
+ "id": "p1",
16
+ "page_index": 0,
17
+ "width": 2480,
18
+ "height": 3508,
19
+ "text_regions": [{
20
+ "id": "tb1",
21
+ "geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
22
+ "provenance": {
23
+ "provider": "test", "adapter": "v1",
24
+ "source_ref": "$", "evidence_type": "provider_native",
25
+ "derived_from": [],
26
+ },
27
+ "lines": [{
28
+ "id": "tl1",
29
+ "geometry": {"bbox": [100, 200, 300, 50], "status": "exact"},
30
+ "provenance": {
31
+ "provider": "test", "adapter": "v1",
32
+ "source_ref": "$", "evidence_type": "provider_native",
33
+ "derived_from": [],
34
+ },
35
+ "words": [{
36
+ "id": "w1",
37
+ "text": "Hello",
38
+ "geometry": {"bbox": [100, 200, 50, 30], "status": "exact"},
39
+ "provenance": {
40
+ "provider": "test", "adapter": "v1",
41
+ "source_ref": "$", "evidence_type": "provider_native",
42
+ "derived_from": [],
43
+ },
44
+ }],
45
+ }],
46
+ }],
47
+ }],
48
+ }
49
+ doc, report = validate_schema(data)
50
+ assert doc is not None
51
+ assert report.is_valid
52
+
53
+ def test_missing_required_field(self) -> None:
54
+ data = {
55
+ "source": {"input_type": "image"},
56
+ "pages": [],
57
+ }
58
+ doc, report = validate_schema(data)
59
+ assert doc is None
60
+ assert not report.is_valid
61
+ assert report.error_count > 0
62
+
63
+ def test_invalid_schema_version(self) -> None:
64
+ data = {
65
+ "schema_version": "bad",
66
+ "document_id": "doc1",
67
+ "source": {"input_type": "image"},
68
+ "pages": [{"id": "p1", "page_index": 0, "width": 100, "height": 100}],
69
+ }
70
+ doc, report = validate_schema(data)
71
+ assert doc is None
72
+ assert not report.is_valid
73
+
74
+ def test_empty_pages(self) -> None:
75
+ data = {
76
+ "document_id": "doc1",
77
+ "source": {"input_type": "image"},
78
+ "pages": [],
79
+ }
80
+ doc, report = validate_schema(data)
81
+ assert doc is None
82
+ assert report.error_count > 0
83
+
84
+ def test_error_paths_populated(self) -> None:
85
+ data = {
86
+ "document_id": "",
87
+ "source": {"input_type": "image"},
88
+ "pages": [],
89
+ }
90
+ doc, report = validate_schema(data)
91
+ assert doc is None
92
+ for entry in report.errors:
93
+ assert entry.path
94
+ assert entry.message
95
+ assert entry.validator == "schema"
tests/unit/test_structural_validator.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for the structural validator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from src.app.domain.models import (
6
+ AltoReadiness,
7
+ CanonicalDocument,
8
+ EvidenceType,
9
+ Geometry,
10
+ GeometryStatus,
11
+ NonTextRegion,
12
+ Page,
13
+ PageXmlReadiness,
14
+ Provenance,
15
+ ReadinessLevel,
16
+ Source,
17
+ TextLine,
18
+ TextRegion,
19
+ Word,
20
+ )
21
+ from src.app.domain.models.status import BlockRole, InputType, NonTextKind
22
+ from src.app.validators.structural_validator import validate_structure
23
+
24
+
25
+ def _prov() -> Provenance:
26
+ return Provenance(
27
+ provider="test", adapter="v1", source_ref="$",
28
+ evidence_type=EvidenceType.PROVIDER_NATIVE,
29
+ )
30
+
31
+
32
+ def _geo(x: float, y: float, w: float, h: float) -> Geometry:
33
+ return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT)
34
+
35
+
36
+ def _word(wid: str, x: float, y: float, w: float, h: float) -> Word:
37
+ return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov())
38
+
39
+
40
+ def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine:
41
+ return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words)
42
+
43
+
44
+ def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion:
45
+ return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines)
46
+
47
+
48
+ def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000,
49
+ reading_order: list[str] | None = None,
50
+ non_text: list[NonTextRegion] | None = None) -> CanonicalDocument:
51
+ ro = reading_order if reading_order is not None else [r.id for r in regions]
52
+ return CanonicalDocument(
53
+ document_id="test",
54
+ source=Source(input_type=InputType.IMAGE),
55
+ pages=[Page(
56
+ id="p1", page_index=0, width=width, height=height,
57
+ alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
58
+ page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
59
+ reading_order=ro,
60
+ text_regions=regions,
61
+ non_text_regions=non_text or [],
62
+ )],
63
+ )
64
+
65
+
66
+ class TestIdUniqueness:
67
+ def test_all_unique_passes(self) -> None:
68
+ doc = _doc([
69
+ _region("tb1", 0, 0, 500, 200, [
70
+ _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
71
+ ]),
72
+ ])
73
+ report = validate_structure(doc)
74
+ assert report.is_valid
75
+
76
+ def test_duplicate_word_ids(self) -> None:
77
+ doc = _doc([
78
+ _region("tb1", 0, 0, 500, 200, [
79
+ _line("tl1", 0, 0, 500, 40, [
80
+ _word("w1", 0, 0, 50, 30),
81
+ _word("w1", 60, 0, 50, 30), # duplicate
82
+ ]),
83
+ ]),
84
+ ])
85
+ report = validate_structure(doc)
86
+ assert not report.is_valid
87
+ assert any("Duplicate ID 'w1'" in e.message for e in report.errors)
88
+
89
+ def test_duplicate_across_levels(self) -> None:
90
+ # line ID = region ID
91
+ doc = _doc([
92
+ _region("same_id", 0, 0, 500, 200, [
93
+ _line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
94
+ ]),
95
+ ])
96
+ report = validate_structure(doc)
97
+ assert not report.is_valid
98
+
99
+ def test_duplicate_with_non_text_region(self) -> None:
100
+ ntr = NonTextRegion(
101
+ id="tb1", kind=NonTextKind.ILLUSTRATION,
102
+ geometry=_geo(600, 0, 100, 100), provenance=_prov(),
103
+ )
104
+ doc = _doc(
105
+ [_region("tb1", 0, 0, 500, 200, [
106
+ _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
107
+ ])],
108
+ non_text=[ntr],
109
+ )
110
+ report = validate_structure(doc)
111
+ assert not report.is_valid
112
+
113
+
114
+ class TestReadingOrder:
115
+ def test_valid_references(self) -> None:
116
+ doc = _doc([
117
+ _region("tb1", 0, 0, 500, 200, [
118
+ _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
119
+ ]),
120
+ ], reading_order=["tb1"])
121
+ report = validate_structure(doc)
122
+ assert report.is_valid
123
+
124
+ def test_invalid_reference(self) -> None:
125
+ doc = _doc([
126
+ _region("tb1", 0, 0, 500, 200, [
127
+ _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
128
+ ]),
129
+ ], reading_order=["tb1", "tb_nonexistent"])
130
+ report = validate_structure(doc)
131
+ assert not report.is_valid
132
+ assert any("unknown region ID" in e.message for e in report.errors)
133
+
134
+
135
+ class TestBboxContainment:
136
+ def test_all_contained_passes(self) -> None:
137
+ doc = _doc([
138
+ _region("tb1", 10, 10, 200, 100, [
139
+ _line("tl1", 20, 20, 150, 30, [
140
+ _word("w1", 25, 22, 50, 25),
141
+ ]),
142
+ ]),
143
+ ])
144
+ report = validate_structure(doc)
145
+ assert report.warning_count == 0
146
+
147
+ def test_word_exceeds_line(self) -> None:
148
+ doc = _doc([
149
+ _region("tb1", 10, 10, 400, 100, [
150
+ _line("tl1", 20, 20, 100, 30, [
151
+ _word("w1", 20, 20, 200, 30), # word wider than line
152
+ ]),
153
+ ]),
154
+ ])
155
+ report = validate_structure(doc, bbox_tolerance=0)
156
+ assert report.warning_count > 0
157
+ assert any("word_exceeds_line" in (e.code or "") for e in report.warnings)
158
+
159
+ def test_tolerance_allows_small_overflow(self) -> None:
160
+ doc = _doc([
161
+ _region("tb1", 10, 10, 200, 100, [
162
+ _line("tl1", 20, 20, 100, 30, [
163
+ _word("w1", 20, 20, 103, 30), # 3px overflow
164
+ ]),
165
+ ]),
166
+ ])
167
+ report = validate_structure(doc, bbox_tolerance=5)
168
+ assert report.warning_count == 0
169
+
170
+ def test_tolerance_rejects_large_overflow(self) -> None:
171
+ doc = _doc([
172
+ _region("tb1", 10, 10, 200, 100, [
173
+ _line("tl1", 20, 20, 100, 30, [
174
+ _word("w1", 20, 20, 120, 30), # 20px overflow
175
+ ]),
176
+ ]),
177
+ ])
178
+ report = validate_structure(doc, bbox_tolerance=5)
179
+ assert report.warning_count > 0
180
+
181
+ def test_region_exceeds_page(self) -> None:
182
+ doc = _doc([
183
+ _region("tb1", 900, 900, 200, 200, [ # exceeds 1000x1000 page
184
+ _line("tl1", 900, 900, 100, 30, [
185
+ _word("w1", 900, 900, 50, 25),
186
+ ]),
187
+ ]),
188
+ ], width=1000, height=1000)
189
+ report = validate_structure(doc, bbox_tolerance=0)
190
+ assert any("region_exceeds_page" in (e.code or "") for e in report.warnings)
191
+
192
+ def test_line_exceeds_region(self) -> None:
193
+ doc = _doc([
194
+ _region("tb1", 10, 10, 100, 50, [
195
+ _line("tl1", 10, 10, 200, 30, [ # line wider than region
196
+ _word("w1", 10, 10, 50, 25),
197
+ ]),
198
+ ]),
199
+ ])
200
+ report = validate_structure(doc, bbox_tolerance=0)
201
+ assert any("line_exceeds_region" in (e.code or "") for e in report.warnings)