Spaces:
Sleeping
Sleeping
| """Tests for the structural validator.""" | |
| from __future__ import annotations | |
| from src.app.domain.models import ( | |
| AltoReadiness, | |
| CanonicalDocument, | |
| EvidenceType, | |
| Geometry, | |
| GeometryStatus, | |
| NonTextRegion, | |
| Page, | |
| PageXmlReadiness, | |
| Provenance, | |
| ReadinessLevel, | |
| Source, | |
| TextLine, | |
| TextRegion, | |
| Word, | |
| ) | |
| from src.app.domain.models.status import InputType, NonTextKind | |
| from src.app.validators.structural_validator import validate_structure | |
| def _prov() -> Provenance: | |
| return Provenance( | |
| provider="test", adapter="v1", source_ref="$", | |
| evidence_type=EvidenceType.PROVIDER_NATIVE, | |
| ) | |
| def _geo(x: float, y: float, w: float, h: float) -> Geometry: | |
| return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT) | |
| def _word(wid: str, x: float, y: float, w: float, h: float) -> Word: | |
| return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov()) | |
| def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine: | |
| return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words) | |
| def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion: | |
| return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines) | |
| def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000, | |
| reading_order: list[str] | None = None, | |
| non_text: list[NonTextRegion] | None = None) -> CanonicalDocument: | |
| ro = reading_order if reading_order is not None else [r.id for r in regions] | |
| return CanonicalDocument( | |
| document_id="test", | |
| source=Source(input_type=InputType.IMAGE), | |
| pages=[Page( | |
| id="p1", page_index=0, width=width, height=height, | |
| alto_readiness=AltoReadiness(level=ReadinessLevel.FULL), | |
| page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL), | |
| reading_order=ro, | |
| text_regions=regions, | |
| non_text_regions=non_text or [], | |
| )], | |
| ) | |
| class TestIdUniqueness: | |
| def test_all_unique_passes(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 0, 0, 500, 200, [ | |
| _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc) | |
| assert report.is_valid | |
| def test_duplicate_word_ids(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 0, 0, 500, 200, [ | |
| _line("tl1", 0, 0, 500, 40, [ | |
| _word("w1", 0, 0, 50, 30), | |
| _word("w1", 60, 0, 50, 30), # duplicate | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc) | |
| assert not report.is_valid | |
| assert any("Duplicate ID 'w1'" in e.message for e in report.errors) | |
| def test_duplicate_across_levels(self) -> None: | |
| # line ID = region ID | |
| doc = _doc([ | |
| _region("same_id", 0, 0, 500, 200, [ | |
| _line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc) | |
| assert not report.is_valid | |
| def test_duplicate_with_non_text_region(self) -> None: | |
| ntr = NonTextRegion( | |
| id="tb1", kind=NonTextKind.ILLUSTRATION, | |
| geometry=_geo(600, 0, 100, 100), provenance=_prov(), | |
| ) | |
| doc = _doc( | |
| [_region("tb1", 0, 0, 500, 200, [ | |
| _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]), | |
| ])], | |
| non_text=[ntr], | |
| ) | |
| report = validate_structure(doc) | |
| assert not report.is_valid | |
| class TestReadingOrder: | |
| def test_valid_references(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 0, 0, 500, 200, [ | |
| _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]), | |
| ]), | |
| ], reading_order=["tb1"]) | |
| report = validate_structure(doc) | |
| assert report.is_valid | |
| def test_invalid_reference(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 0, 0, 500, 200, [ | |
| _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]), | |
| ]), | |
| ], reading_order=["tb1", "tb_nonexistent"]) | |
| report = validate_structure(doc) | |
| assert not report.is_valid | |
| assert any("unknown region ID" in e.message for e in report.errors) | |
| class TestBboxContainment: | |
| def test_all_contained_passes(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 10, 10, 200, 100, [ | |
| _line("tl1", 20, 20, 150, 30, [ | |
| _word("w1", 25, 22, 50, 25), | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc) | |
| assert report.warning_count == 0 | |
| def test_word_exceeds_line(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 10, 10, 400, 100, [ | |
| _line("tl1", 20, 20, 100, 30, [ | |
| _word("w1", 20, 20, 200, 30), # word wider than line | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc, bbox_tolerance=0) | |
| assert report.warning_count > 0 | |
| assert any("word_exceeds_line" in (e.code or "") for e in report.warnings) | |
| def test_tolerance_allows_small_overflow(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 10, 10, 200, 100, [ | |
| _line("tl1", 20, 20, 100, 30, [ | |
| _word("w1", 20, 20, 103, 30), # 3px overflow | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc, bbox_tolerance=5) | |
| assert report.warning_count == 0 | |
| def test_tolerance_rejects_large_overflow(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 10, 10, 200, 100, [ | |
| _line("tl1", 20, 20, 100, 30, [ | |
| _word("w1", 20, 20, 120, 30), # 20px overflow | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc, bbox_tolerance=5) | |
| assert report.warning_count > 0 | |
| def test_region_exceeds_page(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 900, 900, 200, 200, [ # exceeds 1000x1000 page | |
| _line("tl1", 900, 900, 100, 30, [ | |
| _word("w1", 900, 900, 50, 25), | |
| ]), | |
| ]), | |
| ], width=1000, height=1000) | |
| report = validate_structure(doc, bbox_tolerance=0) | |
| assert any("region_exceeds_page" in (e.code or "") for e in report.warnings) | |
| def test_line_exceeds_region(self) -> None: | |
| doc = _doc([ | |
| _region("tb1", 10, 10, 100, 50, [ | |
| _line("tl1", 10, 10, 200, 30, [ # line wider than region | |
| _word("w1", 10, 10, 50, 25), | |
| ]), | |
| ]), | |
| ]) | |
| report = validate_structure(doc, bbox_tolerance=0) | |
| assert any("line_exceeds_region" in (e.code or "") for e in report.warnings) | |