Spaces:

Ma-Ri-Ba-Ku
/

XmLLM

Sleeping

File size: 6,910 Bytes

e2ec8a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbfba8
e2ec8a2

"""Tests for the structural validator."""

from __future__ import annotations

from src.app.domain.models import (
    AltoReadiness,
    CanonicalDocument,
    EvidenceType,
    Geometry,
    GeometryStatus,
    NonTextRegion,
    Page,
    PageXmlReadiness,
    Provenance,
    ReadinessLevel,
    Source,
    TextLine,
    TextRegion,
    Word,
)
from src.app.domain.models.status import InputType, NonTextKind
from src.app.validators.structural_validator import validate_structure


def _prov() -> Provenance:
    return Provenance(
        provider="test", adapter="v1", source_ref="$",
        evidence_type=EvidenceType.PROVIDER_NATIVE,
    )


def _geo(x: float, y: float, w: float, h: float) -> Geometry:
    return Geometry(bbox=(x, y, w, h), status=GeometryStatus.EXACT)


def _word(wid: str, x: float, y: float, w: float, h: float) -> Word:
    return Word(id=wid, text="word", geometry=_geo(x, y, w, h), provenance=_prov())


def _line(lid: str, x: float, y: float, w: float, h: float, words: list[Word]) -> TextLine:
    return TextLine(id=lid, geometry=_geo(x, y, w, h), provenance=_prov(), words=words)


def _region(rid: str, x: float, y: float, w: float, h: float, lines: list[TextLine]) -> TextRegion:
    return TextRegion(id=rid, geometry=_geo(x, y, w, h), provenance=_prov(), lines=lines)


def _doc(regions: list[TextRegion], width: float = 1000, height: float = 1000,
         reading_order: list[str] | None = None,
         non_text: list[NonTextRegion] | None = None) -> CanonicalDocument:
    ro = reading_order if reading_order is not None else [r.id for r in regions]
    return CanonicalDocument(
        document_id="test",
        source=Source(input_type=InputType.IMAGE),
        pages=[Page(
            id="p1", page_index=0, width=width, height=height,
            alto_readiness=AltoReadiness(level=ReadinessLevel.FULL),
            page_readiness=PageXmlReadiness(level=ReadinessLevel.FULL),
            reading_order=ro,
            text_regions=regions,
            non_text_regions=non_text or [],
        )],
    )


class TestIdUniqueness:
    def test_all_unique_passes(self) -> None:
        doc = _doc([
            _region("tb1", 0, 0, 500, 200, [
                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
            ]),
        ])
        report = validate_structure(doc)
        assert report.is_valid

    def test_duplicate_word_ids(self) -> None:
        doc = _doc([
            _region("tb1", 0, 0, 500, 200, [
                _line("tl1", 0, 0, 500, 40, [
                    _word("w1", 0, 0, 50, 30),
                    _word("w1", 60, 0, 50, 30),  # duplicate
                ]),
            ]),
        ])
        report = validate_structure(doc)
        assert not report.is_valid
        assert any("Duplicate ID 'w1'" in e.message for e in report.errors)

    def test_duplicate_across_levels(self) -> None:
        # line ID = region ID
        doc = _doc([
            _region("same_id", 0, 0, 500, 200, [
                _line("same_id", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
            ]),
        ])
        report = validate_structure(doc)
        assert not report.is_valid

    def test_duplicate_with_non_text_region(self) -> None:
        ntr = NonTextRegion(
            id="tb1", kind=NonTextKind.ILLUSTRATION,
            geometry=_geo(600, 0, 100, 100), provenance=_prov(),
        )
        doc = _doc(
            [_region("tb1", 0, 0, 500, 200, [
                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
            ])],
            non_text=[ntr],
        )
        report = validate_structure(doc)
        assert not report.is_valid


class TestReadingOrder:
    def test_valid_references(self) -> None:
        doc = _doc([
            _region("tb1", 0, 0, 500, 200, [
                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
            ]),
        ], reading_order=["tb1"])
        report = validate_structure(doc)
        assert report.is_valid

    def test_invalid_reference(self) -> None:
        doc = _doc([
            _region("tb1", 0, 0, 500, 200, [
                _line("tl1", 0, 0, 500, 40, [_word("w1", 0, 0, 50, 30)]),
            ]),
        ], reading_order=["tb1", "tb_nonexistent"])
        report = validate_structure(doc)
        assert not report.is_valid
        assert any("unknown region ID" in e.message for e in report.errors)


class TestBboxContainment:
    def test_all_contained_passes(self) -> None:
        doc = _doc([
            _region("tb1", 10, 10, 200, 100, [
                _line("tl1", 20, 20, 150, 30, [
                    _word("w1", 25, 22, 50, 25),
                ]),
            ]),
        ])
        report = validate_structure(doc)
        assert report.warning_count == 0

    def test_word_exceeds_line(self) -> None:
        doc = _doc([
            _region("tb1", 10, 10, 400, 100, [
                _line("tl1", 20, 20, 100, 30, [
                    _word("w1", 20, 20, 200, 30),  # word wider than line
                ]),
            ]),
        ])
        report = validate_structure(doc, bbox_tolerance=0)
        assert report.warning_count > 0
        assert any("word_exceeds_line" in (e.code or "") for e in report.warnings)

    def test_tolerance_allows_small_overflow(self) -> None:
        doc = _doc([
            _region("tb1", 10, 10, 200, 100, [
                _line("tl1", 20, 20, 100, 30, [
                    _word("w1", 20, 20, 103, 30),  # 3px overflow
                ]),
            ]),
        ])
        report = validate_structure(doc, bbox_tolerance=5)
        assert report.warning_count == 0

    def test_tolerance_rejects_large_overflow(self) -> None:
        doc = _doc([
            _region("tb1", 10, 10, 200, 100, [
                _line("tl1", 20, 20, 100, 30, [
                    _word("w1", 20, 20, 120, 30),  # 20px overflow
                ]),
            ]),
        ])
        report = validate_structure(doc, bbox_tolerance=5)
        assert report.warning_count > 0

    def test_region_exceeds_page(self) -> None:
        doc = _doc([
            _region("tb1", 900, 900, 200, 200, [  # exceeds 1000x1000 page
                _line("tl1", 900, 900, 100, 30, [
                    _word("w1", 900, 900, 50, 25),
                ]),
            ]),
        ], width=1000, height=1000)
        report = validate_structure(doc, bbox_tolerance=0)
        assert any("region_exceeds_page" in (e.code or "") for e in report.warnings)

    def test_line_exceeds_region(self) -> None:
        doc = _doc([
            _region("tb1", 10, 10, 100, 50, [
                _line("tl1", 10, 10, 200, 30, [  # line wider than region
                    _word("w1", 10, 10, 50, 25),
                ]),
            ]),
        ])
        report = validate_structure(doc, bbox_tolerance=0)
        assert any("line_exceeds_region" in (e.code or "") for e in report.warnings)