Spaces:

MHamdan
/

SPARKNET

Sleeping

File size: 15,109 Bytes

d520909

"""
Unit Tests for Document Intelligence Subsystem

Tests core components:
- BoundingBox operations
- Chunk models
- Schema and extraction
- Evidence building
"""

import pytest
from pathlib import Path


class TestBoundingBox:
    """Tests for BoundingBox model."""

    def test_create_bbox(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox = BoundingBox(
            x_min=0.1,
            y_min=0.2,
            x_max=0.5,
            y_max=0.6,
            normalized=True
        )

        assert bbox.x_min == 0.1
        assert bbox.y_min == 0.2
        assert bbox.x_max == 0.5
        assert bbox.y_max == 0.6
        assert bbox.normalized is True

    def test_bbox_properties(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox = BoundingBox(
            x_min=10,
            y_min=20,
            x_max=50,
            y_max=80,
            normalized=False
        )

        assert bbox.width == 40
        assert bbox.height == 60
        assert bbox.area == 2400
        assert bbox.center == (30, 50)
        assert bbox.xyxy == (10, 20, 50, 80)

    def test_bbox_to_pixel(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox = BoundingBox(
            x_min=0.1,
            y_min=0.2,
            x_max=0.5,
            y_max=0.6,
            normalized=True
        )

        pixel_bbox = bbox.to_pixel(1000, 800)

        assert pixel_bbox.x_min == 100
        assert pixel_bbox.y_min == 160
        assert pixel_bbox.x_max == 500
        assert pixel_bbox.y_max == 480
        assert pixel_bbox.normalized is False

    def test_bbox_to_normalized(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox = BoundingBox(
            x_min=100,
            y_min=160,
            x_max=500,
            y_max=480,
            normalized=False
        )

        norm_bbox = bbox.to_normalized(1000, 800)

        assert abs(norm_bbox.x_min - 0.1) < 0.001
        assert abs(norm_bbox.y_min - 0.2) < 0.001
        assert abs(norm_bbox.x_max - 0.5) < 0.001
        assert abs(norm_bbox.y_max - 0.6) < 0.001
        assert norm_bbox.normalized is True

    def test_bbox_iou(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
        bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)

        # Intersection: 50x50 = 2500
        # Union: 100x100 + 100x100 - 2500 = 17500
        # IoU = 2500/17500 ≈ 0.143
        iou = bbox1.iou(bbox2)
        assert 0.1 < iou < 0.2

    def test_bbox_contains(self):
        from src.document_intelligence.chunks import BoundingBox

        bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)

        assert bbox.contains((50, 50)) is True
        assert bbox.contains((0, 0)) is True
        assert bbox.contains((100, 100)) is True
        assert bbox.contains((150, 50)) is False


class TestDocumentChunk:
    """Tests for DocumentChunk model."""

    def test_create_chunk(self):
        from src.document_intelligence.chunks import (
            DocumentChunk,
            ChunkType,
            BoundingBox,
        )

        bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)

        chunk = DocumentChunk(
            chunk_id="test_chunk_001",
            doc_id="doc_001",
            chunk_type=ChunkType.PARAGRAPH,
            text="This is a test paragraph.",
            page=1,
            bbox=bbox,
            confidence=0.95,
            sequence_index=0,
        )

        assert chunk.chunk_id == "test_chunk_001"
        assert chunk.chunk_type == ChunkType.PARAGRAPH
        assert chunk.text == "This is a test paragraph."
        assert chunk.page == 1
        assert chunk.confidence == 0.95

    def test_generate_chunk_id(self):
        from src.document_intelligence.chunks import (
            DocumentChunk,
            BoundingBox,
        )

        bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)

        chunk_id = DocumentChunk.generate_chunk_id(
            doc_id="doc_001",
            page=1,
            bbox=bbox,
            chunk_type_str="paragraph"
        )

        # Should be deterministic
        chunk_id_2 = DocumentChunk.generate_chunk_id(
            doc_id="doc_001",
            page=1,
            bbox=bbox,
            chunk_type_str="paragraph"
        )

        assert chunk_id == chunk_id_2
        assert len(chunk_id) == 16  # md5 hex prefix


class TestTableChunk:
    """Tests for TableChunk model."""

    def test_create_table_chunk(self):
        from src.document_intelligence.chunks import (
            TableChunk,
            TableCell,
            BoundingBox,
        )

        bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

        cells = [
            TableCell(row=0, col=0, text="Header 1", is_header=True,
                     bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)),
            TableCell(row=0, col=1, text="Header 2", is_header=True,
                     bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)),
            TableCell(row=1, col=0, text="Value 1",
                     bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)),
            TableCell(row=1, col=1, text="Value 2",
                     bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)),
        ]

        table = TableChunk(
            chunk_id="table_001",
            doc_id="doc_001",
            text="Table content",
            page=1,
            bbox=bbox,
            confidence=0.9,
            sequence_index=0,
            cells=cells,
            num_rows=2,
            num_cols=2,
        )

        assert table.num_rows == 2
        assert table.num_cols == 2
        assert len(table.cells) == 4

    def test_table_get_cell(self):
        from src.document_intelligence.chunks import (
            TableChunk,
            TableCell,
            BoundingBox,
        )

        bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

        cells = [
            TableCell(row=0, col=0, text="A",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=0, col=1, text="B",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=1, col=0, text="C",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=1, col=1, text="D",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
        ]

        table = TableChunk(
            chunk_id="table_001",
            doc_id="doc_001",
            text="Table",
            page=1,
            bbox=bbox,
            confidence=0.9,
            sequence_index=0,
            cells=cells,
            num_rows=2,
            num_cols=2,
        )

        assert table.get_cell(0, 0).text == "A"
        assert table.get_cell(0, 1).text == "B"
        assert table.get_cell(1, 0).text == "C"
        assert table.get_cell(1, 1).text == "D"

    def test_table_to_markdown(self):
        from src.document_intelligence.chunks import (
            TableChunk,
            TableCell,
            BoundingBox,
        )

        bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)

        cells = [
            TableCell(row=0, col=0, text="Name",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=0, col=1, text="Value",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=1, col=0, text="A",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
            TableCell(row=1, col=1, text="100",
                     bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
        ]

        table = TableChunk(
            chunk_id="table_001",
            doc_id="doc_001",
            text="Table",
            page=1,
            bbox=bbox,
            confidence=0.9,
            sequence_index=0,
            cells=cells,
            num_rows=2,
            num_cols=2,
        )

        md = table.to_markdown()
        assert "| Name | Value |" in md
        assert "| --- | --- |" in md
        assert "| A | 100 |" in md


class TestExtractionSchema:
    """Tests for ExtractionSchema."""

    def test_create_schema(self):
        from src.document_intelligence.extraction import (
            ExtractionSchema,
            FieldSpec,
            FieldType,
        )

        schema = ExtractionSchema(name="TestSchema")
        schema.add_string_field("name", "Person name", required=True)
        schema.add_number_field("age", "Person age", required=False, is_integer=True)
        schema.add_date_field("birth_date", "Date of birth")

        assert schema.name == "TestSchema"
        assert len(schema.fields) == 3
        assert schema.get_field("name").required is True
        assert schema.get_field("age").field_type == FieldType.INTEGER

    def test_schema_to_json_schema(self):
        from src.document_intelligence.extraction import ExtractionSchema

        schema = ExtractionSchema(name="Invoice")
        schema.add_string_field("invoice_number", required=True)
        schema.add_currency_field("total_amount", required=True)

        json_schema = schema.to_json_schema()

        assert json_schema["type"] == "object"
        assert "invoice_number" in json_schema["properties"]
        assert "total_amount" in json_schema["properties"]
        assert "invoice_number" in json_schema["required"]

    def test_schema_from_json_schema(self):
        from src.document_intelligence.extraction import ExtractionSchema

        json_schema = {
            "type": "object",
            "properties": {
                "name": {"type": "string", "description": "Name"},
                "value": {"type": "number", "minimum": 0},
            },
            "required": ["name"],
        }

        schema = ExtractionSchema.from_json_schema(json_schema, name="Test")

        assert len(schema.fields) == 2
        assert schema.get_field("name").required is True
        assert schema.get_field("value").required is False

    def test_preset_schemas(self):
        from src.document_intelligence.extraction import (
            create_invoice_schema,
            create_receipt_schema,
            create_contract_schema,
        )

        invoice = create_invoice_schema()
        assert invoice.get_field("invoice_number") is not None
        assert invoice.get_field("total_amount") is not None

        receipt = create_receipt_schema()
        assert receipt.get_field("merchant_name") is not None

        contract = create_contract_schema()
        assert contract.get_field("effective_date") is not None


class TestEvidenceBuilder:
    """Tests for EvidenceBuilder."""

    def test_create_evidence(self):
        from src.document_intelligence.grounding import EvidenceBuilder
        from src.document_intelligence.chunks import (
            DocumentChunk,
            ChunkType,
            BoundingBox,
        )

        chunk = DocumentChunk(
            chunk_id="chunk_001",
            doc_id="doc_001",
            chunk_type=ChunkType.PARAGRAPH,
            text="The total amount is $500.00.",
            page=1,
            bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3),
            confidence=0.9,
            sequence_index=0,
        )

        builder = EvidenceBuilder()
        evidence = builder.create_evidence(
            chunk=chunk,
            value="$500.00",
            field_name="total_amount"
        )

        assert evidence.chunk_id == "chunk_001"
        assert evidence.page == 1
        assert "$500.00" in evidence.snippet or "500" in evidence.snippet


class TestSemanticChunker:
    """Tests for SemanticChunker."""

    def test_chunk_text(self):
        from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig

        config = ChunkingConfig(
            min_chunk_chars=10,
            max_chunk_chars=100,
            target_chunk_chars=50,
        )

        chunker = SemanticChunker(config)

        text = """# Heading 1

This is the first paragraph with some text content.

This is the second paragraph with more content.

# Heading 2

Another section with different content.
"""

        chunks = chunker.chunk_text(text)

        assert len(chunks) > 0
        for chunk in chunks:
            assert "text" in chunk
            assert len(chunk["text"]) >= config.min_chunk_chars

    def test_chunk_long_text(self):
        from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig

        config = ChunkingConfig(
            min_chunk_chars=10,
            max_chunk_chars=200,
            target_chunk_chars=100,
        )

        chunker = SemanticChunker(config)

        # Create a long text
        text = " ".join(["This is sentence number {}.".format(i) for i in range(50)])

        chunks = chunker.chunk_text(text)

        assert len(chunks) > 1
        for chunk in chunks:
            assert len(chunk["text"]) <= config.max_chunk_chars * 1.1  # Allow some slack


class TestValidation:
    """Tests for extraction validation."""

    def test_validate_extraction(self):
        from src.document_intelligence.extraction import (
            ExtractionSchema,
            ExtractionValidator,
        )
        from src.document_intelligence.chunks import ExtractionResult, FieldExtraction

        schema = ExtractionSchema(name="Test")
        schema.add_string_field("name", required=True)
        schema.add_number_field("value", required=False, is_integer=True)

        result = ExtractionResult(
            data={"name": "Test Name", "value": 42},
            fields=[],
            evidence=[],
            overall_confidence=0.8,
            abstained_fields=[],
        )

        validator = ExtractionValidator()
        validation = validator.validate(result, schema)

        assert validation.is_valid is True
        assert validation.error_count == 0

    def test_validate_missing_required(self):
        from src.document_intelligence.extraction import (
            ExtractionSchema,
            ExtractionValidator,
        )
        from src.document_intelligence.chunks import ExtractionResult

        schema = ExtractionSchema(name="Test")
        schema.add_string_field("name", required=True)
        schema.add_string_field("description", required=True)

        result = ExtractionResult(
            data={"name": "Test"},  # Missing 'description'
            fields=[],
            evidence=[],
            overall_confidence=0.5,
            abstained_fields=["description"],
        )

        validator = ExtractionValidator()
        validation = validator.validate(result, schema)

        assert validation.is_valid is False
        assert validation.error_count >= 1


if __name__ == "__main__":
    pytest.main([__file__, "-v"])