""" Unit Tests for Document Intelligence Subsystem Tests core components: - BoundingBox operations - Chunk models - Schema and extraction - Evidence building """ import pytest from pathlib import Path class TestBoundingBox: """Tests for BoundingBox model.""" def test_create_bbox(self): from src.document_intelligence.chunks import BoundingBox bbox = BoundingBox( x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.6, normalized=True ) assert bbox.x_min == 0.1 assert bbox.y_min == 0.2 assert bbox.x_max == 0.5 assert bbox.y_max == 0.6 assert bbox.normalized is True def test_bbox_properties(self): from src.document_intelligence.chunks import BoundingBox bbox = BoundingBox( x_min=10, y_min=20, x_max=50, y_max=80, normalized=False ) assert bbox.width == 40 assert bbox.height == 60 assert bbox.area == 2400 assert bbox.center == (30, 50) assert bbox.xyxy == (10, 20, 50, 80) def test_bbox_to_pixel(self): from src.document_intelligence.chunks import BoundingBox bbox = BoundingBox( x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.6, normalized=True ) pixel_bbox = bbox.to_pixel(1000, 800) assert pixel_bbox.x_min == 100 assert pixel_bbox.y_min == 160 assert pixel_bbox.x_max == 500 assert pixel_bbox.y_max == 480 assert pixel_bbox.normalized is False def test_bbox_to_normalized(self): from src.document_intelligence.chunks import BoundingBox bbox = BoundingBox( x_min=100, y_min=160, x_max=500, y_max=480, normalized=False ) norm_bbox = bbox.to_normalized(1000, 800) assert abs(norm_bbox.x_min - 0.1) < 0.001 assert abs(norm_bbox.y_min - 0.2) < 0.001 assert abs(norm_bbox.x_max - 0.5) < 0.001 assert abs(norm_bbox.y_max - 0.6) < 0.001 assert norm_bbox.normalized is True def test_bbox_iou(self): from src.document_intelligence.chunks import BoundingBox bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) # Intersection: 50x50 = 2500 # Union: 100x100 + 100x100 - 2500 = 17500 # IoU = 2500/17500 ≈ 0.143 iou = bbox1.iou(bbox2) assert 0.1 < iou < 0.2 def test_bbox_contains(self): from src.document_intelligence.chunks import BoundingBox bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) assert bbox.contains((50, 50)) is True assert bbox.contains((0, 0)) is True assert bbox.contains((100, 100)) is True assert bbox.contains((150, 50)) is False class TestDocumentChunk: """Tests for DocumentChunk model.""" def test_create_chunk(self): from src.document_intelligence.chunks import ( DocumentChunk, ChunkType, BoundingBox, ) bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) chunk = DocumentChunk( chunk_id="test_chunk_001", doc_id="doc_001", chunk_type=ChunkType.PARAGRAPH, text="This is a test paragraph.", page=1, bbox=bbox, confidence=0.95, sequence_index=0, ) assert chunk.chunk_id == "test_chunk_001" assert chunk.chunk_type == ChunkType.PARAGRAPH assert chunk.text == "This is a test paragraph." assert chunk.page == 1 assert chunk.confidence == 0.95 def test_generate_chunk_id(self): from src.document_intelligence.chunks import ( DocumentChunk, BoundingBox, ) bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) chunk_id = DocumentChunk.generate_chunk_id( doc_id="doc_001", page=1, bbox=bbox, chunk_type_str="paragraph" ) # Should be deterministic chunk_id_2 = DocumentChunk.generate_chunk_id( doc_id="doc_001", page=1, bbox=bbox, chunk_type_str="paragraph" ) assert chunk_id == chunk_id_2 assert len(chunk_id) == 16 # md5 hex prefix class TestTableChunk: """Tests for TableChunk model.""" def test_create_table_chunk(self): from src.document_intelligence.chunks import ( TableChunk, TableCell, BoundingBox, ) bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) cells = [ TableCell(row=0, col=0, text="Header 1", is_header=True, bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)), TableCell(row=0, col=1, text="Header 2", is_header=True, bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)), TableCell(row=1, col=0, text="Value 1", bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)), TableCell(row=1, col=1, text="Value 2", bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)), ] table = TableChunk( chunk_id="table_001", doc_id="doc_001", text="Table content", page=1, bbox=bbox, confidence=0.9, sequence_index=0, cells=cells, num_rows=2, num_cols=2, ) assert table.num_rows == 2 assert table.num_cols == 2 assert len(table.cells) == 4 def test_table_get_cell(self): from src.document_intelligence.chunks import ( TableChunk, TableCell, BoundingBox, ) bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) cells = [ TableCell(row=0, col=0, text="A", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=0, col=1, text="B", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=1, col=0, text="C", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=1, col=1, text="D", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), ] table = TableChunk( chunk_id="table_001", doc_id="doc_001", text="Table", page=1, bbox=bbox, confidence=0.9, sequence_index=0, cells=cells, num_rows=2, num_cols=2, ) assert table.get_cell(0, 0).text == "A" assert table.get_cell(0, 1).text == "B" assert table.get_cell(1, 0).text == "C" assert table.get_cell(1, 1).text == "D" def test_table_to_markdown(self): from src.document_intelligence.chunks import ( TableChunk, TableCell, BoundingBox, ) bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) cells = [ TableCell(row=0, col=0, text="Name", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=0, col=1, text="Value", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=1, col=0, text="A", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), TableCell(row=1, col=1, text="100", bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), ] table = TableChunk( chunk_id="table_001", doc_id="doc_001", text="Table", page=1, bbox=bbox, confidence=0.9, sequence_index=0, cells=cells, num_rows=2, num_cols=2, ) md = table.to_markdown() assert "| Name | Value |" in md assert "| --- | --- |" in md assert "| A | 100 |" in md class TestExtractionSchema: """Tests for ExtractionSchema.""" def test_create_schema(self): from src.document_intelligence.extraction import ( ExtractionSchema, FieldSpec, FieldType, ) schema = ExtractionSchema(name="TestSchema") schema.add_string_field("name", "Person name", required=True) schema.add_number_field("age", "Person age", required=False, is_integer=True) schema.add_date_field("birth_date", "Date of birth") assert schema.name == "TestSchema" assert len(schema.fields) == 3 assert schema.get_field("name").required is True assert schema.get_field("age").field_type == FieldType.INTEGER def test_schema_to_json_schema(self): from src.document_intelligence.extraction import ExtractionSchema schema = ExtractionSchema(name="Invoice") schema.add_string_field("invoice_number", required=True) schema.add_currency_field("total_amount", required=True) json_schema = schema.to_json_schema() assert json_schema["type"] == "object" assert "invoice_number" in json_schema["properties"] assert "total_amount" in json_schema["properties"] assert "invoice_number" in json_schema["required"] def test_schema_from_json_schema(self): from src.document_intelligence.extraction import ExtractionSchema json_schema = { "type": "object", "properties": { "name": {"type": "string", "description": "Name"}, "value": {"type": "number", "minimum": 0}, }, "required": ["name"], } schema = ExtractionSchema.from_json_schema(json_schema, name="Test") assert len(schema.fields) == 2 assert schema.get_field("name").required is True assert schema.get_field("value").required is False def test_preset_schemas(self): from src.document_intelligence.extraction import ( create_invoice_schema, create_receipt_schema, create_contract_schema, ) invoice = create_invoice_schema() assert invoice.get_field("invoice_number") is not None assert invoice.get_field("total_amount") is not None receipt = create_receipt_schema() assert receipt.get_field("merchant_name") is not None contract = create_contract_schema() assert contract.get_field("effective_date") is not None class TestEvidenceBuilder: """Tests for EvidenceBuilder.""" def test_create_evidence(self): from src.document_intelligence.grounding import EvidenceBuilder from src.document_intelligence.chunks import ( DocumentChunk, ChunkType, BoundingBox, ) chunk = DocumentChunk( chunk_id="chunk_001", doc_id="doc_001", chunk_type=ChunkType.PARAGRAPH, text="The total amount is $500.00.", page=1, bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3), confidence=0.9, sequence_index=0, ) builder = EvidenceBuilder() evidence = builder.create_evidence( chunk=chunk, value="$500.00", field_name="total_amount" ) assert evidence.chunk_id == "chunk_001" assert evidence.page == 1 assert "$500.00" in evidence.snippet or "500" in evidence.snippet class TestSemanticChunker: """Tests for SemanticChunker.""" def test_chunk_text(self): from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig config = ChunkingConfig( min_chunk_chars=10, max_chunk_chars=100, target_chunk_chars=50, ) chunker = SemanticChunker(config) text = """# Heading 1 This is the first paragraph with some text content. This is the second paragraph with more content. # Heading 2 Another section with different content. """ chunks = chunker.chunk_text(text) assert len(chunks) > 0 for chunk in chunks: assert "text" in chunk assert len(chunk["text"]) >= config.min_chunk_chars def test_chunk_long_text(self): from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig config = ChunkingConfig( min_chunk_chars=10, max_chunk_chars=200, target_chunk_chars=100, ) chunker = SemanticChunker(config) # Create a long text text = " ".join(["This is sentence number {}.".format(i) for i in range(50)]) chunks = chunker.chunk_text(text) assert len(chunks) > 1 for chunk in chunks: assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 # Allow some slack class TestValidation: """Tests for extraction validation.""" def test_validate_extraction(self): from src.document_intelligence.extraction import ( ExtractionSchema, ExtractionValidator, ) from src.document_intelligence.chunks import ExtractionResult, FieldExtraction schema = ExtractionSchema(name="Test") schema.add_string_field("name", required=True) schema.add_number_field("value", required=False, is_integer=True) result = ExtractionResult( data={"name": "Test Name", "value": 42}, fields=[], evidence=[], overall_confidence=0.8, abstained_fields=[], ) validator = ExtractionValidator() validation = validator.validate(result, schema) assert validation.is_valid is True assert validation.error_count == 0 def test_validate_missing_required(self): from src.document_intelligence.extraction import ( ExtractionSchema, ExtractionValidator, ) from src.document_intelligence.chunks import ExtractionResult schema = ExtractionSchema(name="Test") schema.add_string_field("name", required=True) schema.add_string_field("description", required=True) result = ExtractionResult( data={"name": "Test"}, # Missing 'description' fields=[], evidence=[], overall_confidence=0.5, abstained_fields=["description"], ) validator = ExtractionValidator() validation = validator.validate(result, schema) assert validation.is_valid is False assert validation.error_count >= 1 if __name__ == "__main__": pytest.main([__file__, "-v"])