SPARKNET / tests /unit /test_document_intelligence.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Unit Tests for Document Intelligence Subsystem
Tests core components:
- BoundingBox operations
- Chunk models
- Schema and extraction
- Evidence building
"""
import pytest
from pathlib import Path
class TestBoundingBox:
"""Tests for BoundingBox model."""
def test_create_bbox(self):
from src.document_intelligence.chunks import BoundingBox
bbox = BoundingBox(
x_min=0.1,
y_min=0.2,
x_max=0.5,
y_max=0.6,
normalized=True
)
assert bbox.x_min == 0.1
assert bbox.y_min == 0.2
assert bbox.x_max == 0.5
assert bbox.y_max == 0.6
assert bbox.normalized is True
def test_bbox_properties(self):
from src.document_intelligence.chunks import BoundingBox
bbox = BoundingBox(
x_min=10,
y_min=20,
x_max=50,
y_max=80,
normalized=False
)
assert bbox.width == 40
assert bbox.height == 60
assert bbox.area == 2400
assert bbox.center == (30, 50)
assert bbox.xyxy == (10, 20, 50, 80)
def test_bbox_to_pixel(self):
from src.document_intelligence.chunks import BoundingBox
bbox = BoundingBox(
x_min=0.1,
y_min=0.2,
x_max=0.5,
y_max=0.6,
normalized=True
)
pixel_bbox = bbox.to_pixel(1000, 800)
assert pixel_bbox.x_min == 100
assert pixel_bbox.y_min == 160
assert pixel_bbox.x_max == 500
assert pixel_bbox.y_max == 480
assert pixel_bbox.normalized is False
def test_bbox_to_normalized(self):
from src.document_intelligence.chunks import BoundingBox
bbox = BoundingBox(
x_min=100,
y_min=160,
x_max=500,
y_max=480,
normalized=False
)
norm_bbox = bbox.to_normalized(1000, 800)
assert abs(norm_bbox.x_min - 0.1) < 0.001
assert abs(norm_bbox.y_min - 0.2) < 0.001
assert abs(norm_bbox.x_max - 0.5) < 0.001
assert abs(norm_bbox.y_max - 0.6) < 0.001
assert norm_bbox.normalized is True
def test_bbox_iou(self):
from src.document_intelligence.chunks import BoundingBox
bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150)
# Intersection: 50x50 = 2500
# Union: 100x100 + 100x100 - 2500 = 17500
# IoU = 2500/17500 ≈ 0.143
iou = bbox1.iou(bbox2)
assert 0.1 < iou < 0.2
def test_bbox_contains(self):
from src.document_intelligence.chunks import BoundingBox
bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100)
assert bbox.contains((50, 50)) is True
assert bbox.contains((0, 0)) is True
assert bbox.contains((100, 100)) is True
assert bbox.contains((150, 50)) is False
class TestDocumentChunk:
"""Tests for DocumentChunk model."""
def test_create_chunk(self):
from src.document_intelligence.chunks import (
DocumentChunk,
ChunkType,
BoundingBox,
)
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)
chunk = DocumentChunk(
chunk_id="test_chunk_001",
doc_id="doc_001",
chunk_type=ChunkType.PARAGRAPH,
text="This is a test paragraph.",
page=1,
bbox=bbox,
confidence=0.95,
sequence_index=0,
)
assert chunk.chunk_id == "test_chunk_001"
assert chunk.chunk_type == ChunkType.PARAGRAPH
assert chunk.text == "This is a test paragraph."
assert chunk.page == 1
assert chunk.confidence == 0.95
def test_generate_chunk_id(self):
from src.document_intelligence.chunks import (
DocumentChunk,
BoundingBox,
)
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True)
chunk_id = DocumentChunk.generate_chunk_id(
doc_id="doc_001",
page=1,
bbox=bbox,
chunk_type_str="paragraph"
)
# Should be deterministic
chunk_id_2 = DocumentChunk.generate_chunk_id(
doc_id="doc_001",
page=1,
bbox=bbox,
chunk_type_str="paragraph"
)
assert chunk_id == chunk_id_2
assert len(chunk_id) == 16 # md5 hex prefix
class TestTableChunk:
"""Tests for TableChunk model."""
def test_create_table_chunk(self):
from src.document_intelligence.chunks import (
TableChunk,
TableCell,
BoundingBox,
)
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)
cells = [
TableCell(row=0, col=0, text="Header 1", is_header=True,
bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)),
TableCell(row=0, col=1, text="Header 2", is_header=True,
bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)),
TableCell(row=1, col=0, text="Value 1",
bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)),
TableCell(row=1, col=1, text="Value 2",
bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)),
]
table = TableChunk(
chunk_id="table_001",
doc_id="doc_001",
text="Table content",
page=1,
bbox=bbox,
confidence=0.9,
sequence_index=0,
cells=cells,
num_rows=2,
num_cols=2,
)
assert table.num_rows == 2
assert table.num_cols == 2
assert len(table.cells) == 4
def test_table_get_cell(self):
from src.document_intelligence.chunks import (
TableChunk,
TableCell,
BoundingBox,
)
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)
cells = [
TableCell(row=0, col=0, text="A",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=0, col=1, text="B",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=1, col=0, text="C",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=1, col=1, text="D",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
]
table = TableChunk(
chunk_id="table_001",
doc_id="doc_001",
text="Table",
page=1,
bbox=bbox,
confidence=0.9,
sequence_index=0,
cells=cells,
num_rows=2,
num_cols=2,
)
assert table.get_cell(0, 0).text == "A"
assert table.get_cell(0, 1).text == "B"
assert table.get_cell(1, 0).text == "C"
assert table.get_cell(1, 1).text == "D"
def test_table_to_markdown(self):
from src.document_intelligence.chunks import (
TableChunk,
TableCell,
BoundingBox,
)
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8)
cells = [
TableCell(row=0, col=0, text="Name",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=0, col=1, text="Value",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=1, col=0, text="A",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
TableCell(row=1, col=1, text="100",
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)),
]
table = TableChunk(
chunk_id="table_001",
doc_id="doc_001",
text="Table",
page=1,
bbox=bbox,
confidence=0.9,
sequence_index=0,
cells=cells,
num_rows=2,
num_cols=2,
)
md = table.to_markdown()
assert "| Name | Value |" in md
assert "| --- | --- |" in md
assert "| A | 100 |" in md
class TestExtractionSchema:
"""Tests for ExtractionSchema."""
def test_create_schema(self):
from src.document_intelligence.extraction import (
ExtractionSchema,
FieldSpec,
FieldType,
)
schema = ExtractionSchema(name="TestSchema")
schema.add_string_field("name", "Person name", required=True)
schema.add_number_field("age", "Person age", required=False, is_integer=True)
schema.add_date_field("birth_date", "Date of birth")
assert schema.name == "TestSchema"
assert len(schema.fields) == 3
assert schema.get_field("name").required is True
assert schema.get_field("age").field_type == FieldType.INTEGER
def test_schema_to_json_schema(self):
from src.document_intelligence.extraction import ExtractionSchema
schema = ExtractionSchema(name="Invoice")
schema.add_string_field("invoice_number", required=True)
schema.add_currency_field("total_amount", required=True)
json_schema = schema.to_json_schema()
assert json_schema["type"] == "object"
assert "invoice_number" in json_schema["properties"]
assert "total_amount" in json_schema["properties"]
assert "invoice_number" in json_schema["required"]
def test_schema_from_json_schema(self):
from src.document_intelligence.extraction import ExtractionSchema
json_schema = {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Name"},
"value": {"type": "number", "minimum": 0},
},
"required": ["name"],
}
schema = ExtractionSchema.from_json_schema(json_schema, name="Test")
assert len(schema.fields) == 2
assert schema.get_field("name").required is True
assert schema.get_field("value").required is False
def test_preset_schemas(self):
from src.document_intelligence.extraction import (
create_invoice_schema,
create_receipt_schema,
create_contract_schema,
)
invoice = create_invoice_schema()
assert invoice.get_field("invoice_number") is not None
assert invoice.get_field("total_amount") is not None
receipt = create_receipt_schema()
assert receipt.get_field("merchant_name") is not None
contract = create_contract_schema()
assert contract.get_field("effective_date") is not None
class TestEvidenceBuilder:
"""Tests for EvidenceBuilder."""
def test_create_evidence(self):
from src.document_intelligence.grounding import EvidenceBuilder
from src.document_intelligence.chunks import (
DocumentChunk,
ChunkType,
BoundingBox,
)
chunk = DocumentChunk(
chunk_id="chunk_001",
doc_id="doc_001",
chunk_type=ChunkType.PARAGRAPH,
text="The total amount is $500.00.",
page=1,
bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3),
confidence=0.9,
sequence_index=0,
)
builder = EvidenceBuilder()
evidence = builder.create_evidence(
chunk=chunk,
value="$500.00",
field_name="total_amount"
)
assert evidence.chunk_id == "chunk_001"
assert evidence.page == 1
assert "$500.00" in evidence.snippet or "500" in evidence.snippet
class TestSemanticChunker:
"""Tests for SemanticChunker."""
def test_chunk_text(self):
from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig
config = ChunkingConfig(
min_chunk_chars=10,
max_chunk_chars=100,
target_chunk_chars=50,
)
chunker = SemanticChunker(config)
text = """# Heading 1
This is the first paragraph with some text content.
This is the second paragraph with more content.
# Heading 2
Another section with different content.
"""
chunks = chunker.chunk_text(text)
assert len(chunks) > 0
for chunk in chunks:
assert "text" in chunk
assert len(chunk["text"]) >= config.min_chunk_chars
def test_chunk_long_text(self):
from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig
config = ChunkingConfig(
min_chunk_chars=10,
max_chunk_chars=200,
target_chunk_chars=100,
)
chunker = SemanticChunker(config)
# Create a long text
text = " ".join(["This is sentence number {}.".format(i) for i in range(50)])
chunks = chunker.chunk_text(text)
assert len(chunks) > 1
for chunk in chunks:
assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 # Allow some slack
class TestValidation:
"""Tests for extraction validation."""
def test_validate_extraction(self):
from src.document_intelligence.extraction import (
ExtractionSchema,
ExtractionValidator,
)
from src.document_intelligence.chunks import ExtractionResult, FieldExtraction
schema = ExtractionSchema(name="Test")
schema.add_string_field("name", required=True)
schema.add_number_field("value", required=False, is_integer=True)
result = ExtractionResult(
data={"name": "Test Name", "value": 42},
fields=[],
evidence=[],
overall_confidence=0.8,
abstained_fields=[],
)
validator = ExtractionValidator()
validation = validator.validate(result, schema)
assert validation.is_valid is True
assert validation.error_count == 0
def test_validate_missing_required(self):
from src.document_intelligence.extraction import (
ExtractionSchema,
ExtractionValidator,
)
from src.document_intelligence.chunks import ExtractionResult
schema = ExtractionSchema(name="Test")
schema.add_string_field("name", required=True)
schema.add_string_field("description", required=True)
result = ExtractionResult(
data={"name": "Test"}, # Missing 'description'
fields=[],
evidence=[],
overall_confidence=0.5,
abstained_fields=["description"],
)
validator = ExtractionValidator()
validation = validator.validate(result, schema)
assert validation.is_valid is False
assert validation.error_count >= 1
if __name__ == "__main__":
pytest.main([__file__, "-v"])