|
|
""" |
|
|
Unit Tests for Document Intelligence Subsystem |
|
|
|
|
|
Tests core components: |
|
|
- BoundingBox operations |
|
|
- Chunk models |
|
|
- Schema and extraction |
|
|
- Evidence building |
|
|
""" |
|
|
|
|
|
import pytest |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
class TestBoundingBox: |
|
|
"""Tests for BoundingBox model.""" |
|
|
|
|
|
def test_create_bbox(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox = BoundingBox( |
|
|
x_min=0.1, |
|
|
y_min=0.2, |
|
|
x_max=0.5, |
|
|
y_max=0.6, |
|
|
normalized=True |
|
|
) |
|
|
|
|
|
assert bbox.x_min == 0.1 |
|
|
assert bbox.y_min == 0.2 |
|
|
assert bbox.x_max == 0.5 |
|
|
assert bbox.y_max == 0.6 |
|
|
assert bbox.normalized is True |
|
|
|
|
|
def test_bbox_properties(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox = BoundingBox( |
|
|
x_min=10, |
|
|
y_min=20, |
|
|
x_max=50, |
|
|
y_max=80, |
|
|
normalized=False |
|
|
) |
|
|
|
|
|
assert bbox.width == 40 |
|
|
assert bbox.height == 60 |
|
|
assert bbox.area == 2400 |
|
|
assert bbox.center == (30, 50) |
|
|
assert bbox.xyxy == (10, 20, 50, 80) |
|
|
|
|
|
def test_bbox_to_pixel(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox = BoundingBox( |
|
|
x_min=0.1, |
|
|
y_min=0.2, |
|
|
x_max=0.5, |
|
|
y_max=0.6, |
|
|
normalized=True |
|
|
) |
|
|
|
|
|
pixel_bbox = bbox.to_pixel(1000, 800) |
|
|
|
|
|
assert pixel_bbox.x_min == 100 |
|
|
assert pixel_bbox.y_min == 160 |
|
|
assert pixel_bbox.x_max == 500 |
|
|
assert pixel_bbox.y_max == 480 |
|
|
assert pixel_bbox.normalized is False |
|
|
|
|
|
def test_bbox_to_normalized(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox = BoundingBox( |
|
|
x_min=100, |
|
|
y_min=160, |
|
|
x_max=500, |
|
|
y_max=480, |
|
|
normalized=False |
|
|
) |
|
|
|
|
|
norm_bbox = bbox.to_normalized(1000, 800) |
|
|
|
|
|
assert abs(norm_bbox.x_min - 0.1) < 0.001 |
|
|
assert abs(norm_bbox.y_min - 0.2) < 0.001 |
|
|
assert abs(norm_bbox.x_max - 0.5) < 0.001 |
|
|
assert abs(norm_bbox.y_max - 0.6) < 0.001 |
|
|
assert norm_bbox.normalized is True |
|
|
|
|
|
def test_bbox_iou(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox1 = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
|
|
bbox2 = BoundingBox(x_min=50, y_min=50, x_max=150, y_max=150) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
iou = bbox1.iou(bbox2) |
|
|
assert 0.1 < iou < 0.2 |
|
|
|
|
|
def test_bbox_contains(self): |
|
|
from src.document_intelligence.chunks import BoundingBox |
|
|
|
|
|
bbox = BoundingBox(x_min=0, y_min=0, x_max=100, y_max=100) |
|
|
|
|
|
assert bbox.contains((50, 50)) is True |
|
|
assert bbox.contains((0, 0)) is True |
|
|
assert bbox.contains((100, 100)) is True |
|
|
assert bbox.contains((150, 50)) is False |
|
|
|
|
|
|
|
|
class TestDocumentChunk: |
|
|
"""Tests for DocumentChunk model.""" |
|
|
|
|
|
def test_create_chunk(self): |
|
|
from src.document_intelligence.chunks import ( |
|
|
DocumentChunk, |
|
|
ChunkType, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
|
|
|
|
|
chunk = DocumentChunk( |
|
|
chunk_id="test_chunk_001", |
|
|
doc_id="doc_001", |
|
|
chunk_type=ChunkType.PARAGRAPH, |
|
|
text="This is a test paragraph.", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
confidence=0.95, |
|
|
sequence_index=0, |
|
|
) |
|
|
|
|
|
assert chunk.chunk_id == "test_chunk_001" |
|
|
assert chunk.chunk_type == ChunkType.PARAGRAPH |
|
|
assert chunk.text == "This is a test paragraph." |
|
|
assert chunk.page == 1 |
|
|
assert chunk.confidence == 0.95 |
|
|
|
|
|
def test_generate_chunk_id(self): |
|
|
from src.document_intelligence.chunks import ( |
|
|
DocumentChunk, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3, normalized=True) |
|
|
|
|
|
chunk_id = DocumentChunk.generate_chunk_id( |
|
|
doc_id="doc_001", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
chunk_type_str="paragraph" |
|
|
) |
|
|
|
|
|
|
|
|
chunk_id_2 = DocumentChunk.generate_chunk_id( |
|
|
doc_id="doc_001", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
chunk_type_str="paragraph" |
|
|
) |
|
|
|
|
|
assert chunk_id == chunk_id_2 |
|
|
assert len(chunk_id) == 16 |
|
|
|
|
|
|
|
|
class TestTableChunk: |
|
|
"""Tests for TableChunk model.""" |
|
|
|
|
|
def test_create_table_chunk(self): |
|
|
from src.document_intelligence.chunks import ( |
|
|
TableChunk, |
|
|
TableCell, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
|
|
|
cells = [ |
|
|
TableCell(row=0, col=0, text="Header 1", is_header=True, |
|
|
bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.5, y_max=0.3)), |
|
|
TableCell(row=0, col=1, text="Header 2", is_header=True, |
|
|
bbox=BoundingBox(x_min=0.5, y_min=0.2, x_max=0.9, y_max=0.3)), |
|
|
TableCell(row=1, col=0, text="Value 1", |
|
|
bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.5, y_max=0.4)), |
|
|
TableCell(row=1, col=1, text="Value 2", |
|
|
bbox=BoundingBox(x_min=0.5, y_min=0.3, x_max=0.9, y_max=0.4)), |
|
|
] |
|
|
|
|
|
table = TableChunk( |
|
|
chunk_id="table_001", |
|
|
doc_id="doc_001", |
|
|
text="Table content", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
confidence=0.9, |
|
|
sequence_index=0, |
|
|
cells=cells, |
|
|
num_rows=2, |
|
|
num_cols=2, |
|
|
) |
|
|
|
|
|
assert table.num_rows == 2 |
|
|
assert table.num_cols == 2 |
|
|
assert len(table.cells) == 4 |
|
|
|
|
|
def test_table_get_cell(self): |
|
|
from src.document_intelligence.chunks import ( |
|
|
TableChunk, |
|
|
TableCell, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
|
|
|
cells = [ |
|
|
TableCell(row=0, col=0, text="A", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=0, col=1, text="B", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=1, col=0, text="C", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=1, col=1, text="D", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
] |
|
|
|
|
|
table = TableChunk( |
|
|
chunk_id="table_001", |
|
|
doc_id="doc_001", |
|
|
text="Table", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
confidence=0.9, |
|
|
sequence_index=0, |
|
|
cells=cells, |
|
|
num_rows=2, |
|
|
num_cols=2, |
|
|
) |
|
|
|
|
|
assert table.get_cell(0, 0).text == "A" |
|
|
assert table.get_cell(0, 1).text == "B" |
|
|
assert table.get_cell(1, 0).text == "C" |
|
|
assert table.get_cell(1, 1).text == "D" |
|
|
|
|
|
def test_table_to_markdown(self): |
|
|
from src.document_intelligence.chunks import ( |
|
|
TableChunk, |
|
|
TableCell, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
bbox = BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.8) |
|
|
|
|
|
cells = [ |
|
|
TableCell(row=0, col=0, text="Name", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=0, col=1, text="Value", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=1, col=0, text="A", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
TableCell(row=1, col=1, text="100", |
|
|
bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1)), |
|
|
] |
|
|
|
|
|
table = TableChunk( |
|
|
chunk_id="table_001", |
|
|
doc_id="doc_001", |
|
|
text="Table", |
|
|
page=1, |
|
|
bbox=bbox, |
|
|
confidence=0.9, |
|
|
sequence_index=0, |
|
|
cells=cells, |
|
|
num_rows=2, |
|
|
num_cols=2, |
|
|
) |
|
|
|
|
|
md = table.to_markdown() |
|
|
assert "| Name | Value |" in md |
|
|
assert "| --- | --- |" in md |
|
|
assert "| A | 100 |" in md |
|
|
|
|
|
|
|
|
class TestExtractionSchema: |
|
|
"""Tests for ExtractionSchema.""" |
|
|
|
|
|
def test_create_schema(self): |
|
|
from src.document_intelligence.extraction import ( |
|
|
ExtractionSchema, |
|
|
FieldSpec, |
|
|
FieldType, |
|
|
) |
|
|
|
|
|
schema = ExtractionSchema(name="TestSchema") |
|
|
schema.add_string_field("name", "Person name", required=True) |
|
|
schema.add_number_field("age", "Person age", required=False, is_integer=True) |
|
|
schema.add_date_field("birth_date", "Date of birth") |
|
|
|
|
|
assert schema.name == "TestSchema" |
|
|
assert len(schema.fields) == 3 |
|
|
assert schema.get_field("name").required is True |
|
|
assert schema.get_field("age").field_type == FieldType.INTEGER |
|
|
|
|
|
def test_schema_to_json_schema(self): |
|
|
from src.document_intelligence.extraction import ExtractionSchema |
|
|
|
|
|
schema = ExtractionSchema(name="Invoice") |
|
|
schema.add_string_field("invoice_number", required=True) |
|
|
schema.add_currency_field("total_amount", required=True) |
|
|
|
|
|
json_schema = schema.to_json_schema() |
|
|
|
|
|
assert json_schema["type"] == "object" |
|
|
assert "invoice_number" in json_schema["properties"] |
|
|
assert "total_amount" in json_schema["properties"] |
|
|
assert "invoice_number" in json_schema["required"] |
|
|
|
|
|
def test_schema_from_json_schema(self): |
|
|
from src.document_intelligence.extraction import ExtractionSchema |
|
|
|
|
|
json_schema = { |
|
|
"type": "object", |
|
|
"properties": { |
|
|
"name": {"type": "string", "description": "Name"}, |
|
|
"value": {"type": "number", "minimum": 0}, |
|
|
}, |
|
|
"required": ["name"], |
|
|
} |
|
|
|
|
|
schema = ExtractionSchema.from_json_schema(json_schema, name="Test") |
|
|
|
|
|
assert len(schema.fields) == 2 |
|
|
assert schema.get_field("name").required is True |
|
|
assert schema.get_field("value").required is False |
|
|
|
|
|
def test_preset_schemas(self): |
|
|
from src.document_intelligence.extraction import ( |
|
|
create_invoice_schema, |
|
|
create_receipt_schema, |
|
|
create_contract_schema, |
|
|
) |
|
|
|
|
|
invoice = create_invoice_schema() |
|
|
assert invoice.get_field("invoice_number") is not None |
|
|
assert invoice.get_field("total_amount") is not None |
|
|
|
|
|
receipt = create_receipt_schema() |
|
|
assert receipt.get_field("merchant_name") is not None |
|
|
|
|
|
contract = create_contract_schema() |
|
|
assert contract.get_field("effective_date") is not None |
|
|
|
|
|
|
|
|
class TestEvidenceBuilder: |
|
|
"""Tests for EvidenceBuilder.""" |
|
|
|
|
|
def test_create_evidence(self): |
|
|
from src.document_intelligence.grounding import EvidenceBuilder |
|
|
from src.document_intelligence.chunks import ( |
|
|
DocumentChunk, |
|
|
ChunkType, |
|
|
BoundingBox, |
|
|
) |
|
|
|
|
|
chunk = DocumentChunk( |
|
|
chunk_id="chunk_001", |
|
|
doc_id="doc_001", |
|
|
chunk_type=ChunkType.PARAGRAPH, |
|
|
text="The total amount is $500.00.", |
|
|
page=1, |
|
|
bbox=BoundingBox(x_min=0.1, y_min=0.2, x_max=0.9, y_max=0.3), |
|
|
confidence=0.9, |
|
|
sequence_index=0, |
|
|
) |
|
|
|
|
|
builder = EvidenceBuilder() |
|
|
evidence = builder.create_evidence( |
|
|
chunk=chunk, |
|
|
value="$500.00", |
|
|
field_name="total_amount" |
|
|
) |
|
|
|
|
|
assert evidence.chunk_id == "chunk_001" |
|
|
assert evidence.page == 1 |
|
|
assert "$500.00" in evidence.snippet or "500" in evidence.snippet |
|
|
|
|
|
|
|
|
class TestSemanticChunker: |
|
|
"""Tests for SemanticChunker.""" |
|
|
|
|
|
def test_chunk_text(self): |
|
|
from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
|
|
|
|
|
config = ChunkingConfig( |
|
|
min_chunk_chars=10, |
|
|
max_chunk_chars=100, |
|
|
target_chunk_chars=50, |
|
|
) |
|
|
|
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
text = """# Heading 1 |
|
|
|
|
|
This is the first paragraph with some text content. |
|
|
|
|
|
This is the second paragraph with more content. |
|
|
|
|
|
# Heading 2 |
|
|
|
|
|
Another section with different content. |
|
|
""" |
|
|
|
|
|
chunks = chunker.chunk_text(text) |
|
|
|
|
|
assert len(chunks) > 0 |
|
|
for chunk in chunks: |
|
|
assert "text" in chunk |
|
|
assert len(chunk["text"]) >= config.min_chunk_chars |
|
|
|
|
|
def test_chunk_long_text(self): |
|
|
from src.document_intelligence.parsing import SemanticChunker, ChunkingConfig |
|
|
|
|
|
config = ChunkingConfig( |
|
|
min_chunk_chars=10, |
|
|
max_chunk_chars=200, |
|
|
target_chunk_chars=100, |
|
|
) |
|
|
|
|
|
chunker = SemanticChunker(config) |
|
|
|
|
|
|
|
|
text = " ".join(["This is sentence number {}.".format(i) for i in range(50)]) |
|
|
|
|
|
chunks = chunker.chunk_text(text) |
|
|
|
|
|
assert len(chunks) > 1 |
|
|
for chunk in chunks: |
|
|
assert len(chunk["text"]) <= config.max_chunk_chars * 1.1 |
|
|
|
|
|
|
|
|
class TestValidation: |
|
|
"""Tests for extraction validation.""" |
|
|
|
|
|
def test_validate_extraction(self): |
|
|
from src.document_intelligence.extraction import ( |
|
|
ExtractionSchema, |
|
|
ExtractionValidator, |
|
|
) |
|
|
from src.document_intelligence.chunks import ExtractionResult, FieldExtraction |
|
|
|
|
|
schema = ExtractionSchema(name="Test") |
|
|
schema.add_string_field("name", required=True) |
|
|
schema.add_number_field("value", required=False, is_integer=True) |
|
|
|
|
|
result = ExtractionResult( |
|
|
data={"name": "Test Name", "value": 42}, |
|
|
fields=[], |
|
|
evidence=[], |
|
|
overall_confidence=0.8, |
|
|
abstained_fields=[], |
|
|
) |
|
|
|
|
|
validator = ExtractionValidator() |
|
|
validation = validator.validate(result, schema) |
|
|
|
|
|
assert validation.is_valid is True |
|
|
assert validation.error_count == 0 |
|
|
|
|
|
def test_validate_missing_required(self): |
|
|
from src.document_intelligence.extraction import ( |
|
|
ExtractionSchema, |
|
|
ExtractionValidator, |
|
|
) |
|
|
from src.document_intelligence.chunks import ExtractionResult |
|
|
|
|
|
schema = ExtractionSchema(name="Test") |
|
|
schema.add_string_field("name", required=True) |
|
|
schema.add_string_field("description", required=True) |
|
|
|
|
|
result = ExtractionResult( |
|
|
data={"name": "Test"}, |
|
|
fields=[], |
|
|
evidence=[], |
|
|
overall_confidence=0.5, |
|
|
abstained_fields=["description"], |
|
|
) |
|
|
|
|
|
validator = ExtractionValidator() |
|
|
validation = validator.validate(result, schema) |
|
|
|
|
|
assert validation.is_valid is False |
|
|
assert validation.error_count >= 1 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
pytest.main([__file__, "-v"]) |
|
|
|