SPARKNET / examples /document_intelligence_demo.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
#!/usr/bin/env python3
"""
Document Intelligence Demo
Demonstrates the capabilities of the SPARKNET document_intelligence subsystem:
- Document parsing with OCR and layout detection
- Schema-driven field extraction
- Visual grounding with evidence
- Question answering
- Document classification
"""
import asyncio
import json
from pathlib import Path
# Add project root to path
import sys
sys.path.insert(0, str(Path(__file__).parent.parent))
def demo_parse_document(doc_path: str):
"""Demo: Parse a document into semantic chunks."""
print("\n" + "=" * 60)
print("1. DOCUMENT PARSING")
print("=" * 60)
from src.document_intelligence import (
DocumentParser,
ParserConfig,
)
# Configure parser
config = ParserConfig(
render_dpi=200,
max_pages=5, # Limit for demo
include_markdown=True,
)
parser = DocumentParser(config=config)
print(f"\nParsing: {doc_path}")
result = parser.parse(doc_path)
print(f"\nDocument ID: {result.doc_id}")
print(f"Filename: {result.filename}")
print(f"Pages: {result.num_pages}")
print(f"Chunks: {len(result.chunks)}")
print(f"Processing time: {result.processing_time_ms:.0f}ms")
# Show chunk summary by type
print("\nChunk types:")
by_type = {}
for chunk in result.chunks:
t = chunk.chunk_type.value
by_type[t] = by_type.get(t, 0) + 1
for t, count in sorted(by_type.items()):
print(f" - {t}: {count}")
# Show first few chunks
print("\nFirst 3 chunks:")
for i, chunk in enumerate(result.chunks[:3]):
print(f"\n [{i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page}")
print(f" ID: {chunk.chunk_id}")
print(f" Text: {chunk.text[:100]}...")
print(f" BBox: {chunk.bbox.xyxy}")
print(f" Confidence: {chunk.confidence:.2f}")
return result
def demo_extract_fields(parse_result):
"""Demo: Extract fields using a schema."""
print("\n" + "=" * 60)
print("2. SCHEMA-DRIVEN EXTRACTION")
print("=" * 60)
from src.document_intelligence import (
FieldExtractor,
ExtractionSchema,
FieldType,
ExtractionValidator,
)
# Create a custom schema
schema = ExtractionSchema(
name="DocumentInfo",
description="Basic document information",
)
schema.add_string_field("title", "Document title or heading", required=True)
schema.add_string_field("date", "Document date", required=False)
schema.add_string_field("author", "Author or organization name", required=False)
schema.add_string_field("reference_number", "Reference or ID number", required=False)
print(f"\nExtraction schema: {schema.name}")
print("Fields:")
for field in schema.fields:
req = "required" if field.required else "optional"
print(f" - {field.name} ({field.field_type.value}, {req})")
# Extract fields
extractor = FieldExtractor()
result = extractor.extract(parse_result, schema)
print("\nExtracted data:")
for key, value in result.data.items():
status = " [ABSTAINED]" if key in result.abstained_fields else ""
print(f" {key}: {value}{status}")
print(f"\nOverall confidence: {result.overall_confidence:.2f}")
# Show evidence
if result.evidence:
print("\nEvidence:")
for ev in result.evidence[:3]:
print(f" - Page {ev.page}, Chunk {ev.chunk_id[:12]}...")
print(f" Snippet: {ev.snippet[:80]}...")
# Validate
validator = ExtractionValidator()
validation = validator.validate(result, schema)
print(f"\nValidation: {'PASSED' if validation.is_valid else 'FAILED'}")
if validation.issues:
print("Issues:")
for issue in validation.issues[:3]:
print(f" - [{issue.severity}] {issue.field_name}: {issue.message}")
return result
def demo_search_and_qa(parse_result):
"""Demo: Search and question answering."""
print("\n" + "=" * 60)
print("3. SEARCH AND Q&A")
print("=" * 60)
from src.document_intelligence.tools import get_tool
# Search demo
print("\nSearching for 'document'...")
search_tool = get_tool("search_chunks")
search_result = search_tool.execute(
parse_result=parse_result,
query="document",
top_k=5,
)
if search_result.success:
matches = search_result.data.get("results", [])
print(f"Found {len(matches)} matches:")
for i, match in enumerate(matches[:3], 1):
print(f" {i}. Page {match['page']}, Type: {match['type']}")
print(f" Score: {match['score']:.2f}")
print(f" Text: {match['text'][:80]}...")
# Q&A demo
print("\nAsking: 'What is this document about?'")
qa_tool = get_tool("answer_question")
qa_result = qa_tool.execute(
parse_result=parse_result,
question="What is this document about?",
)
if qa_result.success:
print(f"Answer: {qa_result.data.get('answer', 'No answer')}")
print(f"Confidence: {qa_result.data.get('confidence', 0):.2f}")
def demo_grounding(parse_result, doc_path: str):
"""Demo: Visual grounding with crops."""
print("\n" + "=" * 60)
print("4. VISUAL GROUNDING")
print("=" * 60)
from src.document_intelligence import (
load_document,
RenderOptions,
)
from src.document_intelligence.grounding import (
EvidenceBuilder,
crop_region,
create_annotated_image,
)
# Load page image
loader, renderer = load_document(doc_path)
page_image = renderer.render_page(1, RenderOptions(dpi=200))
loader.close()
print(f"\nPage 1 image size: {page_image.shape}")
# Get chunks from page 1
page_chunks = [c for c in parse_result.chunks if c.page == 1]
print(f"Page 1 chunks: {len(page_chunks)}")
# Create evidence for first chunk
if page_chunks:
chunk = page_chunks[0]
evidence_builder = EvidenceBuilder()
evidence = evidence_builder.create_evidence(
chunk=chunk,
value=chunk.text[:50],
field_name="example_field",
)
print(f"\nEvidence created:")
print(f" Chunk ID: {evidence.chunk_id}")
print(f" Page: {evidence.page}")
print(f" BBox: {evidence.bbox.xyxy}")
print(f" Snippet: {evidence.snippet[:80]}...")
# Crop region
crop = crop_region(page_image, chunk.bbox)
print(f" Crop size: {crop.shape}")
# Create annotated image (preview)
print("\nAnnotated image would include bounding boxes for all chunks.")
print("Use the CLI 'sparknet docint visualize' command to generate.")
def demo_classification(parse_result):
"""Demo: Document classification."""
print("\n" + "=" * 60)
print("5. DOCUMENT CLASSIFICATION")
print("=" * 60)
from src.document_intelligence.chunks import DocumentType
# Simple keyword-based classification
first_page = [c for c in parse_result.chunks if c.page == 1][:5]
content = " ".join(c.text for c in first_page).lower()
type_keywords = {
"invoice": ["invoice", "bill", "payment due", "amount due"],
"contract": ["agreement", "contract", "party", "whereas"],
"receipt": ["receipt", "paid", "transaction"],
"patent": ["patent", "claims", "invention"],
"report": ["report", "findings", "summary"],
}
detected_type = "other"
confidence = 0.3
for doc_type, keywords in type_keywords.items():
matches = sum(1 for k in keywords if k in content)
if matches >= 2:
detected_type = doc_type
confidence = min(0.95, 0.5 + matches * 0.15)
break
print(f"\nDetected type: {detected_type}")
print(f"Confidence: {confidence:.2f}")
def main():
"""Run all demos."""
print("=" * 60)
print("SPARKNET Document Intelligence Demo")
print("=" * 60)
# Check for sample document
sample_paths = [
Path("Dataset/Patent_1.pdf"),
Path("data/sample.pdf"),
Path("tests/fixtures/sample.pdf"),
]
doc_path = None
for path in sample_paths:
if path.exists():
doc_path = str(path)
break
if not doc_path:
print("\nNo sample document found.")
print("Please provide a PDF file path as argument.")
print("\nUsage: python document_intelligence_demo.py [path/to/document.pdf]")
if len(sys.argv) > 1:
doc_path = sys.argv[1]
else:
return
print(f"\nUsing document: {doc_path}")
try:
# Run demos
parse_result = demo_parse_document(doc_path)
demo_extract_fields(parse_result)
demo_search_and_qa(parse_result)
demo_grounding(parse_result, doc_path)
demo_classification(parse_result)
print("\n" + "=" * 60)
print("Demo complete!")
print("=" * 60)
except ImportError as e:
print(f"\nImport error: {e}")
print("Make sure all dependencies are installed:")
print(" pip install pymupdf pillow numpy pydantic")
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()