SPARKNET / examples /document_processing.py
MHamdan's picture
Initial commit: SPARKNET framework
d520909
"""
Example: Document Processing Pipeline
Demonstrates:
1. Processing a PDF document
2. Extracting text with OCR
3. Layout detection
4. Semantic chunking
"""
import asyncio
from pathlib import Path
from loguru import logger
# Import document processing components
from src.document.pipeline import (
PipelineConfig,
DocumentProcessor,
process_document,
)
from src.document.ocr import OCRConfig
def example_basic_processing():
"""Basic document processing example."""
print("=" * 50)
print("Basic Document Processing")
print("=" * 50)
# Configure pipeline
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
render_dpi=300,
max_pages=5, # Limit for demo
)
# Create processor
processor = DocumentProcessor(config)
# Process a sample document
# NOTE: Replace with actual document path
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print(f"Sample document not found: {sample_doc}")
print("Create a sample PDF at ./data/sample.pdf to run this example")
return
# Process
result = processor.process(sample_doc)
# Display results
print(f"\nDocument: {result.metadata.filename}")
print(f"Pages: {result.metadata.num_pages}")
print(f"Chunks: {result.metadata.total_chunks}")
print(f"Characters: {result.metadata.total_characters}")
print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}")
print("\n--- Sample Chunks ---")
for i, chunk in enumerate(result.chunks[:3]):
print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}")
print(f"Text: {chunk.text[:200]}...")
print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
def example_with_layout():
"""Document processing with layout analysis."""
print("\n" + "=" * 50)
print("Document Processing with Layout Analysis")
print("=" * 50)
from src.document.layout import LayoutConfig, LayoutType
# Configure with layout detection
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
layout=LayoutConfig(method="rule_based"),
include_layout_regions=True,
)
processor = DocumentProcessor(config)
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print("Sample document not found")
return
result = processor.process(sample_doc)
# Count layout types
layout_counts = {}
for region in result.layout_regions:
layout_type = region.layout_type.value
layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1
print(f"\nLayout Analysis:")
for layout_type, count in sorted(layout_counts.items()):
print(f" {layout_type}: {count} regions")
# Show tables if found
tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE]
if tables:
print(f"\n--- Tables Found ({len(tables)}) ---")
for i, table in enumerate(tables[:2]):
print(f"\nTable {i+1}: Page {table.page+1}")
print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})")
print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}")
def example_convenience_function():
"""Using the convenience function."""
print("\n" + "=" * 50)
print("Using Convenience Function")
print("=" * 50)
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print("Sample document not found")
return
# Simple one-liner
result = process_document(sample_doc)
print(f"Processed: {result.metadata.filename}")
print(f"Chunks: {len(result.chunks)}")
print(f"\nFull text preview:")
print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text)
if __name__ == "__main__":
example_basic_processing()
example_with_layout()
example_convenience_function()