""" Example: Document Processing Pipeline Demonstrates: 1. Processing a PDF document 2. Extracting text with OCR 3. Layout detection 4. Semantic chunking """ import asyncio from pathlib import Path from loguru import logger # Import document processing components from src.document.pipeline import ( PipelineConfig, DocumentProcessor, process_document, ) from src.document.ocr import OCRConfig def example_basic_processing(): """Basic document processing example.""" print("=" * 50) print("Basic Document Processing") print("=" * 50) # Configure pipeline config = PipelineConfig( ocr=OCRConfig(engine="paddleocr"), render_dpi=300, max_pages=5, # Limit for demo ) # Create processor processor = DocumentProcessor(config) # Process a sample document # NOTE: Replace with actual document path sample_doc = Path("./data/sample.pdf") if not sample_doc.exists(): print(f"Sample document not found: {sample_doc}") print("Create a sample PDF at ./data/sample.pdf to run this example") return # Process result = processor.process(sample_doc) # Display results print(f"\nDocument: {result.metadata.filename}") print(f"Pages: {result.metadata.num_pages}") print(f"Chunks: {result.metadata.total_chunks}") print(f"Characters: {result.metadata.total_characters}") print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}") print("\n--- Sample Chunks ---") for i, chunk in enumerate(result.chunks[:3]): print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}") print(f"Text: {chunk.text[:200]}...") print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})") def example_with_layout(): """Document processing with layout analysis.""" print("\n" + "=" * 50) print("Document Processing with Layout Analysis") print("=" * 50) from src.document.layout import LayoutConfig, LayoutType # Configure with layout detection config = PipelineConfig( ocr=OCRConfig(engine="paddleocr"), layout=LayoutConfig(method="rule_based"), include_layout_regions=True, ) processor = DocumentProcessor(config) sample_doc = Path("./data/sample.pdf") if not sample_doc.exists(): print("Sample document not found") return result = processor.process(sample_doc) # Count layout types layout_counts = {} for region in result.layout_regions: layout_type = region.layout_type.value layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1 print(f"\nLayout Analysis:") for layout_type, count in sorted(layout_counts.items()): print(f" {layout_type}: {count} regions") # Show tables if found tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE] if tables: print(f"\n--- Tables Found ({len(tables)}) ---") for i, table in enumerate(tables[:2]): print(f"\nTable {i+1}: Page {table.page+1}") print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})") print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}") def example_convenience_function(): """Using the convenience function.""" print("\n" + "=" * 50) print("Using Convenience Function") print("=" * 50) sample_doc = Path("./data/sample.pdf") if not sample_doc.exists(): print("Sample document not found") return # Simple one-liner result = process_document(sample_doc) print(f"Processed: {result.metadata.filename}") print(f"Chunks: {len(result.chunks)}") print(f"\nFull text preview:") print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text) if __name__ == "__main__": example_basic_processing() example_with_layout() example_convenience_function()