File size: 4,012 Bytes
d520909 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
"""
Example: Document Processing Pipeline
Demonstrates:
1. Processing a PDF document
2. Extracting text with OCR
3. Layout detection
4. Semantic chunking
"""
import asyncio
from pathlib import Path
from loguru import logger
# Import document processing components
from src.document.pipeline import (
PipelineConfig,
DocumentProcessor,
process_document,
)
from src.document.ocr import OCRConfig
def example_basic_processing():
"""Basic document processing example."""
print("=" * 50)
print("Basic Document Processing")
print("=" * 50)
# Configure pipeline
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
render_dpi=300,
max_pages=5, # Limit for demo
)
# Create processor
processor = DocumentProcessor(config)
# Process a sample document
# NOTE: Replace with actual document path
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print(f"Sample document not found: {sample_doc}")
print("Create a sample PDF at ./data/sample.pdf to run this example")
return
# Process
result = processor.process(sample_doc)
# Display results
print(f"\nDocument: {result.metadata.filename}")
print(f"Pages: {result.metadata.num_pages}")
print(f"Chunks: {result.metadata.total_chunks}")
print(f"Characters: {result.metadata.total_characters}")
print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}")
print("\n--- Sample Chunks ---")
for i, chunk in enumerate(result.chunks[:3]):
print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}")
print(f"Text: {chunk.text[:200]}...")
print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")
def example_with_layout():
"""Document processing with layout analysis."""
print("\n" + "=" * 50)
print("Document Processing with Layout Analysis")
print("=" * 50)
from src.document.layout import LayoutConfig, LayoutType
# Configure with layout detection
config = PipelineConfig(
ocr=OCRConfig(engine="paddleocr"),
layout=LayoutConfig(method="rule_based"),
include_layout_regions=True,
)
processor = DocumentProcessor(config)
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print("Sample document not found")
return
result = processor.process(sample_doc)
# Count layout types
layout_counts = {}
for region in result.layout_regions:
layout_type = region.layout_type.value
layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1
print(f"\nLayout Analysis:")
for layout_type, count in sorted(layout_counts.items()):
print(f" {layout_type}: {count} regions")
# Show tables if found
tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE]
if tables:
print(f"\n--- Tables Found ({len(tables)}) ---")
for i, table in enumerate(tables[:2]):
print(f"\nTable {i+1}: Page {table.page+1}")
print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})")
print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}")
def example_convenience_function():
"""Using the convenience function."""
print("\n" + "=" * 50)
print("Using Convenience Function")
print("=" * 50)
sample_doc = Path("./data/sample.pdf")
if not sample_doc.exists():
print("Sample document not found")
return
# Simple one-liner
result = process_document(sample_doc)
print(f"Processed: {result.metadata.filename}")
print(f"Chunks: {len(result.chunks)}")
print(f"\nFull text preview:")
print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text)
if __name__ == "__main__":
example_basic_processing()
example_with_layout()
example_convenience_function()
|