|
|
""" |
|
|
Example: Document Processing Pipeline |
|
|
|
|
|
Demonstrates: |
|
|
1. Processing a PDF document |
|
|
2. Extracting text with OCR |
|
|
3. Layout detection |
|
|
4. Semantic chunking |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
from pathlib import Path |
|
|
from loguru import logger |
|
|
|
|
|
|
|
|
from src.document.pipeline import ( |
|
|
PipelineConfig, |
|
|
DocumentProcessor, |
|
|
process_document, |
|
|
) |
|
|
from src.document.ocr import OCRConfig |
|
|
|
|
|
|
|
|
def example_basic_processing(): |
|
|
"""Basic document processing example.""" |
|
|
print("=" * 50) |
|
|
print("Basic Document Processing") |
|
|
print("=" * 50) |
|
|
|
|
|
|
|
|
config = PipelineConfig( |
|
|
ocr=OCRConfig(engine="paddleocr"), |
|
|
render_dpi=300, |
|
|
max_pages=5, |
|
|
) |
|
|
|
|
|
|
|
|
processor = DocumentProcessor(config) |
|
|
|
|
|
|
|
|
|
|
|
sample_doc = Path("./data/sample.pdf") |
|
|
|
|
|
if not sample_doc.exists(): |
|
|
print(f"Sample document not found: {sample_doc}") |
|
|
print("Create a sample PDF at ./data/sample.pdf to run this example") |
|
|
return |
|
|
|
|
|
|
|
|
result = processor.process(sample_doc) |
|
|
|
|
|
|
|
|
print(f"\nDocument: {result.metadata.filename}") |
|
|
print(f"Pages: {result.metadata.num_pages}") |
|
|
print(f"Chunks: {result.metadata.total_chunks}") |
|
|
print(f"Characters: {result.metadata.total_characters}") |
|
|
print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}") |
|
|
|
|
|
print("\n--- Sample Chunks ---") |
|
|
for i, chunk in enumerate(result.chunks[:3]): |
|
|
print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}") |
|
|
print(f"Text: {chunk.text[:200]}...") |
|
|
print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})") |
|
|
|
|
|
|
|
|
def example_with_layout(): |
|
|
"""Document processing with layout analysis.""" |
|
|
print("\n" + "=" * 50) |
|
|
print("Document Processing with Layout Analysis") |
|
|
print("=" * 50) |
|
|
|
|
|
from src.document.layout import LayoutConfig, LayoutType |
|
|
|
|
|
|
|
|
config = PipelineConfig( |
|
|
ocr=OCRConfig(engine="paddleocr"), |
|
|
layout=LayoutConfig(method="rule_based"), |
|
|
include_layout_regions=True, |
|
|
) |
|
|
|
|
|
processor = DocumentProcessor(config) |
|
|
|
|
|
sample_doc = Path("./data/sample.pdf") |
|
|
if not sample_doc.exists(): |
|
|
print("Sample document not found") |
|
|
return |
|
|
|
|
|
result = processor.process(sample_doc) |
|
|
|
|
|
|
|
|
layout_counts = {} |
|
|
for region in result.layout_regions: |
|
|
layout_type = region.layout_type.value |
|
|
layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1 |
|
|
|
|
|
print(f"\nLayout Analysis:") |
|
|
for layout_type, count in sorted(layout_counts.items()): |
|
|
print(f" {layout_type}: {count} regions") |
|
|
|
|
|
|
|
|
tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE] |
|
|
if tables: |
|
|
print(f"\n--- Tables Found ({len(tables)}) ---") |
|
|
for i, table in enumerate(tables[:2]): |
|
|
print(f"\nTable {i+1}: Page {table.page+1}") |
|
|
print(f" Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})") |
|
|
print(f" Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}") |
|
|
|
|
|
|
|
|
def example_convenience_function(): |
|
|
"""Using the convenience function.""" |
|
|
print("\n" + "=" * 50) |
|
|
print("Using Convenience Function") |
|
|
print("=" * 50) |
|
|
|
|
|
sample_doc = Path("./data/sample.pdf") |
|
|
if not sample_doc.exists(): |
|
|
print("Sample document not found") |
|
|
return |
|
|
|
|
|
|
|
|
result = process_document(sample_doc) |
|
|
|
|
|
print(f"Processed: {result.metadata.filename}") |
|
|
print(f"Chunks: {len(result.chunks)}") |
|
|
print(f"\nFull text preview:") |
|
|
print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
example_basic_processing() |
|
|
example_with_layout() |
|
|
example_convenience_function() |
|
|
|