File size: 4,012 Bytes
d520909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Example: Document Processing Pipeline

Demonstrates:
1. Processing a PDF document
2. Extracting text with OCR
3. Layout detection
4. Semantic chunking
"""

import asyncio
from pathlib import Path
from loguru import logger

# Import document processing components
from src.document.pipeline import (
    PipelineConfig,
    DocumentProcessor,
    process_document,
)
from src.document.ocr import OCRConfig


def example_basic_processing():
    """Basic document processing example."""
    print("=" * 50)
    print("Basic Document Processing")
    print("=" * 50)

    # Configure pipeline
    config = PipelineConfig(
        ocr=OCRConfig(engine="paddleocr"),
        render_dpi=300,
        max_pages=5,  # Limit for demo
    )

    # Create processor
    processor = DocumentProcessor(config)

    # Process a sample document
    # NOTE: Replace with actual document path
    sample_doc = Path("./data/sample.pdf")

    if not sample_doc.exists():
        print(f"Sample document not found: {sample_doc}")
        print("Create a sample PDF at ./data/sample.pdf to run this example")
        return

    # Process
    result = processor.process(sample_doc)

    # Display results
    print(f"\nDocument: {result.metadata.filename}")
    print(f"Pages: {result.metadata.num_pages}")
    print(f"Chunks: {result.metadata.total_chunks}")
    print(f"Characters: {result.metadata.total_characters}")
    print(f"OCR Confidence: {result.metadata.ocr_confidence_avg:.2%}")

    print("\n--- Sample Chunks ---")
    for i, chunk in enumerate(result.chunks[:3]):
        print(f"\n[Chunk {i+1}] Type: {chunk.chunk_type.value}, Page: {chunk.page+1}")
        print(f"Text: {chunk.text[:200]}...")
        print(f"BBox: ({chunk.bbox.x_min:.0f}, {chunk.bbox.y_min:.0f}) -> ({chunk.bbox.x_max:.0f}, {chunk.bbox.y_max:.0f})")


def example_with_layout():
    """Document processing with layout analysis."""
    print("\n" + "=" * 50)
    print("Document Processing with Layout Analysis")
    print("=" * 50)

    from src.document.layout import LayoutConfig, LayoutType

    # Configure with layout detection
    config = PipelineConfig(
        ocr=OCRConfig(engine="paddleocr"),
        layout=LayoutConfig(method="rule_based"),
        include_layout_regions=True,
    )

    processor = DocumentProcessor(config)

    sample_doc = Path("./data/sample.pdf")
    if not sample_doc.exists():
        print("Sample document not found")
        return

    result = processor.process(sample_doc)

    # Count layout types
    layout_counts = {}
    for region in result.layout_regions:
        layout_type = region.layout_type.value
        layout_counts[layout_type] = layout_counts.get(layout_type, 0) + 1

    print(f"\nLayout Analysis:")
    for layout_type, count in sorted(layout_counts.items()):
        print(f"  {layout_type}: {count} regions")

    # Show tables if found
    tables = [r for r in result.layout_regions if r.layout_type == LayoutType.TABLE]
    if tables:
        print(f"\n--- Tables Found ({len(tables)}) ---")
        for i, table in enumerate(tables[:2]):
            print(f"\nTable {i+1}: Page {table.page+1}")
            print(f"  Position: ({table.bbox.x_min:.0f}, {table.bbox.y_min:.0f})")
            print(f"  Size: {table.bbox.width:.0f} x {table.bbox.height:.0f}")


def example_convenience_function():
    """Using the convenience function."""
    print("\n" + "=" * 50)
    print("Using Convenience Function")
    print("=" * 50)

    sample_doc = Path("./data/sample.pdf")
    if not sample_doc.exists():
        print("Sample document not found")
        return

    # Simple one-liner
    result = process_document(sample_doc)

    print(f"Processed: {result.metadata.filename}")
    print(f"Chunks: {len(result.chunks)}")
    print(f"\nFull text preview:")
    print(result.full_text[:500] + "..." if len(result.full_text) > 500 else result.full_text)


if __name__ == "__main__":
    example_basic_processing()
    example_with_layout()
    example_convenience_function()