File size: 861 Bytes
11133c9
54b2662
92c9b4d
 
 
 
 
c429a2d
54b2662
 
11133c9
54b2662
11133c9
92c9b4d
 
11133c9
92c9b4d
11133c9
92c9b4d
 
11133c9
92c9b4d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from docling_processor import DoclingProcessor

# Configuration
PDF_FILE = ""  # Single file (leave empty to parse entire directory)
SOURCE_DIR = "data/data_raw"  # Directory containing PDFs
OUTPUT_DIR = "data"           # Markdown output directory
USE_OCR = False               # Enable OCR for scanned PDFs


if __name__ == "__main__":
    processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)
    
    if PDF_FILE:
        # Parse a single file
        print(f"Processing: {PDF_FILE}")
        result = processor.parse_document(PDF_FILE)
        print("Done!" if result else "Error or skipped")
    else:
        # Parse entire directory
        print(f"Processing directory: {SOURCE_DIR}")
        r = processor.parse_directory(SOURCE_DIR)
        print(f"Total: {r['total']} | Success: {r['parsed']} | Skipped: {r['skipped']} | Errors: {r['errors']}")