File size: 861 Bytes
11133c9 54b2662 92c9b4d c429a2d 54b2662 11133c9 54b2662 11133c9 92c9b4d 11133c9 92c9b4d 11133c9 92c9b4d 11133c9 92c9b4d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | from docling_processor import DoclingProcessor
# Configuration
PDF_FILE = "" # Single file (leave empty to parse entire directory)
SOURCE_DIR = "data/data_raw" # Directory containing PDFs
OUTPUT_DIR = "data" # Markdown output directory
USE_OCR = False # Enable OCR for scanned PDFs
if __name__ == "__main__":
processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)
if PDF_FILE:
# Parse a single file
print(f"Processing: {PDF_FILE}")
result = processor.parse_document(PDF_FILE)
print("Done!" if result else "Error or skipped")
else:
# Parse entire directory
print(f"Processing directory: {SOURCE_DIR}")
r = processor.parse_directory(SOURCE_DIR)
print(f"Total: {r['total']} | Success: {r['parsed']} | Skipped: {r['skipped']} | Errors: {r['errors']}")
|