from docling_processor import DoclingProcessor # Configuration PDF_FILE = "" # Single file (leave empty to parse entire directory) SOURCE_DIR = "data/data_raw" # Directory containing PDFs OUTPUT_DIR = "data" # Markdown output directory USE_OCR = False # Enable OCR for scanned PDFs if __name__ == "__main__": processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR) if PDF_FILE: # Parse a single file print(f"Processing: {PDF_FILE}") result = processor.parse_document(PDF_FILE) print("Done!" if result else "Error or skipped") else: # Parse entire directory print(f"Processing directory: {SOURCE_DIR}") r = processor.parse_directory(SOURCE_DIR) print(f"Total: {r['total']} | Success: {r['parsed']} | Skipped: {r['skipped']} | Errors: {r['errors']}")