| | from docling_processor import DoclingProcessor |
| |
|
| | |
| | PDF_FILE = "" |
| | SOURCE_DIR = "data/data_raw" |
| | OUTPUT_DIR = "data" |
| | USE_OCR = False |
| |
|
| |
|
| | if __name__ == "__main__": |
| | processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR) |
| | |
| | if PDF_FILE: |
| | |
| | print(f"Processing: {PDF_FILE}") |
| | result = processor.parse_document(PDF_FILE) |
| | print("Done!" if result else "Error or skipped") |
| | else: |
| | |
| | print(f"Processing directory: {SOURCE_DIR}") |
| | r = processor.parse_directory(SOURCE_DIR) |
| | print(f"Total: {r['total']} | Success: {r['parsed']} | Skipped: {r['skipped']} | Errors: {r['errors']}") |
| |
|