DoAn / core /preprocessing /pdf_parser.py
hungnha's picture
Thay đổi promt
92c9b4d
raw
history blame contribute delete
861 Bytes
from docling_processor import DoclingProcessor
# Configuration
PDF_FILE = "" # Single file (leave empty to parse entire directory)
SOURCE_DIR = "data/data_raw" # Directory containing PDFs
OUTPUT_DIR = "data" # Markdown output directory
USE_OCR = False # Enable OCR for scanned PDFs
if __name__ == "__main__":
processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)
if PDF_FILE:
# Parse a single file
print(f"Processing: {PDF_FILE}")
result = processor.parse_document(PDF_FILE)
print("Done!" if result else "Error or skipped")
else:
# Parse entire directory
print(f"Processing directory: {SOURCE_DIR}")
r = processor.parse_directory(SOURCE_DIR)
print(f"Total: {r['total']} | Success: {r['parsed']} | Skipped: {r['skipped']} | Errors: {r['errors']}")