hungnha
/

DoAn

Model card Files Files and versions

DoAn / core /preprocessing /pdf_parser.py

hungnha's picture

Thay đổi promt

92c9b4d 7 days ago

history blame contribute delete

861 Bytes

	from docling_processor import DoclingProcessor

	# Configuration
	PDF_FILE = "" # Single file (leave empty to parse entire directory)
	SOURCE_DIR = "data/data_raw" # Directory containing PDFs
	OUTPUT_DIR = "data" # Markdown output directory
	USE_OCR = False # Enable OCR for scanned PDFs


	if __name__ == "__main__":
	processor = DoclingProcessor(OUTPUT_DIR, use_ocr=USE_OCR)

	if PDF_FILE:
	# Parse a single file
	print(f"Processing: {PDF_FILE}")
	result = processor.parse_document(PDF_FILE)
	print("Done!" if result else "Error or skipped")
	else:
	# Parse entire directory
	print(f"Processing directory: {SOURCE_DIR}")
	r = processor.parse_directory(SOURCE_DIR)
	print(f"Total: {r['total']} \| Success: {r['parsed']} \| Skipped: {r['skipped']} \| Errors: {r['errors']}")