File size: 5,858 Bytes
54b2662 11133c9 b91b0a5 54b2662 9348624 54b2662 11133c9 54b2662 92c9b4d b91b0a5 54b2662 92c9b4d b91b0a5 11133c9 92c9b4d 11133c9 9348624 b91b0a5 11133c9 54b2662 92c9b4d b91b0a5 92c9b4d 11133c9 92c9b4d 11133c9 c429a2d 11133c9 b91b0a5 54b2662 11133c9 92c9b4d 11133c9 b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 92c9b4d b91b0a5 11133c9 92c9b4d 54b2662 92c9b4d 11133c9 54b2662 b91b0a5 54b2662 11133c9 54b2662 b91b0a5 11133c9 92c9b4d 11133c9 54b2662 b91b0a5 54b2662 92c9b4d 54b2662 11133c9 92c9b4d 54b2662 11133c9 b91b0a5 11133c9 b91b0a5 92c9b4d b91b0a5 11133c9 92c9b4d b91b0a5 11133c9 92c9b4d b91b0a5 11133c9 92c9b4d 11133c9 92c9b4d b91b0a5 92c9b4d b91b0a5 54b2662 92c9b4d 11133c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | import os
import re
import gc
import sys
import signal
import logging
from datetime import datetime
from pathlib import Path
from docling.document_converter import DocumentConverter, FormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableStructureOptions, EasyOcrOptions, TableFormerMode
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# Add project root to path for HashProcessor import
PROJECT_ROOT = Path(__file__).resolve().parents[2]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from core.hash_file.hash_file import HashProcessor
class DoclingProcessor:
def __init__(self, output_dir: str, use_ocr: bool = True, timeout: int = 300, images_scale: float = 3.0):
self.output_dir = output_dir
self.timeout = timeout
self.logger = logging.getLogger(__name__)
self.hasher = HashProcessor(verbose=False)
os.makedirs(output_dir, exist_ok=True)
# Hash index file
self.hash_index_path = Path(output_dir) / "docling_hash_index.json"
self.hash_index = self.hasher.load_processed_index(str(self.hash_index_path))
# PDF pipeline configuration
opts = PdfPipelineOptions(do_ocr=use_ocr, do_table_structure=True)
opts.table_structure_options = TableStructureOptions(do_cell_matching=True, mode=TableFormerMode.ACCURATE)
opts.images_scale = images_scale
# Vietnamese OCR configuration
if use_ocr:
ocr = EasyOcrOptions()
ocr.lang = ["vi"]
ocr.force_full_page_ocr = False
opts.ocr_options = ocr
self.converter = DocumentConverter(format_options={
InputFormat.PDF: FormatOption(backend=PyPdfiumDocumentBackend, pipeline_cls=StandardPdfPipeline, pipeline_options=opts)
})
self.logger.info(f"Docling | OCR={use_ocr} | Table=accurate | Scale={images_scale} | timeout={timeout}s")
def clean_markdown(self, text: str) -> str:
text = re.sub(r'\n\s*Trang\s+\d+\s*\n', '\n', text)
return re.sub(r'\n{3,}', '\n\n', text).strip()
def _should_process(self, pdf_path: str, output_path: Path) -> bool:
# If output doesn't exist -> needs processing
if not output_path.exists():
return True
# Compute hash of current PDF file
current_hash = self.hasher.get_file_hash(pdf_path)
if not current_hash:
return True
# Compare with saved hash
saved_hash = self.hash_index.get(pdf_path, {}).get("hash")
return current_hash != saved_hash
def _save_hash(self, pdf_path: str, file_hash: str) -> None:
self.hash_index[pdf_path] = {
"hash": file_hash,
"processed_at": self.hasher.get_current_timestamp()
}
def parse_document(self, file_path: str) -> str | None:
if not os.path.exists(file_path):
return None
filename = os.path.basename(file_path)
try:
# Set timeout to prevent hanging
signal.signal(signal.SIGALRM, lambda s, f: (_ for _ in ()).throw(TimeoutError()))
signal.alarm(self.timeout)
result = self.converter.convert(file_path)
md = result.document.export_to_markdown(image_placeholder="")
signal.alarm(0)
md = self.clean_markdown(md)
# Add frontmatter metadata
return f"---\nfilename: {filename}\nfilepath: {file_path}\npage_count: {len(result.document.pages)}\nprocessed_at: {datetime.now().isoformat()}\n---\n\n{md}"
except TimeoutError:
self.logger.warning(f"Timeout: {filename}")
signal.alarm(0)
return None
except Exception as e:
self.logger.error(f"Error: {filename}: {e}")
signal.alarm(0)
return None
def parse_directory(self, source_dir: str) -> dict:
self.logger.info(f"Found {len(pdf_files)} PDF files in {source_dir}")
results = {"total": len(pdf_files), "parsed": 0, "skipped": 0, "errors": 0}
for i, fp in enumerate(pdf_files):
try:
rel = fp.relative_to(source_path)
except ValueError:
rel = Path(fp.name)
out = Path(self.output_dir) / rel.with_suffix(".md")
out.parent.mkdir(parents=True, exist_ok=True)
pdf_path = str(fp)
# Check hash to decide if processing is needed
if not self._should_process(pdf_path, out):
results["skipped"] += 1
continue
# Compute hash before processing
file_hash = self.hasher.get_file_hash(pdf_path)
md = self.parse_document(pdf_path)
if md:
out.write_text(md, encoding="utf-8")
results["parsed"] += 1
# Save hash after successful processing
if file_hash:
self._save_hash(pdf_path, file_hash)
else:
results["errors"] += 1
# Clean up memory every 10 files
if (i + 1) % 10 == 0:
gc.collect()
self.logger.info(f"{i+1}/{len(pdf_files)} (skipped: {results['skipped']})")
# Save hash index after processing
self.hasher.save_processed_index(str(self.hash_index_path), self.hash_index)
self.logger.info(f"Done: {results['parsed']} processed, {results['skipped']} skipped, {results['errors']} errors")
return results
|