|
|
""" |
|
|
PaddleOCR Engine |
|
|
|
|
|
High-accuracy OCR using PaddleOCR. |
|
|
Supports detection, recognition, and angle classification. |
|
|
""" |
|
|
|
|
|
import time |
|
|
from typing import List, Optional, Tuple |
|
|
import numpy as np |
|
|
from loguru import logger |
|
|
|
|
|
from .base import OCREngine, OCRConfig, OCRResult |
|
|
from ..schemas.core import BoundingBox, OCRRegion |
|
|
|
|
|
|
|
|
try: |
|
|
from paddleocr import PaddleOCR |
|
|
HAS_PADDLEOCR = True |
|
|
except ImportError: |
|
|
HAS_PADDLEOCR = False |
|
|
logger.warning( |
|
|
"PaddleOCR not installed. Install with: " |
|
|
"pip install paddleocr paddlepaddle-gpu (or paddlepaddle for CPU)" |
|
|
) |
|
|
|
|
|
|
|
|
class PaddleOCREngine(OCREngine): |
|
|
""" |
|
|
OCR engine using PaddleOCR. |
|
|
|
|
|
Features: |
|
|
- High accuracy text detection and recognition |
|
|
- Multi-language support |
|
|
- GPU acceleration |
|
|
- Angle classification for rotated text |
|
|
""" |
|
|
|
|
|
|
|
|
LANGUAGE_MAP = { |
|
|
"en": "en", |
|
|
"ch": "ch", |
|
|
"chinese_cht": "chinese_cht", |
|
|
"fr": "french", |
|
|
"german": "german", |
|
|
"es": "es", |
|
|
"it": "it", |
|
|
"pt": "pt", |
|
|
"ru": "ru", |
|
|
"japan": "japan", |
|
|
"korean": "korean", |
|
|
"ar": "ar", |
|
|
"hi": "hi", |
|
|
"latin": "latin", |
|
|
} |
|
|
|
|
|
def __init__(self, config: Optional[OCRConfig] = None): |
|
|
"""Initialize PaddleOCR engine.""" |
|
|
super().__init__(config) |
|
|
self._ocr: Optional[PaddleOCR] = None |
|
|
|
|
|
def initialize(self): |
|
|
"""Initialize PaddleOCR model.""" |
|
|
if not HAS_PADDLEOCR: |
|
|
raise RuntimeError( |
|
|
"PaddleOCR not installed. Install with: " |
|
|
"pip install paddleocr paddlepaddle-gpu" |
|
|
) |
|
|
|
|
|
if self._initialized: |
|
|
return |
|
|
|
|
|
logger.info("Initializing PaddleOCR engine...") |
|
|
|
|
|
|
|
|
lang = self.config.languages[0] if self.config.languages else "en" |
|
|
paddle_lang = self.LANGUAGE_MAP.get(lang, "en") |
|
|
|
|
|
try: |
|
|
self._ocr = PaddleOCR( |
|
|
use_angle_cls=self.config.use_angle_cls, |
|
|
lang=paddle_lang, |
|
|
use_gpu=self.config.use_gpu, |
|
|
gpu_mem=500, |
|
|
det_db_thresh=self.config.det_db_thresh, |
|
|
det_db_box_thresh=self.config.det_db_box_thresh, |
|
|
rec_batch_num=self.config.rec_batch_num, |
|
|
drop_score=self.config.drop_score, |
|
|
show_log=False, |
|
|
) |
|
|
self._initialized = True |
|
|
logger.info(f"PaddleOCR initialized (lang={paddle_lang}, gpu={self.config.use_gpu})") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Failed to initialize PaddleOCR: {e}") |
|
|
raise |
|
|
|
|
|
def recognize( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
page_number: int = 0, |
|
|
) -> OCRResult: |
|
|
""" |
|
|
Perform OCR on an image using PaddleOCR. |
|
|
|
|
|
Args: |
|
|
image: Image as numpy array (RGB, HWC format) |
|
|
page_number: Page number for multi-page documents |
|
|
|
|
|
Returns: |
|
|
OCRResult with recognized text and regions |
|
|
""" |
|
|
if not self._initialized: |
|
|
self.initialize() |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
results = self._ocr.ocr(image, cls=self.config.use_angle_cls) |
|
|
|
|
|
|
|
|
regions = [] |
|
|
all_texts = [] |
|
|
total_confidence = 0.0 |
|
|
|
|
|
|
|
|
if results and results[0]: |
|
|
for idx, line in enumerate(results[0]): |
|
|
if line is None: |
|
|
continue |
|
|
|
|
|
box_points = line[0] |
|
|
text, confidence = line[1] |
|
|
|
|
|
|
|
|
if confidence < self.config.min_confidence: |
|
|
continue |
|
|
|
|
|
|
|
|
bbox = self._polygon_to_bbox(box_points, image.shape[:2]) |
|
|
|
|
|
|
|
|
polygon = [(float(p[0]), float(p[1])) for p in box_points] |
|
|
|
|
|
region = OCRRegion( |
|
|
text=text, |
|
|
confidence=float(confidence), |
|
|
bbox=bbox, |
|
|
polygon=polygon, |
|
|
page=page_number, |
|
|
line_id=idx, |
|
|
engine="paddleocr", |
|
|
) |
|
|
regions.append(region) |
|
|
all_texts.append(text) |
|
|
total_confidence += confidence |
|
|
|
|
|
processing_time = (time.time() - start_time) * 1000 |
|
|
|
|
|
return OCRResult( |
|
|
regions=regions, |
|
|
full_text="\n".join(all_texts), |
|
|
confidence_avg=total_confidence / len(regions) if regions else 0.0, |
|
|
processing_time_ms=processing_time, |
|
|
engine="paddleocr", |
|
|
success=True, |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"PaddleOCR recognition failed: {e}") |
|
|
return OCRResult( |
|
|
regions=[], |
|
|
full_text="", |
|
|
confidence_avg=0.0, |
|
|
processing_time_ms=(time.time() - start_time) * 1000, |
|
|
engine="paddleocr", |
|
|
success=False, |
|
|
error=str(e), |
|
|
) |
|
|
|
|
|
def _polygon_to_bbox( |
|
|
self, |
|
|
points: List[List[float]], |
|
|
image_shape: Tuple[int, int], |
|
|
) -> BoundingBox: |
|
|
"""Convert polygon points to bounding box.""" |
|
|
x_coords = [p[0] for p in points] |
|
|
y_coords = [p[1] for p in points] |
|
|
|
|
|
height, width = image_shape |
|
|
|
|
|
return BoundingBox( |
|
|
x_min=max(0, min(x_coords)), |
|
|
y_min=max(0, min(y_coords)), |
|
|
x_max=min(width, max(x_coords)), |
|
|
y_max=min(height, max(y_coords)), |
|
|
normalized=False, |
|
|
page_width=width, |
|
|
page_height=height, |
|
|
) |
|
|
|
|
|
def get_supported_languages(self) -> List[str]: |
|
|
"""Return list of supported language codes.""" |
|
|
return list(self.LANGUAGE_MAP.keys()) |
|
|
|
|
|
def recognize_with_structure( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
page_number: int = 0, |
|
|
) -> Tuple[OCRResult, Optional[dict]]: |
|
|
""" |
|
|
Perform OCR with structure analysis (tables, layout). |
|
|
|
|
|
Args: |
|
|
image: Image as numpy array |
|
|
page_number: Page number |
|
|
|
|
|
Returns: |
|
|
Tuple of (OCRResult, structure_info) |
|
|
""" |
|
|
|
|
|
ocr_result = self.recognize(image, page_number) |
|
|
|
|
|
|
|
|
|
|
|
structure_info = None |
|
|
|
|
|
return ocr_result, structure_info |
|
|
|