|
|
""" |
|
|
Tesseract OCR Engine |
|
|
|
|
|
Fallback OCR engine using Tesseract. |
|
|
Provides broad language support and is widely available. |
|
|
""" |
|
|
|
|
|
import time |
|
|
from typing import List, Optional, Dict, Any |
|
|
import numpy as np |
|
|
from loguru import logger |
|
|
|
|
|
from .base import OCREngine, OCRConfig, OCRResult |
|
|
from ..schemas.core import BoundingBox, OCRRegion |
|
|
|
|
|
|
|
|
try: |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
HAS_TESSERACT = True |
|
|
except ImportError: |
|
|
HAS_TESSERACT = False |
|
|
logger.warning( |
|
|
"pytesseract not installed. Install with: pip install pytesseract " |
|
|
"Also install Tesseract: apt-get install tesseract-ocr" |
|
|
) |
|
|
|
|
|
|
|
|
class TesseractOCREngine(OCREngine): |
|
|
""" |
|
|
OCR engine using Tesseract. |
|
|
|
|
|
Features: |
|
|
- Broad language support (100+ languages) |
|
|
- Mature and well-tested |
|
|
- No GPU required |
|
|
- Page segmentation modes for different layouts |
|
|
""" |
|
|
|
|
|
|
|
|
LANGUAGE_MAP = { |
|
|
"en": "eng", |
|
|
"ch": "chi_sim", |
|
|
"chinese_cht": "chi_tra", |
|
|
"fr": "fra", |
|
|
"german": "deu", |
|
|
"es": "spa", |
|
|
"it": "ita", |
|
|
"pt": "por", |
|
|
"ru": "rus", |
|
|
"japan": "jpn", |
|
|
"korean": "kor", |
|
|
"ar": "ara", |
|
|
"hi": "hin", |
|
|
"latin": "lat", |
|
|
} |
|
|
|
|
|
|
|
|
PSM_AUTO = 3 |
|
|
PSM_SINGLE_BLOCK = 6 |
|
|
PSM_SINGLE_LINE = 7 |
|
|
PSM_SPARSE = 11 |
|
|
|
|
|
def __init__(self, config: Optional[OCRConfig] = None): |
|
|
"""Initialize Tesseract OCR engine.""" |
|
|
super().__init__(config) |
|
|
self._tesseract_cmd: Optional[str] = None |
|
|
|
|
|
def initialize(self): |
|
|
"""Initialize Tesseract engine.""" |
|
|
if not HAS_TESSERACT: |
|
|
raise RuntimeError( |
|
|
"pytesseract not installed. Install with: pip install pytesseract. " |
|
|
"Also install Tesseract: apt-get install tesseract-ocr" |
|
|
) |
|
|
|
|
|
if self._initialized: |
|
|
return |
|
|
|
|
|
logger.info("Initializing Tesseract OCR engine...") |
|
|
|
|
|
|
|
|
try: |
|
|
version = pytesseract.get_tesseract_version() |
|
|
logger.info(f"Tesseract version: {version}") |
|
|
self._initialized = True |
|
|
except Exception as e: |
|
|
logger.error(f"Tesseract not properly installed: {e}") |
|
|
raise RuntimeError( |
|
|
f"Tesseract not properly installed: {e}. " |
|
|
"Install with: apt-get install tesseract-ocr" |
|
|
) |
|
|
|
|
|
def recognize( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
page_number: int = 0, |
|
|
) -> OCRResult: |
|
|
""" |
|
|
Perform OCR on an image using Tesseract. |
|
|
|
|
|
Args: |
|
|
image: Image as numpy array (RGB, HWC format) |
|
|
page_number: Page number for multi-page documents |
|
|
|
|
|
Returns: |
|
|
OCRResult with recognized text and regions |
|
|
""" |
|
|
if not self._initialized: |
|
|
self.initialize() |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
pil_image = Image.fromarray(image) |
|
|
|
|
|
|
|
|
lang = self._get_tesseract_lang() |
|
|
|
|
|
|
|
|
custom_config = self._build_config() |
|
|
|
|
|
|
|
|
data = pytesseract.image_to_data( |
|
|
pil_image, |
|
|
lang=lang, |
|
|
config=custom_config, |
|
|
output_type=pytesseract.Output.DICT, |
|
|
) |
|
|
|
|
|
|
|
|
regions = [] |
|
|
all_texts = [] |
|
|
total_confidence = 0.0 |
|
|
valid_count = 0 |
|
|
|
|
|
height, width = image.shape[:2] |
|
|
|
|
|
|
|
|
current_line_id = -1 |
|
|
word_id = 0 |
|
|
|
|
|
for i in range(len(data['text'])): |
|
|
text = data['text'][i].strip() |
|
|
conf = int(data['conf'][i]) |
|
|
|
|
|
|
|
|
if not text or conf < 0: |
|
|
continue |
|
|
|
|
|
confidence = conf / 100.0 |
|
|
if confidence < self.config.min_confidence: |
|
|
continue |
|
|
|
|
|
|
|
|
block_num = data['block_num'][i] |
|
|
line_num = data['line_num'][i] |
|
|
line_id = block_num * 1000 + line_num |
|
|
|
|
|
if line_id != current_line_id: |
|
|
current_line_id = line_id |
|
|
word_id = 0 |
|
|
else: |
|
|
word_id += 1 |
|
|
|
|
|
|
|
|
x = data['left'][i] |
|
|
y = data['top'][i] |
|
|
w = data['width'][i] |
|
|
h = data['height'][i] |
|
|
|
|
|
bbox = BoundingBox( |
|
|
x_min=float(x), |
|
|
y_min=float(y), |
|
|
x_max=float(x + w), |
|
|
y_max=float(y + h), |
|
|
normalized=False, |
|
|
page_width=width, |
|
|
page_height=height, |
|
|
) |
|
|
|
|
|
region = OCRRegion( |
|
|
text=text, |
|
|
confidence=confidence, |
|
|
bbox=bbox, |
|
|
page=page_number, |
|
|
line_id=line_id, |
|
|
word_id=word_id, |
|
|
engine="tesseract", |
|
|
) |
|
|
regions.append(region) |
|
|
all_texts.append(text) |
|
|
total_confidence += confidence |
|
|
valid_count += 1 |
|
|
|
|
|
|
|
|
full_text = pytesseract.image_to_string( |
|
|
pil_image, |
|
|
lang=lang, |
|
|
config=custom_config, |
|
|
) |
|
|
|
|
|
processing_time = (time.time() - start_time) * 1000 |
|
|
|
|
|
return OCRResult( |
|
|
regions=regions, |
|
|
full_text=full_text.strip(), |
|
|
confidence_avg=total_confidence / valid_count if valid_count > 0 else 0.0, |
|
|
processing_time_ms=processing_time, |
|
|
engine="tesseract", |
|
|
success=True, |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Tesseract recognition failed: {e}") |
|
|
return OCRResult( |
|
|
regions=[], |
|
|
full_text="", |
|
|
confidence_avg=0.0, |
|
|
processing_time_ms=(time.time() - start_time) * 1000, |
|
|
engine="tesseract", |
|
|
success=False, |
|
|
error=str(e), |
|
|
) |
|
|
|
|
|
def _get_tesseract_lang(self) -> str: |
|
|
"""Get Tesseract language string from config.""" |
|
|
langs = [] |
|
|
for lang in self.config.languages: |
|
|
tess_lang = self.LANGUAGE_MAP.get(lang, "eng") |
|
|
if tess_lang not in langs: |
|
|
langs.append(tess_lang) |
|
|
return "+".join(langs) if langs else "eng" |
|
|
|
|
|
def _build_config(self) -> str: |
|
|
"""Build Tesseract config string.""" |
|
|
config_parts = [ |
|
|
f"--psm {self.PSM_AUTO}", |
|
|
"--oem 3", |
|
|
] |
|
|
|
|
|
|
|
|
if self.config.return_word_boxes: |
|
|
config_parts.append("-c preserve_interword_spaces=1") |
|
|
|
|
|
return " ".join(config_parts) |
|
|
|
|
|
def get_supported_languages(self) -> List[str]: |
|
|
"""Return list of supported language codes.""" |
|
|
return list(self.LANGUAGE_MAP.keys()) |
|
|
|
|
|
def get_installed_languages(self) -> List[str]: |
|
|
"""Get list of languages installed in Tesseract.""" |
|
|
if not self._initialized: |
|
|
self.initialize() |
|
|
|
|
|
try: |
|
|
langs = pytesseract.get_languages() |
|
|
return langs |
|
|
except Exception as e: |
|
|
logger.warning(f"Could not get installed languages: {e}") |
|
|
return ["eng"] |
|
|
|
|
|
def recognize_with_hocr( |
|
|
self, |
|
|
image: np.ndarray, |
|
|
page_number: int = 0, |
|
|
) -> tuple: |
|
|
""" |
|
|
Perform OCR and return hOCR format for detailed layout. |
|
|
|
|
|
Args: |
|
|
image: Image as numpy array |
|
|
page_number: Page number |
|
|
|
|
|
Returns: |
|
|
Tuple of (OCRResult, hOCR string) |
|
|
""" |
|
|
if not self._initialized: |
|
|
self.initialize() |
|
|
|
|
|
pil_image = Image.fromarray(image) |
|
|
lang = self._get_tesseract_lang() |
|
|
config = self._build_config() |
|
|
|
|
|
|
|
|
ocr_result = self.recognize(image, page_number) |
|
|
|
|
|
|
|
|
try: |
|
|
hocr = pytesseract.image_to_pdf_or_hocr( |
|
|
pil_image, |
|
|
lang=lang, |
|
|
config=config, |
|
|
extension='hocr', |
|
|
) |
|
|
return ocr_result, hocr.decode('utf-8') |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to generate hOCR: {e}") |
|
|
return ocr_result, None |
|
|
|