Spaces:

phamha
/

engineering-drawing

Sleeping

App Files Files Community

Harry Pham commited on Apr 2

Commit

f69131e

1 Parent(s): 1813932

update OCR

Browse files

Files changed (1) hide show

src/inference.py +614 -275

src/inference.py CHANGED Viewed

@@ -1,315 +1,665 @@
 # src/inference.py
-# ── Patch torch.load — DÒNG ĐẦU TIÊN ──────────────────────
 import torch
 _orig_torch_load = torch.load
 def _patched_load(*args, **kwargs):
     kwargs.setdefault("weights_only", False)
     return _orig_torch_load(*args, **kwargs)
 torch.load = _patched_load
-# ───────────────────────────────────────────────────────────
 import cv2
 import json
 import numpy as np
 from pathlib import Path
-from PIL import Image
 from ultralytics import RTDETR
-# ── Device ──────────────────────────────────────────────────
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
-# ── Class config ─────────────────────────────────────────────
 CLASS_NAMES   = ["note", "part-drawing", "table"]
-CLASS_DISPLAY = {
-    "note":         "Note",
-    "part-drawing": "PartDrawing",
-    "table":        "Table",
-}
-COLORS = {
-    "note":         (0,  165, 255),
-    "part-drawing": (0,  200,   0),
-    "table":        (0,   0,  220),
-}
-# ─────────────────────────────────────────────────────────────
-# DETECTION MODEL
-# ─────────────────────────────────────────────────────────────
-_det_model = None
-def get_det_model(checkpoint: str = "best.pt") -> RTDETR:
     global _det_model
     if _det_model is None:
-        print(f"[INFO] Loading RT-DETR: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
-# ─────────────────────────────────────────────────────────────
-# TrOCR  — engine chính cho handwritten text
-# microsoft/trocr-large-handwritten  (tốt nhất, ~1.3GB)
-# microsoft/trocr-base-handwritten   (nhỏ hơn, ~400MB)
-# ─────────────────────────────────────────────────────────────
-_trocr_processor = None
-_trocr_model     = None
-TROCR_MODEL_ID   = "microsoft/trocr-large-handwritten"
-def get_trocr():
-    global _trocr_processor, _trocr_model
-    if _trocr_processor is None:
-        from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-        print(f"[INFO] Loading TrOCR ({TROCR_MODEL_ID})...")
-        _trocr_processor = TrOCRProcessor.from_pretrained(TROCR_MODEL_ID)
-        _trocr_model     = VisionEncoderDecoderModel.from_pretrained(TROCR_MODEL_ID)
-        _trocr_model.to(DEVICE)
-        _trocr_model.eval()
-        print("[INFO] TrOCR ready.")
-    return _trocr_processor, _trocr_model
-def trocr_predict_line(pil_img: Image.Image) -> str:
     """
-    Chạy TrOCR trên 1 dòng ảnh PIL (RGB).
-    TrOCR được train theo từng dòng text — không truyền cả trang.
     """
-    processor, model = get_trocr()
-    pixel_values = processor(images=pil_img, return_tensors="pt").pixel_values
-    pixel_values = pixel_values.to(DEVICE)
-    with torch.no_grad():
-        generated_ids = model.generate(
-            pixel_values,
-            max_new_tokens=128,
         )
-    text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return text.strip()
-# ─────────────────────────────────────────────────────────────
-# EasyOCR  — fallback + text detection (tìm vị trí dòng text)
-# ─────────────────────────────────────────────────────────────
-_easy_reader = None
-def get_easy_reader():
-    global _easy_reader
-    if _easy_reader is None:
         import easyocr
-        print("[INFO] Loading EasyOCR (vi + en)...")
-        _easy_reader = easyocr.Reader(["vi", "en"], gpu=False, verbose=False)
-    return _easy_reader
-# ─────────────────────────────────────────────────────────────
-# PREPROCESSING
-# ─────────────────────────────────────────────────────────────
-def preprocess_for_ocr(img_bgr: np.ndarray) -> np.ndarray:
-    """Tăng chất lượng ảnh scan/bản vẽ. Trả về BGR."""
     h, w = img_bgr.shape[:2]
-    # Upscale nếu nhỏ — TrOCR cần ít nhất 384px chiều cao
-    if w < 1000:
-        scale   = 1000 / w
-        img_bgr = cv2.resize(img_bgr,
-                             (int(w * scale), int(h * scale)),
                              interpolation=cv2.INTER_CUBIC)
-    gray  = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
-    # CLAHE — cân bằng histogram cục bộ
-    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
-    gray  = clahe.apply(gray)
-    # Bilateral filter — giữ cạnh sắc, giảm nhiễu
-    gray = cv2.bilateralFilter(gray, 9, 75, 75)
-    # Sharpen
-    kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
-    gray   = cv2.filter2D(gray, -1, kernel)
-    return cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
-def crop_text_lines(img_bgr: np.ndarray) -> list:
     """
-    Dùng EasyOCR để detect vị trí các dòng text,
-    sau đó crop từng dòng để truyền vào TrOCR.
-    Trả về list of (pil_crop, bbox, easy_text, easy_conf).
     """
-    reader  = get_easy_reader()
-    results = reader.readtext(
-        img_bgr,
-        detail=1,
-        paragraph=False,
-        width_ths=0.8,
-        height_ths=0.8,
-    )
-    line_crops = []
-    for (pts, easy_text, easy_conf) in results:
-        if easy_conf < 0.1 or not easy_text.strip():
-            continue
-        # Bounding box của dòng
-        xs = [int(p[0]) for p in pts]
-        ys = [int(p[1]) for p in pts]
-        x1, x2 = max(0, min(xs) - 4), min(img_bgr.shape[1], max(xs) + 4)
-        y1, y2 = max(0, min(ys) - 4), min(img_bgr.shape[0], max(ys) + 4)
-        if x2 <= x1 or y2 <= y1:
-            continue
-        crop_bgr = img_bgr[y1:y2, x1:x2]
-        crop_rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
-        pil_crop = Image.fromarray(crop_rgb)
-        cx = (x1 + x2) / 2
-        cy = (y1 + y2) / 2
-        line_crops.append({
-            "pil":       pil_crop,
-            "x":         cx,
-            "y":         cy,
-            "easy_text": easy_text,
-            "easy_conf": easy_conf,
-        })
-    return line_crops
-# ─────────────────────────────────────────────────────────────
-# OCR PIPELINE: kết hợp EasyOCR detect + TrOCR recognize
-# ─────────────────────────────────────────────────────────────
-def hybrid_ocr_lines(img_bgr: np.ndarray, conf_threshold: float = 0.3) -> list:
     """
-    1. EasyOCR detect vị trí từng dòng text
-    2. Crop từng dòng → TrOCR nhận dạng
-    3. Nếu TrOCR output quá ngắn/rỗng → giữ EasyOCR text
-    Trả về list of {"text", "x", "y"}
     """
-    img_proc   = preprocess_for_ocr(img_bgr)
-    line_crops = crop_text_lines(img_proc)
-    items = []
-    for lc in line_crops:
-        trocr_text = ""
         try:
-            trocr_text = trocr_predict_line(lc["pil"])
         except Exception as e:
-            print(f"[WARN] TrOCR error: {e}")
-        # Chọn text tốt hơn giữa TrOCR và EasyOCR
-        # TrOCR ưu tiên nếu có output đủ dài
-        if len(trocr_text) >= max(2, len(lc["easy_text"]) * 0.4):
-            final_text = trocr_text
-        else:
-            final_text = lc["easy_text"]
-        if final_text.strip():
             items.append({
-                "text": final_text.strip(),
-                "x":    lc["x"],
-                "y":    lc["y"],
             })
-    return items
-# ─────────────────────────────────────────────────────────────
-# GROUP ROWS (dùng cho Table)
-# ─────────────────────────────────────────────────────────────
-def group_into_rows(items: list) -> list:
-    """Group các text item theo hàng dựa vào y_center."""
     if not items:
         return []
-    items = sorted(items, key=lambda x: x["y"])
-    y_vals = [it["y"] for it in items]
     if len(y_vals) > 1:
-        gaps   = [y_vals[i+1] - y_vals[i] for i in range(len(y_vals) - 1)]
-        thresh = max(8, np.median(gaps) * 0.6)
     else:
         thresh = 12
-    rows, cur = [], [items[0]]
-    for item in items[1:]:
-        if item["y"] - cur[-1]["y"] < thresh:
-            cur.append(item)
         else:
-            cur.sort(key=lambda x: x["x"])
-            rows.append([i["text"] for i in cur])
-            cur = [item]
-    cur.sort(key=lambda x: x["x"])
-    rows.append([i["text"] for i in cur])
-    return rows
-# ─────────────────────────────────────────────────────────────
-# PUBLIC OCR FUNCTIONS
-# ─────────────────────────────────────────────────────────────
-def ocr_note(img_path: str) -> str:
-    """OCR vùng Note → plain text, preserve line order."""
-    img = cv2.imread(img_path)
-    if img is None:
-        return ""
-    items = hybrid_ocr_lines(img)
-    # Sắp xếp theo y (trên xuống dưới), x (trái sang phải)
-    items.sort(key=lambda x: (round(x["y"] / 15), x["x"]))
-    return "\n".join(it["text"] for it in items)
-def ocr_table(img_path: str) -> dict:
-    """OCR vùng Table → giữ cấu trúc rows × cols."""
-    img = cv2.imread(img_path)
-    if img is None:
-        return {"rows": [], "text": ""}
-    items = hybrid_ocr_lines(img)
-    if not items:
-        return {"rows": [], "text": ""}
-    rows = group_into_rows(items)
-    text = "\n".join(" | ".join(r) for r in rows)
-    return {"rows": rows, "text": text}
-# ─────────────────────────────────────────────────────────────
 # MAIN PIPELINE
-# ─────────────────────────────────────────────────────────────
-def run_pipeline(
-    image_path:  str,
-    output_dir:  str   = "outputs",
-    checkpoint:  str   = "best.pt",
-    conf_thresh: float = 0.3,
-) -> tuple:
-    """
-    Full pipeline: detect → crop → OCR → JSON + visualize.
-    Returns (result_dict, vis_image_path).
-    """
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
     crop_dir   = Path(output_dir) / stem / "crops"
     crop_dir.mkdir(parents=True, exist_ok=True)
-    # 1. Detect
     model   = get_det_model(checkpoint)
-    results = model(
-        image_path,
-        imgsz=1024,
-        conf=conf_thresh,
-        iou=0.5,
-        device=DEVICE,
-        verbose=False,
-    )
     img_bgr = cv2.imread(image_path)
     if img_bgr is None:
-        raise ValueError(f"Cannot read image: {image_path}")
     objects = []
     for i, box in enumerate(results[0].boxes):
         x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
         cls_idx  = int(box.cls[0])
@@ -317,66 +667,55 @@ def run_pipeline(
         cls_raw  = CLASS_NAMES[cls_idx]
         cls_show = CLASS_DISPLAY[cls_raw]
-        # 2. Crop
-        pad  = 6
-        crop = img_bgr[
-            max(0, y1 - pad): min(img_bgr.shape[0], y2 + pad),
-            max(0, x1 - pad): min(img_bgr.shape[1], x2 + pad),
-        ]
         crop_path = str(crop_dir / f"{cls_show}_{i+1}.jpg")
-        cv2.imwrite(crop_path, crop, [cv2.IMWRITE_JPEG_QUALITY, 95])
-        # 3. OCR
         ocr_content = None
         if cls_raw == "note":
-            print(f"[OCR] Note #{i+1}...")
-            ocr_content = ocr_note(crop_path)
-            print(f"      → {repr(ocr_content[:100]) if ocr_content else 'EMPTY'}")
         elif cls_raw == "table":
-            print(f"[OCR] Table #{i+1}...")
-            ocr_content = ocr_table(crop_path)
-            preview = ocr_content.get("text", "")[:100]
             print(f"      → {repr(preview) if preview else 'EMPTY'}")
         objects.append({
-            "id":          i + 1,
-            "class":       cls_show,
-            "confidence":  conf_val,
-            "bbox":        {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
-            "crop_path":   crop_path,
             "ocr_content": ocr_content,
         })
-        # 4. Vẽ bbox
         color = COLORS[cls_raw]
         cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 2)
         label = f"{cls_show} {conf_val:.2f}"
-        (tw, th), _ = cv2.getTextSize(
-            label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
-        cv2.rectangle(img_bgr,
-                      (x1, y1 - th - 10), (x1 + tw + 8, y1),
-                      color, -1)
-        cv2.putText(img_bgr, label, (x1 + 4, y1 - 4),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
-    # 5. Lưu visualize
     vis_path = str(Path(output_dir) / stem / "result_vis.jpg")
     cv2.imwrite(vis_path, img_bgr)
-    # 6. Lưu JSON
     result    = {"image": img_name, "objects": objects}
     json_path = str(Path(output_dir) / stem / "result.json")
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
-    print(f"\n[✓] {len(objects)} objects | vis→{vis_path} | json→{json_path}")
     return result, vis_path
-# ── CLI ──────────────────────────────────────────────────────
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
-    result, _ = run_pipeline(img)
     print(json.dumps(result, ensure_ascii=False, indent=2))

 # src/inference.py
 import torch
 _orig_torch_load = torch.load
 def _patched_load(*args, **kwargs):
     kwargs.setdefault("weights_only", False)
     return _orig_torch_load(*args, **kwargs)
 torch.load = _patched_load
 import cv2
 import json
 import numpy as np
 from pathlib import Path
 from ultralytics import RTDETR
+import re
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[INFO] Device: {DEVICE}")
 CLASS_NAMES   = ["note", "part-drawing", "table"]
+CLASS_DISPLAY = {"note": "Note", "part-drawing": "PartDrawing", "table": "Table"}
+COLORS        = {"note": (0,165,255), "part-drawing": (0,200,0), "table": (0,0,220)}
+_det_model  = None
+_ocr_paddle = None
+_ocr_paddle_en = None
+_ocr_easyocr = None
+# ============================================================
+# MODEL LOADERS
+# ============================================================
+def get_det_model(checkpoint="best.pt"):
     global _det_model
     if _det_model is None:
+        print(f"[INFO] Loading detection model: {checkpoint}")
         _det_model = RTDETR(checkpoint)
     return _det_model
+def get_paddle_reader(lang='vi'):
     """
+    PaddleOCR PP-OCRv4 — cải thiện chính:
+    - ocr_version='PP-OCRv4' (mới nhất, chính xác hơn v3)
+    - det_db_thresh thấp hơn → phát hiện chữ nhỏ/mờ
+    - det_db_unclip_ratio lớn hơn → box chữ rộng hơn, không cắt dấu
+    - use_dilation=True → kết nối các phần chữ bị đứt
+    - det_db_score_mode='slow' → chính xác hơn 'fast'
     """
+    global _ocr_paddle, _ocr_paddle_en
+    if lang == 'en':
+        if _ocr_paddle_en is not None:
+            return _ocr_paddle_en
+    else:
+        if _ocr_paddle is not None:
+            return _ocr_paddle
+    try:
+        from paddleocr import PaddleOCR
+        print(f"[INFO] Initializing PaddleOCR PP-OCRv4 (lang={lang})...")
+        reader = PaddleOCR(
+            lang=lang,
+            use_angle_cls=True,
+            use_gpu=(DEVICE == "cuda"),
+            show_log=False,
+            ocr_version='PP-OCRv4',          # ← KEY: dùng v4
+            det_db_thresh=0.15,               # ← giảm để phát hiện chữ mờ
+            det_db_box_thresh=0.2,            # ← giảm
+            det_db_unclip_ratio=2.0,          # ← tăng để không cắt dấu tiếng Việt
+            use_dilation=True,                # ← kết nối chữ bị đứt
+            det_db_score_mode='slow',         # ← chính xác hơn
+            rec_image_shape="3,48,320",
+            max_text_length=80,
+            rec_batch_num=6,
         )
+        if lang == 'en':
+            _ocr_paddle_en = reader
+        else:
+            _ocr_paddle = reader
+        return reader
+    except Exception as e:
+        print(f"[WARN] PaddleOCR init failed: {e}")
+        return None
+def get_easyocr_reader():
+    global _ocr_easyocr
+    if _ocr_easyocr is None:
         import easyocr
+        _ocr_easyocr = easyocr.Reader(
+            ["vi", "en"], gpu=(DEVICE == "cuda"), verbose=False
+        )
+    return _ocr_easyocr
+# ============================================================
+# PREPROCESSING — Nguyên tắc: UPSCALE MẠNH, XỬ LÝ NHẸ
+# ============================================================
+def preprocess_for_ocr(img_bgr, min_width=1500, mode="note"):
+    """
+    Tiền xử lý cho OCR trên bản vẽ kỹ thuật.
+    THAY ĐỔI QUAN TRỌNG so với bản cũ:
+    1. Upscale mạnh hơn (min 1500px thay vì 800px)
+    2. KHÔNG convert sang grayscale rồi threshold → phá hủy dấu tiếng Việt
+    3. Dùng CLAHE trên kênh L (LAB) → giữ nguyên cấu trúc ảnh
+    4. Bilateral filter thay vì fastNlMeansDenoising → giữ edge tốt hơn
+    5. Sharpening nhẹ hơn nhiều
+    """
     h, w = img_bgr.shape[:2]
+    # === BƯỚC 1: UPSCALE MẠNH (quan trọng nhất!) ===
+    if w < min_width:
+        scale = min_width / w
+        img_bgr = cv2.resize(img_bgr, None, fx=scale, fy=scale,
                              interpolation=cv2.INTER_CUBIC)
+        h, w = img_bgr.shape[:2]
+    if mode == "note":
+        # === BƯỚC 2: Gentle denoising (giữ edge, giữ dấu) ===
+        img_proc = cv2.bilateralFilter(img_bgr, 9, 75, 75)
+        # === BƯỚC 3: CLAHE trên kênh L (LAB colorspace) ===
+        # Không convert grayscale → giữ info cho PaddleOCR
+        lab = cv2.cvtColor(img_proc, cv2.COLOR_BGR2LAB)
+        l, a, b = cv2.split(lab)
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        l = clahe.apply(l)
+        lab = cv2.merge([l, a, b])
+        img_proc = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+        # === BƯỚC 4: Sharpening NHẸ (không dùng kernel quá mạnh) ===
+        # Kernel cũ [-1,-1,-1; -1,9,-1; -1,-1,-1] quá mạnh → tạo artifact
+        kernel = np.array([[0, -0.5, 0],
+                           [-0.5, 3, -0.5],
+                           [0, -0.5, 0]])
+        img_proc = cv2.filter2D(img_proc, -1, kernel)
+        return img_proc
+    else:  # table
+        # Với table: tăng contrast mạnh hơn, nhưng vẫn giữ BGR
+        img_proc = cv2.bilateralFilter(img_bgr, 11, 80, 80)
+        lab = cv2.cvtColor(img_proc, cv2.COLOR_BGR2LAB)
+        l, a, b = cv2.split(lab)
+        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(4, 4))
+        l = clahe.apply(l)
+        lab = cv2.merge([l, a, b])
+        img_proc = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+        return img_proc
+def preprocess_grayscale_variant(img_bgr, min_width=1500):
     """
+    Biến thể grayscale để dùng trong multi-pass OCR.
+    Chỉ dùng Otsu thay vì adaptive threshold → ít artifact hơn.
     """
+    h, w = img_bgr.shape[:2]
+    if w < min_width:
+        scale = min_width / w
+        img_bgr = cv2.resize(img_bgr, None, fx=scale, fy=scale,
+                             interpolation=cv2.INTER_CUBIC)
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    gray = clahe.apply(gray)
+    # Otsu threshold — tự động chọn ngưỡng tối ưu
+    _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    return cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+# ============================================================
+# MULTI-PASS OCR — Thử nhiều cách, chọn kết quả tốt nhất
+# ============================================================
+def ocr_single_pass(reader, img_bgr):
+    """Chạy OCR 1 lần, trả về (texts, avg_confidence)."""
+    if hasattr(reader, 'ocr'):  # PaddleOCR
+        result = reader.ocr(img_bgr, cls=True)
+        if not result or not result[0]:
+            return [], 0.0
+        texts = []
+        confs = []
+        for line in result[0]:
+            box, (text, conf) = line
+            if conf >= 0.2 and text.strip():
+                texts.append(text.strip())
+                confs.append(conf)
+        avg_conf = np.mean(confs) if confs else 0.0
+        return texts, avg_conf
+    else:  # EasyOCR
+        results = reader.readtext(img_bgr, detail=1, paragraph=False)
+        texts = []
+        confs = []
+        for (pts, text, conf) in results:
+            if conf >= 0.15 and text.strip():
+                texts.append(text.strip())
+                confs.append(conf)
+        avg_conf = np.mean(confs) if confs else 0.0
+        return texts, avg_conf
+def multi_pass_ocr(img_bgr, reader, ocr_type="note"):
+    """
+    Multi-pass OCR: thử nhiều preprocessing, chọn kết quả confidence cao nhất.
+    Pass 1: Preprocessing nhẹ (CLAHE + bilateral) — tốt cho chữ rõ
+    Pass 2: Grayscale + Otsu — tốt cho chữ mờ trên nền phức tạp
+    Pass 3: Scale 2x thêm — tốt cho chữ rất nhỏ
+    """
+    best_texts = []
+    best_conf = 0.0
+    # Pass 1: Color preprocessing (gentle)
+    img_v1 = preprocess_for_ocr(img_bgr, min_width=1500, mode=ocr_type)
+    texts1, conf1 = ocr_single_pass(reader, img_v1)
+    if conf1 > best_conf:
+        best_conf = conf1
+        best_texts = texts1
+    # Pass 2: Grayscale variant
+    img_v2 = preprocess_grayscale_variant(img_bgr, min_width=1500)
+    texts2, conf2 = ocr_single_pass(reader, img_v2)
+    if conf2 > best_conf:
+        best_conf = conf2
+        best_texts = texts2
+    # Pass 3: Extra upscale (2x more than pass 1)
+    img_v3 = preprocess_for_ocr(img_bgr, min_width=2500, mode=ocr_type)
+    texts3, conf3 = ocr_single_pass(reader, img_v3)
+    if conf3 > best_conf:
+        best_conf = conf3
+        best_texts = texts3
+    print(f"      Multi-pass confidences: {conf1:.3f}, {conf2:.3f}, {conf3:.3f} → best={best_conf:.3f}")
+    return best_texts, best_conf
+# ============================================================
+# DUAL-ENGINE OCR — PaddleOCR (vi) + PaddleOCR (en), chọn tốt hơn
+# ============================================================
+def dual_engine_ocr(img_bgr, ocr_type="note"):
+    """
+    Chạy PaddleOCR với cả lang='vi' và lang='en',
+    chọn kết quả có confidence cao hơn.
+    Nếu PaddleOCR fail → fallback EasyOCR.
+    """
+    reader_vi = get_paddle_reader('vi')
+    reader_en = get_paddle_reader('en')
+    if reader_vi is None and reader_en is None:
+        # Fallback to EasyOCR
+        reader = get_easyocr_reader()
+        texts, conf = multi_pass_ocr(img_bgr, reader, ocr_type)
+        return texts, conf
+    best_texts = []
+    best_conf = 0.0
+    best_lang = ""
+    # Try Vietnamese
+    if reader_vi:
+        texts_vi, conf_vi = multi_pass_ocr(img_bgr, reader_vi, ocr_type)
+        if conf_vi > best_conf:
+            best_conf = conf_vi
+            best_texts = texts_vi
+            best_lang = "vi"
+    # Try English
+    if reader_en:
+        texts_en, conf_en = multi_pass_ocr(img_bgr, reader_en, ocr_type)
+        if conf_en > best_conf:
+            best_conf = conf_en
+            best_texts = texts_en
+            best_lang = "en"
+    print(f"      Best language: {best_lang} (conf={best_conf:.3f})")
+    return best_texts, best_conf
+# ============================================================
+# POST-PROCESSING — Sửa lỗi OCR phổ biến
+# ============================================================
+def post_process_ocr_text(text):
+    """
+    Sửa các lỗi OCR phổ biến trong bản vẽ kỹ thuật.
+    """
+    if not text:
+        return text
+    # Fix: số 0 bị nhận thành O và ngược lại trong context kỹ thuật
+    # Ví dụ: "M1O" → "M10", "Ø2O" → "Ø20"
+    text = re.sub(r'(?<=[0-9])O(?=[0-9])', '0', text)  # 1O5 → 105
+    text = re.sub(r'(?<=M)O', '0', text)                 # MO → M0... (rồi thành M10 nếu phù hợp)
+    text = re.sub(r'(?<=Ø)O', '0', text)
+    # Fix: số 1 bị nhận thành l/I
+    text = re.sub(r'(?<=[0-9])[lI](?=[0-9])', '1', text)  # 2l5 → 215
+    # Fix: dấu × bị nhận thành x
+    text = re.sub(r'(\d+)\s*[xX]\s*(\d+)', r'\1×\2', text)
+    # Fix: Thép bị nhận sai
+    text = re.sub(r'[Tt]h[eé]p\s*', 'Thép ', text, flags=re.IGNORECASE)
+    # Clean extra spaces
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+# ============================================================
+# OCR NOTE — Cải thiện
+# ============================================================
+def ocr_note(img_path, backend="paddle"):
+    """
+    OCR cho vùng Note — cải thiện:
+    1. Upscale mạnh (min 1500px width)
+    2. Multi-pass với nhiều preprocessing
+    3. Dual-engine (vi + en)
+    4. Post-processing
+    """
+    img = cv2.imread(img_path)
+    if img is None:
+        return ""
+    texts, conf = dual_engine_ocr(img, ocr_type="note")
+    # Post-process từng dòng
+    processed = [post_process_ocr_text(t) for t in texts]
+    processed = [t for t in processed if t]  # remove empty
+    return "\n".join(processed)
+# ============================================================
+# OCR TABLE — Cải thiện với PPStructure
+# ============================================================
+_pp_structure = None
+def get_pp_structure():
+    """Load PPStructure cho table recognition."""
+    global _pp_structure
+    if _pp_structure is not None:
+        return _pp_structure
+    try:
+        from paddleocr import PPStructure
+        print("[INFO] Initializing PPStructure for table recognition...")
+        _pp_structure = PPStructure(
+            table=True,
+            ocr=True,
+            lang='vi',
+            show_log=False,
+            use_gpu=(DEVICE == "cuda"),
+            table_char_type='vi',
+        )
+        return _pp_structure
+    except Exception as e:
+        print(f"[WARN] PPStructure init failed: {e}")
+        return None
+def parse_html_table(html_str):
+    """Parse HTML table string thành list of rows."""
+    rows = []
+    # Tìm tất cả <tr>...</tr>
+    tr_pattern = re.findall(r'<tr>(.*?)</tr>', html_str, re.DOTALL)
+    for tr in tr_pattern:
+        # Tìm tất cả <td>...</td>
+        cells = re.findall(r'<td[^>]*>(.*?)</td>', tr, re.DOTALL)
+        # Clean HTML tags trong cell
+        clean_cells = []
+        for cell in cells:
+            clean = re.sub(r'<[^>]+>', '', cell).strip()
+            clean_cells.append(clean)
+        if clean_cells:
+            rows.append(clean_cells)
+    return rows
+def ocr_table(img_path, backend="paddle"):
     """
+    OCR cho vùng Table — cải thiện:
+    1. Thử PPStructure trước (table structure recognition tốt nhất)
+    2. Fallback: detect cells thủ công + OCR từng cell
+    3. Post-processing
     """
+    img = cv2.imread(img_path)
+    if img is None:
+        return {"rows": [], "text": ""}
+    # === Strategy 1: PPStructure (best for tables) ===
+    pp_engine = get_pp_structure()
+    if pp_engine is not None:
         try:
+            # Upscale trước khi đưa vào PPStructure
+            h, w = img.shape[:2]
+            if w < 1200:
+                scale = 1200 / w
+                img_scaled = cv2.resize(img, None, fx=scale, fy=scale,
+                                        interpolation=cv2.INTER_CUBIC)
+            else:
+                img_scaled = img
+            result = pp_engine(img_scaled)
+            for item in result:
+                if item.get('type') == 'table':
+                    html = item.get('res', {}).get('html', '')
+                    if html:
+                        rows = parse_html_table(html)
+                        if rows:
+                            # Post-process mỗi cell
+                            rows = [[post_process_ocr_text(cell) for cell in row]
+                                    for row in rows]
+                            text = "\n".join(" | ".join(r) for r in rows)
+                            print(f"      PPStructure: {len(rows)} rows detected")
+                            return {"rows": rows, "text": text, "html": html}
+            # PPStructure ran but no table found → extract text
+            all_texts = []
+            for item in result:
+                res = item.get('res', [])
+                if isinstance(res, list):
+                    for line in res:
+                        if isinstance(line, dict) and 'text' in line:
+                            all_texts.append(line['text'])
+                        elif isinstance(line, (list, tuple)) and len(line) >= 2:
+                            text_info = line[1]
+                            if isinstance(text_info, (list, tuple)):
+                                all_texts.append(str(text_info[0]))
+                            else:
+                                all_texts.append(str(text_info))
+            if all_texts:
+                return {"rows": [all_texts], "text": "\n".join(all_texts)}
         except Exception as e:
+            print(f"      PPStructure error: {e}, falling back to manual")
+    # === Strategy 2: Manual cell detection + OCR ===
+    return ocr_table_manual(img, img_path, backend)
+def ocr_table_manual(img, img_path, backend="paddle"):
+    """
+    Fallback: detect table cells thủ công + OCR từng cell.
+    Cải thiện: upscale mỗi cell riêng, multi-pass OCR.
+    """
+    cells = detect_table_structure(img)
+    if cells:
+        reader = get_paddle_reader('vi') or get_easyocr_reader()
+        ocr_results = []
+        for (x1, y1, x2, y2) in cells:
+            # Bỏ cell quá lớn (toàn bộ bảng) hoặc quá nhỏ
+            cell_w, cell_h = x2 - x1, y2 - y1
+            img_h, img_w = img.shape[:2]
+            if cell_w > img_w * 0.9 and cell_h > img_h * 0.9:
+                continue  # Skip full-table contour
+            if cell_w < 15 or cell_h < 15:
+                continue
+            pad = 3
+            cy1 = max(0, y1 - pad)
+            cx1 = max(0, x1 - pad)
+            cy2 = min(img.shape[0], y2 + pad)
+            cx2 = min(img.shape[1], x2 + pad)
+            cell_img = img[cy1:cy2, cx1:cx2]
+            text = ocr_cell_improved(cell_img, reader)
+            if text:
+                ocr_results.append({
+                    "text": post_process_ocr_text(text),
+                    "x": (x1 + x2) // 2,
+                    "y": (y1 + y2) // 2,
+                    "box": (x1, y1, x2, y2)
+                })
+        if ocr_results:
+            rows = group_rows(ocr_results, vertical_thresh_ratio=0.5)
+            return {
+                "rows": rows,
+                "text": "\n".join(" | ".join(r) for r in rows)
+            }
+    # === Strategy 3: OCR toàn bộ ảnh table, group theo hàng ===
+    return ocr_table_fullimage(img, backend)
+def ocr_cell_improved(img_cell, reader):
+    """OCR 1 cell — upscale mạnh, multi-preprocessing."""
+    if img_cell.size == 0:
+        return ""
+    h, w = img_cell.shape[:2]
+    # Upscale cell nhỏ rất mạnh
+    target_w = max(300, w)
+    if w < target_w:
+        scale = target_w / w
+        img_cell = cv2.resize(img_cell, None, fx=scale, fy=scale,
+                              interpolation=cv2.INTER_CUBIC)
+    # Try 2 variants
+    best_text = ""
+    best_conf = 0
+    for variant in ["color", "binary"]:
+        if variant == "color":
+            # Gentle enhancement
+            img_proc = cv2.bilateralFilter(img_cell, 5, 50, 50)
+            lab = cv2.cvtColor(img_proc, cv2.COLOR_BGR2LAB)
+            l, a, b = cv2.split(lab)
+            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(4, 4))
+            l = clahe.apply(l)
+            lab = cv2.merge([l, a, b])
+            img_proc = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
+        else:
+            gray = cv2.cvtColor(img_cell, cv2.COLOR_BGR2GRAY)
+            _, binary = cv2.threshold(gray, 0, 255,
+                                      cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+            img_proc = cv2.cvtColor(binary, cv2.COLOR_GRAY2BGR)
+        texts, conf = ocr_single_pass(reader, img_proc)
+        combined = " ".join(texts)
+        if conf > best_conf and combined.strip():
+            best_conf = conf
+            best_text = combined
+    return best_text
+def ocr_table_fullimage(img, backend="paddle"):
+    """OCR toàn bộ ảnh table (không chia cell), group by rows."""
+    reader = get_paddle_reader('vi') or get_easyocr_reader()
+    img_proc = preprocess_for_ocr(img, min_width=1500, mode="table")
+    items = []
+    if hasattr(reader, 'ocr'):
+        result = reader.ocr(img_proc, cls=True)
+        if result and result[0]:
+            for line in result[0]:
+                box, (text, conf) = line
+                if conf < 0.2 or not text.strip():
+                    continue
+                xs = [p[0] for p in box]
+                ys = [p[1] for p in box]
+                items.append({
+                    "text": post_process_ocr_text(text.strip()),
+                    "conf": conf,
+                    "x": np.mean(xs),
+                    "y": np.mean(ys),
+                    "box": box
+                })
+    else:
+        results = reader.readtext(img_proc, detail=1, paragraph=False)
+        for (pts, text, conf) in results:
+            if conf < 0.15 or not text.strip():
+                continue
             items.append({
+                "text": post_process_ocr_text(text.strip()),
+                "conf": conf,
+                "x": sum(p[0] for p in pts) / 4,
+                "y": sum(p[1] for p in pts) / 4,
+                "box": pts
             })
+    if not items:
+        return {"rows": [], "text": ""}
+    rows = group_rows(items, vertical_thresh_ratio=0.6)
+    return {"rows": rows, "text": "\n".join(" | ".join(r) for r in rows)}
+# ============================================================
+# TABLE STRUCTURE DETECTION (giữ nguyên, có cải thiện nhỏ)
+# ============================================================
+def detect_table_structure(img_bgr):
+    """Phát hiện cells trong bảng dựa trên đường kẻ."""
+    h, w = img_bgr.shape[:2]
+    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
+    _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
+    # Adaptive kernel size based on image size
+    h_kernel_len = max(40, w // 15)
+    v_kernel_len = max(40, h // 15)
+    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_kernel_len, 1))
+    horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
+    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_kernel_len))
+    vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
+    table_structure = cv2.add(horizontal_lines, vertical_lines)
+    contours, hierarchy = cv2.findContours(table_structure, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    cells = []
+    min_cell_area = (w * h) * 0.001  # ít nhất 0.1% diện tích ảnh
+    max_cell_area = (w * h) * 0.85   # không quá 85% (tránh lấy toàn bảng)
+    for cnt in contours:
+        x, y, cw, ch = cv2.boundingRect(cnt)
+        area = cw * ch
+        if min_cell_area < area < max_cell_area and cw > 15 and ch > 15:
+            cells.append((x, y, x + cw, y + ch))
+    cells = sorted(set(cells), key=lambda r: (r[1], r[0]))
+    return cells
+# ============================================================
+# GROUP ROWS (giữ nguyên)
+# ============================================================
+def group_rows(items, vertical_thresh_ratio=0.6):
     if not items:
         return []
+    items_sorted = sorted(items, key=lambda x: x["y"])
+    y_vals = [it["y"] for it in items_sorted]
     if len(y_vals) > 1:
+        gaps = [y_vals[i+1] - y_vals[i] for i in range(len(y_vals)-1)]
+        median_gap = np.median(gaps)
+        thresh = max(8, median_gap * vertical_thresh_ratio)
     else:
         thresh = 12
+    rows = []
+    current_row = [items_sorted[0]]
+    for it in items_sorted[1:]:
+        if it["y"] - current_row[-1]["y"] < thresh:
+            current_row.append(it)
         else:
+            current_row.sort(key=lambda x: x["x"])
+            rows.append(current_row)
+            current_row = [it]
+    current_row.sort(key=lambda x: x["x"])
+    rows.append(current_row)
+    return [[it["text"] for it in row] for row in rows]
+# ============================================================
 # MAIN PIPELINE
+# ============================================================
+def run_pipeline(image_path, output_dir="outputs",
+                 checkpoint="best.pt", conf_thresh=0.3,
+                 ocr_backend="paddle"):
     image_path = str(image_path)
     img_name   = Path(image_path).name
     stem       = Path(image_path).stem
     crop_dir   = Path(output_dir) / stem / "crops"
     crop_dir.mkdir(parents=True, exist_ok=True)
     model   = get_det_model(checkpoint)
+    results = model(image_path, imgsz=1024, conf=conf_thresh,
+                    iou=0.5, device=DEVICE, verbose=False)
     img_bgr = cv2.imread(image_path)
     if img_bgr is None:
+        raise ValueError(f"Cannot read: {image_path}")
     objects = []
     for i, box in enumerate(results[0].boxes):
         x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
         cls_idx  = int(box.cls[0])
         cls_raw  = CLASS_NAMES[cls_idx]
         cls_show = CLASS_DISPLAY[cls_raw]
+        # Padding lớn hơn → bao thêm context cho OCR
+        pad = 10
+        crop = img_bgr[max(0, y1-pad):min(img_bgr.shape[0], y2+pad),
+                       max(0, x1-pad):min(img_bgr.shape[1], x2+pad)]
         crop_path = str(crop_dir / f"{cls_show}_{i+1}.jpg")
+        # Lưu với quality cao hơn
+        cv2.imwrite(crop_path, crop, [cv2.IMWRITE_JPEG_QUALITY, 98])
         ocr_content = None
         if cls_raw == "note":
+            print(f"[OCR] Note #{i+1} ({x2-x1}x{y2-y1}px)...")
+            ocr_content = ocr_note(crop_path, backend=ocr_backend)
+            print(f"      → {repr(ocr_content[:120]) if ocr_content else 'EMPTY'}")
         elif cls_raw == "table":
+            print(f"[OCR] Table #{i+1} ({x2-x1}x{y2-y1}px)...")
+            ocr_content = ocr_table(crop_path, backend=ocr_backend)
+            preview = ocr_content.get("text", "")[:120]
             print(f"      → {repr(preview) if preview else 'EMPTY'}")
         objects.append({
+            "id": i+1, "class": cls_show,
+            "confidence": conf_val,
+            "bbox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2},
+            "crop_path": crop_path,
             "ocr_content": ocr_content,
         })
         color = COLORS[cls_raw]
         cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 2)
         label = f"{cls_show} {conf_val:.2f}"
+        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
+        cv2.rectangle(img_bgr, (x1, y1-th-10), (x1+tw+8, y1), color, -1)
+        cv2.putText(img_bgr, label, (x1+4, y1-4),
                     cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
     vis_path = str(Path(output_dir) / stem / "result_vis.jpg")
     cv2.imwrite(vis_path, img_bgr)
     result    = {"image": img_name, "objects": objects}
     json_path = str(Path(output_dir) / stem / "result.json")
     with open(json_path, "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"[✓] {len(objects)} objects | {vis_path} | {json_path}")
     return result, vis_path
 if __name__ == "__main__":
     import sys
     img = sys.argv[1] if len(sys.argv) > 1 else "test.jpg"
+    result, _ = run_pipeline(img, ocr_backend="paddle")
     print(json.dumps(result, ensure_ascii=False, indent=2))