upload

Browse files

Files changed (15) hide show

tableformer_accurate/.DS_Store +0 -0
tableformer_accurate/bbox_decoder.onnx +3 -0
tableformer_accurate/decoder_step.onnx +3 -0
tableformer_accurate/encoder.onnx +3 -0
tableformer_accurate/inference.py +163 -0
tableformer_fast/.DS_Store +0 -0
tableformer_fast/bbox_decoder.onnx +3 -0
tableformer_fast/decoder_step.onnx +3 -0
tableformer_fast/encoder.onnx +3 -0
tableformer_fast/inference.py +163 -0
tableformer_v2/.DS_Store +0 -0
tableformer_v2/bbox_head.onnx +3 -0
tableformer_v2/decoder.onnx +3 -0
tableformer_v2/encoder.onnx +3 -0
tableformer_v2/inference.py +99 -0

tableformer_accurate/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

tableformer_accurate/bbox_decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e0f89346a71eb7ebd429ce6fcb398732598c3fbec95438adcb719409f98d125
+size 39633512

tableformer_accurate/decoder_step.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab858a0c9207649bf8ad35877c7eb710917c4f3b9877e0ce05a8cf239c8c127
+size 77940061

tableformer_accurate/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09e51bac97a75e45b6ab264b5207b365a7553ba6a7e1ac67a79ed78ae91b4a3b
+size 95248337

tableformer_accurate/inference.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python3
+"""TableFormer v1 ONNX inference — table structure recognition.
+Self-contained: requires only numpy, onnxruntime and Pillow.
+The same script works for both the fast and accurate variants
+(the decoder layer count is read from the cache input shape).
+    python inference.py <table_image.png>
+Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
+The decoder computes one token per call; `cache` carries each decoder layer's
+per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
+on the first step).
+Replicates TableModel04_rs.predict from docling-ibm-models, including its
+structure-error corrections and horizontal-span bbox merging.
+Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
+[0, 1] relative to the input table crop.
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+HERE = Path(__file__).parent
+WM = {
+    "<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
+    "lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
+}
+ID2TOKEN = {v: k for k, v in WM.items()}
+MAX_STEPS = 1024
+IMAGE_SIZE = 448  # resized without keeping aspect ratio (per tm_config.json)
+MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
+STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
+def preprocess(image: Image.Image) -> np.ndarray:
+    """RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
+    Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
+    as (channels, width, height); the model was trained that way. Predicted
+    bbox x/y are therefore swapped back in predict().
+    """
+    image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
+    x = np.asarray(image, dtype=np.float32) / 255.0
+    x = (x - MEAN) / STD
+    return x.transpose(2, 1, 0)[None]  # (1, C, W, H)
+class TableFormerOnnx:
+    def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
+        model_dir = Path(model_dir)
+        p = list(providers)
+        self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
+        self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
+        self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
+        cache_shape = next(
+            i.shape for i in self.decoder.get_inputs() if i.name == "cache"
+        )
+        self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
+    def predict(self, image: Image.Image):
+        """Returns (otsl_tokens, classes, bboxes).
+        classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
+        """
+        x = preprocess(image)
+        enc_out, memory = self.encoder.run(None, {"image": x})
+        decoded = np.array([[WM["<start>"]]], dtype=np.int64)  # (L, 1)
+        cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
+        output_tags, tag_H_buf = [], []
+        skip_next_tag, prev_tag_ucel = True, False
+        first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
+        line_num = 0
+        while len(output_tags) < MAX_STEPS:
+            logits, tag_H, cache = self.decoder.run(
+                None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
+            )
+            new_tag = int(logits[0].argmax())
+            # Structure error corrections (as in TableModel04_rs.predict)
+            if line_num == 0 and new_tag == WM["xcel"]:
+                new_tag = WM["lcel"]
+            if prev_tag_ucel and new_tag == WM["lcel"]:
+                new_tag = WM["fcel"]
+            if new_tag == WM["<end>"]:
+                output_tags.append(new_tag)
+                break
+            output_tags.append(new_tag)
+            if new_tag == WM["nl"]:
+                line_num += 1
+            # Keep one hidden state per cell for the bbox decoder
+            if not skip_next_tag and new_tag in (
+                WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
+                WM["srow"], WM["nl"], WM["ucel"],
+            ):
+                tag_H_buf.append(tag_H)
+                if not first_lcel:
+                    bboxes_to_merge[cur_bbox_ind] = bbox_ind
+                bbox_ind += 1
+            if new_tag != WM["lcel"]:
+                first_lcel = True
+            elif first_lcel:  # start of a horizontal span
+                tag_H_buf.append(tag_H)
+                first_lcel = False
+                cur_bbox_ind = bbox_ind
+                bboxes_to_merge[cur_bbox_ind] = -1
+                bbox_ind += 1
+            skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
+            prev_tag_ucel = new_tag == WM["ucel"]
+            decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
+        tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
+        if not tag_H_buf:
+            return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
+        classes, coords = self.bbox.run(
+            None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
+        )
+        # Merge first/last bbox of each horizontal span (cxcywh)
+        out_cls, out_coord, skip = [], [], set()
+        for i in range(len(coords)):
+            if i in bboxes_to_merge:
+                b1, b2 = coords[i], coords[bboxes_to_merge[i]]
+                skip.add(bboxes_to_merge[i])
+                w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
+                h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
+                left = b1[0] - b1[2] / 2
+                top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
+                out_coord.append([left + w / 2, top + h / 2, w, h])
+                out_cls.append(classes[i])
+            elif i not in skip:
+                out_coord.append(coords[i].tolist())
+                out_cls.append(classes[i])
+        return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        sys.exit(f"usage: {sys.argv[0]} <table_image>")
+    img = Image.open(sys.argv[1])
+    model = TableFormerOnnx()
+    tokens, classes, bboxes = model.predict(img)
+    print("OTSL:", " ".join(tokens))
+    print(f"{len(bboxes)} cells (xyxy in original image pixels):")
+    for cx, cy, w, h in bboxes:
+        x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
+        x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
+        print(f"  ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")

tableformer_fast/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

tableformer_fast/bbox_decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35300de20645552bed0967c2dc79c0944cfed12f04600116a1c92920d1ff8329
+size 39633512

tableformer_fast/decoder_step.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:696b51d1b86fed94dbdb46691465cf887fdd420f1efa9681b96fe37090601990
+size 27414748

tableformer_fast/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a749e9b517c1a1e8debf4e91776453a3858efb910d75bd9961f43e4d489292a1
+size 78402523

tableformer_fast/inference.py ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/usr/bin/env python3
+"""TableFormer v1 ONNX inference — table structure recognition.
+Self-contained: requires only numpy, onnxruntime and Pillow.
+The same script works for both the fast and accurate variants
+(the decoder layer count is read from the cache input shape).
+    python inference.py <table_image.png>
+Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
+The decoder computes one token per call; `cache` carries each decoder layer's
+per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
+on the first step).
+Replicates TableModel04_rs.predict from docling-ibm-models, including its
+structure-error corrections and horizontal-span bbox merging.
+Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
+[0, 1] relative to the input table crop.
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+HERE = Path(__file__).parent
+WM = {
+    "<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
+    "lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
+}
+ID2TOKEN = {v: k for k, v in WM.items()}
+MAX_STEPS = 1024
+IMAGE_SIZE = 448  # resized without keeping aspect ratio (per tm_config.json)
+MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
+STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
+def preprocess(image: Image.Image) -> np.ndarray:
+    """RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
+    Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
+    as (channels, width, height); the model was trained that way. Predicted
+    bbox x/y are therefore swapped back in predict().
+    """
+    image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
+    x = np.asarray(image, dtype=np.float32) / 255.0
+    x = (x - MEAN) / STD
+    return x.transpose(2, 1, 0)[None]  # (1, C, W, H)
+class TableFormerOnnx:
+    def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
+        model_dir = Path(model_dir)
+        p = list(providers)
+        self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
+        self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
+        self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
+        cache_shape = next(
+            i.shape for i in self.decoder.get_inputs() if i.name == "cache"
+        )
+        self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
+    def predict(self, image: Image.Image):
+        """Returns (otsl_tokens, classes, bboxes).
+        classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
+        """
+        x = preprocess(image)
+        enc_out, memory = self.encoder.run(None, {"image": x})
+        decoded = np.array([[WM["<start>"]]], dtype=np.int64)  # (L, 1)
+        cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
+        output_tags, tag_H_buf = [], []
+        skip_next_tag, prev_tag_ucel = True, False
+        first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
+        line_num = 0
+        while len(output_tags) < MAX_STEPS:
+            logits, tag_H, cache = self.decoder.run(
+                None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
+            )
+            new_tag = int(logits[0].argmax())
+            # Structure error corrections (as in TableModel04_rs.predict)
+            if line_num == 0 and new_tag == WM["xcel"]:
+                new_tag = WM["lcel"]
+            if prev_tag_ucel and new_tag == WM["lcel"]:
+                new_tag = WM["fcel"]
+            if new_tag == WM["<end>"]:
+                output_tags.append(new_tag)
+                break
+            output_tags.append(new_tag)
+            if new_tag == WM["nl"]:
+                line_num += 1
+            # Keep one hidden state per cell for the bbox decoder
+            if not skip_next_tag and new_tag in (
+                WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
+                WM["srow"], WM["nl"], WM["ucel"],
+            ):
+                tag_H_buf.append(tag_H)
+                if not first_lcel:
+                    bboxes_to_merge[cur_bbox_ind] = bbox_ind
+                bbox_ind += 1
+            if new_tag != WM["lcel"]:
+                first_lcel = True
+            elif first_lcel:  # start of a horizontal span
+                tag_H_buf.append(tag_H)
+                first_lcel = False
+                cur_bbox_ind = bbox_ind
+                bboxes_to_merge[cur_bbox_ind] = -1
+                bbox_ind += 1
+            skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
+            prev_tag_ucel = new_tag == WM["ucel"]
+            decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
+        tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
+        if not tag_H_buf:
+            return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
+        classes, coords = self.bbox.run(
+            None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
+        )
+        # Merge first/last bbox of each horizontal span (cxcywh)
+        out_cls, out_coord, skip = [], [], set()
+        for i in range(len(coords)):
+            if i in bboxes_to_merge:
+                b1, b2 = coords[i], coords[bboxes_to_merge[i]]
+                skip.add(bboxes_to_merge[i])
+                w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
+                h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
+                left = b1[0] - b1[2] / 2
+                top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
+                out_coord.append([left + w / 2, top + h / 2, w, h])
+                out_cls.append(classes[i])
+            elif i not in skip:
+                out_coord.append(coords[i].tolist())
+                out_cls.append(classes[i])
+        return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        sys.exit(f"usage: {sys.argv[0]} <table_image>")
+    img = Image.open(sys.argv[1])
+    model = TableFormerOnnx()
+    tokens, classes, bboxes = model.predict(img)
+    print("OTSL:", " ".join(tokens))
+    print(f"{len(bboxes)} cells (xyxy in original image pixels):")
+    for cx, cy, w, h in bboxes:
+        x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
+        x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
+        print(f"  ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")

tableformer_v2/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

tableformer_v2/bbox_head.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:945d677fbe54f922e5027ea70a6e584113ff8e2d92f5d7317d2b762bc796f922
+size 37353806

tableformer_v2/decoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:744f2922fca1583f2152e196e05fe989250681f2139ab386c2dfc3e7d2b6d733
+size 68420864

tableformer_v2/encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6634b70eb00b8377992ecd8554964a18bfc2feae2b4c3d3286433f330b4c9591
+size 98118237

tableformer_v2/inference.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3
+"""TableFormerV2 ONNX inference — table structure recognition.
+Self-contained: requires only numpy, onnxruntime and Pillow.
+    python inference.py <table_image.png>
+Pipeline: encoder.onnx -> greedy loop over decoder.onnx -> bbox_head.onnx.
+The decoder graph is cache-free and re-runs the whole prefix each step
+(vocab is 13 tokens, sequences are short, so this is cheap).
+Outputs OTSL structure tokens and one bbox per data cell, xyxy normalized
+to [0, 1] relative to the input table crop.
+"""
+import sys
+from pathlib import Path
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+HERE = Path(__file__).parent
+ID2TOKEN = [
+    "<pad>", "[UNK]", "<start>", "<end>", "<ecel>", "<fcel>", "<lcel>",
+    "<ucel>", "<xcel>", "<nl>", "<ched>", "<rhed>", "<srow>",
+]
+BOS_ID, EOS_ID = 2, 3
+DATA_CELL_IDS = {4, 5, 10, 11, 12}  # ecel, fcel, ched, rhed, srow
+IMAGE_SIZE = 448
+MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)  # ImageNet
+STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+def preprocess(image: Image.Image) -> np.ndarray:
+    """RGB table crop -> (1, 3, 448, 448) float32, ImageNet-normalized."""
+    image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
+    x = np.asarray(image, dtype=np.float32) / 255.0
+    x = (x - MEAN) / STD
+    return x.transpose(2, 0, 1)[None]
+class TableFormerV2Onnx:
+    def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
+        model_dir = Path(model_dir)
+        p = list(providers)
+        self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
+        self.decoder = ort.InferenceSession(str(model_dir / "decoder.onnx"), providers=p)
+        self.bbox_head = ort.InferenceSession(str(model_dir / "bbox_head.onnx"), providers=p)
+    def predict(self, image: Image.Image, max_length: int = 512):
+        """Returns (otsl_tokens, bboxes). bboxes: (num_cells, 4) xyxy in [0, 1]."""
+        images = preprocess(image)
+        (enc_hidden,) = self.encoder.run(None, {"images": images})
+        # Greedy generation
+        ids = np.array([[BOS_ID]], dtype=np.int64)
+        for _ in range(max_length):
+            logits, _ = self.decoder.run(
+                None, {"input_ids": ids, "encoder_hidden": enc_hidden}
+            )
+            next_id = int(logits[0, -1].argmax())
+            ids = np.concatenate([ids, [[next_id]]], axis=1)
+            if next_id == EOS_ID:
+                break
+        # Hidden states of the full sequence -> bboxes at data-cell positions
+        _, hidden = self.decoder.run(
+            None, {"input_ids": ids, "encoder_hidden": enc_hidden}
+        )
+        cell_pos = [i for i, t in enumerate(ids[0].tolist()) if t in DATA_CELL_IDS]
+        if cell_pos:
+            (bboxes,) = self.bbox_head.run(
+                None,
+                {"cell_embeddings": hidden[0, cell_pos], "encoder_hidden": enc_hidden},
+            )
+        else:
+            bboxes = np.zeros((0, 4), dtype=np.float32)
+        tokens = [
+            ID2TOKEN[t] for t in ids[0].tolist() if t not in (BOS_ID, EOS_ID, 0)
+        ]
+        return tokens, bboxes
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        sys.exit(f"usage: {sys.argv[0]} <table_image>")
+    img = Image.open(sys.argv[1])
+    model = TableFormerV2Onnx()
+    tokens, bboxes = model.predict(img)
+    print("OTSL:", " ".join(tokens))
+    print(f"{len(bboxes)} cells (xyxy in original image pixels):")
+    scale = np.array([img.width, img.height, img.width, img.height])
+    for b in bboxes * scale:
+        print(f"  ({b[0]:7.1f}, {b[1]:7.1f}, {b[2]:7.1f}, {b[3]:7.1f})")