upload
Browse files- tableformer_accurate/.DS_Store +0 -0
- tableformer_accurate/bbox_decoder.onnx +3 -0
- tableformer_accurate/decoder_step.onnx +3 -0
- tableformer_accurate/encoder.onnx +3 -0
- tableformer_accurate/inference.py +163 -0
- tableformer_fast/.DS_Store +0 -0
- tableformer_fast/bbox_decoder.onnx +3 -0
- tableformer_fast/decoder_step.onnx +3 -0
- tableformer_fast/encoder.onnx +3 -0
- tableformer_fast/inference.py +163 -0
- tableformer_v2/.DS_Store +0 -0
- tableformer_v2/bbox_head.onnx +3 -0
- tableformer_v2/decoder.onnx +3 -0
- tableformer_v2/encoder.onnx +3 -0
- tableformer_v2/inference.py +99 -0
tableformer_accurate/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
tableformer_accurate/bbox_decoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e0f89346a71eb7ebd429ce6fcb398732598c3fbec95438adcb719409f98d125
|
| 3 |
+
size 39633512
|
tableformer_accurate/decoder_step.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ab858a0c9207649bf8ad35877c7eb710917c4f3b9877e0ce05a8cf239c8c127
|
| 3 |
+
size 77940061
|
tableformer_accurate/encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09e51bac97a75e45b6ab264b5207b365a7553ba6a7e1ac67a79ed78ae91b4a3b
|
| 3 |
+
size 95248337
|
tableformer_accurate/inference.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""TableFormer v1 ONNX inference — table structure recognition.
|
| 3 |
+
|
| 4 |
+
Self-contained: requires only numpy, onnxruntime and Pillow.
|
| 5 |
+
The same script works for both the fast and accurate variants
|
| 6 |
+
(the decoder layer count is read from the cache input shape).
|
| 7 |
+
|
| 8 |
+
python inference.py <table_image.png>
|
| 9 |
+
|
| 10 |
+
Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
|
| 11 |
+
The decoder computes one token per call; `cache` carries each decoder layer's
|
| 12 |
+
per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
|
| 13 |
+
on the first step).
|
| 14 |
+
|
| 15 |
+
Replicates TableModel04_rs.predict from docling-ibm-models, including its
|
| 16 |
+
structure-error corrections and horizontal-span bbox merging.
|
| 17 |
+
|
| 18 |
+
Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
|
| 19 |
+
[0, 1] relative to the input table crop.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import sys
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import numpy as np
|
| 26 |
+
import onnxruntime as ort
|
| 27 |
+
from PIL import Image
|
| 28 |
+
|
| 29 |
+
HERE = Path(__file__).parent
|
| 30 |
+
|
| 31 |
+
WM = {
|
| 32 |
+
"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
|
| 33 |
+
"lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
|
| 34 |
+
}
|
| 35 |
+
ID2TOKEN = {v: k for k, v in WM.items()}
|
| 36 |
+
MAX_STEPS = 1024
|
| 37 |
+
|
| 38 |
+
IMAGE_SIZE = 448 # resized without keeping aspect ratio (per tm_config.json)
|
| 39 |
+
MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
|
| 40 |
+
STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def preprocess(image: Image.Image) -> np.ndarray:
|
| 44 |
+
"""RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
|
| 45 |
+
|
| 46 |
+
Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
|
| 47 |
+
as (channels, width, height); the model was trained that way. Predicted
|
| 48 |
+
bbox x/y are therefore swapped back in predict().
|
| 49 |
+
"""
|
| 50 |
+
image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
|
| 51 |
+
x = np.asarray(image, dtype=np.float32) / 255.0
|
| 52 |
+
x = (x - MEAN) / STD
|
| 53 |
+
return x.transpose(2, 1, 0)[None] # (1, C, W, H)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TableFormerOnnx:
|
| 57 |
+
def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
|
| 58 |
+
model_dir = Path(model_dir)
|
| 59 |
+
p = list(providers)
|
| 60 |
+
self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
|
| 61 |
+
self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
|
| 62 |
+
self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
|
| 63 |
+
cache_shape = next(
|
| 64 |
+
i.shape for i in self.decoder.get_inputs() if i.name == "cache"
|
| 65 |
+
)
|
| 66 |
+
self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
|
| 67 |
+
|
| 68 |
+
def predict(self, image: Image.Image):
|
| 69 |
+
"""Returns (otsl_tokens, classes, bboxes).
|
| 70 |
+
|
| 71 |
+
classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
|
| 72 |
+
"""
|
| 73 |
+
x = preprocess(image)
|
| 74 |
+
enc_out, memory = self.encoder.run(None, {"image": x})
|
| 75 |
+
|
| 76 |
+
decoded = np.array([[WM["<start>"]]], dtype=np.int64) # (L, 1)
|
| 77 |
+
cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
|
| 78 |
+
output_tags, tag_H_buf = [], []
|
| 79 |
+
skip_next_tag, prev_tag_ucel = True, False
|
| 80 |
+
first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
|
| 81 |
+
line_num = 0
|
| 82 |
+
|
| 83 |
+
while len(output_tags) < MAX_STEPS:
|
| 84 |
+
logits, tag_H, cache = self.decoder.run(
|
| 85 |
+
None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
|
| 86 |
+
)
|
| 87 |
+
new_tag = int(logits[0].argmax())
|
| 88 |
+
|
| 89 |
+
# Structure error corrections (as in TableModel04_rs.predict)
|
| 90 |
+
if line_num == 0 and new_tag == WM["xcel"]:
|
| 91 |
+
new_tag = WM["lcel"]
|
| 92 |
+
if prev_tag_ucel and new_tag == WM["lcel"]:
|
| 93 |
+
new_tag = WM["fcel"]
|
| 94 |
+
|
| 95 |
+
if new_tag == WM["<end>"]:
|
| 96 |
+
output_tags.append(new_tag)
|
| 97 |
+
break
|
| 98 |
+
output_tags.append(new_tag)
|
| 99 |
+
if new_tag == WM["nl"]:
|
| 100 |
+
line_num += 1
|
| 101 |
+
|
| 102 |
+
# Keep one hidden state per cell for the bbox decoder
|
| 103 |
+
if not skip_next_tag and new_tag in (
|
| 104 |
+
WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
|
| 105 |
+
WM["srow"], WM["nl"], WM["ucel"],
|
| 106 |
+
):
|
| 107 |
+
tag_H_buf.append(tag_H)
|
| 108 |
+
if not first_lcel:
|
| 109 |
+
bboxes_to_merge[cur_bbox_ind] = bbox_ind
|
| 110 |
+
bbox_ind += 1
|
| 111 |
+
|
| 112 |
+
if new_tag != WM["lcel"]:
|
| 113 |
+
first_lcel = True
|
| 114 |
+
elif first_lcel: # start of a horizontal span
|
| 115 |
+
tag_H_buf.append(tag_H)
|
| 116 |
+
first_lcel = False
|
| 117 |
+
cur_bbox_ind = bbox_ind
|
| 118 |
+
bboxes_to_merge[cur_bbox_ind] = -1
|
| 119 |
+
bbox_ind += 1
|
| 120 |
+
|
| 121 |
+
skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
|
| 122 |
+
prev_tag_ucel = new_tag == WM["ucel"]
|
| 123 |
+
decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
|
| 124 |
+
|
| 125 |
+
tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
|
| 126 |
+
if not tag_H_buf:
|
| 127 |
+
return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
|
| 128 |
+
|
| 129 |
+
classes, coords = self.bbox.run(
|
| 130 |
+
None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Merge first/last bbox of each horizontal span (cxcywh)
|
| 134 |
+
out_cls, out_coord, skip = [], [], set()
|
| 135 |
+
for i in range(len(coords)):
|
| 136 |
+
if i in bboxes_to_merge:
|
| 137 |
+
b1, b2 = coords[i], coords[bboxes_to_merge[i]]
|
| 138 |
+
skip.add(bboxes_to_merge[i])
|
| 139 |
+
w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
|
| 140 |
+
h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
|
| 141 |
+
left = b1[0] - b1[2] / 2
|
| 142 |
+
top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
|
| 143 |
+
out_coord.append([left + w / 2, top + h / 2, w, h])
|
| 144 |
+
out_cls.append(classes[i])
|
| 145 |
+
elif i not in skip:
|
| 146 |
+
out_coord.append(coords[i].tolist())
|
| 147 |
+
out_cls.append(classes[i])
|
| 148 |
+
return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
if len(sys.argv) != 2:
|
| 153 |
+
sys.exit(f"usage: {sys.argv[0]} <table_image>")
|
| 154 |
+
img = Image.open(sys.argv[1])
|
| 155 |
+
model = TableFormerOnnx()
|
| 156 |
+
tokens, classes, bboxes = model.predict(img)
|
| 157 |
+
|
| 158 |
+
print("OTSL:", " ".join(tokens))
|
| 159 |
+
print(f"{len(bboxes)} cells (xyxy in original image pixels):")
|
| 160 |
+
for cx, cy, w, h in bboxes:
|
| 161 |
+
x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
|
| 162 |
+
x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
|
| 163 |
+
print(f" ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")
|
tableformer_fast/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
tableformer_fast/bbox_decoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:35300de20645552bed0967c2dc79c0944cfed12f04600116a1c92920d1ff8329
|
| 3 |
+
size 39633512
|
tableformer_fast/decoder_step.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:696b51d1b86fed94dbdb46691465cf887fdd420f1efa9681b96fe37090601990
|
| 3 |
+
size 27414748
|
tableformer_fast/encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a749e9b517c1a1e8debf4e91776453a3858efb910d75bd9961f43e4d489292a1
|
| 3 |
+
size 78402523
|
tableformer_fast/inference.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""TableFormer v1 ONNX inference — table structure recognition.
|
| 3 |
+
|
| 4 |
+
Self-contained: requires only numpy, onnxruntime and Pillow.
|
| 5 |
+
The same script works for both the fast and accurate variants
|
| 6 |
+
(the decoder layer count is read from the cache input shape).
|
| 7 |
+
|
| 8 |
+
python inference.py <table_image.png>
|
| 9 |
+
|
| 10 |
+
Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
|
| 11 |
+
The decoder computes one token per call; `cache` carries each decoder layer's
|
| 12 |
+
per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
|
| 13 |
+
on the first step).
|
| 14 |
+
|
| 15 |
+
Replicates TableModel04_rs.predict from docling-ibm-models, including its
|
| 16 |
+
structure-error corrections and horizontal-span bbox merging.
|
| 17 |
+
|
| 18 |
+
Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
|
| 19 |
+
[0, 1] relative to the input table crop.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
import sys
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import numpy as np
|
| 26 |
+
import onnxruntime as ort
|
| 27 |
+
from PIL import Image
|
| 28 |
+
|
| 29 |
+
HERE = Path(__file__).parent
|
| 30 |
+
|
| 31 |
+
WM = {
|
| 32 |
+
"<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
|
| 33 |
+
"lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
|
| 34 |
+
}
|
| 35 |
+
ID2TOKEN = {v: k for k, v in WM.items()}
|
| 36 |
+
MAX_STEPS = 1024
|
| 37 |
+
|
| 38 |
+
IMAGE_SIZE = 448 # resized without keeping aspect ratio (per tm_config.json)
|
| 39 |
+
MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
|
| 40 |
+
STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def preprocess(image: Image.Image) -> np.ndarray:
|
| 44 |
+
"""RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
|
| 45 |
+
|
| 46 |
+
Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
|
| 47 |
+
as (channels, width, height); the model was trained that way. Predicted
|
| 48 |
+
bbox x/y are therefore swapped back in predict().
|
| 49 |
+
"""
|
| 50 |
+
image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
|
| 51 |
+
x = np.asarray(image, dtype=np.float32) / 255.0
|
| 52 |
+
x = (x - MEAN) / STD
|
| 53 |
+
return x.transpose(2, 1, 0)[None] # (1, C, W, H)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class TableFormerOnnx:
|
| 57 |
+
def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
|
| 58 |
+
model_dir = Path(model_dir)
|
| 59 |
+
p = list(providers)
|
| 60 |
+
self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
|
| 61 |
+
self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
|
| 62 |
+
self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
|
| 63 |
+
cache_shape = next(
|
| 64 |
+
i.shape for i in self.decoder.get_inputs() if i.name == "cache"
|
| 65 |
+
)
|
| 66 |
+
self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
|
| 67 |
+
|
| 68 |
+
def predict(self, image: Image.Image):
|
| 69 |
+
"""Returns (otsl_tokens, classes, bboxes).
|
| 70 |
+
|
| 71 |
+
classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
|
| 72 |
+
"""
|
| 73 |
+
x = preprocess(image)
|
| 74 |
+
enc_out, memory = self.encoder.run(None, {"image": x})
|
| 75 |
+
|
| 76 |
+
decoded = np.array([[WM["<start>"]]], dtype=np.int64) # (L, 1)
|
| 77 |
+
cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
|
| 78 |
+
output_tags, tag_H_buf = [], []
|
| 79 |
+
skip_next_tag, prev_tag_ucel = True, False
|
| 80 |
+
first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
|
| 81 |
+
line_num = 0
|
| 82 |
+
|
| 83 |
+
while len(output_tags) < MAX_STEPS:
|
| 84 |
+
logits, tag_H, cache = self.decoder.run(
|
| 85 |
+
None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
|
| 86 |
+
)
|
| 87 |
+
new_tag = int(logits[0].argmax())
|
| 88 |
+
|
| 89 |
+
# Structure error corrections (as in TableModel04_rs.predict)
|
| 90 |
+
if line_num == 0 and new_tag == WM["xcel"]:
|
| 91 |
+
new_tag = WM["lcel"]
|
| 92 |
+
if prev_tag_ucel and new_tag == WM["lcel"]:
|
| 93 |
+
new_tag = WM["fcel"]
|
| 94 |
+
|
| 95 |
+
if new_tag == WM["<end>"]:
|
| 96 |
+
output_tags.append(new_tag)
|
| 97 |
+
break
|
| 98 |
+
output_tags.append(new_tag)
|
| 99 |
+
if new_tag == WM["nl"]:
|
| 100 |
+
line_num += 1
|
| 101 |
+
|
| 102 |
+
# Keep one hidden state per cell for the bbox decoder
|
| 103 |
+
if not skip_next_tag and new_tag in (
|
| 104 |
+
WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
|
| 105 |
+
WM["srow"], WM["nl"], WM["ucel"],
|
| 106 |
+
):
|
| 107 |
+
tag_H_buf.append(tag_H)
|
| 108 |
+
if not first_lcel:
|
| 109 |
+
bboxes_to_merge[cur_bbox_ind] = bbox_ind
|
| 110 |
+
bbox_ind += 1
|
| 111 |
+
|
| 112 |
+
if new_tag != WM["lcel"]:
|
| 113 |
+
first_lcel = True
|
| 114 |
+
elif first_lcel: # start of a horizontal span
|
| 115 |
+
tag_H_buf.append(tag_H)
|
| 116 |
+
first_lcel = False
|
| 117 |
+
cur_bbox_ind = bbox_ind
|
| 118 |
+
bboxes_to_merge[cur_bbox_ind] = -1
|
| 119 |
+
bbox_ind += 1
|
| 120 |
+
|
| 121 |
+
skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
|
| 122 |
+
prev_tag_ucel = new_tag == WM["ucel"]
|
| 123 |
+
decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
|
| 124 |
+
|
| 125 |
+
tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
|
| 126 |
+
if not tag_H_buf:
|
| 127 |
+
return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
|
| 128 |
+
|
| 129 |
+
classes, coords = self.bbox.run(
|
| 130 |
+
None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
# Merge first/last bbox of each horizontal span (cxcywh)
|
| 134 |
+
out_cls, out_coord, skip = [], [], set()
|
| 135 |
+
for i in range(len(coords)):
|
| 136 |
+
if i in bboxes_to_merge:
|
| 137 |
+
b1, b2 = coords[i], coords[bboxes_to_merge[i]]
|
| 138 |
+
skip.add(bboxes_to_merge[i])
|
| 139 |
+
w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
|
| 140 |
+
h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
|
| 141 |
+
left = b1[0] - b1[2] / 2
|
| 142 |
+
top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
|
| 143 |
+
out_coord.append([left + w / 2, top + h / 2, w, h])
|
| 144 |
+
out_cls.append(classes[i])
|
| 145 |
+
elif i not in skip:
|
| 146 |
+
out_coord.append(coords[i].tolist())
|
| 147 |
+
out_cls.append(classes[i])
|
| 148 |
+
return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
if len(sys.argv) != 2:
|
| 153 |
+
sys.exit(f"usage: {sys.argv[0]} <table_image>")
|
| 154 |
+
img = Image.open(sys.argv[1])
|
| 155 |
+
model = TableFormerOnnx()
|
| 156 |
+
tokens, classes, bboxes = model.predict(img)
|
| 157 |
+
|
| 158 |
+
print("OTSL:", " ".join(tokens))
|
| 159 |
+
print(f"{len(bboxes)} cells (xyxy in original image pixels):")
|
| 160 |
+
for cx, cy, w, h in bboxes:
|
| 161 |
+
x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
|
| 162 |
+
x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
|
| 163 |
+
print(f" ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")
|
tableformer_v2/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
tableformer_v2/bbox_head.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:945d677fbe54f922e5027ea70a6e584113ff8e2d92f5d7317d2b762bc796f922
|
| 3 |
+
size 37353806
|
tableformer_v2/decoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:744f2922fca1583f2152e196e05fe989250681f2139ab386c2dfc3e7d2b6d733
|
| 3 |
+
size 68420864
|
tableformer_v2/encoder.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6634b70eb00b8377992ecd8554964a18bfc2feae2b4c3d3286433f330b4c9591
|
| 3 |
+
size 98118237
|
tableformer_v2/inference.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""TableFormerV2 ONNX inference — table structure recognition.
|
| 3 |
+
|
| 4 |
+
Self-contained: requires only numpy, onnxruntime and Pillow.
|
| 5 |
+
|
| 6 |
+
python inference.py <table_image.png>
|
| 7 |
+
|
| 8 |
+
Pipeline: encoder.onnx -> greedy loop over decoder.onnx -> bbox_head.onnx.
|
| 9 |
+
The decoder graph is cache-free and re-runs the whole prefix each step
|
| 10 |
+
(vocab is 13 tokens, sequences are short, so this is cheap).
|
| 11 |
+
|
| 12 |
+
Outputs OTSL structure tokens and one bbox per data cell, xyxy normalized
|
| 13 |
+
to [0, 1] relative to the input table crop.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import sys
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import onnxruntime as ort
|
| 21 |
+
from PIL import Image
|
| 22 |
+
|
| 23 |
+
HERE = Path(__file__).parent
|
| 24 |
+
|
| 25 |
+
ID2TOKEN = [
|
| 26 |
+
"<pad>", "[UNK]", "<start>", "<end>", "<ecel>", "<fcel>", "<lcel>",
|
| 27 |
+
"<ucel>", "<xcel>", "<nl>", "<ched>", "<rhed>", "<srow>",
|
| 28 |
+
]
|
| 29 |
+
BOS_ID, EOS_ID = 2, 3
|
| 30 |
+
DATA_CELL_IDS = {4, 5, 10, 11, 12} # ecel, fcel, ched, rhed, srow
|
| 31 |
+
|
| 32 |
+
IMAGE_SIZE = 448
|
| 33 |
+
MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) # ImageNet
|
| 34 |
+
STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def preprocess(image: Image.Image) -> np.ndarray:
|
| 38 |
+
"""RGB table crop -> (1, 3, 448, 448) float32, ImageNet-normalized."""
|
| 39 |
+
image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
|
| 40 |
+
x = np.asarray(image, dtype=np.float32) / 255.0
|
| 41 |
+
x = (x - MEAN) / STD
|
| 42 |
+
return x.transpose(2, 0, 1)[None]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class TableFormerV2Onnx:
|
| 46 |
+
def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
|
| 47 |
+
model_dir = Path(model_dir)
|
| 48 |
+
p = list(providers)
|
| 49 |
+
self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
|
| 50 |
+
self.decoder = ort.InferenceSession(str(model_dir / "decoder.onnx"), providers=p)
|
| 51 |
+
self.bbox_head = ort.InferenceSession(str(model_dir / "bbox_head.onnx"), providers=p)
|
| 52 |
+
|
| 53 |
+
def predict(self, image: Image.Image, max_length: int = 512):
|
| 54 |
+
"""Returns (otsl_tokens, bboxes). bboxes: (num_cells, 4) xyxy in [0, 1]."""
|
| 55 |
+
images = preprocess(image)
|
| 56 |
+
(enc_hidden,) = self.encoder.run(None, {"images": images})
|
| 57 |
+
|
| 58 |
+
# Greedy generation
|
| 59 |
+
ids = np.array([[BOS_ID]], dtype=np.int64)
|
| 60 |
+
for _ in range(max_length):
|
| 61 |
+
logits, _ = self.decoder.run(
|
| 62 |
+
None, {"input_ids": ids, "encoder_hidden": enc_hidden}
|
| 63 |
+
)
|
| 64 |
+
next_id = int(logits[0, -1].argmax())
|
| 65 |
+
ids = np.concatenate([ids, [[next_id]]], axis=1)
|
| 66 |
+
if next_id == EOS_ID:
|
| 67 |
+
break
|
| 68 |
+
|
| 69 |
+
# Hidden states of the full sequence -> bboxes at data-cell positions
|
| 70 |
+
_, hidden = self.decoder.run(
|
| 71 |
+
None, {"input_ids": ids, "encoder_hidden": enc_hidden}
|
| 72 |
+
)
|
| 73 |
+
cell_pos = [i for i, t in enumerate(ids[0].tolist()) if t in DATA_CELL_IDS]
|
| 74 |
+
if cell_pos:
|
| 75 |
+
(bboxes,) = self.bbox_head.run(
|
| 76 |
+
None,
|
| 77 |
+
{"cell_embeddings": hidden[0, cell_pos], "encoder_hidden": enc_hidden},
|
| 78 |
+
)
|
| 79 |
+
else:
|
| 80 |
+
bboxes = np.zeros((0, 4), dtype=np.float32)
|
| 81 |
+
|
| 82 |
+
tokens = [
|
| 83 |
+
ID2TOKEN[t] for t in ids[0].tolist() if t not in (BOS_ID, EOS_ID, 0)
|
| 84 |
+
]
|
| 85 |
+
return tokens, bboxes
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
if len(sys.argv) != 2:
|
| 90 |
+
sys.exit(f"usage: {sys.argv[0]} <table_image>")
|
| 91 |
+
img = Image.open(sys.argv[1])
|
| 92 |
+
model = TableFormerV2Onnx()
|
| 93 |
+
tokens, bboxes = model.predict(img)
|
| 94 |
+
|
| 95 |
+
print("OTSL:", " ".join(tokens))
|
| 96 |
+
print(f"{len(bboxes)} cells (xyxy in original image pixels):")
|
| 97 |
+
scale = np.array([img.width, img.height, img.width, img.height])
|
| 98 |
+
for b in bboxes * scale:
|
| 99 |
+
print(f" ({b[0]:7.1f}, {b[1]:7.1f}, {b[2]:7.1f}, {b[3]:7.1f})")
|