19x3 commited on
Commit
3c19d8e
·
1 Parent(s): 4ea0f93
tableformer_accurate/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tableformer_accurate/bbox_decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e0f89346a71eb7ebd429ce6fcb398732598c3fbec95438adcb719409f98d125
3
+ size 39633512
tableformer_accurate/decoder_step.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ab858a0c9207649bf8ad35877c7eb710917c4f3b9877e0ce05a8cf239c8c127
3
+ size 77940061
tableformer_accurate/encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e51bac97a75e45b6ab264b5207b365a7553ba6a7e1ac67a79ed78ae91b4a3b
3
+ size 95248337
tableformer_accurate/inference.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """TableFormer v1 ONNX inference — table structure recognition.
3
+
4
+ Self-contained: requires only numpy, onnxruntime and Pillow.
5
+ The same script works for both the fast and accurate variants
6
+ (the decoder layer count is read from the cache input shape).
7
+
8
+ python inference.py <table_image.png>
9
+
10
+ Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
11
+ The decoder computes one token per call; `cache` carries each decoder layer's
12
+ per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
13
+ on the first step).
14
+
15
+ Replicates TableModel04_rs.predict from docling-ibm-models, including its
16
+ structure-error corrections and horizontal-span bbox merging.
17
+
18
+ Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
19
+ [0, 1] relative to the input table crop.
20
+ """
21
+
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ import numpy as np
26
+ import onnxruntime as ort
27
+ from PIL import Image
28
+
29
+ HERE = Path(__file__).parent
30
+
31
+ WM = {
32
+ "<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
33
+ "lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
34
+ }
35
+ ID2TOKEN = {v: k for k, v in WM.items()}
36
+ MAX_STEPS = 1024
37
+
38
+ IMAGE_SIZE = 448 # resized without keeping aspect ratio (per tm_config.json)
39
+ MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
40
+ STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
41
+
42
+
43
+ def preprocess(image: Image.Image) -> np.ndarray:
44
+ """RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
45
+
46
+ Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
47
+ as (channels, width, height); the model was trained that way. Predicted
48
+ bbox x/y are therefore swapped back in predict().
49
+ """
50
+ image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
51
+ x = np.asarray(image, dtype=np.float32) / 255.0
52
+ x = (x - MEAN) / STD
53
+ return x.transpose(2, 1, 0)[None] # (1, C, W, H)
54
+
55
+
56
+ class TableFormerOnnx:
57
+ def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
58
+ model_dir = Path(model_dir)
59
+ p = list(providers)
60
+ self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
61
+ self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
62
+ self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
63
+ cache_shape = next(
64
+ i.shape for i in self.decoder.get_inputs() if i.name == "cache"
65
+ )
66
+ self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
67
+
68
+ def predict(self, image: Image.Image):
69
+ """Returns (otsl_tokens, classes, bboxes).
70
+
71
+ classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
72
+ """
73
+ x = preprocess(image)
74
+ enc_out, memory = self.encoder.run(None, {"image": x})
75
+
76
+ decoded = np.array([[WM["<start>"]]], dtype=np.int64) # (L, 1)
77
+ cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
78
+ output_tags, tag_H_buf = [], []
79
+ skip_next_tag, prev_tag_ucel = True, False
80
+ first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
81
+ line_num = 0
82
+
83
+ while len(output_tags) < MAX_STEPS:
84
+ logits, tag_H, cache = self.decoder.run(
85
+ None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
86
+ )
87
+ new_tag = int(logits[0].argmax())
88
+
89
+ # Structure error corrections (as in TableModel04_rs.predict)
90
+ if line_num == 0 and new_tag == WM["xcel"]:
91
+ new_tag = WM["lcel"]
92
+ if prev_tag_ucel and new_tag == WM["lcel"]:
93
+ new_tag = WM["fcel"]
94
+
95
+ if new_tag == WM["<end>"]:
96
+ output_tags.append(new_tag)
97
+ break
98
+ output_tags.append(new_tag)
99
+ if new_tag == WM["nl"]:
100
+ line_num += 1
101
+
102
+ # Keep one hidden state per cell for the bbox decoder
103
+ if not skip_next_tag and new_tag in (
104
+ WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
105
+ WM["srow"], WM["nl"], WM["ucel"],
106
+ ):
107
+ tag_H_buf.append(tag_H)
108
+ if not first_lcel:
109
+ bboxes_to_merge[cur_bbox_ind] = bbox_ind
110
+ bbox_ind += 1
111
+
112
+ if new_tag != WM["lcel"]:
113
+ first_lcel = True
114
+ elif first_lcel: # start of a horizontal span
115
+ tag_H_buf.append(tag_H)
116
+ first_lcel = False
117
+ cur_bbox_ind = bbox_ind
118
+ bboxes_to_merge[cur_bbox_ind] = -1
119
+ bbox_ind += 1
120
+
121
+ skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
122
+ prev_tag_ucel = new_tag == WM["ucel"]
123
+ decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
124
+
125
+ tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
126
+ if not tag_H_buf:
127
+ return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
128
+
129
+ classes, coords = self.bbox.run(
130
+ None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
131
+ )
132
+
133
+ # Merge first/last bbox of each horizontal span (cxcywh)
134
+ out_cls, out_coord, skip = [], [], set()
135
+ for i in range(len(coords)):
136
+ if i in bboxes_to_merge:
137
+ b1, b2 = coords[i], coords[bboxes_to_merge[i]]
138
+ skip.add(bboxes_to_merge[i])
139
+ w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
140
+ h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
141
+ left = b1[0] - b1[2] / 2
142
+ top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
143
+ out_coord.append([left + w / 2, top + h / 2, w, h])
144
+ out_cls.append(classes[i])
145
+ elif i not in skip:
146
+ out_coord.append(coords[i].tolist())
147
+ out_cls.append(classes[i])
148
+ return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ if len(sys.argv) != 2:
153
+ sys.exit(f"usage: {sys.argv[0]} <table_image>")
154
+ img = Image.open(sys.argv[1])
155
+ model = TableFormerOnnx()
156
+ tokens, classes, bboxes = model.predict(img)
157
+
158
+ print("OTSL:", " ".join(tokens))
159
+ print(f"{len(bboxes)} cells (xyxy in original image pixels):")
160
+ for cx, cy, w, h in bboxes:
161
+ x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
162
+ x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
163
+ print(f" ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")
tableformer_fast/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tableformer_fast/bbox_decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35300de20645552bed0967c2dc79c0944cfed12f04600116a1c92920d1ff8329
3
+ size 39633512
tableformer_fast/decoder_step.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:696b51d1b86fed94dbdb46691465cf887fdd420f1efa9681b96fe37090601990
3
+ size 27414748
tableformer_fast/encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a749e9b517c1a1e8debf4e91776453a3858efb910d75bd9961f43e4d489292a1
3
+ size 78402523
tableformer_fast/inference.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """TableFormer v1 ONNX inference — table structure recognition.
3
+
4
+ Self-contained: requires only numpy, onnxruntime and Pillow.
5
+ The same script works for both the fast and accurate variants
6
+ (the decoder layer count is read from the cache input shape).
7
+
8
+ python inference.py <table_image.png>
9
+
10
+ Pipeline: encoder.onnx -> greedy loop over decoder_step.onnx -> bbox_decoder.onnx.
11
+ The decoder computes one token per call; `cache` carries each decoder layer's
12
+ per-position outputs across steps (shape (num_layers, L, 1, 512); pass length 0
13
+ on the first step).
14
+
15
+ Replicates TableModel04_rs.predict from docling-ibm-models, including its
16
+ structure-error corrections and horizontal-span bbox merging.
17
+
18
+ Outputs OTSL structure tokens and one bbox per cell, cxcywh normalized to
19
+ [0, 1] relative to the input table crop.
20
+ """
21
+
22
+ import sys
23
+ from pathlib import Path
24
+
25
+ import numpy as np
26
+ import onnxruntime as ort
27
+ from PIL import Image
28
+
29
+ HERE = Path(__file__).parent
30
+
31
+ WM = {
32
+ "<pad>": 0, "<unk>": 1, "<start>": 2, "<end>": 3, "ecel": 4, "fcel": 5,
33
+ "lcel": 6, "ucel": 7, "xcel": 8, "nl": 9, "ched": 10, "rhed": 11, "srow": 12,
34
+ }
35
+ ID2TOKEN = {v: k for k, v in WM.items()}
36
+ MAX_STEPS = 1024
37
+
38
+ IMAGE_SIZE = 448 # resized without keeping aspect ratio (per tm_config.json)
39
+ MEAN = np.array([0.94247851, 0.94254675, 0.94292611], dtype=np.float32)
40
+ STD = np.array([0.17910956, 0.17940403, 0.17931663], dtype=np.float32)
41
+
42
+
43
+ def preprocess(image: Image.Image) -> np.ndarray:
44
+ """RGB table crop -> (1, 3, 448, 448) float32, PubTabNet-normalized.
45
+
46
+ Note: docling's TFPredictor._prepare_image feeds the image TRANSPOSED,
47
+ as (channels, width, height); the model was trained that way. Predicted
48
+ bbox x/y are therefore swapped back in predict().
49
+ """
50
+ image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
51
+ x = np.asarray(image, dtype=np.float32) / 255.0
52
+ x = (x - MEAN) / STD
53
+ return x.transpose(2, 1, 0)[None] # (1, C, W, H)
54
+
55
+
56
+ class TableFormerOnnx:
57
+ def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
58
+ model_dir = Path(model_dir)
59
+ p = list(providers)
60
+ self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
61
+ self.decoder = ort.InferenceSession(str(model_dir / "decoder_step.onnx"), providers=p)
62
+ self.bbox = ort.InferenceSession(str(model_dir / "bbox_decoder.onnx"), providers=p)
63
+ cache_shape = next(
64
+ i.shape for i in self.decoder.get_inputs() if i.name == "cache"
65
+ )
66
+ self.num_layers, self.hidden_dim = cache_shape[0], cache_shape[3]
67
+
68
+ def predict(self, image: Image.Image):
69
+ """Returns (otsl_tokens, classes, bboxes).
70
+
71
+ classes: (num_cells, 3) logits; bboxes: (num_cells, 4) cxcywh in [0, 1].
72
+ """
73
+ x = preprocess(image)
74
+ enc_out, memory = self.encoder.run(None, {"image": x})
75
+
76
+ decoded = np.array([[WM["<start>"]]], dtype=np.int64) # (L, 1)
77
+ cache = np.zeros((self.num_layers, 0, 1, self.hidden_dim), dtype=np.float32)
78
+ output_tags, tag_H_buf = [], []
79
+ skip_next_tag, prev_tag_ucel = True, False
80
+ first_lcel, bboxes_to_merge, cur_bbox_ind, bbox_ind = True, {}, -1, 0
81
+ line_num = 0
82
+
83
+ while len(output_tags) < MAX_STEPS:
84
+ logits, tag_H, cache = self.decoder.run(
85
+ None, {"decoded_tags": decoded, "memory": memory, "cache": cache}
86
+ )
87
+ new_tag = int(logits[0].argmax())
88
+
89
+ # Structure error corrections (as in TableModel04_rs.predict)
90
+ if line_num == 0 and new_tag == WM["xcel"]:
91
+ new_tag = WM["lcel"]
92
+ if prev_tag_ucel and new_tag == WM["lcel"]:
93
+ new_tag = WM["fcel"]
94
+
95
+ if new_tag == WM["<end>"]:
96
+ output_tags.append(new_tag)
97
+ break
98
+ output_tags.append(new_tag)
99
+ if new_tag == WM["nl"]:
100
+ line_num += 1
101
+
102
+ # Keep one hidden state per cell for the bbox decoder
103
+ if not skip_next_tag and new_tag in (
104
+ WM["fcel"], WM["ecel"], WM["ched"], WM["rhed"],
105
+ WM["srow"], WM["nl"], WM["ucel"],
106
+ ):
107
+ tag_H_buf.append(tag_H)
108
+ if not first_lcel:
109
+ bboxes_to_merge[cur_bbox_ind] = bbox_ind
110
+ bbox_ind += 1
111
+
112
+ if new_tag != WM["lcel"]:
113
+ first_lcel = True
114
+ elif first_lcel: # start of a horizontal span
115
+ tag_H_buf.append(tag_H)
116
+ first_lcel = False
117
+ cur_bbox_ind = bbox_ind
118
+ bboxes_to_merge[cur_bbox_ind] = -1
119
+ bbox_ind += 1
120
+
121
+ skip_next_tag = new_tag in (WM["nl"], WM["ucel"], WM["xcel"])
122
+ prev_tag_ucel = new_tag == WM["ucel"]
123
+ decoded = np.concatenate([decoded, [[new_tag]]], axis=0)
124
+
125
+ tokens = [ID2TOKEN[t] for t in output_tags if t != WM["<end>"]]
126
+ if not tag_H_buf:
127
+ return tokens, np.zeros((0, 3), np.float32), np.zeros((0, 4), np.float32)
128
+
129
+ classes, coords = self.bbox.run(
130
+ None, {"enc_out": enc_out, "tag_H": np.concatenate(tag_H_buf, axis=0)}
131
+ )
132
+
133
+ # Merge first/last bbox of each horizontal span (cxcywh)
134
+ out_cls, out_coord, skip = [], [], set()
135
+ for i in range(len(coords)):
136
+ if i in bboxes_to_merge:
137
+ b1, b2 = coords[i], coords[bboxes_to_merge[i]]
138
+ skip.add(bboxes_to_merge[i])
139
+ w = (b2[0] + b2[2] / 2) - (b1[0] - b1[2] / 2)
140
+ h = (b2[1] + b2[3] / 2) - (b1[1] - b1[3] / 2)
141
+ left = b1[0] - b1[2] / 2
142
+ top = min(b2[1] - b2[3] / 2, b1[1] - b1[3] / 2)
143
+ out_coord.append([left + w / 2, top + h / 2, w, h])
144
+ out_cls.append(classes[i])
145
+ elif i not in skip:
146
+ out_coord.append(coords[i].tolist())
147
+ out_cls.append(classes[i])
148
+ return tokens, np.array(out_cls), np.array(out_coord, dtype=np.float32)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ if len(sys.argv) != 2:
153
+ sys.exit(f"usage: {sys.argv[0]} <table_image>")
154
+ img = Image.open(sys.argv[1])
155
+ model = TableFormerOnnx()
156
+ tokens, classes, bboxes = model.predict(img)
157
+
158
+ print("OTSL:", " ".join(tokens))
159
+ print(f"{len(bboxes)} cells (xyxy in original image pixels):")
160
+ for cx, cy, w, h in bboxes:
161
+ x1, y1 = (cx - w / 2) * img.width, (cy - h / 2) * img.height
162
+ x2, y2 = (cx + w / 2) * img.width, (cy + h / 2) * img.height
163
+ print(f" ({x1:7.1f}, {y1:7.1f}, {x2:7.1f}, {y2:7.1f})")
tableformer_v2/.DS_Store ADDED
Binary file (6.15 kB). View file
 
tableformer_v2/bbox_head.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:945d677fbe54f922e5027ea70a6e584113ff8e2d92f5d7317d2b762bc796f922
3
+ size 37353806
tableformer_v2/decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:744f2922fca1583f2152e196e05fe989250681f2139ab386c2dfc3e7d2b6d733
3
+ size 68420864
tableformer_v2/encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6634b70eb00b8377992ecd8554964a18bfc2feae2b4c3d3286433f330b4c9591
3
+ size 98118237
tableformer_v2/inference.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """TableFormerV2 ONNX inference — table structure recognition.
3
+
4
+ Self-contained: requires only numpy, onnxruntime and Pillow.
5
+
6
+ python inference.py <table_image.png>
7
+
8
+ Pipeline: encoder.onnx -> greedy loop over decoder.onnx -> bbox_head.onnx.
9
+ The decoder graph is cache-free and re-runs the whole prefix each step
10
+ (vocab is 13 tokens, sequences are short, so this is cheap).
11
+
12
+ Outputs OTSL structure tokens and one bbox per data cell, xyxy normalized
13
+ to [0, 1] relative to the input table crop.
14
+ """
15
+
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import numpy as np
20
+ import onnxruntime as ort
21
+ from PIL import Image
22
+
23
+ HERE = Path(__file__).parent
24
+
25
+ ID2TOKEN = [
26
+ "<pad>", "[UNK]", "<start>", "<end>", "<ecel>", "<fcel>", "<lcel>",
27
+ "<ucel>", "<xcel>", "<nl>", "<ched>", "<rhed>", "<srow>",
28
+ ]
29
+ BOS_ID, EOS_ID = 2, 3
30
+ DATA_CELL_IDS = {4, 5, 10, 11, 12} # ecel, fcel, ched, rhed, srow
31
+
32
+ IMAGE_SIZE = 448
33
+ MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) # ImageNet
34
+ STD = np.array([0.229, 0.224, 0.225], dtype=np.float32)
35
+
36
+
37
+ def preprocess(image: Image.Image) -> np.ndarray:
38
+ """RGB table crop -> (1, 3, 448, 448) float32, ImageNet-normalized."""
39
+ image = image.convert("RGB").resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
40
+ x = np.asarray(image, dtype=np.float32) / 255.0
41
+ x = (x - MEAN) / STD
42
+ return x.transpose(2, 0, 1)[None]
43
+
44
+
45
+ class TableFormerV2Onnx:
46
+ def __init__(self, model_dir=HERE, providers=("CPUExecutionProvider",)):
47
+ model_dir = Path(model_dir)
48
+ p = list(providers)
49
+ self.encoder = ort.InferenceSession(str(model_dir / "encoder.onnx"), providers=p)
50
+ self.decoder = ort.InferenceSession(str(model_dir / "decoder.onnx"), providers=p)
51
+ self.bbox_head = ort.InferenceSession(str(model_dir / "bbox_head.onnx"), providers=p)
52
+
53
+ def predict(self, image: Image.Image, max_length: int = 512):
54
+ """Returns (otsl_tokens, bboxes). bboxes: (num_cells, 4) xyxy in [0, 1]."""
55
+ images = preprocess(image)
56
+ (enc_hidden,) = self.encoder.run(None, {"images": images})
57
+
58
+ # Greedy generation
59
+ ids = np.array([[BOS_ID]], dtype=np.int64)
60
+ for _ in range(max_length):
61
+ logits, _ = self.decoder.run(
62
+ None, {"input_ids": ids, "encoder_hidden": enc_hidden}
63
+ )
64
+ next_id = int(logits[0, -1].argmax())
65
+ ids = np.concatenate([ids, [[next_id]]], axis=1)
66
+ if next_id == EOS_ID:
67
+ break
68
+
69
+ # Hidden states of the full sequence -> bboxes at data-cell positions
70
+ _, hidden = self.decoder.run(
71
+ None, {"input_ids": ids, "encoder_hidden": enc_hidden}
72
+ )
73
+ cell_pos = [i for i, t in enumerate(ids[0].tolist()) if t in DATA_CELL_IDS]
74
+ if cell_pos:
75
+ (bboxes,) = self.bbox_head.run(
76
+ None,
77
+ {"cell_embeddings": hidden[0, cell_pos], "encoder_hidden": enc_hidden},
78
+ )
79
+ else:
80
+ bboxes = np.zeros((0, 4), dtype=np.float32)
81
+
82
+ tokens = [
83
+ ID2TOKEN[t] for t in ids[0].tolist() if t not in (BOS_ID, EOS_ID, 0)
84
+ ]
85
+ return tokens, bboxes
86
+
87
+
88
+ if __name__ == "__main__":
89
+ if len(sys.argv) != 2:
90
+ sys.exit(f"usage: {sys.argv[0]} <table_image>")
91
+ img = Image.open(sys.argv[1])
92
+ model = TableFormerV2Onnx()
93
+ tokens, bboxes = model.predict(img)
94
+
95
+ print("OTSL:", " ".join(tokens))
96
+ print(f"{len(bboxes)} cells (xyxy in original image pixels):")
97
+ scale = np.array([img.width, img.height, img.width, img.height])
98
+ for b in bboxes * scale:
99
+ print(f" ({b[0]:7.1f}, {b[1]:7.1f}, {b[2]:7.1f}, {b[3]:7.1f})")