{ "image_size": 320, "patch_size": 16, "num_channels": 3, "color_order": "RGB", "resize_mode": "letterbox", "pad_color_rgb": [114, 114, 114], "normalize_mean": [0.5, 0.5, 0.5], "normalize_std": [0.5, 0.5, 0.5], "input_dtype": "float32", "input_layout": "BCHW", "onnx_inputs": { "pixel_values": { "shape": "(batch_size, 3, 320, 320)", "dtype": "float32", "description": "Letterboxed and normalized image tensor. Preprocessing is NOT in the graph; do it externally." }, "padding_mask": { "shape": "(batch_size, 320, 320)", "dtype": "bool", "description": "True = padded pixel, False = valid pixel. Pass an all-False mask if your image fills the frame." } }, "onnx_outputs": { "probabilities": { "shape": "(batch_size, 19294)", "dtype": "float32", "activation": "sigmoid (already applied inside the graph)" } }, "opset_version": 21, "dynamic_batch": true, "embedded_metadata": { "vocabulary": "Embedded as gzip+base64 in the ONNX metadata_props (key: vocab_b64_gzip).", "tags_csv": "selected_tags.csv mirrors index_to_tag for SmilingWolf-style tagger UIs." }, "notes": [ "Letterbox resize keeps aspect ratio; pad with the RGB color above to reach 320x320.", "Normalize per-channel: (x/255 - mean) / std after letterboxing.", "Recommended thresholds are in pr_thresholds.json (per-tag and global)." ] }