{
  "image_size": 320,
  "patch_size": 16,
  "num_channels": 3,
  "color_order": "RGB",
  "resize_mode": "letterbox",
  "pad_color_rgb": [114, 114, 114],
  "normalize_mean": [0.5, 0.5, 0.5],
  "normalize_std": [0.5, 0.5, 0.5],
  "input_dtype": "float32",
  "input_layout": "BCHW",
  "onnx_inputs": {
    "pixel_values": {
      "shape": "(batch_size, 3, 320, 320)",
      "dtype": "float32",
      "description": "Letterboxed and normalized image tensor. Preprocessing is NOT in the graph; do it externally."
    },
    "padding_mask": {
      "shape": "(batch_size, 320, 320)",
      "dtype": "bool",
      "description": "True = padded pixel, False = valid pixel. Pass an all-False mask if your image fills the frame."
    }
  },
  "onnx_outputs": {
    "probabilities": {
      "shape": "(batch_size, 19294)",
      "dtype": "float32",
      "activation": "sigmoid (already applied inside the graph)"
    }
  },
  "opset_version": 21,
  "dynamic_batch": true,
  "embedded_metadata": {
    "vocabulary": "Embedded as gzip+base64 in the ONNX metadata_props (key: vocab_b64_gzip).",
    "tags_csv": "selected_tags.csv mirrors index_to_tag for SmilingWolf-style tagger UIs."
  },
  "notes": [
    "Letterbox resize keeps aspect ratio; pad with the RGB color above to reach 320x320.",
    "Normalize per-channel: (x/255 - mean) / std after letterboxing.",
    "Recommended thresholds are in pr_thresholds.json (per-tag and global)."
  ]
}