File size: 3,819 Bytes
3e7d998
 
32853d9
3e7d998
 
 
 
 
 
 
884798e
3e7d998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884798e
3e7d998
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79c5db2
32853d9
3e7d998
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
{
  "title": "Edge Inference Benchmarks",
  "subtitle": "Compare throughput and latency across devices and model variants.",
  "model_column": "model",
  "model_family_column": "model_family",
  "model_link_prefix": "https://huggingface.co/",
  "optimized_org": "embedl",
  "filters": [
    {
      "column": "type",
      "label": "Modality"
    },
    {
      "column": "batch",
      "label": "Batch Size",
      "type": "number"
    },
    {
      "column": "device",
      "label": "Device",
      "value_labels": {
        "orin_nano": "Jetson Orin Nano Super",
        "orin_nano_super": "Jetson Orin Nano Super",
        "agx_orin": "Jetson AGX Orin",
        "agx_thor": "Jetson AGX Thor"
      }
    }
  ],
  "metrics": [
    {
      "column": "tps",
      "label": "Tokens / sec",
      "short": "TPS ↑",
      "higher_is_better": true,
      "description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. "
    },
    {
      "column": "tpot",
      "label": "Time per Output Token (ms)",
      "short": "TPOT ↓",
      "higher_is_better": false,
      "description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens."
    },
    {
      "column": "ttft",
      "label": "Time to First Token (ms)",
      "short": "TTFT ↓",
      "higher_is_better": false,
      "description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization."
    },
    {
      "column": "e2e",
      "label": "End-to-End Latency (sec)",
      "short": "E2E ↓",
      "higher_is_better": false,
      "description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency."
    }
  ],
  "display_columns": [
    {
      "column": "res",
      "label": "Resolution",
      "visible_when": {
        "type": [
          "video",
          "image"
        ]
      }
    },
    {
      "column": "fps",
      "label": "FPS",
      "type": "number",
      "visible_when": {
        "type": [
          "video"
        ]
      }
    },
    {
      "column": "frames",
      "label": "Frames",
      "type": "number",
      "visible_when": {
        "type": [
          "video"
        ]
      }
    }
  ],
  "chart": {
    "default_metric": "tps",
    "group_by": "device",
    "scenarios": [
      {
        "label": "Text",
        "match": {
          "type": "text"
        }
      },
      {
        "label": "Image · 1280×720",
        "match": {
          "type": "image",
          "res": "1280x720"
        }
      },
      {
        "label": "Video · 1280×720 · 4 FPS",
        "match": {
          "type": "video",
          "res": "1280x720",
          "fps": 4
        }
      }
    ]
  },
  "table_sort": [
    {
      "column": "res",
      "direction": "asc"
    },
    {
      "column": "fps",
      "direction": "desc"
    }
  ],
  "table_group_by": "model",
  "model_families": {
    "Cosmos-Reason2-2B": {
      "data_file": "data/cosmos-reason2.csv",
      "table_group_by": ["res", "fps"],
      "experiment_setup": {
        "agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "agx_orin": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
        "orin_nano": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs."
      }
    }
  }
}