File size: 3,819 Bytes
3e7d998 32853d9 3e7d998 884798e 3e7d998 884798e 3e7d998 79c5db2 32853d9 3e7d998 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | {
"title": "Edge Inference Benchmarks",
"subtitle": "Compare throughput and latency across devices and model variants.",
"model_column": "model",
"model_family_column": "model_family",
"model_link_prefix": "https://huggingface.co/",
"optimized_org": "embedl",
"filters": [
{
"column": "type",
"label": "Modality"
},
{
"column": "batch",
"label": "Batch Size",
"type": "number"
},
{
"column": "device",
"label": "Device",
"value_labels": {
"orin_nano": "Jetson Orin Nano Super",
"orin_nano_super": "Jetson Orin Nano Super",
"agx_orin": "Jetson AGX Orin",
"agx_thor": "Jetson AGX Thor"
}
}
],
"metrics": [
{
"column": "tps",
"label": "Tokens / sec",
"short": "TPS ↑",
"higher_is_better": true,
"description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. "
},
{
"column": "tpot",
"label": "Time per Output Token (ms)",
"short": "TPOT ↓",
"higher_is_better": false,
"description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens."
},
{
"column": "ttft",
"label": "Time to First Token (ms)",
"short": "TTFT ↓",
"higher_is_better": false,
"description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization."
},
{
"column": "e2e",
"label": "End-to-End Latency (sec)",
"short": "E2E ↓",
"higher_is_better": false,
"description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency."
}
],
"display_columns": [
{
"column": "res",
"label": "Resolution",
"visible_when": {
"type": [
"video",
"image"
]
}
},
{
"column": "fps",
"label": "FPS",
"type": "number",
"visible_when": {
"type": [
"video"
]
}
},
{
"column": "frames",
"label": "Frames",
"type": "number",
"visible_when": {
"type": [
"video"
]
}
}
],
"chart": {
"default_metric": "tps",
"group_by": "device",
"scenarios": [
{
"label": "Text",
"match": {
"type": "text"
}
},
{
"label": "Image · 1280×720",
"match": {
"type": "image",
"res": "1280x720"
}
},
{
"label": "Video · 1280×720 · 4 FPS",
"match": {
"type": "video",
"res": "1280x720",
"fps": 4
}
}
]
},
"table_sort": [
{
"column": "res",
"direction": "asc"
},
{
"column": "fps",
"direction": "desc"
}
],
"table_group_by": "model",
"model_families": {
"Cosmos-Reason2-2B": {
"data_file": "data/cosmos-reason2.csv",
"table_group_by": ["res", "fps"],
"experiment_setup": {
"agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
"agx_orin": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
"orin_nano": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs."
}
}
}
}
|