JonnaMat's picture
Update page
32853d9 verified
{
"title": "Edge Inference Benchmarks",
"subtitle": "Compare throughput and latency across devices and model variants.",
"model_column": "model",
"model_family_column": "model_family",
"model_link_prefix": "https://huggingface.co/",
"optimized_org": "embedl",
"filters": [
{
"column": "type",
"label": "Modality"
},
{
"column": "batch",
"label": "Batch Size",
"type": "number"
},
{
"column": "device",
"label": "Device",
"value_labels": {
"orin_nano": "Jetson Orin Nano Super",
"orin_nano_super": "Jetson Orin Nano Super",
"agx_orin": "Jetson AGX Orin",
"agx_thor": "Jetson AGX Thor"
}
}
],
"metrics": [
{
"column": "tps",
"label": "Tokens / sec",
"short": "TPS ↑",
"higher_is_better": true,
"description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. "
},
{
"column": "tpot",
"label": "Time per Output Token (ms)",
"short": "TPOT ↓",
"higher_is_better": false,
"description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens."
},
{
"column": "ttft",
"label": "Time to First Token (ms)",
"short": "TTFT ↓",
"higher_is_better": false,
"description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization."
},
{
"column": "e2e",
"label": "End-to-End Latency (sec)",
"short": "E2E ↓",
"higher_is_better": false,
"description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency."
}
],
"display_columns": [
{
"column": "res",
"label": "Resolution",
"visible_when": {
"type": [
"video",
"image"
]
}
},
{
"column": "fps",
"label": "FPS",
"type": "number",
"visible_when": {
"type": [
"video"
]
}
},
{
"column": "frames",
"label": "Frames",
"type": "number",
"visible_when": {
"type": [
"video"
]
}
}
],
"chart": {
"default_metric": "tps",
"group_by": "device",
"scenarios": [
{
"label": "Text",
"match": {
"type": "text"
}
},
{
"label": "Image Β· 1280Γ—720",
"match": {
"type": "image",
"res": "1280x720"
}
},
{
"label": "Video Β· 1280Γ—720 Β· 4 FPS",
"match": {
"type": "video",
"res": "1280x720",
"fps": 4
}
}
]
},
"table_sort": [
{
"column": "res",
"direction": "asc"
},
{
"column": "fps",
"direction": "desc"
}
],
"table_group_by": "model",
"model_families": {
"Cosmos-Reason2-2B": {
"data_file": "data/cosmos-reason2.csv",
"table_group_by": ["res", "fps"],
"experiment_setup": {
"agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
"agx_orin": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.",
"orin_nano": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs."
}
}
}
}