{ "title": "Edge Inference Benchmarks", "subtitle": "Compare throughput and latency across devices and model variants.", "model_column": "model", "model_family_column": "model_family", "model_link_prefix": "https://huggingface.co/", "optimized_org": "embedl", "filters": [ { "column": "type", "label": "Modality" }, { "column": "batch", "label": "Batch Size", "type": "number" }, { "column": "device", "label": "Device", "value_labels": { "orin_nano": "Jetson Orin Nano Super", "orin_nano_super": "Jetson Orin Nano Super", "agx_orin": "Jetson AGX Orin", "agx_thor": "Jetson AGX Thor" } } ], "metrics": [ { "column": "tps", "label": "Tokens / sec", "short": "TPS ↑", "higher_is_better": true, "description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. " }, { "column": "tpot", "label": "Time per Output Token (ms)", "short": "TPOT(ms) ↓", "higher_is_better": false, "description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens." }, { "column": "ttft", "label": "Time to First Token (ms)", "short": "TTFT(ms) ↓", "higher_is_better": false, "description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization." }, { "column": "e2e", "label": "End-to-End Latency (sec)", "short": "E2E(s) ↓", "higher_is_better": false, "description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency." } ], "display_columns": [ { "column": "res", "label": "RESOLUTION", "visible_when": { "type": [ "video", "image" ] } }, { "column": "fps", "label": "FPS", "type": "number", "visible_when": { "type": [ "video" ] } }, { "column": "frames", "label": "Frames", "type": "number", "visible_when": { "type": [ "video" ] } } ], "chart": { "default_metric": "tps", "group_by": "device", "scenarios": [ { "label": "Text", "match": { "type": "text" } }, { "label": "Image · 1280×720", "match": { "type": "image", "res": "1280x720" } }, { "label": "Video · 1280×720 · 4 FPS", "match": { "type": "video", "res": "1280x720", "fps": 4 } } ] }, "table_sort": [ { "column": "res", "direction": "asc" }, { "column": "fps", "direction": "desc" } ], "table_group_by": "model", "model_families": { "Cosmos-Reason2-2B": { "data_file": "data/Cosmos-Reason2.csv", "table_group_by": ["res", "fps"], "experiment_setup": { "agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "agx_orin": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", "orin_nano": "Measurement setup: NVIDIA AI IoT vLLM 0.14.0 tegra, 256 tokens generated, 10 warm-up runs, averaged over 25 runs." } } } }