| { | |
| "title": "Edge Inference Benchmarks", | |
| "subtitle": "Compare throughput and latency across devices and model variants.", | |
| "model_column": "model", | |
| "model_family_column": "model_family", | |
| "model_link_prefix": "https://huggingface.co/", | |
| "optimized_org": "embedl", | |
| "filters": [ | |
| { | |
| "column": "type", | |
| "label": "Modality" | |
| }, | |
| { | |
| "column": "batch", | |
| "label": "Batch Size", | |
| "type": "number" | |
| }, | |
| { | |
| "column": "device", | |
| "label": "Device", | |
| "value_labels": { | |
| "orin_nano": "Jetson Orin Nano Super", | |
| "orin_nano_super": "Jetson Orin Nano Super", | |
| "agx_orin": "Jetson AGX Orin", | |
| "agx_thor": "Jetson AGX Thor" | |
| } | |
| } | |
| ], | |
| "metrics": [ | |
| { | |
| "column": "tps", | |
| "label": "Tokens / sec", | |
| "short": "TPS β", | |
| "higher_is_better": true, | |
| "description": "Tokens per second (higher is better). Number of output tokens generated per second during the decoding phase. " | |
| }, | |
| { | |
| "column": "tpot", | |
| "label": "Time per Output Token (ms)", | |
| "short": "TPOT β", | |
| "higher_is_better": false, | |
| "description": "Time per output token in ms (lower is better). Average time (in milliseconds) required to generate one output token during decoding. Computed as TPOT = (last_token_ts - first_token_ts) / total_output_tokens." | |
| }, | |
| { | |
| "column": "ttft", | |
| "label": "Time to First Token (ms)", | |
| "short": "TTFT β", | |
| "higher_is_better": false, | |
| "description": "Time to first token in ms (lower is better). Time from request submission to generation of the first output token. This includes vision encoding, prompt prefill, KV cache initialization." | |
| }, | |
| { | |
| "column": "e2e", | |
| "label": "End-to-End Latency (sec)", | |
| "short": "E2E β", | |
| "higher_is_better": false, | |
| "description": "End-to-end latency in seconds (lower is better). Total time from request submission to completion of the full generated response. This reflects real user-perceived latency." | |
| } | |
| ], | |
| "display_columns": [ | |
| { | |
| "column": "res", | |
| "label": "Resolution", | |
| "visible_when": { | |
| "type": [ | |
| "video", | |
| "image" | |
| ] | |
| } | |
| }, | |
| { | |
| "column": "fps", | |
| "label": "FPS", | |
| "type": "number", | |
| "visible_when": { | |
| "type": [ | |
| "video" | |
| ] | |
| } | |
| }, | |
| { | |
| "column": "frames", | |
| "label": "Frames", | |
| "type": "number", | |
| "visible_when": { | |
| "type": [ | |
| "video" | |
| ] | |
| } | |
| } | |
| ], | |
| "chart": { | |
| "default_metric": "tps", | |
| "group_by": "device", | |
| "scenarios": [ | |
| { | |
| "label": "Text", | |
| "match": { | |
| "type": "text" | |
| } | |
| }, | |
| { | |
| "label": "Image Β· 1280Γ720", | |
| "match": { | |
| "type": "image", | |
| "res": "1280x720" | |
| } | |
| }, | |
| { | |
| "label": "Video Β· 1280Γ720 Β· 4 FPS", | |
| "match": { | |
| "type": "video", | |
| "res": "1280x720", | |
| "fps": 4 | |
| } | |
| } | |
| ] | |
| }, | |
| "table_sort": [ | |
| { | |
| "column": "res", | |
| "direction": "asc" | |
| }, | |
| { | |
| "column": "fps", | |
| "direction": "desc" | |
| } | |
| ], | |
| "table_group_by": "model", | |
| "model_families": { | |
| "Cosmos-Reason2-2B": { | |
| "data_file": "data/cosmos-reason2.csv", | |
| "table_group_by": ["res", "fps"], | |
| "experiment_setup": { | |
| "agx_thor": "Measurement setup: NVIDIA vLLM 26.01, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", | |
| "agx_orin": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs.", | |
| "orin_nano": "Measurement setup: NVIDIA vLLM 0.14.0 for Jetson, 256 tokens generated, 10 warm-up runs, averaged over 25 runs." | |
| } | |
| } | |
| } | |
| } | |