| # Launch SGLang server with DFLASH speculative decoding. | |
| # Usage: | |
| # bash start_server_dflash.sh | |
| # bash start_server_dflash.sh 4 # use tp=4 | |
| set -e | |
| TP=${1:-2} | |
| BASE_MODEL=/workspace/models/Qwen3-8B | |
| DRAFT_MODEL=/workspace/models/Qwen3-8B-DFlash-b16 | |
| INTRANET_IP=10.1.1.22 | |
| PORT=30000 | |
| echo "============================================" | |
| echo " SGLang DFLASH Speculative Decoding" | |
| echo " target : $BASE_MODEL" | |
| echo " draft : $DRAFT_MODEL" | |
| echo " host : $INTRANET_IP:$PORT" | |
| echo " tp : $TP" | |
| echo "============================================" | |
| export SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 | |
| /workspace/miniconda3/envs/dflash/bin/python -m sglang.launch_server \ | |
| --model-path $BASE_MODEL \ | |
| --speculative-algorithm DFLASH \ | |
| --speculative-draft-model-path $DRAFT_MODEL \ | |
| --tp-size $TP \ | |
| --dtype bfloat16 \ | |
| --attention-backend fa3 \ | |
| --mem-fraction-static 0.30 \ | |
| --trust-remote-code \ | |
| --host $INTRANET_IP \ | |
| --port $PORT | |