| # Step 2: Launch SGLang server with STANDALONE speculative decoding. | |
| # Usage: | |
| # bash start_server.sh | |
| # bash start_server.sh 8 # use tp=8 | |
| set -e | |
| TP=${1:-2} | |
| BASE_MODEL=/workspace/models/Qwen3-8B | |
| MERGED=/workspace/hanrui/syxin_old/Specforge/outputs/qwen3-8b-sft-32gpu-v2-merged | |
| INTRANET_IP=10.1.1.72 | |
| PORT=30000 | |
| if [ ! -d "$MERGED" ]; then | |
| echo "[ERROR] Merged model not found: $MERGED" | |
| echo " Run: conda activate sglang && python3 merge_lora.py" | |
| exit 1 | |
| fi | |
| echo "============================================" | |
| echo " SGLang STANDALONE Speculative Decoding" | |
| echo " target : $BASE_MODEL" | |
| echo " draft : $MERGED" | |
| echo " host : $INTRANET_IP:$PORT" | |
| echo " tp : $TP" | |
| echo "============================================" | |
| /workspace/miniconda3/envs/sglang/bin/python3 -m sglang.launch_server \ | |
| --model-path $BASE_MODEL \ | |
| --speculative-algorithm STANDALONE \ | |
| --speculative-draft-model-path $MERGED \ | |
| --speculative-num-steps 4 \ | |
| --speculative-eagle-topk 1 \ | |
| --speculative-num-draft-tokens 4 \ | |
| --tp-size $TP \ | |
| --mem-fraction-static 0.30 \ | |
| --trust-remote-code \ | |
| --host $INTRANET_IP \ | |
| --port $PORT \ | |
| --dtype bfloat16 | |