| # tp_launch.sh — launcher for TP>1 multi-process qwen3-moe-aclnn. | |
| # | |
| # Usage: ./tp_launch.sh <tp_size> <bin> [args...] | |
| # e.g. ./tp_launch.sh 16 ./build/qwen3-moe-aclnn --model-dir ... --prompt "..." --n-predict 20 | |
| # | |
| # Each rank runs as a separate process with: | |
| # ASCEND_RT_VISIBLE_DEVICES=<rank> | |
| # TP_RANK=<rank> TP_SIZE=<tp_size> | |
| # HCCL_WHITELIST_DISABLE=1 | |
| # rank 0 creates /tmp/hccl_root_info.bin; other ranks wait for it. | |
| set -euo pipefail | |
| TP_SIZE="${1:?tp_size required}"; shift | |
| BIN="${1:?binary required}"; shift | |
| # Clean any stale HCCL coordination file | |
| rm -f /tmp/hccl_root_info.bin | |
| export HCCL_WHITELIST_DISABLE=1 | |
| # Benchmark-tuned defaults (bench_hccl_adv.sh 2026-04-21): | |
| # ring:200 + OP_EXPANSION_MODE=AIV + OP_BASE_FFTS_MODE_ENABLE=1 → ~18.8 t/s median | |
| # vs baseline (auto) ~12 t/s. +54% from HCCL env knobs alone. | |
| export HCCL_ALGO="${HCCL_ALGO:-level0:ring}" | |
| export HCCL_BUFFSIZE="${HCCL_BUFFSIZE:-200}" | |
| export HCCL_OP_EXPANSION_MODE="${HCCL_OP_EXPANSION_MODE:-AIV}" | |
| export HCCL_OP_BASE_FFTS_MODE_ENABLE="${HCCL_OP_BASE_FFTS_MODE_ENABLE:-1}" | |
| # TASK_QUEUE_ENABLE=2: aggressive async task queueing (marginal gain on top of AIV+FFTS) | |
| export TASK_QUEUE_ENABLE="${TASK_QUEUE_ENABLE:-2}" | |
| # Launch ranks 1..N-1 in background with stdin/stdout redirected to /dev/null / logfile. | |
| # Launch rank 0 LAST in foreground, inheriting the terminal stdin/stdout — so --interactive works. | |
| pids=() | |
| for rank in $(seq 1 $((TP_SIZE - 1))); do | |
| logfile="/tmp/tp_rank_${rank}.log" | |
| env ASCEND_RT_VISIBLE_DEVICES=${rank} \ | |
| TP_RANK=${rank} \ | |
| TP_SIZE=${TP_SIZE} \ | |
| "${BIN}" "$@" < /dev/null > "${logfile}" 2>&1 & | |
| pids+=($!) | |
| echo "[tp_launch] rank ${rank} pid=$! log=${logfile}" | |
| done | |
| # Give ranks 1..N-1 a moment to reach HcclCommInitRootInfo's file-wait before rank 0 writes it. | |
| sleep 1 | |
| # Rank 0 in foreground — terminal stdin/stdout passthrough for REPL. | |
| env ASCEND_RT_VISIBLE_DEVICES=0 \ | |
| TP_RANK=0 \ | |
| TP_SIZE=${TP_SIZE} \ | |
| "${BIN}" "$@" | |
| ec=$? | |
| # Wait for background ranks to finish (rank 0 exit signals end-of-work, but they may take a bit). | |
| for i in "${!pids[@]}"; do | |
| wait "${pids[$i]}" || true | |
| done | |
| exit $ec | |