llm_mutil_npu / scripts /tp_launch.sh
xianglarry's picture
Initial C++ aclnn EAGER inference for Qwen3-235B-A22B MoE on Ascend 910 × 16 NPU
4b9fefd
#!/usr/bin/env bash
# tp_launch.sh — launcher for TP>1 multi-process qwen3-moe-aclnn.
#
# Usage: ./tp_launch.sh <tp_size> <bin> [args...]
# e.g. ./tp_launch.sh 16 ./build/qwen3-moe-aclnn --model-dir ... --prompt "..." --n-predict 20
#
# Each rank runs as a separate process with:
# ASCEND_RT_VISIBLE_DEVICES=<rank>
# TP_RANK=<rank> TP_SIZE=<tp_size>
# HCCL_WHITELIST_DISABLE=1
# rank 0 creates /tmp/hccl_root_info.bin; other ranks wait for it.
set -euo pipefail
TP_SIZE="${1:?tp_size required}"; shift
BIN="${1:?binary required}"; shift
# Clean any stale HCCL coordination file
rm -f /tmp/hccl_root_info.bin
export HCCL_WHITELIST_DISABLE=1
# Benchmark-tuned defaults (bench_hccl_adv.sh 2026-04-21):
# ring:200 + OP_EXPANSION_MODE=AIV + OP_BASE_FFTS_MODE_ENABLE=1 → ~18.8 t/s median
# vs baseline (auto) ~12 t/s. +54% from HCCL env knobs alone.
export HCCL_ALGO="${HCCL_ALGO:-level0:ring}"
export HCCL_BUFFSIZE="${HCCL_BUFFSIZE:-200}"
export HCCL_OP_EXPANSION_MODE="${HCCL_OP_EXPANSION_MODE:-AIV}"
export HCCL_OP_BASE_FFTS_MODE_ENABLE="${HCCL_OP_BASE_FFTS_MODE_ENABLE:-1}"
# TASK_QUEUE_ENABLE=2: aggressive async task queueing (marginal gain on top of AIV+FFTS)
export TASK_QUEUE_ENABLE="${TASK_QUEUE_ENABLE:-2}"
# Launch ranks 1..N-1 in background with stdin/stdout redirected to /dev/null / logfile.
# Launch rank 0 LAST in foreground, inheriting the terminal stdin/stdout — so --interactive works.
pids=()
for rank in $(seq 1 $((TP_SIZE - 1))); do
logfile="/tmp/tp_rank_${rank}.log"
env ASCEND_RT_VISIBLE_DEVICES=${rank} \
TP_RANK=${rank} \
TP_SIZE=${TP_SIZE} \
"${BIN}" "$@" < /dev/null > "${logfile}" 2>&1 &
pids+=($!)
echo "[tp_launch] rank ${rank} pid=$! log=${logfile}"
done
# Give ranks 1..N-1 a moment to reach HcclCommInitRootInfo's file-wait before rank 0 writes it.
sleep 1
# Rank 0 in foreground — terminal stdin/stdout passthrough for REPL.
env ASCEND_RT_VISIBLE_DEVICES=0 \
TP_RANK=0 \
TP_SIZE=${TP_SIZE} \
"${BIN}" "$@"
ec=$?
# Wait for background ranks to finish (rank 0 exit signals end-of-work, but they may take a bit).
for i in "${!pids[@]}"; do
wait "${pids[$i]}" || true
done
exit $ec