#!/usr/bin/env bash # tp_launch.sh — launcher for TP>1 multi-process qwen3-moe-aclnn. # # Usage: ./tp_launch.sh [args...] # e.g. ./tp_launch.sh 16 ./build/qwen3-moe-aclnn --model-dir ... --prompt "..." --n-predict 20 # # Each rank runs as a separate process with: # ASCEND_RT_VISIBLE_DEVICES= # TP_RANK= TP_SIZE= # HCCL_WHITELIST_DISABLE=1 # rank 0 creates /tmp/hccl_root_info.bin; other ranks wait for it. set -euo pipefail TP_SIZE="${1:?tp_size required}"; shift BIN="${1:?binary required}"; shift # Clean any stale HCCL coordination file rm -f /tmp/hccl_root_info.bin export HCCL_WHITELIST_DISABLE=1 # Benchmark-tuned defaults (bench_hccl_adv.sh 2026-04-21): # ring:200 + OP_EXPANSION_MODE=AIV + OP_BASE_FFTS_MODE_ENABLE=1 → ~18.8 t/s median # vs baseline (auto) ~12 t/s. +54% from HCCL env knobs alone. export HCCL_ALGO="${HCCL_ALGO:-level0:ring}" export HCCL_BUFFSIZE="${HCCL_BUFFSIZE:-200}" export HCCL_OP_EXPANSION_MODE="${HCCL_OP_EXPANSION_MODE:-AIV}" export HCCL_OP_BASE_FFTS_MODE_ENABLE="${HCCL_OP_BASE_FFTS_MODE_ENABLE:-1}" # TASK_QUEUE_ENABLE=2: aggressive async task queueing (marginal gain on top of AIV+FFTS) export TASK_QUEUE_ENABLE="${TASK_QUEUE_ENABLE:-2}" # Launch ranks 1..N-1 in background with stdin/stdout redirected to /dev/null / logfile. # Launch rank 0 LAST in foreground, inheriting the terminal stdin/stdout — so --interactive works. pids=() for rank in $(seq 1 $((TP_SIZE - 1))); do logfile="/tmp/tp_rank_${rank}.log" env ASCEND_RT_VISIBLE_DEVICES=${rank} \ TP_RANK=${rank} \ TP_SIZE=${TP_SIZE} \ "${BIN}" "$@" < /dev/null > "${logfile}" 2>&1 & pids+=($!) echo "[tp_launch] rank ${rank} pid=$! log=${logfile}" done # Give ranks 1..N-1 a moment to reach HcclCommInitRootInfo's file-wait before rank 0 writes it. sleep 1 # Rank 0 in foreground — terminal stdin/stdout passthrough for REPL. env ASCEND_RT_VISIBLE_DEVICES=0 \ TP_RANK=0 \ TP_SIZE=${TP_SIZE} \ "${BIN}" "$@" ec=$? # Wait for background ranks to finish (rank 0 exit signals end-of-work, but they may take a bit). for i in "${!pids[@]}"; do wait "${pids[$i]}" || true done exit $ec