File size: 2,195 Bytes
4b9fefd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env bash
# tp_launch.sh — launcher for TP>1 multi-process qwen3-moe-aclnn.
#
# Usage: ./tp_launch.sh <tp_size> <bin> [args...]
#   e.g. ./tp_launch.sh 16 ./build/qwen3-moe-aclnn --model-dir ... --prompt "..." --n-predict 20
#
# Each rank runs as a separate process with:
#   ASCEND_RT_VISIBLE_DEVICES=<rank>
#   TP_RANK=<rank>  TP_SIZE=<tp_size>
#   HCCL_WHITELIST_DISABLE=1
# rank 0 creates /tmp/hccl_root_info.bin; other ranks wait for it.
set -euo pipefail

TP_SIZE="${1:?tp_size required}"; shift
BIN="${1:?binary required}"; shift

# Clean any stale HCCL coordination file
rm -f /tmp/hccl_root_info.bin

export HCCL_WHITELIST_DISABLE=1
# Benchmark-tuned defaults (bench_hccl_adv.sh 2026-04-21):
#   ring:200 + OP_EXPANSION_MODE=AIV + OP_BASE_FFTS_MODE_ENABLE=1  →  ~18.8 t/s median
#   vs baseline (auto) ~12 t/s.  +54% from HCCL env knobs alone.
export HCCL_ALGO="${HCCL_ALGO:-level0:ring}"
export HCCL_BUFFSIZE="${HCCL_BUFFSIZE:-200}"
export HCCL_OP_EXPANSION_MODE="${HCCL_OP_EXPANSION_MODE:-AIV}"
export HCCL_OP_BASE_FFTS_MODE_ENABLE="${HCCL_OP_BASE_FFTS_MODE_ENABLE:-1}"
# TASK_QUEUE_ENABLE=2: aggressive async task queueing (marginal gain on top of AIV+FFTS)
export TASK_QUEUE_ENABLE="${TASK_QUEUE_ENABLE:-2}"

# Launch ranks 1..N-1 in background with stdin/stdout redirected to /dev/null / logfile.
# Launch rank 0 LAST in foreground, inheriting the terminal stdin/stdout — so --interactive works.
pids=()
for rank in $(seq 1 $((TP_SIZE - 1))); do
    logfile="/tmp/tp_rank_${rank}.log"
    env ASCEND_RT_VISIBLE_DEVICES=${rank} \
        TP_RANK=${rank} \
        TP_SIZE=${TP_SIZE} \
        "${BIN}" "$@" < /dev/null > "${logfile}" 2>&1 &
    pids+=($!)
    echo "[tp_launch] rank ${rank} pid=$! log=${logfile}"
done

# Give ranks 1..N-1 a moment to reach HcclCommInitRootInfo's file-wait before rank 0 writes it.
sleep 1

# Rank 0 in foreground — terminal stdin/stdout passthrough for REPL.
env ASCEND_RT_VISIBLE_DEVICES=0 \
    TP_RANK=0 \
    TP_SIZE=${TP_SIZE} \
    "${BIN}" "$@"
ec=$?

# Wait for background ranks to finish (rank 0 exit signals end-of-work, but they may take a bit).
for i in "${!pids[@]}"; do
    wait "${pids[$i]}" || true
done
exit $ec