| #!/usr/bin/env bash |
|
|
| T=`date +%m%d%H%M` |
|
|
| |
| |
| CFG=$1 |
| GPUS=$2 |
| RESUME_FROM=${3:-None} |
| |
|
|
| GPUS_PER_NODE=$(($GPUS<8?$GPUS:8)) |
| NNODES=${WORLD_SIZE:-`expr $GPUS / $GPUS_PER_NODE`} |
|
|
| MASTER_PORT=${MASTER_PORT:-28567} |
| MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} |
|
|
| export MASTER_ADDR=${MASTER_ADDR} |
| export MASTER_PORT=${MASTER_PORT} |
|
|
| RANK=${RANK:-0} |
|
|
| WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/ |
| |
|
|
| if [ ! -d ${WORK_DIR}logs ]; then |
| mkdir -p ${WORK_DIR}logs |
| fi |
| export PYTHONPATH="$(realpath "$(dirname $0)/..")":"$(realpath "$(dirname $0)/../navsim")":$PYTHONPATH |
| export OMP_NUM_THREADS=1 |
|
|
| echo 'WORK_DIR: ' ${WORK_DIR} |
| echo 'GPUS_PER_NODE: ' ${GPUS_PER_NODE} |
| echo 'NNODES: ' ${NNODES} |
| echo 'RANK: ' ${RANK} |
| echo 'PYTHONPATH: ' ${PYTHONPATH} |
|
|
| if [[ "$RESUME_FROM" != "None" && -n "$RESUME_FROM" ]]; then |
| RESUME_ARG="--resume-from $RESUME_FROM" |
| else |
| RESUME_ARG="" |
| fi |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| torchrun \ |
| --nnodes=${NNODES} \ |
| --nproc_per_node=${GPUS_PER_NODE} \ |
| --node_rank=${RANK} \ |
| --master_addr=${MASTER_ADDR} \ |
| --master_port=${MASTER_PORT} \ |
| $(dirname "$0")/train.py \ |
| $CFG \ |
| --launcher pytorch $RESUME_ARG \ |
| --work-dir ${WORK_DIR} \ |
| 2>&1 | tee ${WORK_DIR}logs/train.$T |
|
|