| #### Prerequisite | |
| ```bash | |
| ENV_PATH=/export/share/ruimeng/env/anaconda/envs/llm/bin/ninja | |
| export PATH="${ENV_PATH}/:$PATH" | |
| export NCCL_DEBUG=WARN | |
| export HF_DATASETS_CACHE=/export/xgen-embedding/data/.hfdata_cache | |
| export TRANSFORMERS_CACHE=/export/xgen-embedding/data/.hfmodel_cache/ | |
| export TOKENIZERS_PARALLELISM=true | |
| export WANDB_DISABLED=false | |
| export WANDB_PROJECT=mini-gradcache | |
| export WANDB_API_KEY=local-d64a4127e8d4a1782aedbb72e76080b3dfbf89dd | |
| export WANDB_BASE_URL=https://salesforceairesearch.wandb.io | |
| ``` | |
| ```bash | |
| # gpu0-3, DDP4-bs4096-accum4, 29922MB, hang at epoch34 | |
| export EXP_NAME=GC-4gpu-bs4096-accum16-step10k | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=4 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master_port=4403 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 4096 --chunk_sizes 256 2>&1 | tee $EXP_DIR/train.log | |
| # gpu0-3, DDP4-bs256-accum4, 11818MB | |
| export EXP_NAME=GC-4gpu-bs256-accum4-step10k | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=4 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node=4 --master_port=4403 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 64 --chunk_sizes 16 2>&1 | tee $EXP_DIR/train.log | |
| # gpu45, DDP2-bs256-accum2, 15742MB | |
| export EXP_NAME=GC-2gpu-bs256-accum2-step10k | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=1 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=4,5 torchrun --nproc_per_node=2 --master_port=2245 --max_restarts=0 mini_gc.py --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 128 --chunk_sizes 64 2>&1 | tee $EXP_DIR/train.log | |
| # gpu6, bs256-accum4, 9GB | |
| export EXP_NAME=GC-1gpu-bs256-accum4-step10k | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=1 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=6 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes 64 2>&1 | tee $EXP_DIR/train.log | |
| # gpu6, bs256-accum2, 18GB | |
| export EXP_NAME=GC-1gpu-bs256-accum2-step10k | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=1 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=6 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes 128 2>&1 | tee $EXP_DIR/train.log | |
| # gpu7, bs256-accum1, 38012MB | |
| export EXP_NAME=GC-1gpu-bs256-accum1-step10k-baseline | |
| export EXP_DIR=/export/xgen-embedding/runs/ruimeng/minimal_gc/$EXP_NAME | |
| export WANDB_DIR=$EXP_DIR/wandb | |
| export WANDB_NAME=$EXP_NAME | |
| export WORLD_SIZE=1 | |
| mkdir -p $EXP_DIR/wandb | |
| rm -rf $EXP_DIR/* | |
| cd /export/home/project/search/xgen-embedding/ | |
| CUDA_VISIBLE_DEVICES=7 python -m mini_gc --model_name_or_path bert-base-uncased --output_dir $EXP_DIR --q_len 128 --d_len 256 --batch_size 256 --chunk_sizes -1 2>&1 | tee $EXP_DIR/train.log | |
| ``` | |