anonymous-penguin's picture
Initial code release
9c60174 verified
export GLOO_SOCKET_IFNAME=eth0
export WANDB_MODE=disabled
maindir=$1
datadir=${maindir}data
codedir=${maindir}code
MAXLEN=2048
EPOCH=3
test_data=${datadir}/memochat_instructions/test.jsonl
settings=("1k" "10k")
models=("t5-3b" "vicuna-7b" "vicuna-13b" "vicuna-33b")
for model in "${models[@]}"
do
raw_model_path=${maindir}model/fastchat-${model}/
case ${model} in
"vicuna-33b")
RAYGPUS=2
;;
"t5-3b"|"vicuna-7b"|"vicuna-13b")
RAYGPUS=1
;;
esac
# zeroshot inference on one node
python3 ${codedir}/codes/eval/get_model_infer_simple.py \
--model-id ${model}_zeroshot \
--model-path ${raw_model_path} \
--question-file ${test_data} \
--answer-file ${datadir}/instruction_testing/instruction_testing_${model}_zeroshot.jsonl \
--num-gpus $GPU_NUM_PER_NODE \
--ray-num-gpus ${RAYGPUS}
# tuning
for setting in "${settings[@]}"
do
data_path=${datadir}/memochat_instructions/train_${setting}.json
preprocessed_data_dir=${datadir}/memochat_instructions/processed_${setting}_${model%-*}.pt
model_output_path=${maindir}model/${model}_${setting}/
deepspeed_config_path=${codedir}/configs/ds_config_${model#*-}.json
case ${model} in
"t5-3b")
PER_GPU_BATCH=8
GRA_ACC=2
;;
"vicuna-7b")
PER_GPU_BATCH=16
GRA_ACC=1
;;
"vicuna-13b")
PER_GPU_BATCH=8
GRA_ACC=2
;;
"vicuna-33b")
PER_GPU_BATCH=4
GRA_ACC=4
;;
esac
# train data preprocess
python3 ${codedir}/codes/train/data_preprocess.py \
--model_name_or_path ${raw_model_path} \
--data_path ${data_path} \
--preprocessing_num_workers=1 \
--model_max_length ${MAXLEN} \
--preprocessed_path ${preprocessed_data_dir}
# training: avaliable for multi nodes
torchrun --nnodes=$NODE_NUM \
--node_rank=$INDEX \
--nproc_per_node $GPU_NUM_PER_NODE \
--master_addr $MASTER_ADDR \
--master_port $MASTER_PORT \
${codedir}/codes/train/train.py \
--model_name_or_path ${raw_model_path} \
--bf16 True \
--output_dir ${model_output_path} \
--num_train_epochs ${EPOCH} \
--per_device_train_batch_size ${PER_GPU_BATCH} \
--gradient_accumulation_steps ${GRA_ACC} \
--save_strategy "steps" \
--save_steps 1500 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--log_level "info" \
--logging_strategy "steps" \
--logging_steps 1 \
--weight_decay 0. \
--warmup_ratio 0.04 \
--lr_scheduler_type "cosine" \
--deepspeed ${deepspeed_config_path} \
--tf32 True \
--model_max_length ${MAXLEN} \
--preprocessed_path ${preprocessed_data_dir} \
--gradient_checkpointing True
# tuning inference
python3 ${codedir}/codes/eval/get_model_infer_simple.py \
--model-id ${model}_${setting} \
--model-path ${model_output_path} \
--question-file ${test_data} \
--answer-file ${datadir}/instruction_testing/instruction_testing_${model}_${setting}.jsonl \
--num-gpus $GPU_NUM_PER_NODE \
--ray-num-gpus ${RAYGPUS}
done
done