| export GLOO_SOCKET_IFNAME=eth0 |
| export WANDB_MODE=disabled |
|
|
| maindir=$1 |
| datadir=${maindir}data |
| codedir=${maindir}code |
|
|
| MAXLEN=2048 |
| EPOCH=3 |
| test_data=${datadir}/memochat_instructions/test.jsonl |
|
|
| settings=("1k" "10k") |
| models=("t5-3b" "vicuna-7b" "vicuna-13b" "vicuna-33b") |
|
|
| for model in "${models[@]}" |
| do |
|
|
| raw_model_path=${maindir}model/fastchat-${model}/ |
| case ${model} in |
| "vicuna-33b") |
| RAYGPUS=2 |
| ;; |
| "t5-3b"|"vicuna-7b"|"vicuna-13b") |
| RAYGPUS=1 |
| ;; |
| esac |
|
|
| |
| python3 ${codedir}/codes/eval/get_model_infer_simple.py \ |
| --model-id ${model}_zeroshot \ |
| --model-path ${raw_model_path} \ |
| --question-file ${test_data} \ |
| --answer-file ${datadir}/instruction_testing/instruction_testing_${model}_zeroshot.jsonl \ |
| --num-gpus $GPU_NUM_PER_NODE \ |
| --ray-num-gpus ${RAYGPUS} |
| |
| |
| for setting in "${settings[@]}" |
| do |
| data_path=${datadir}/memochat_instructions/train_${setting}.json |
| preprocessed_data_dir=${datadir}/memochat_instructions/processed_${setting}_${model%-*}.pt |
| model_output_path=${maindir}model/${model}_${setting}/ |
| deepspeed_config_path=${codedir}/configs/ds_config_${model#*-}.json |
|
|
| case ${model} in |
| "t5-3b") |
| PER_GPU_BATCH=8 |
| GRA_ACC=2 |
| ;; |
| "vicuna-7b") |
| PER_GPU_BATCH=16 |
| GRA_ACC=1 |
| ;; |
| "vicuna-13b") |
| PER_GPU_BATCH=8 |
| GRA_ACC=2 |
| ;; |
| "vicuna-33b") |
| PER_GPU_BATCH=4 |
| GRA_ACC=4 |
| ;; |
| esac |
|
|
| |
| python3 ${codedir}/codes/train/data_preprocess.py \ |
| --model_name_or_path ${raw_model_path} \ |
| --data_path ${data_path} \ |
| --preprocessing_num_workers=1 \ |
| --model_max_length ${MAXLEN} \ |
| --preprocessed_path ${preprocessed_data_dir} |
| |
| |
| torchrun --nnodes=$NODE_NUM \ |
| --node_rank=$INDEX \ |
| --nproc_per_node $GPU_NUM_PER_NODE \ |
| --master_addr $MASTER_ADDR \ |
| --master_port $MASTER_PORT \ |
| ${codedir}/codes/train/train.py \ |
| --model_name_or_path ${raw_model_path} \ |
| --bf16 True \ |
| --output_dir ${model_output_path} \ |
| --num_train_epochs ${EPOCH} \ |
| --per_device_train_batch_size ${PER_GPU_BATCH} \ |
| --gradient_accumulation_steps ${GRA_ACC} \ |
| --save_strategy "steps" \ |
| --save_steps 1500 \ |
| --save_total_limit 1 \ |
| --learning_rate 2e-5 \ |
| --log_level "info" \ |
| --logging_strategy "steps" \ |
| --logging_steps 1 \ |
| --weight_decay 0. \ |
| --warmup_ratio 0.04 \ |
| --lr_scheduler_type "cosine" \ |
| --deepspeed ${deepspeed_config_path} \ |
| --tf32 True \ |
| --model_max_length ${MAXLEN} \ |
| --preprocessed_path ${preprocessed_data_dir} \ |
| --gradient_checkpointing True |
| |
| |
| python3 ${codedir}/codes/eval/get_model_infer_simple.py \ |
| --model-id ${model}_${setting} \ |
| --model-path ${model_output_path} \ |
| --question-file ${test_data} \ |
| --answer-file ${datadir}/instruction_testing/instruction_testing_${model}_${setting}.jsonl \ |
| --num-gpus $GPU_NUM_PER_NODE \ |
| --ray-num-gpus ${RAYGPUS} |
| done |
| done |
|
|