export GLOO_SOCKET_IFNAME=eth0
export WANDB_MODE=disabled

maindir=$1
datadir=${maindir}data
codedir=${maindir}code

MAXLEN=2048
EPOCH=3
test_data=${datadir}/memochat_instructions/test.jsonl

settings=("1k" "10k")
models=("t5-3b" "vicuna-7b" "vicuna-13b" "vicuna-33b")

for model in "${models[@]}"
    do

    raw_model_path=${maindir}model/fastchat-${model}/
    case ${model} in 
        "vicuna-33b")
            RAYGPUS=2
            ;;
        "t5-3b"|"vicuna-7b"|"vicuna-13b")
            RAYGPUS=1
            ;;
    esac

    # zeroshot inference on one node
    python3 ${codedir}/codes/eval/get_model_infer_simple.py \
        --model-id ${model}_zeroshot \
        --model-path ${raw_model_path} \
        --question-file ${test_data} \
        --answer-file ${datadir}/instruction_testing/instruction_testing_${model}_zeroshot.jsonl \
        --num-gpus $GPU_NUM_PER_NODE \
        --ray-num-gpus ${RAYGPUS}
    
    # tuning
    for setting in "${settings[@]}"
        do
        data_path=${datadir}/memochat_instructions/train_${setting}.json
        preprocessed_data_dir=${datadir}/memochat_instructions/processed_${setting}_${model%-*}.pt
        model_output_path=${maindir}model/${model}_${setting}/
        deepspeed_config_path=${codedir}/configs/ds_config_${model#*-}.json

        case ${model} in 
            "t5-3b")
                PER_GPU_BATCH=8
                GRA_ACC=2
                ;;
            "vicuna-7b")
                PER_GPU_BATCH=16
                GRA_ACC=1
                ;;
            "vicuna-13b")
                PER_GPU_BATCH=8
                GRA_ACC=2
                ;;
            "vicuna-33b")
                PER_GPU_BATCH=4
                GRA_ACC=4
                ;;
        esac

        # train data preprocess
        python3 ${codedir}/codes/train/data_preprocess.py \
            --model_name_or_path ${raw_model_path} \
            --data_path ${data_path} \
            --preprocessing_num_workers=1 \
            --model_max_length ${MAXLEN} \
            --preprocessed_path ${preprocessed_data_dir}
        
        # training: avaliable for multi nodes
        torchrun --nnodes=$NODE_NUM \
            --node_rank=$INDEX \
            --nproc_per_node $GPU_NUM_PER_NODE \
            --master_addr $MASTER_ADDR \
            --master_port $MASTER_PORT \
            ${codedir}/codes/train/train.py \
            --model_name_or_path ${raw_model_path} \
            --bf16 True \
            --output_dir ${model_output_path} \
            --num_train_epochs ${EPOCH} \
            --per_device_train_batch_size ${PER_GPU_BATCH} \
            --gradient_accumulation_steps ${GRA_ACC} \
            --save_strategy "steps" \
            --save_steps 1500 \
            --save_total_limit 1 \
            --learning_rate 2e-5 \
            --log_level "info" \
            --logging_strategy "steps" \
            --logging_steps 1 \
            --weight_decay 0. \
            --warmup_ratio 0.04 \
            --lr_scheduler_type "cosine" \
            --deepspeed ${deepspeed_config_path} \
            --tf32 True \
            --model_max_length ${MAXLEN} \
            --preprocessed_path ${preprocessed_data_dir} \
            --gradient_checkpointing True
        
        # tuning inference
        python3 ${codedir}/codes/eval/get_model_infer_simple.py \
            --model-id ${model}_${setting} \
            --model-path ${model_output_path} \
            --question-file ${test_data} \
            --answer-file ${datadir}/instruction_testing/instruction_testing_${model}_${setting}.jsonl \
            --num-gpus $GPU_NUM_PER_NODE \
            --ray-num-gpus ${RAYGPUS}
        done
    done