GLEN-model / scripts /test_small_training.sh
QuanTH02's picture
15-06-v2
08894ba
#!/bin/bash
echo "==========================================="
echo "Testing GLEN on The Vault dataset (Small)"
echo "==========================================="
# Set memory monitoring parameters
GPU_MEMORY_THRESHOLD=0.85
GPU_CHECK_INTERVAL=50
echo "Resource Protection enabled:"
echo "- Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)"
echo "- Check interval: ${GPU_CHECK_INTERVAL} steps"
echo ""
# Ensure data preprocessing is done
echo "Checking data preprocessing..."
if [ ! -f "data/the_vault/DOC_VAULT_train.tsv" ] || [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then
echo "Running data preprocessing..."
python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --sample_size 1000 --create_test_set
if [ $? -ne 0 ]; then
echo "Error: Data preprocessing failed!"
exit 1
fi
else
echo "Data already preprocessed."
fi
# Phase 1 Training
echo ""
echo "=== Phase 1 Training (Document ID Assignment) ==="
# Check if CUDA is available
if command -v nvidia-smi &> /dev/null; then
export CUDA_VISIBLE_DEVICES="0"
echo "Using GPU for training"
BATCH_SIZE=8
EVAL_BATCH_SIZE=4
ACCUM_STEPS=2
else
echo "No GPU detected, using CPU with reduced batch sizes"
BATCH_SIZE=2
EVAL_BATCH_SIZE=1
ACCUM_STEPS=8
fi
python examples/glen_phase1/train_glen.py \
--output_dir logs/test_glen_vault/GLEN_P1_test \
--model_name_or_path t5-base \
--query_type gtq_doc \
--per_device_train_batch_size $BATCH_SIZE \
--per_device_eval_batch_size $EVAL_BATCH_SIZE \
--gradient_accumulation_steps $ACCUM_STEPS \
--dropout_rate 0.1 \
--Rdrop 0.15 \
--aug_query True \
--aug_query_type corrupted_query \
--input_dropout 1 \
--id_class t5_bm25_truncate_3 \
--dataset_name the_vault \
--test100 1 \
--tree 1 \
--pretrain_decoder True \
--max_input_length 128 \
--val_check_interval 1.0 \
--tie_word_embeddings True \
--decoder_input doc_rep \
--max_output_length 5 \
--num_return_sequences 5 \
--logging_steps 100 \
--overwrite_output_dir \
--wandb_tag glen_vault_test_p1 \
--do_eval True \
--num_train_epochs 1 \
--save_steps 1000 \
--save_strategy steps \
--evaluation_strategy steps \
--eval_steps 1000 \
--seed 42 \
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD \
--gpu_check_interval $GPU_CHECK_INTERVAL \
--fp16 True \
--dataloader_num_workers 0 \
--dataloader_pin_memory False
if [ $? -ne 0 ]; then
echo "Error: Phase 1 training failed!"
exit 1
fi
echo "βœ… Phase 1 training completed successfully!"
# Check if Phase 1 checkpoint exists
PHASE1_CKPT="logs/test_glen_vault/GLEN_P1_test"
if [ ! -d "$PHASE1_CKPT" ]; then
echo "Error: Phase 1 checkpoint not found at $PHASE1_CKPT"
exit 1
fi
# Check for model files
model_files=("pytorch_model.bin" "model.safetensors")
found_model=false
for file in "${model_files[@]}"; do
if [ -f "$PHASE1_CKPT/$file" ]; then
found_model=true
echo "πŸ“ Found Phase 1 model: $file"
break
fi
done
if [ "$found_model" = false ]; then
echo "Error: No model files found in Phase 1 checkpoint"
exit 1
fi
echo ""
echo "=== Phase 2 Training (Ranking-based Refinement) ==="
# Adjust batch sizes for Phase 2
if command -v nvidia-smi &> /dev/null; then
BATCH_SIZE=4
EVAL_BATCH_SIZE=2
ACCUM_STEPS=4
else
BATCH_SIZE=1
EVAL_BATCH_SIZE=1
ACCUM_STEPS=16
fi
python examples/glen_phase2/train_glen.py \
--output_dir logs/test_glen_vault/GLEN_P2_test \
--model_name_or_path $PHASE1_CKPT \
--per_device_train_batch_size $BATCH_SIZE \
--per_device_eval_batch_size $EVAL_BATCH_SIZE \
--gradient_accumulation_steps $ACCUM_STEPS \
--dropout_rate 0.1 \
--warmup_ratio 0.1 \
--id_class t5_bm25_truncate_3 \
--dataset_name the_vault \
--tree 1 \
--q_max_len 32 \
--p_max_len 128 \
--negative_passage_type self \
--positive_passage_no_shuffle True \
--tie_word_embeddings True \
--num_return_sequences 5 \
--logging_steps 100 \
--overwrite_output_dir \
--wandb_tag glen_vault_test_p2 \
--do_eval True \
--num_train_epochs 1 \
--save_steps 1000 \
--save_strategy steps \
--evaluation_strategy steps \
--eval_steps 1000 \
--seed 42 \
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD \
--gpu_check_interval $GPU_CHECK_INTERVAL \
--fp16 True \
--dataloader_num_workers 0 \
--dataloader_pin_memory False
if [ $? -ne 0 ]; then
echo "Error: Phase 2 training failed!"
exit 1
fi
echo "βœ… Phase 2 training completed successfully!"
# Validate Phase 2 checkpoint
PHASE2_CKPT="logs/test_glen_vault/GLEN_P2_test"
if [ ! -d "$PHASE2_CKPT" ]; then
echo "Error: Phase 2 checkpoint not found at $PHASE2_CKPT"
exit 1
fi
# Check for checkpoint subdirectories or model files
checkpoint_dir=$(find "$PHASE2_CKPT" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 1)
if [ -n "$checkpoint_dir" ]; then
echo "πŸ“ Found Phase 2 checkpoint: $(basename $checkpoint_dir)"
if [ ! -f "$checkpoint_dir/model.safetensors" ] && [ ! -f "$checkpoint_dir/pytorch_model.bin" ]; then
echo "Error: No model files in checkpoint directory"
exit 1
fi
else
# Check for model files in root
found_model=false
for file in "${model_files[@]}"; do
if [ -f "$PHASE2_CKPT/$file" ]; then
found_model=true
echo "πŸ“ Found Phase 2 model: $file"
break
fi
done
if [ "$found_model" = false ]; then
echo "Error: No model files found in Phase 2 checkpoint"
exit 1
fi
fi
echo ""
echo "=== Document ID Generation ==="
python examples/glen_phase2/makeid_glen.py \
--model_name_or_path $PHASE2_CKPT \
--infer_dir $PHASE2_CKPT \
--dataset_name the_vault \
--docid_file_name GLEN_P2_test_docids \
--per_device_eval_batch_size 1 \
--max_input_length 128 \
--num_return_sequences 10 \
--dataloader_num_workers 0 \
--dataloader_pin_memory False
if [ $? -ne 0 ]; then
echo "Error: Document ID generation failed!"
exit 1
fi
# Validate docid file was created
docid_file="logs/test_glen_vault/GLEN_P2_test_docids.tsv"
if [ ! -f "$docid_file" ]; then
echo "Error: Document ID file not created: $docid_file"
exit 1
fi
line_count=$(wc -l < "$docid_file")
echo "βœ… Document ID generation completed! Generated $line_count document IDs"
echo ""
echo "=== Query Inference ==="
# First, ensure we have test queries
if [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then
echo "Error: Test queries file not found. Please run preprocessing with --create_test_set flag"
exit 1
fi
python examples/glen_phase2/evaluate_glen.py \
--model_name_or_path $PHASE2_CKPT \
--infer_dir $PHASE2_CKPT \
--dataset_name the_vault \
--docid_file_name GLEN_P2_test_docids \
--per_device_eval_batch_size 1 \
--q_max_len 32 \
--num_return_sequences 5 \
--logs_dir logs/test_glen_vault \
--test100 1 \
--dataloader_num_workers 0 \
--dataloader_pin_memory False
if [ $? -ne 0 ]; then
echo "Error: Query inference failed!"
exit 1
fi
echo "βœ… Query inference completed successfully!"
echo ""
echo "==========================================="
echo "πŸŽ‰ TESTING COMPLETED SUCCESSFULLY! πŸŽ‰"
echo "==========================================="
echo ""
echo "πŸ“Š Summary:"
echo " βœ… Phase 1 Training (Document ID Assignment)"
echo " βœ… Phase 2 Training (Ranking-based Refinement)"
echo " βœ… Document ID Generation ($line_count IDs)"
echo " βœ… Query Inference & Evaluation"
echo ""
echo "πŸ“ Results saved in: logs/test_glen_vault/"
echo "πŸ“ Document IDs: $docid_file"
echo ""
echo "πŸ›‘οΈ Resource Protection Summary:"
echo " - Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)"
echo " - Check interval: ${GPU_CHECK_INTERVAL} steps"
echo " - FP16 training enabled"
echo " - Optimized batch sizes for current hardware"
echo ""
echo "πŸš€ Testing completed! The model is ready for full training."