#!/bin/bash echo "===========================================" echo "Testing GLEN on The Vault dataset (Small)" echo "===========================================" # Set memory monitoring parameters GPU_MEMORY_THRESHOLD=0.85 GPU_CHECK_INTERVAL=50 echo "Resource Protection enabled:" echo "- Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" echo "- Check interval: ${GPU_CHECK_INTERVAL} steps" echo "" # Ensure data preprocessing is done echo "Checking data preprocessing..." if [ ! -f "data/the_vault/DOC_VAULT_train.tsv" ] || [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then echo "Running data preprocessing..." python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --sample_size 1000 --create_test_set if [ $? -ne 0 ]; then echo "Error: Data preprocessing failed!" exit 1 fi else echo "Data already preprocessed." fi # Phase 1 Training echo "" echo "=== Phase 1 Training (Document ID Assignment) ===" # Check if CUDA is available if command -v nvidia-smi &> /dev/null; then export CUDA_VISIBLE_DEVICES="0" echo "Using GPU for training" BATCH_SIZE=8 EVAL_BATCH_SIZE=4 ACCUM_STEPS=2 else echo "No GPU detected, using CPU with reduced batch sizes" BATCH_SIZE=2 EVAL_BATCH_SIZE=1 ACCUM_STEPS=8 fi python examples/glen_phase1/train_glen.py \ --output_dir logs/test_glen_vault/GLEN_P1_test \ --model_name_or_path t5-base \ --query_type gtq_doc \ --per_device_train_batch_size $BATCH_SIZE \ --per_device_eval_batch_size $EVAL_BATCH_SIZE \ --gradient_accumulation_steps $ACCUM_STEPS \ --dropout_rate 0.1 \ --Rdrop 0.15 \ --aug_query True \ --aug_query_type corrupted_query \ --input_dropout 1 \ --id_class t5_bm25_truncate_3 \ --dataset_name the_vault \ --test100 1 \ --tree 1 \ --pretrain_decoder True \ --max_input_length 128 \ --val_check_interval 1.0 \ --tie_word_embeddings True \ --decoder_input doc_rep \ --max_output_length 5 \ --num_return_sequences 5 \ --logging_steps 100 \ --overwrite_output_dir \ --wandb_tag glen_vault_test_p1 \ --do_eval True \ --num_train_epochs 1 \ --save_steps 1000 \ --save_strategy steps \ --evaluation_strategy steps \ --eval_steps 1000 \ --seed 42 \ --gpu_memory_threshold $GPU_MEMORY_THRESHOLD \ --gpu_check_interval $GPU_CHECK_INTERVAL \ --fp16 True \ --dataloader_num_workers 0 \ --dataloader_pin_memory False if [ $? -ne 0 ]; then echo "Error: Phase 1 training failed!" exit 1 fi echo "✅ Phase 1 training completed successfully!" # Check if Phase 1 checkpoint exists PHASE1_CKPT="logs/test_glen_vault/GLEN_P1_test" if [ ! -d "$PHASE1_CKPT" ]; then echo "Error: Phase 1 checkpoint not found at $PHASE1_CKPT" exit 1 fi # Check for model files model_files=("pytorch_model.bin" "model.safetensors") found_model=false for file in "${model_files[@]}"; do if [ -f "$PHASE1_CKPT/$file" ]; then found_model=true echo "📁 Found Phase 1 model: $file" break fi done if [ "$found_model" = false ]; then echo "Error: No model files found in Phase 1 checkpoint" exit 1 fi echo "" echo "=== Phase 2 Training (Ranking-based Refinement) ===" # Adjust batch sizes for Phase 2 if command -v nvidia-smi &> /dev/null; then BATCH_SIZE=4 EVAL_BATCH_SIZE=2 ACCUM_STEPS=4 else BATCH_SIZE=1 EVAL_BATCH_SIZE=1 ACCUM_STEPS=16 fi python examples/glen_phase2/train_glen.py \ --output_dir logs/test_glen_vault/GLEN_P2_test \ --model_name_or_path $PHASE1_CKPT \ --per_device_train_batch_size $BATCH_SIZE \ --per_device_eval_batch_size $EVAL_BATCH_SIZE \ --gradient_accumulation_steps $ACCUM_STEPS \ --dropout_rate 0.1 \ --warmup_ratio 0.1 \ --id_class t5_bm25_truncate_3 \ --dataset_name the_vault \ --tree 1 \ --q_max_len 32 \ --p_max_len 128 \ --negative_passage_type self \ --positive_passage_no_shuffle True \ --tie_word_embeddings True \ --num_return_sequences 5 \ --logging_steps 100 \ --overwrite_output_dir \ --wandb_tag glen_vault_test_p2 \ --do_eval True \ --num_train_epochs 1 \ --save_steps 1000 \ --save_strategy steps \ --evaluation_strategy steps \ --eval_steps 1000 \ --seed 42 \ --gpu_memory_threshold $GPU_MEMORY_THRESHOLD \ --gpu_check_interval $GPU_CHECK_INTERVAL \ --fp16 True \ --dataloader_num_workers 0 \ --dataloader_pin_memory False if [ $? -ne 0 ]; then echo "Error: Phase 2 training failed!" exit 1 fi echo "✅ Phase 2 training completed successfully!" # Validate Phase 2 checkpoint PHASE2_CKPT="logs/test_glen_vault/GLEN_P2_test" if [ ! -d "$PHASE2_CKPT" ]; then echo "Error: Phase 2 checkpoint not found at $PHASE2_CKPT" exit 1 fi # Check for checkpoint subdirectories or model files checkpoint_dir=$(find "$PHASE2_CKPT" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 1) if [ -n "$checkpoint_dir" ]; then echo "📁 Found Phase 2 checkpoint: $(basename $checkpoint_dir)" if [ ! -f "$checkpoint_dir/model.safetensors" ] && [ ! -f "$checkpoint_dir/pytorch_model.bin" ]; then echo "Error: No model files in checkpoint directory" exit 1 fi else # Check for model files in root found_model=false for file in "${model_files[@]}"; do if [ -f "$PHASE2_CKPT/$file" ]; then found_model=true echo "📁 Found Phase 2 model: $file" break fi done if [ "$found_model" = false ]; then echo "Error: No model files found in Phase 2 checkpoint" exit 1 fi fi echo "" echo "=== Document ID Generation ===" python examples/glen_phase2/makeid_glen.py \ --model_name_or_path $PHASE2_CKPT \ --infer_dir $PHASE2_CKPT \ --dataset_name the_vault \ --docid_file_name GLEN_P2_test_docids \ --per_device_eval_batch_size 1 \ --max_input_length 128 \ --num_return_sequences 10 \ --dataloader_num_workers 0 \ --dataloader_pin_memory False if [ $? -ne 0 ]; then echo "Error: Document ID generation failed!" exit 1 fi # Validate docid file was created docid_file="logs/test_glen_vault/GLEN_P2_test_docids.tsv" if [ ! -f "$docid_file" ]; then echo "Error: Document ID file not created: $docid_file" exit 1 fi line_count=$(wc -l < "$docid_file") echo "✅ Document ID generation completed! Generated $line_count document IDs" echo "" echo "=== Query Inference ===" # First, ensure we have test queries if [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then echo "Error: Test queries file not found. Please run preprocessing with --create_test_set flag" exit 1 fi python examples/glen_phase2/evaluate_glen.py \ --model_name_or_path $PHASE2_CKPT \ --infer_dir $PHASE2_CKPT \ --dataset_name the_vault \ --docid_file_name GLEN_P2_test_docids \ --per_device_eval_batch_size 1 \ --q_max_len 32 \ --num_return_sequences 5 \ --logs_dir logs/test_glen_vault \ --test100 1 \ --dataloader_num_workers 0 \ --dataloader_pin_memory False if [ $? -ne 0 ]; then echo "Error: Query inference failed!" exit 1 fi echo "✅ Query inference completed successfully!" echo "" echo "===========================================" echo "🎉 TESTING COMPLETED SUCCESSFULLY! 🎉" echo "===========================================" echo "" echo "📊 Summary:" echo " ✅ Phase 1 Training (Document ID Assignment)" echo " ✅ Phase 2 Training (Ranking-based Refinement)" echo " ✅ Document ID Generation ($line_count IDs)" echo " ✅ Query Inference & Evaluation" echo "" echo "📁 Results saved in: logs/test_glen_vault/" echo "📁 Document IDs: $docid_file" echo "" echo "🛡️ Resource Protection Summary:" echo " - Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" echo " - Check interval: ${GPU_CHECK_INTERVAL} steps" echo " - FP16 training enabled" echo " - Optimized batch sizes for current hardware" echo "" echo "🚀 Testing completed! The model is ready for full training."