|
|
#!/bin/bash |
|
|
|
|
|
echo "===========================================" |
|
|
echo "Testing GLEN on The Vault dataset (Small)" |
|
|
echo "===========================================" |
|
|
|
|
|
|
|
|
GPU_MEMORY_THRESHOLD=0.85 |
|
|
GPU_CHECK_INTERVAL=50 |
|
|
|
|
|
echo "Resource Protection enabled:" |
|
|
echo "- Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
|
echo "- Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
|
echo "" |
|
|
|
|
|
|
|
|
echo "Checking data preprocessing..." |
|
|
if [ ! -f "data/the_vault/DOC_VAULT_train.tsv" ] || [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then |
|
|
echo "Running data preprocessing..." |
|
|
python scripts/preprocess_vault_dataset.py --input_dir the_vault_dataset/ --output_dir data/the_vault/ --sample_size 1000 --create_test_set |
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Error: Data preprocessing failed!" |
|
|
exit 1 |
|
|
fi |
|
|
else |
|
|
echo "Data already preprocessed." |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "=== Phase 1 Training (Document ID Assignment) ===" |
|
|
|
|
|
|
|
|
if command -v nvidia-smi &> /dev/null; then |
|
|
export CUDA_VISIBLE_DEVICES="0" |
|
|
echo "Using GPU for training" |
|
|
BATCH_SIZE=8 |
|
|
EVAL_BATCH_SIZE=4 |
|
|
ACCUM_STEPS=2 |
|
|
else |
|
|
echo "No GPU detected, using CPU with reduced batch sizes" |
|
|
BATCH_SIZE=2 |
|
|
EVAL_BATCH_SIZE=1 |
|
|
ACCUM_STEPS=8 |
|
|
fi |
|
|
|
|
|
python examples/glen_phase1/train_glen.py \ |
|
|
--output_dir logs/test_glen_vault/GLEN_P1_test \ |
|
|
--model_name_or_path t5-base \ |
|
|
--query_type gtq_doc \ |
|
|
--per_device_train_batch_size $BATCH_SIZE \ |
|
|
--per_device_eval_batch_size $EVAL_BATCH_SIZE \ |
|
|
--gradient_accumulation_steps $ACCUM_STEPS \ |
|
|
--dropout_rate 0.1 \ |
|
|
--Rdrop 0.15 \ |
|
|
--aug_query True \ |
|
|
--aug_query_type corrupted_query \ |
|
|
--input_dropout 1 \ |
|
|
--id_class t5_bm25_truncate_3 \ |
|
|
--dataset_name the_vault \ |
|
|
--test100 1 \ |
|
|
--tree 1 \ |
|
|
--pretrain_decoder True \ |
|
|
--max_input_length 128 \ |
|
|
--val_check_interval 1.0 \ |
|
|
--tie_word_embeddings True \ |
|
|
--decoder_input doc_rep \ |
|
|
--max_output_length 5 \ |
|
|
--num_return_sequences 5 \ |
|
|
--logging_steps 100 \ |
|
|
--overwrite_output_dir \ |
|
|
--wandb_tag glen_vault_test_p1 \ |
|
|
--do_eval True \ |
|
|
--num_train_epochs 1 \ |
|
|
--save_steps 1000 \ |
|
|
--save_strategy steps \ |
|
|
--evaluation_strategy steps \ |
|
|
--eval_steps 1000 \ |
|
|
--seed 42 \ |
|
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD \ |
|
|
--gpu_check_interval $GPU_CHECK_INTERVAL \ |
|
|
--fp16 True \ |
|
|
--dataloader_num_workers 0 \ |
|
|
--dataloader_pin_memory False |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Error: Phase 1 training failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "β
Phase 1 training completed successfully!" |
|
|
|
|
|
|
|
|
PHASE1_CKPT="logs/test_glen_vault/GLEN_P1_test" |
|
|
if [ ! -d "$PHASE1_CKPT" ]; then |
|
|
echo "Error: Phase 1 checkpoint not found at $PHASE1_CKPT" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
model_files=("pytorch_model.bin" "model.safetensors") |
|
|
found_model=false |
|
|
for file in "${model_files[@]}"; do |
|
|
if [ -f "$PHASE1_CKPT/$file" ]; then |
|
|
found_model=true |
|
|
echo "π Found Phase 1 model: $file" |
|
|
break |
|
|
fi |
|
|
done |
|
|
|
|
|
if [ "$found_model" = false ]; then |
|
|
echo "Error: No model files found in Phase 1 checkpoint" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
echo "=== Phase 2 Training (Ranking-based Refinement) ===" |
|
|
|
|
|
|
|
|
if command -v nvidia-smi &> /dev/null; then |
|
|
BATCH_SIZE=4 |
|
|
EVAL_BATCH_SIZE=2 |
|
|
ACCUM_STEPS=4 |
|
|
else |
|
|
BATCH_SIZE=1 |
|
|
EVAL_BATCH_SIZE=1 |
|
|
ACCUM_STEPS=16 |
|
|
fi |
|
|
|
|
|
python examples/glen_phase2/train_glen.py \ |
|
|
--output_dir logs/test_glen_vault/GLEN_P2_test \ |
|
|
--model_name_or_path $PHASE1_CKPT \ |
|
|
--per_device_train_batch_size $BATCH_SIZE \ |
|
|
--per_device_eval_batch_size $EVAL_BATCH_SIZE \ |
|
|
--gradient_accumulation_steps $ACCUM_STEPS \ |
|
|
--dropout_rate 0.1 \ |
|
|
--warmup_ratio 0.1 \ |
|
|
--id_class t5_bm25_truncate_3 \ |
|
|
--dataset_name the_vault \ |
|
|
--tree 1 \ |
|
|
--q_max_len 32 \ |
|
|
--p_max_len 128 \ |
|
|
--negative_passage_type self \ |
|
|
--positive_passage_no_shuffle True \ |
|
|
--tie_word_embeddings True \ |
|
|
--num_return_sequences 5 \ |
|
|
--logging_steps 100 \ |
|
|
--overwrite_output_dir \ |
|
|
--wandb_tag glen_vault_test_p2 \ |
|
|
--do_eval True \ |
|
|
--num_train_epochs 1 \ |
|
|
--save_steps 1000 \ |
|
|
--save_strategy steps \ |
|
|
--evaluation_strategy steps \ |
|
|
--eval_steps 1000 \ |
|
|
--seed 42 \ |
|
|
--gpu_memory_threshold $GPU_MEMORY_THRESHOLD \ |
|
|
--gpu_check_interval $GPU_CHECK_INTERVAL \ |
|
|
--fp16 True \ |
|
|
--dataloader_num_workers 0 \ |
|
|
--dataloader_pin_memory False |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Error: Phase 2 training failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "β
Phase 2 training completed successfully!" |
|
|
|
|
|
|
|
|
PHASE2_CKPT="logs/test_glen_vault/GLEN_P2_test" |
|
|
if [ ! -d "$PHASE2_CKPT" ]; then |
|
|
echo "Error: Phase 2 checkpoint not found at $PHASE2_CKPT" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
checkpoint_dir=$(find "$PHASE2_CKPT" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 1) |
|
|
if [ -n "$checkpoint_dir" ]; then |
|
|
echo "π Found Phase 2 checkpoint: $(basename $checkpoint_dir)" |
|
|
if [ ! -f "$checkpoint_dir/model.safetensors" ] && [ ! -f "$checkpoint_dir/pytorch_model.bin" ]; then |
|
|
echo "Error: No model files in checkpoint directory" |
|
|
exit 1 |
|
|
fi |
|
|
else |
|
|
|
|
|
found_model=false |
|
|
for file in "${model_files[@]}"; do |
|
|
if [ -f "$PHASE2_CKPT/$file" ]; then |
|
|
found_model=true |
|
|
echo "π Found Phase 2 model: $file" |
|
|
break |
|
|
fi |
|
|
done |
|
|
if [ "$found_model" = false ]; then |
|
|
echo "Error: No model files found in Phase 2 checkpoint" |
|
|
exit 1 |
|
|
fi |
|
|
fi |
|
|
|
|
|
echo "" |
|
|
echo "=== Document ID Generation ===" |
|
|
|
|
|
python examples/glen_phase2/makeid_glen.py \ |
|
|
--model_name_or_path $PHASE2_CKPT \ |
|
|
--infer_dir $PHASE2_CKPT \ |
|
|
--dataset_name the_vault \ |
|
|
--docid_file_name GLEN_P2_test_docids \ |
|
|
--per_device_eval_batch_size 1 \ |
|
|
--max_input_length 128 \ |
|
|
--num_return_sequences 10 \ |
|
|
--dataloader_num_workers 0 \ |
|
|
--dataloader_pin_memory False |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Error: Document ID generation failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
docid_file="logs/test_glen_vault/GLEN_P2_test_docids.tsv" |
|
|
if [ ! -f "$docid_file" ]; then |
|
|
echo "Error: Document ID file not created: $docid_file" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
line_count=$(wc -l < "$docid_file") |
|
|
echo "β
Document ID generation completed! Generated $line_count document IDs" |
|
|
|
|
|
echo "" |
|
|
echo "=== Query Inference ===" |
|
|
|
|
|
|
|
|
if [ ! -f "data/the_vault/GTQ_VAULT_dev.tsv" ]; then |
|
|
echo "Error: Test queries file not found. Please run preprocessing with --create_test_set flag" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
python examples/glen_phase2/evaluate_glen.py \ |
|
|
--model_name_or_path $PHASE2_CKPT \ |
|
|
--infer_dir $PHASE2_CKPT \ |
|
|
--dataset_name the_vault \ |
|
|
--docid_file_name GLEN_P2_test_docids \ |
|
|
--per_device_eval_batch_size 1 \ |
|
|
--q_max_len 32 \ |
|
|
--num_return_sequences 5 \ |
|
|
--logs_dir logs/test_glen_vault \ |
|
|
--test100 1 \ |
|
|
--dataloader_num_workers 0 \ |
|
|
--dataloader_pin_memory False |
|
|
|
|
|
if [ $? -ne 0 ]; then |
|
|
echo "Error: Query inference failed!" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "β
Query inference completed successfully!" |
|
|
|
|
|
echo "" |
|
|
echo "===========================================" |
|
|
echo "π TESTING COMPLETED SUCCESSFULLY! π" |
|
|
echo "===========================================" |
|
|
echo "" |
|
|
echo "π Summary:" |
|
|
echo " β
Phase 1 Training (Document ID Assignment)" |
|
|
echo " β
Phase 2 Training (Ranking-based Refinement)" |
|
|
echo " β
Document ID Generation ($line_count IDs)" |
|
|
echo " β
Query Inference & Evaluation" |
|
|
echo "" |
|
|
echo "π Results saved in: logs/test_glen_vault/" |
|
|
echo "π Document IDs: $docid_file" |
|
|
echo "" |
|
|
echo "π‘οΈ Resource Protection Summary:" |
|
|
echo " - Memory threshold: ${GPU_MEMORY_THRESHOLD} (85%)" |
|
|
echo " - Check interval: ${GPU_CHECK_INTERVAL} steps" |
|
|
echo " - FP16 training enabled" |
|
|
echo " - Optimized batch sizes for current hardware" |
|
|
echo "" |
|
|
echo "π Testing completed! The model is ready for full training." |