dreilly commited on
Commit
ea75dd1
·
verified ·
1 Parent(s): 7ea44f8

Upload folder using huggingface_hub

Browse files
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+
9
+ - PEFT 0.4.0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": ".*model\\.layers\\..*\\.(v_proj|o_proj|q_proj|down_proj|k_proj|up_proj|gate_proj)$",
17
+ "task_type": "CAUSAL_LM"
18
+ }
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea0d3ac9694cbc47276699b9f351e6bccc34e7ff4761e5229d4fc4d9c7b3bb0
3
+ size 323097578
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local",
4
+ "architectures": [
5
+ "Videollama3Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
10
+ "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "domain_names": [
14
+ "egocentric",
15
+ "depth",
16
+ "exocentric"
17
+ ],
18
+ "enable_probe_diversity_loss": true,
19
+ "eos_token_id": 151645,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 3584,
22
+ "image_aspect_ratio": "square",
23
+ "image_size": -1,
24
+ "image_token_index": 151665,
25
+ "image_token_length": 1,
26
+ "include_general_domain": true,
27
+ "include_visual_probes": true,
28
+ "include_visual_tokens": true,
29
+ "initializer_range": 0.02,
30
+ "interaction_module": "cross_attention",
31
+ "interaction_module_layers": null,
32
+ "intermediate_size": 18944,
33
+ "is_alignment": false,
34
+ "llm_lr": 1e-05,
35
+ "max_frames": 180,
36
+ "max_position_embeddings": 32768,
37
+ "max_window_layers": 28,
38
+ "mm_hidden_size": 1152,
39
+ "mm_projector_lr": 1e-05,
40
+ "mm_projector_type": "mlp2x_gelu",
41
+ "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
42
+ "mm_vision_select_feature": "patch",
43
+ "mm_vision_select_layer": -1,
44
+ "model_type": "viscop_qwen2",
45
+ "num_attention_heads": 28,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 4,
48
+ "num_visual_probes": 8,
49
+ "probe_token_index": 151668,
50
+ "rms_norm_eps": 1e-06,
51
+ "rope_scaling": null,
52
+ "rope_theta": 1000000.0,
53
+ "sliding_window": null,
54
+ "tie_word_embeddings": false,
55
+ "tokenizer_model_max_length": 16384,
56
+ "tokenizer_padding_side": "right",
57
+ "torch_dtype": "bfloat16",
58
+ "transformers_version": "4.46.3",
59
+ "use_cache": true,
60
+ "use_mm_proj": true,
61
+ "use_sliding_window": false,
62
+ "use_token_compression": false,
63
+ "viscop_type": "multi-viscop",
64
+ "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
65
+ "vision_encoder_config": {
66
+ "_attn_implementation_autoset": false,
67
+ "_name_or_path": "",
68
+ "add_cross_attention": false,
69
+ "architectures": null,
70
+ "attention_dropout": 0.0,
71
+ "bad_words_ids": null,
72
+ "begin_suppress_tokens": null,
73
+ "bos_token_id": null,
74
+ "chunk_size_feed_forward": 0,
75
+ "cross_attention_hidden_size": null,
76
+ "decoder_start_token_id": null,
77
+ "diversity_penalty": 0.0,
78
+ "do_sample": false,
79
+ "early_stopping": false,
80
+ "encoder_no_repeat_ngram_size": 0,
81
+ "eos_token_id": null,
82
+ "exponential_decay_length_penalty": null,
83
+ "finetuning_task": null,
84
+ "forced_bos_token_id": null,
85
+ "forced_eos_token_id": null,
86
+ "hidden_act": "gelu_pytorch_tanh",
87
+ "hidden_size": 1152,
88
+ "id2label": {
89
+ "0": "LABEL_0",
90
+ "1": "LABEL_1"
91
+ },
92
+ "intermediate_size": 4304,
93
+ "is_decoder": false,
94
+ "is_encoder_decoder": false,
95
+ "label2id": {
96
+ "LABEL_0": 0,
97
+ "LABEL_1": 1
98
+ },
99
+ "layer_norm_eps": 1e-06,
100
+ "length_penalty": 1.0,
101
+ "max_length": 20,
102
+ "min_length": 0,
103
+ "model_type": "videollama3_vision_encoder",
104
+ "no_repeat_ngram_size": 0,
105
+ "num_attention_heads": 16,
106
+ "num_beam_groups": 1,
107
+ "num_beams": 1,
108
+ "num_channels": 3,
109
+ "num_hidden_layers": 27,
110
+ "num_return_sequences": 1,
111
+ "output_attentions": false,
112
+ "output_hidden_states": false,
113
+ "output_scores": false,
114
+ "pad_token_id": null,
115
+ "patch_size": 14,
116
+ "prefix": null,
117
+ "problem_type": null,
118
+ "pruned_heads": {},
119
+ "remove_invalid_values": false,
120
+ "repetition_penalty": 1.0,
121
+ "return_dict": true,
122
+ "return_dict_in_generate": false,
123
+ "sep_token_id": null,
124
+ "suppress_tokens": null,
125
+ "task_specific_params": null,
126
+ "temperature": 1.0,
127
+ "tf_legacy_loss": false,
128
+ "tie_encoder_decoder": false,
129
+ "tie_word_embeddings": true,
130
+ "tokenizer_class": null,
131
+ "top_k": 50,
132
+ "top_p": 1.0,
133
+ "torch_dtype": null,
134
+ "torchscript": false,
135
+ "typical_p": 1.0,
136
+ "use_bfloat16": false
137
+ },
138
+ "vision_encoder_lr": null,
139
+ "vocab_size": 152064
140
+ }
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee9d69eb43f474d609bb2ab579bba1882801a4422d418252f322f16114a9de4
3
+ size 1215957382
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
3
+ REPO_ROOT="$SCRIPT_DIR"
4
+ while [ "$REPO_ROOT" != "/" ] && [ ! -d "$REPO_ROOT/.git" ] && [ ! -d "$REPO_ROOT/training_jsons" ]; do
5
+ REPO_ROOT="$(cd "$REPO_ROOT/.." && pwd)"
6
+ done
7
+
8
+ CONFIG_FILE="${CONFIG_FILE:-$REPO_ROOT/local.env}"
9
+ if [ -f "$CONFIG_FILE" ]; then
10
+ set -a
11
+ . "$CONFIG_FILE"
12
+ set +a
13
+ else
14
+ echo "Missing config: $CONFIG_FILE"
15
+ echo "Create $REPO_ROOT/local.env based on values for this server."
16
+ exit 1
17
+ fi
18
+
19
+ # Environment Variables
20
+ ARG_WORLD_SIZE=${1:-1}
21
+ ARG_NPROC_PER_NODE=${2:-8}
22
+
23
+ if [[ -v MASTER_ADDR_PASSED ]]; then
24
+ ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
25
+ else
26
+ ARG_MASTER_ADDR=127.0.0.1 # for dev environments
27
+ fi
28
+ ARG_MASTER_PORT=12356
29
+ # ARG_RANK=$SLURM_NODEID
30
+ ARG_RANK=0
31
+
32
+ # Multiple conditions
33
+ if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
34
+ WORLD_SIZE=$ARG_WORLD_SIZE
35
+ NPROC_PER_NODE=$ARG_NPROC_PER_NODE
36
+ fi
37
+
38
+ if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
39
+ MASTER_ADDR=$ARG_MASTER_ADDR
40
+ MASTER_PORT=$ARG_MASTER_PORT
41
+ RANK=$ARG_RANK
42
+ fi
43
+
44
+ echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
45
+ echo "WORLD_SIZE: $WORLD_SIZE"
46
+ echo "NPROC_PER_NODE: $NPROC_PER_NODE"
47
+
48
+ # Training Arguments
49
+ GLOBAL_BATCH_SIZE=128 # aka effective batch size
50
+ LOCAL_BATCH_SIZE=8 # batch size per GPU
51
+ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
52
+ echo $GRADIENT_ACCUMULATION_STEPS
53
+
54
+ INIT_MODEL=$VISCOP_INIT_MODEL # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM)
55
+
56
+ NUM_DATA_WORKERS=8
57
+ # NUM_TRAIN_EPOCHS=3 # !
58
+ NUM_TRAIN_EPOCHS=1
59
+ LORA_TRAINING=True
60
+
61
+ # ViSCoP Arguments
62
+ NUM_VISUAL_PROBES=8 # > usually 16 but changed for domain experiment
63
+ INTERACTION_MODULE_NAME=cross_attention
64
+ INTERACTION_MODULE_POS=all
65
+ PASS_PROBES_TO_LLM=True
66
+ PASS_VIS_FEATURES_TO_LLM=True
67
+
68
+ # Logging Arguments
69
+ export WANDB_PROJECT=sony26_mm_viscop
70
+ REPORT_TO=wandb
71
+ OUTP_DIR=work_dirs/egoexo
72
+ RUN_NAME=viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight
73
+
74
+ # Data Arguments
75
+ DATA_DIR=$VISCOP_DATA_DIR
76
+ SHUFFLE_DATA=True # >
77
+ DOMAIN_NAMES=("egocentric" "depth" "exocentric") # ("egocentric" "depth" "robotics" "exocentric") # >
78
+ INCLUDE_GENERAL_DOMAIN=True # >
79
+
80
+ DATA_JSONS=(
81
+ "$REPO_ROOT/training_jsons/train-instr_viscop_egoview.json"
82
+ "$REPO_ROOT/training_jsons/train-instr_viscop_depthmodality.json"
83
+ "$REPO_ROOT/training_jsons/train_instr_viscop_exoviews.json"
84
+ # "$REPO_ROOT/training_jsons/D-inBC-text-multi-train-8k-front.json"
85
+ )
86
+
87
+ TRAINING_JSON=""
88
+ for json_file in "${DATA_JSONS[@]}"; do
89
+ TRAINING_JSON+="${json_file} "
90
+ done
91
+
92
+ # ! Debug: remove setting of ego max frames to 40
93
+ # if [[ $TRAINING_JSON == *"egoview"* ]]; then
94
+ # MAX_FRAMES=40 # use 40 frames for training on ego
95
+ # else
96
+ # MAX_FRAMES=180
97
+ # fi
98
+ MAX_FRAMES=180
99
+
100
+ # Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging
101
+ TESTING=0
102
+ if [ $TESTING -eq 1 ]; then
103
+ NUM_DATA_WORKERS=0
104
+ REPORT_TO=none
105
+ RUN_NAME=TESTING
106
+ fi
107
+
108
+ mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
109
+ cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
110
+
111
+ torchrun --nnodes $WORLD_SIZE \
112
+ --nproc_per_node $NPROC_PER_NODE \
113
+ --master_addr=$MASTER_ADDR \
114
+ --master_port=$MASTER_PORT \
115
+ --node_rank $RANK \
116
+ viscop/train_multiviscop.py \
117
+ --interaction_module_layers $INTERACTION_MODULE_POS \
118
+ --interaction_module_name $INTERACTION_MODULE_NAME \
119
+ --viscop_type multi-viscop \
120
+ --enable_probe_diversity_loss True \
121
+ --lora_enable $LORA_TRAINING \
122
+ --num_train_epochs $NUM_TRAIN_EPOCHS \
123
+ --deepspeed scripts/zero2.json \
124
+ --model_type viscop_qwen2 \
125
+ --model_path $INIT_MODEL \
126
+ --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
127
+ --mm_projector_type mlp2x_gelu \
128
+ --data_path $TRAINING_JSON \
129
+ --shuffle_data $SHUFFLE_DATA \
130
+ --domain_names ${DOMAIN_NAMES[@]} \
131
+ --include_general_domain $INCLUDE_GENERAL_DOMAIN \
132
+ --data_folder $DATA_DIR \
133
+ --image_merge_size 2 \
134
+ --video_merge_size 2 \
135
+ --fps 1 \
136
+ --max_frames $MAX_FRAMES \
137
+ --model_max_length 16384 \
138
+ --mm_max_length 10240 \
139
+ --bf16 True \
140
+ --tf32 True \
141
+ --fp16 False \
142
+ --output_dir ${OUTP_DIR}/${RUN_NAME} \
143
+ --per_device_train_batch_size $LOCAL_BATCH_SIZE \
144
+ --per_device_eval_batch_size 2 \
145
+ --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
146
+ --evaluation_strategy "no" \
147
+ --save_strategy "no" \
148
+ --save_steps 5000 \
149
+ --save_total_limit 1 \
150
+ --mm_projector_lr 1e-5 \
151
+ --llm_lr 1e-5 \
152
+ --weight_decay 0. \
153
+ --warmup_ratio 0.03 \
154
+ --lr_scheduler_type "cosine" \
155
+ --logging_steps 1 \
156
+ --gradient_checkpointing True \
157
+ --dataloader_num_workers $NUM_DATA_WORKERS \
158
+ --report_to $REPORT_TO \
159
+ --run_name $RUN_NAME \
160
+ --dataset_cache_dir $VISCOP_DATASET_CACHE_DIR \
161
+ --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \
162
+ --include_visual_probes $PASS_PROBES_TO_LLM \
163
+ --num_visual_probes $NUM_VISUAL_PROBES
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff