Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md +9 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json +18 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin +3 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json +140 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin +3 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh +163 -0
viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json +0 -0

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+library_name: peft
+---
+## Training procedure
+### Framework versions
+- PEFT 0.4.0

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": ".*model\\.layers\\..*\\.(v_proj|o_proj|q_proj|down_proj|k_proj|up_proj|gate_proj)$",
+  "task_type": "CAUSAL_LM"
+}

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ea0d3ac9694cbc47276699b9f351e6bccc34e7ff4761e5229d4fc4d9c7b3bb0
+size 323097578

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json ADDED Viewed

	@@ -0,0 +1,140 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local",
+  "architectures": [
+    "Videollama3Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
+    "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
+  },
+  "bos_token_id": 151643,
+  "domain_names": [
+    "egocentric",
+    "depth",
+    "exocentric"
+  ],
+  "enable_probe_diversity_loss": true,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "square",
+  "image_size": -1,
+  "image_token_index": 151665,
+  "image_token_length": 1,
+  "include_general_domain": true,
+  "include_visual_probes": true,
+  "include_visual_tokens": true,
+  "initializer_range": 0.02,
+  "interaction_module": "cross_attention",
+  "interaction_module_layers": null,
+  "intermediate_size": 18944,
+  "is_alignment": false,
+  "llm_lr": 1e-05,
+  "max_frames": 180,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "mm_hidden_size": 1152,
+  "mm_projector_lr": 1e-05,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -1,
+  "model_type": "viscop_qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "num_visual_probes": 8,
+  "probe_token_index": 151668,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 16384,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "use_mm_proj": true,
+  "use_sliding_window": false,
+  "use_token_compression": false,
+  "viscop_type": "multi-viscop",
+  "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "vision_encoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "videollama3_vision_encoder",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_encoder_lr": null,
+  "vocab_size": 152064
+}

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ee9d69eb43f474d609bb2ab579bba1882801a4422d418252f322f16114a9de4
+size 1215957382

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh ADDED Viewed

	@@ -0,0 +1,163 @@

+#!/bin/bash
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$SCRIPT_DIR"
+while [ "$REPO_ROOT" != "/" ] && [ ! -d "$REPO_ROOT/.git" ] && [ ! -d "$REPO_ROOT/training_jsons" ]; do
+    REPO_ROOT="$(cd "$REPO_ROOT/.." && pwd)"
+done
+CONFIG_FILE="${CONFIG_FILE:-$REPO_ROOT/local.env}"
+if [ -f "$CONFIG_FILE" ]; then
+    set -a
+    . "$CONFIG_FILE"
+    set +a
+else
+    echo "Missing config: $CONFIG_FILE"
+    echo "Create $REPO_ROOT/local.env based on values for this server."
+    exit 1
+fi
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+if [[ -v MASTER_ADDR_PASSED ]]; then
+    ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
+else
+    ARG_MASTER_ADDR=127.0.0.1 # for dev environments
+fi
+ARG_MASTER_PORT=12356
+# ARG_RANK=$SLURM_NODEID
+ARG_RANK=0
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+# Training Arguments
+GLOBAL_BATCH_SIZE=128 # aka effective batch size
+LOCAL_BATCH_SIZE=8 # batch size per GPU
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+INIT_MODEL=$VISCOP_INIT_MODEL # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM)
+NUM_DATA_WORKERS=8
+# NUM_TRAIN_EPOCHS=3 # !
+NUM_TRAIN_EPOCHS=1
+LORA_TRAINING=True
+# ViSCoP Arguments
+NUM_VISUAL_PROBES=8 # > usually 16 but changed for domain experiment
+INTERACTION_MODULE_NAME=cross_attention
+INTERACTION_MODULE_POS=all
+PASS_PROBES_TO_LLM=True
+PASS_VIS_FEATURES_TO_LLM=True
+# Logging Arguments
+export WANDB_PROJECT=sony26_mm_viscop
+REPORT_TO=wandb
+OUTP_DIR=work_dirs/egoexo
+RUN_NAME=viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight
+# Data Arguments
+DATA_DIR=$VISCOP_DATA_DIR
+SHUFFLE_DATA=True # >
+DOMAIN_NAMES=("egocentric" "depth" "exocentric") # ("egocentric" "depth" "robotics" "exocentric") # >
+INCLUDE_GENERAL_DOMAIN=True # >
+DATA_JSONS=(
+    "$REPO_ROOT/training_jsons/train-instr_viscop_egoview.json"
+    "$REPO_ROOT/training_jsons/train-instr_viscop_depthmodality.json"
+    "$REPO_ROOT/training_jsons/train_instr_viscop_exoviews.json"
+    # "$REPO_ROOT/training_jsons/D-inBC-text-multi-train-8k-front.json"
+)
+TRAINING_JSON=""
+for json_file in "${DATA_JSONS[@]}"; do
+    TRAINING_JSON+="${json_file} "
+done
+# ! Debug: remove setting of ego max frames to 40
+# if [[ $TRAINING_JSON == *"egoview"* ]]; then
+#     MAX_FRAMES=40 # use 40 frames for training on ego
+# else
+#     MAX_FRAMES=180
+# fi
+MAX_FRAMES=180
+# Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging
+TESTING=0
+if [ $TESTING -eq 1 ]; then
+    NUM_DATA_WORKERS=0
+    REPORT_TO=none
+    RUN_NAME=TESTING
+fi
+mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
+cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    viscop/train_multiviscop.py \
+    --interaction_module_layers $INTERACTION_MODULE_POS \
+    --interaction_module_name $INTERACTION_MODULE_NAME \
+    --viscop_type multi-viscop \
+    --enable_probe_diversity_loss True \
+    --lora_enable $LORA_TRAINING \
+    --num_train_epochs $NUM_TRAIN_EPOCHS \
+    --deepspeed scripts/zero2.json \
+    --model_type viscop_qwen2 \
+    --model_path $INIT_MODEL \
+    --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
+    --mm_projector_type mlp2x_gelu \
+    --data_path $TRAINING_JSON \
+    --shuffle_data $SHUFFLE_DATA \
+    --domain_names ${DOMAIN_NAMES[@]} \
+    --include_general_domain $INCLUDE_GENERAL_DOMAIN \
+    --data_folder $DATA_DIR \
+    --image_merge_size 2 \
+    --video_merge_size 2 \
+    --fps 1 \
+    --max_frames $MAX_FRAMES \
+    --model_max_length 16384 \
+    --mm_max_length 10240 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${RUN_NAME} \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 2 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --mm_projector_lr 1e-5 \
+    --llm_lr 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers $NUM_DATA_WORKERS \
+    --report_to $REPORT_TO \
+    --run_name $RUN_NAME \
+    --dataset_cache_dir $VISCOP_DATASET_CACHE_DIR \
+    --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \
+    --include_visual_probes $PASS_PROBES_TO_LLM \
+    --num_visual_probes $NUM_VISUAL_PROBES

viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff