diff --git a/.gitattributes b/.gitattributes index f727163fd0228d789251b0e06e7d2f777407a512..83582caa201206f144067a38f0ad840c6f502eff 100644 --- a/.gitattributes +++ b/.gitattributes @@ -42,3 +42,5 @@ video_mllm_swift/s1_declip_siglip2_qwen3_1.7b/v0-20260314-141147/checkpoint-2181 video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text video_mllm_swift/s1_siglip2_qwen3_1.7b/v11-20260314-090153/checkpoint-2181/tokenizer.json filter=lfs diff=lfs merge=lfs -text video_mllm_swift/s2_declip_siglip2_qwen3_1.7b_10pct/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/args.json b/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/args.json new file mode 100644 index 0000000000000000000000000000000000000000..8469c3c6b959b43aa78e4a06f6ed937a079adbf8 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/args.json @@ -0,0 +1,376 @@ +{ + "output_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051", + "per_device_train_batch_size": 1, + "num_train_epochs": 3.0, + "max_steps": 500, + "learning_rate": 1e-05, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_steps": 0, + "optim": "adamw_torch_fused", + "optim_args": null, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "optim_target_modules": null, + "gradient_accumulation_steps": 8, + "average_tokens_across_devices": true, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "bf16": true, + "fp16": false, + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": "{\"use_reentrant\": false}", + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "use_liger_kernel": false, + "liger_kernel_config": null, + "use_cache": false, + "neftune_noise_alpha": null, + "torch_empty_cache_steps": null, + "auto_find_batch_size": false, + "logging_strategy": "steps", + "logging_steps": 1, + "logging_first_step": true, + "log_on_each_node": true, + "logging_nan_inf_filter": true, + "include_num_input_tokens_seen": false, + "log_level": "passive", + "log_level_replica": "warning", + "disable_tqdm": null, + "report_to": [ + "none" + ], + "run_name": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051", + "project": "huggingface", + "trackio_space_id": "trackio", + "eval_strategy": "no", + "eval_steps": 100.0, + "eval_delay": 0, + "per_device_eval_batch_size": 1, + "prediction_loss_only": false, + "eval_on_start": false, + "eval_do_concat_batches": true, + "eval_use_gather_object": false, + "eval_accumulation_steps": null, + "include_for_metrics": [], + "batch_eval_metrics": false, + "save_only_model": false, + "save_strategy": "steps", + "save_steps": 100.0, + "save_on_each_node": false, + "save_total_limit": 2, + "enable_jit_checkpoint": false, + "push_to_hub": false, + "hub_token": null, + "hub_private_repo": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_always_push": false, + "hub_revision": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "restore_callback_states_from_checkpoint": false, + "full_determinism": false, + "seed": 42, + "data_seed": 42, + "use_cpu": false, + "accelerator_config": "{\"dispatch_batches\": false}", + "parallelism_config": null, + "dataloader_drop_last": false, + "dataloader_num_workers": 4, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "dataloader_prefetch_factor": 4, + "remove_unused_columns": true, + "label_names": null, + "train_sampling_strategy": "random", + "length_column_name": "length", + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "ddp_backend": null, + "ddp_timeout": 36000, + "fsdp": [], + "fsdp_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "debug": null, + "skip_memory_metrics": true, + "do_train": false, + "do_eval": false, + "do_predict": false, + "resume_from_checkpoint": null, + "warmup_ratio": 0.05, + "logging_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/runs", + "local_rank": 0, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "safe_serialization": true, + "max_shard_size": "5GB", + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "group_by_length": false, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": 1e-06, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "eval_metric": null, + "callbacks": [], + "early_stop_interval": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "tuner_type": "full", + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "/opt/tiger/model_cache/checkpoint-2181", + "model_type": "llava_siglip2_qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "experts_impl": null, + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "llava_siglip2_qwen3", + "system": null, + "max_length": 16384, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_side": "right", + "padding_free": true, + "loss_scale": "default", + "sequence_parallel_size": 1, + "template_backend": "swift", + "response_prefix": null, + "enable_thinking": null, + "add_non_thinking_prefix": true, + "dataset": [ + "vmllm_s2_image_10pct" + ], + "val_dataset": [], + "cached_dataset": [], + "cached_val_dataset": [], + "split_dataset_ratio": 0.0, + "dataset_num_proc": 64, + "load_from_cache_file": true, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": null, + "model_author": null, + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "structured_outputs_regex": null, + "train_type": null, + "adapters": [], + "external_plugins": [ + "video_mllm/model_plugin.py", + "video_mllm/dataset_plugin.py" + ], + "custom_register_path": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": true, + "packing_length": 16384, + "packing_num_proc": 1, + "lazy_tokenize": false, + "use_hf": true, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [ + "model.multi_modal_projector" + ], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": false, + "freeze_aligner": false, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": "ms-swift", + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_notification_method": null, + "swanlab_webhook_url": null, + "swanlab_secret": null, + "swanlab_sender_email": null, + "swanlab_receiver_email": null, + "swanlab_smtp_server": null, + "swanlab_smtp_port": null, + "swanlab_email_language": "zh", + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "swift_version": "4.1.0.dev0", + "ckpt_dir": "/opt/tiger/model_cache/checkpoint-2181", + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "checkpoint-2181", + "model_info": "ModelInfo(model_type='llava_siglip2_qwen3', model_dir='/opt/tiger/model_cache/checkpoint-2181', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='llava_siglip2_qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None)], template=None, ignore_patterns=None, requires=None, tags=[])], loader=, template='llava_siglip2_qwen3', model_arch=MultiModelKeys(arch_name='llava_hf', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.multi_modal_projector'], vision_tower=['model.vision_tower'], generator=[]), architectures=['LlavaOnevisionForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=[], tags=['vision', 'video'])", + "model_dir": "/opt/tiger/model_cache/checkpoint-2181", + "template_meta": "QwenTemplateMeta(template_type='llava_siglip2_qwen3', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='hermes', is_thinking=False, thinking_prefix='', non_thinking_prefix='', history_thinking_prefix='')", + "_val_dataset_exists": false, + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051', per_device_train_batch_size=1, num_train_epochs=3.0, max_steps=500, learning_rate=1e-05, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_steps=0.05, optim=, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=8, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False}, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=[], run_name='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051', project='huggingface', trackio_space_id='trackio', eval_strategy=, eval_steps=100.0, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=False, save_strategy=, save_steps=100, save_on_each_node=False, save_total_limit=2, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=4, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=36000, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=False, do_predict=False, resume_from_checkpoint=None, warmup_ratio=0.05, logging_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=1e-06, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer='multimodal', loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)" +} \ No newline at end of file diff --git a/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/logging.jsonl b/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/logging.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0d6dd8346b899250bb6d356652d94f656a25cf20 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v0-20260316-082051/logging.jsonl @@ -0,0 +1 @@ +{"train_dataset": "15834.342530±556.264794, min=14612.000000, max=16384.000000, size=14860", "model_parameter_info": "LlavaOnevisionForConditionalGeneration: 2436.0474M Params (2436.0474M Trainable [100.0000%]), 0.0009M Buffers.", "last_model_checkpoint": null, "best_model_checkpoint": null, "best_metric": null, "global_step": 0, "log_history": [], "memory": null} diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/args.json b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/args.json new file mode 100644 index 0000000000000000000000000000000000000000..00f89bd96dbafe45ae235624add21f86eaa0e032 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/args.json @@ -0,0 +1,376 @@ +{ + "output_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215", + "per_device_train_batch_size": 1, + "num_train_epochs": 3.0, + "max_steps": 500, + "learning_rate": 1e-05, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_steps": 0, + "optim": "adamw_torch_fused", + "optim_args": null, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "optim_target_modules": null, + "gradient_accumulation_steps": 8, + "average_tokens_across_devices": true, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "bf16": true, + "fp16": false, + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": "{\"use_reentrant\": false}", + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "use_liger_kernel": false, + "liger_kernel_config": null, + "use_cache": false, + "neftune_noise_alpha": null, + "torch_empty_cache_steps": null, + "auto_find_batch_size": false, + "logging_strategy": "steps", + "logging_steps": 1, + "logging_first_step": true, + "log_on_each_node": true, + "logging_nan_inf_filter": true, + "include_num_input_tokens_seen": false, + "log_level": "passive", + "log_level_replica": "warning", + "disable_tqdm": null, + "report_to": [ + "none" + ], + "run_name": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215", + "project": "huggingface", + "trackio_space_id": "trackio", + "eval_strategy": "no", + "eval_steps": 100.0, + "eval_delay": 0, + "per_device_eval_batch_size": 1, + "prediction_loss_only": false, + "eval_on_start": false, + "eval_do_concat_batches": true, + "eval_use_gather_object": false, + "eval_accumulation_steps": null, + "include_for_metrics": [], + "batch_eval_metrics": false, + "save_only_model": false, + "save_strategy": "steps", + "save_steps": 100.0, + "save_on_each_node": false, + "save_total_limit": 2, + "enable_jit_checkpoint": false, + "push_to_hub": false, + "hub_token": null, + "hub_private_repo": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_always_push": false, + "hub_revision": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "restore_callback_states_from_checkpoint": false, + "full_determinism": false, + "seed": 42, + "data_seed": 42, + "use_cpu": false, + "accelerator_config": "{\"dispatch_batches\": false}", + "parallelism_config": null, + "dataloader_drop_last": false, + "dataloader_num_workers": 4, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "dataloader_prefetch_factor": 4, + "remove_unused_columns": true, + "label_names": null, + "train_sampling_strategy": "random", + "length_column_name": "length", + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "ddp_backend": null, + "ddp_timeout": 7200, + "fsdp": [], + "fsdp_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "debug": null, + "skip_memory_metrics": true, + "do_train": false, + "do_eval": false, + "do_predict": false, + "resume_from_checkpoint": null, + "warmup_ratio": 0.05, + "logging_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/runs", + "local_rank": 0, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "safe_serialization": true, + "max_shard_size": "5GB", + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "group_by_length": false, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": 1e-06, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "eval_metric": null, + "callbacks": [], + "early_stop_interval": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "tuner_type": "full", + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "/opt/tiger/model_cache/checkpoint-2181", + "model_type": "llava_siglip2_qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "experts_impl": null, + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "llava_siglip2_qwen3", + "system": null, + "max_length": 16384, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_side": "right", + "padding_free": true, + "loss_scale": "default", + "sequence_parallel_size": 1, + "template_backend": "swift", + "response_prefix": null, + "enable_thinking": null, + "add_non_thinking_prefix": true, + "dataset": [], + "val_dataset": [], + "cached_dataset": [ + "/mnt/bn/strategy-mllm-train/common/datasets/vmllm_cached/image_10pct/train" + ], + "cached_val_dataset": [], + "split_dataset_ratio": 0.0, + "dataset_num_proc": 1, + "load_from_cache_file": false, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": null, + "model_author": null, + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "structured_outputs_regex": null, + "train_type": null, + "adapters": [], + "external_plugins": [ + "video_mllm/model_plugin.py", + "video_mllm/dataset_plugin.py" + ], + "custom_register_path": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": true, + "packing_length": 16384, + "packing_num_proc": 1, + "lazy_tokenize": false, + "use_hf": true, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [ + "model.multi_modal_projector" + ], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": false, + "freeze_aligner": false, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": "ms-swift", + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_notification_method": null, + "swanlab_webhook_url": null, + "swanlab_secret": null, + "swanlab_sender_email": null, + "swanlab_receiver_email": null, + "swanlab_smtp_server": null, + "swanlab_smtp_port": null, + "swanlab_email_language": "zh", + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "swift_version": "4.1.0.dev0", + "ckpt_dir": "/opt/tiger/model_cache/checkpoint-2181", + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "checkpoint-2181", + "model_info": "ModelInfo(model_type='llava_siglip2_qwen3', model_dir='/opt/tiger/model_cache/checkpoint-2181', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='llava_siglip2_qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None)], template=None, ignore_patterns=None, requires=None, tags=[])], loader=, template='llava_siglip2_qwen3', model_arch=MultiModelKeys(arch_name='llava_hf', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.multi_modal_projector'], vision_tower=['model.vision_tower'], generator=[]), architectures=['LlavaOnevisionForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=[], tags=['vision', 'video'])", + "model_dir": "/opt/tiger/model_cache/checkpoint-2181", + "template_meta": "QwenTemplateMeta(template_type='llava_siglip2_qwen3', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='hermes', is_thinking=False, thinking_prefix='', non_thinking_prefix='', history_thinking_prefix='')", + "_val_dataset_exists": false, + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215', per_device_train_batch_size=1, num_train_epochs=3.0, max_steps=500, learning_rate=1e-05, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_steps=0.05, optim=, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=8, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False}, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=[], run_name='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215', project='huggingface', trackio_space_id='trackio', eval_strategy=, eval_steps=100.0, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=False, save_strategy=, save_steps=100, save_on_each_node=False, save_total_limit=2, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=4, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=7200, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=False, do_predict=False, resume_from_checkpoint=None, warmup_ratio=0.05, logging_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=1e-06, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer='multimodal', loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)" +} \ No newline at end of file diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/args.json b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/args.json new file mode 100644 index 0000000000000000000000000000000000000000..00f89bd96dbafe45ae235624add21f86eaa0e032 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/args.json @@ -0,0 +1,376 @@ +{ + "output_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215", + "per_device_train_batch_size": 1, + "num_train_epochs": 3.0, + "max_steps": 500, + "learning_rate": 1e-05, + "lr_scheduler_type": "cosine", + "lr_scheduler_kwargs": null, + "warmup_steps": 0, + "optim": "adamw_torch_fused", + "optim_args": null, + "weight_decay": 0.1, + "adam_beta1": 0.9, + "adam_beta2": 0.95, + "adam_epsilon": 1e-08, + "optim_target_modules": null, + "gradient_accumulation_steps": 8, + "average_tokens_across_devices": true, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "bf16": true, + "fp16": false, + "bf16_full_eval": false, + "fp16_full_eval": false, + "tf32": null, + "gradient_checkpointing": true, + "gradient_checkpointing_kwargs": "{\"use_reentrant\": false}", + "torch_compile": false, + "torch_compile_backend": null, + "torch_compile_mode": null, + "use_liger_kernel": false, + "liger_kernel_config": null, + "use_cache": false, + "neftune_noise_alpha": null, + "torch_empty_cache_steps": null, + "auto_find_batch_size": false, + "logging_strategy": "steps", + "logging_steps": 1, + "logging_first_step": true, + "log_on_each_node": true, + "logging_nan_inf_filter": true, + "include_num_input_tokens_seen": false, + "log_level": "passive", + "log_level_replica": "warning", + "disable_tqdm": null, + "report_to": [ + "none" + ], + "run_name": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215", + "project": "huggingface", + "trackio_space_id": "trackio", + "eval_strategy": "no", + "eval_steps": 100.0, + "eval_delay": 0, + "per_device_eval_batch_size": 1, + "prediction_loss_only": false, + "eval_on_start": false, + "eval_do_concat_batches": true, + "eval_use_gather_object": false, + "eval_accumulation_steps": null, + "include_for_metrics": [], + "batch_eval_metrics": false, + "save_only_model": false, + "save_strategy": "steps", + "save_steps": 100.0, + "save_on_each_node": false, + "save_total_limit": 2, + "enable_jit_checkpoint": false, + "push_to_hub": false, + "hub_token": null, + "hub_private_repo": null, + "hub_model_id": null, + "hub_strategy": "every_save", + "hub_always_push": false, + "hub_revision": null, + "load_best_model_at_end": false, + "metric_for_best_model": "loss", + "greater_is_better": false, + "ignore_data_skip": false, + "restore_callback_states_from_checkpoint": false, + "full_determinism": false, + "seed": 42, + "data_seed": 42, + "use_cpu": false, + "accelerator_config": "{\"dispatch_batches\": false}", + "parallelism_config": null, + "dataloader_drop_last": false, + "dataloader_num_workers": 4, + "dataloader_pin_memory": true, + "dataloader_persistent_workers": false, + "dataloader_prefetch_factor": 4, + "remove_unused_columns": true, + "label_names": null, + "train_sampling_strategy": "random", + "length_column_name": "length", + "ddp_find_unused_parameters": null, + "ddp_bucket_cap_mb": null, + "ddp_broadcast_buffers": null, + "ddp_backend": null, + "ddp_timeout": 7200, + "fsdp": [], + "fsdp_config": null, + "deepspeed": { + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 200000000.0, + "overlap_comm": false, + "reduce_scatter": true, + "reduce_bucket_size": 200000000.0, + "contiguous_gradients": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false + }, + "debug": null, + "skip_memory_metrics": true, + "do_train": false, + "do_eval": false, + "do_predict": false, + "resume_from_checkpoint": null, + "warmup_ratio": 0.05, + "logging_dir": "/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/runs", + "local_rank": 0, + "sortish_sampler": false, + "predict_with_generate": false, + "generation_max_length": null, + "generation_num_beams": null, + "generation_config": null, + "tuner_backend": "peft", + "vit_gradient_checkpointing": null, + "router_aux_loss_coef": 0.0, + "enable_dft_loss": false, + "enable_channel_loss": false, + "safe_serialization": true, + "max_shard_size": "5GB", + "check_model": true, + "acc_strategy": "token", + "train_dataloader_shuffle": true, + "group_by_length": false, + "max_epochs": null, + "aligner_lr": null, + "vit_lr": 1e-06, + "use_logits_to_keep": null, + "ds3_gather_for_generation": true, + "resume_only_model": false, + "optimizer": null, + "loss_type": null, + "eval_metric": null, + "callbacks": [], + "early_stop_interval": null, + "eval_use_evalscope": false, + "eval_dataset": [], + "eval_dataset_args": null, + "eval_limit": null, + "eval_generation_config": null, + "extra_eval_args": null, + "tuner_type": "full", + "use_galore": false, + "galore_target_modules": null, + "galore_rank": 128, + "galore_update_proj_gap": 50, + "galore_scale": 1.0, + "galore_proj_type": "std", + "galore_optim_per_parameter": false, + "galore_with_embedding": false, + "galore_quantization": false, + "galore_proj_quant": false, + "galore_proj_bits": 4, + "galore_proj_group_size": 256, + "galore_cos_threshold": 0.4, + "galore_gamma_proj": 2, + "galore_queue_size": 5, + "lisa_activated_layers": 0, + "lisa_step_interval": 20, + "use_flash_ckpt": false, + "use_ray": false, + "ray_exp_name": null, + "device_groups": null, + "model": "/opt/tiger/model_cache/checkpoint-2181", + "model_type": "llava_siglip2_qwen3", + "model_revision": null, + "task_type": "causal_lm", + "torch_dtype": "bfloat16", + "attn_impl": "flash_attn", + "experts_impl": null, + "new_special_tokens": [], + "num_labels": null, + "problem_type": null, + "rope_scaling": null, + "device_map": null, + "max_memory": {}, + "max_model_len": null, + "local_repo_path": null, + "init_strategy": null, + "template": "llava_siglip2_qwen3", + "system": null, + "max_length": 16384, + "truncation_strategy": "delete", + "max_pixels": null, + "agent_template": null, + "norm_bbox": null, + "use_chat_template": true, + "padding_side": "right", + "padding_free": true, + "loss_scale": "default", + "sequence_parallel_size": 1, + "template_backend": "swift", + "response_prefix": null, + "enable_thinking": null, + "add_non_thinking_prefix": true, + "dataset": [], + "val_dataset": [], + "cached_dataset": [ + "/mnt/bn/strategy-mllm-train/common/datasets/vmllm_cached/image_10pct/train" + ], + "cached_val_dataset": [], + "split_dataset_ratio": 0.0, + "dataset_num_proc": 1, + "load_from_cache_file": false, + "dataset_shuffle": true, + "val_dataset_shuffle": false, + "streaming": false, + "interleave_prob": null, + "stopping_strategy": "first_exhausted", + "shuffle_buffer_size": 1000, + "download_mode": "reuse_dataset_if_exists", + "columns": {}, + "strict": false, + "model_name": null, + "model_author": null, + "custom_dataset_info": [], + "quant_method": null, + "quant_bits": null, + "hqq_axis": null, + "bnb_4bit_compute_dtype": "bfloat16", + "bnb_4bit_quant_type": "nf4", + "bnb_4bit_use_double_quant": true, + "bnb_4bit_quant_storage": null, + "max_new_tokens": 64, + "temperature": 0.0, + "top_k": null, + "top_p": null, + "repetition_penalty": null, + "num_beams": 1, + "stream": false, + "stop_words": [], + "logprobs": false, + "top_logprobs": null, + "structured_outputs_regex": null, + "train_type": null, + "adapters": [], + "external_plugins": [ + "video_mllm/model_plugin.py", + "video_mllm/dataset_plugin.py" + ], + "custom_register_path": [], + "model_kwargs": {}, + "load_args": false, + "load_data_args": false, + "packing": true, + "packing_length": 16384, + "packing_num_proc": 1, + "lazy_tokenize": false, + "use_hf": true, + "ignore_args_error": false, + "use_swift_lora": false, + "freeze_parameters": [], + "freeze_parameters_regex": null, + "freeze_parameters_ratio": 0.0, + "trainable_parameters": [ + "model.multi_modal_projector" + ], + "trainable_parameters_regex": null, + "freeze_llm": false, + "freeze_vit": false, + "freeze_aligner": false, + "target_modules": [ + "all-linear" + ], + "target_regex": null, + "target_parameters": null, + "modules_to_save": [], + "lora_rank": 8, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_bias": "none", + "lora_dtype": null, + "lorap_lr_ratio": null, + "use_rslora": false, + "use_dora": false, + "lora_ga_batch_size": 2, + "lora_ga_iters": 2, + "lora_ga_max_length": 1024, + "lora_ga_direction": "ArB2r", + "lora_ga_scale": "stable", + "lora_ga_stable_gamma": 16, + "init_weights": true, + "fourier_n_frequency": 2000, + "fourier_scaling": 300.0, + "boft_block_size": 4, + "boft_block_num": 0, + "boft_n_butterfly_factor": 1, + "boft_dropout": 0.0, + "vera_rank": 256, + "vera_projection_prng_key": 0, + "vera_dropout": 0.0, + "vera_d_initial": 0.1, + "adapter_act": "gelu", + "adapter_length": 128, + "adalora_target_r": 8, + "adalora_init_r": 12, + "adalora_tinit": 0, + "adalora_tfinal": 0, + "adalora_deltaT": 1, + "adalora_beta1": 0.85, + "adalora_beta2": 0.85, + "adalora_orth_reg_weight": 0.5, + "llamapro_num_new_blocks": 4, + "llamapro_num_groups": null, + "reft_layer_key": null, + "reft_layers": null, + "reft_rank": 4, + "reft_intervention_type": "LoreftIntervention", + "reft_args": null, + "swanlab_token": null, + "swanlab_project": "ms-swift", + "swanlab_workspace": null, + "swanlab_exp_name": null, + "swanlab_notification_method": null, + "swanlab_webhook_url": null, + "swanlab_secret": null, + "swanlab_sender_email": null, + "swanlab_receiver_email": null, + "swanlab_smtp_server": null, + "swanlab_smtp_port": null, + "swanlab_email_language": "zh", + "swanlab_mode": "cloud", + "add_version": true, + "create_checkpoint_symlink": false, + "zero_hpz_partition_size": null, + "deepspeed_autotp_size": null, + "swift_version": "4.1.0.dev0", + "ckpt_dir": "/opt/tiger/model_cache/checkpoint-2181", + "rank": 0, + "global_world_size": 8, + "local_world_size": 8, + "model_suffix": "checkpoint-2181", + "model_info": "ModelInfo(model_type='llava_siglip2_qwen3', model_dir='/opt/tiger/model_cache/checkpoint-2181', torch_dtype=torch.bfloat16, max_model_len=40960, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)", + "model_meta": "ModelMeta(model_type='llava_siglip2_qwen3', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-0.6B', hf_model_id='Qwen/Qwen3-0.6B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-1.7B', hf_model_id='Qwen/Qwen3-1.7B', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-4B', hf_model_id='Qwen/Qwen3-4B', model_path=None, ms_revision=None, hf_revision=None)], template=None, ignore_patterns=None, requires=None, tags=[])], loader=, template='llava_siglip2_qwen3', model_arch=MultiModelKeys(arch_name='llava_hf', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.multi_modal_projector'], vision_tower=['model.vision_tower'], generator=[]), architectures=['LlavaOnevisionForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, task_type=None, ignore_patterns=None, requires=[], tags=['vision', 'video'])", + "model_dir": "/opt/tiger/model_cache/checkpoint-2181", + "template_meta": "QwenTemplateMeta(template_type='llava_siglip2_qwen3', prefix=[], prompt=['<|im_start|>user\\n{{QUERY}}<|im_end|>\\n<|im_start|>assistant\\n'], chat_sep=['<|im_end|>\\n'], suffix=['<|im_end|>\\n'], template_cls=, system_prefix=['<|im_start|>system\\n{{SYSTEM}}<|im_end|>\\n'], default_system=None, auto_add_bos=False, stop_words=['<|endoftext|>'], agent_template='hermes', is_thinking=False, thinking_prefix='', non_thinking_prefix='', history_thinking_prefix='')", + "_val_dataset_exists": false, + "hub": "", + "evaluation_strategy": "steps", + "training_args": "Seq2SeqTrainingArguments(output_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215', per_device_train_batch_size=1, num_train_epochs=3.0, max_steps=500, learning_rate=1e-05, lr_scheduler_type=, lr_scheduler_kwargs=None, warmup_steps=0.05, optim=, optim_args=None, weight_decay=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, optim_target_modules=None, gradient_accumulation_steps=8, average_tokens_across_devices=None, max_grad_norm=1.0, label_smoothing_factor=0.0, bf16=True, fp16=False, bf16_full_eval=False, fp16_full_eval=False, tf32=None, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant': False}, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, use_liger_kernel=False, liger_kernel_config=None, use_cache=False, neftune_noise_alpha=None, torch_empty_cache_steps=None, auto_find_batch_size=False, logging_strategy=, logging_steps=1, logging_first_step=True, log_on_each_node=True, logging_nan_inf_filter=True, include_num_input_tokens_seen=None, log_level='passive', log_level_replica='warning', disable_tqdm=False, report_to=[], run_name='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215', project='huggingface', trackio_space_id='trackio', eval_strategy=, eval_steps=100.0, eval_delay=0, per_device_eval_batch_size=1, prediction_loss_only=False, eval_on_start=False, eval_do_concat_batches=True, eval_use_gather_object=False, eval_accumulation_steps=None, include_for_metrics=[], batch_eval_metrics=False, save_only_model=False, save_strategy=, save_steps=100, save_on_each_node=False, save_total_limit=2, enable_jit_checkpoint=False, push_to_hub=False, hub_token=None, hub_private_repo=None, hub_model_id=None, hub_strategy=, hub_always_push=False, hub_revision=None, load_best_model_at_end=False, metric_for_best_model='loss', greater_is_better=False, ignore_data_skip=False, restore_callback_states_from_checkpoint=False, full_determinism=False, seed=42, data_seed=42, use_cpu=False, accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=False, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), parallelism_config=None, dataloader_drop_last=False, dataloader_num_workers=4, dataloader_pin_memory=True, dataloader_persistent_workers=False, dataloader_prefetch_factor=4, remove_unused_columns=False, label_names=None, train_sampling_strategy='random', length_column_name='length', ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, ddp_backend=None, ddp_timeout=7200, fsdp=[], fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, deepspeed={'fp16': {'enabled': 'auto', 'loss_scale': 0, 'loss_scale_window': 1000, 'initial_scale_power': 16, 'hysteresis': 2, 'min_loss_scale': 1}, 'bf16': {'enabled': 'auto'}, 'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'none', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'overlap_comm': False, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'contiguous_gradients': True}, 'gradient_accumulation_steps': 'auto', 'gradient_clipping': 'auto', 'steps_per_print': 2000, 'train_batch_size': 'auto', 'train_micro_batch_size_per_gpu': 'auto', 'wall_clock_breakdown': False}, debug=[], skip_memory_metrics=True, do_train=False, do_eval=False, do_predict=False, resume_from_checkpoint=None, warmup_ratio=0.05, logging_dir='/mnt/bn/strategy-mllm-train/user/wangjunjie/code/xiaomoguhzz/exps/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/runs', local_rank=0, sortish_sampler=False, predict_with_generate=False, generation_max_length=None, generation_num_beams=None, generation_config=None, tuner_backend='peft', vit_gradient_checkpointing=True, router_aux_loss_coef=0.0, enable_dft_loss=False, enable_channel_loss=False, safe_serialization=True, max_shard_size='5GB', check_model=True, acc_strategy='token', train_dataloader_shuffle=True, group_by_length=False, max_epochs=None, aligner_lr=None, vit_lr=1e-06, use_logits_to_keep=None, ds3_gather_for_generation=True, resume_only_model=False, optimizer='multimodal', loss_type=None, eval_metric=None, callbacks=[], early_stop_interval=None, eval_use_evalscope=False, eval_dataset=[], eval_dataset_args=None, eval_limit=None, eval_generation_config=None, extra_eval_args=None, tuner_type='full', use_galore=False, galore_target_modules=None, galore_rank=128, galore_update_proj_gap=50, galore_scale=1.0, galore_proj_type='std', galore_optim_per_parameter=False, galore_with_embedding=False, galore_quantization=False, galore_proj_quant=False, galore_proj_bits=4, galore_proj_group_size=256, galore_cos_threshold=0.4, galore_gamma_proj=2, galore_queue_size=5, lisa_activated_layers=0, lisa_step_interval=20, use_flash_ckpt=False)" +} \ No newline at end of file diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/chat_template.jinja b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..01be9b307daa2d425f7c168c9fb145a286e0afb4 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/chat_template.jinja @@ -0,0 +1,89 @@ +{%- if tools %} + {{- '<|im_start|>system\n' }} + {%- if messages[0].role == 'system' %} + {{- messages[0].content + '\n\n' }} + {%- endif %} + {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} + {%- for tool in tools %} + {{- "\n" }} + {{- tool | tojson }} + {%- endfor %} + {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} +{%- else %} + {%- if messages[0].role == 'system' %} + {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages[::-1] %} + {%- set index = (messages|length - 1) - loop.index0 %} + {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('') and message.content.endswith('')) %} + {%- set ns.multi_step_tool = false %} + {%- set ns.last_query_index = index %} + {%- endif %} +{%- endfor %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if loop.index0 > ns.last_query_index %} + {%- if loop.last or (not loop.last and reasoning_content) %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {%- if message.tool_calls %} + {%- for tool_call in message.tool_calls %} + {%- if (loop.first and content) or (not loop.first) %} + {{- '\n' }} + {%- endif %} + {%- if tool_call.function %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n{"name": "' }} + {{- tool_call.name }} + {{- '", "arguments": ' }} + {%- if tool_call.arguments is string %} + {{- tool_call.arguments }} + {%- else %} + {{- tool_call.arguments | tojson }} + {%- endif %} + {{- '}\n' }} + {%- endfor %} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} + {{- '<|im_start|>user' }} + {%- endif %} + {{- '\n\n' }} + {{- content }} + {{- '\n' }} + {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/config.json b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/config.json new file mode 100644 index 0000000000000000000000000000000000000000..91fec50984b1ce69db1f04f83bf57934419cc5ac --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/config.json @@ -0,0 +1,248 @@ +{ + "architectures": [ + "LlavaOnevisionForConditionalGeneration" + ], + "bos_token_id": null, + "dtype": "bfloat16", + "eos_token_id": 151645, + "hidden_size": 2048, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_token_index": 151669, + "keys_to_ignore_at_inference": [ + "past_key_values" + ], + "model_type": "llava_onevision", + "multimodal_projector_bias": true, + "pad_token_id": 151643, + "projector_hidden_act": "gelu", + "text_config": { + "_name_or_path": "/home/tiger/.cache/huggingface/hub/models--Qwen--Qwen3-1.7B/snapshots/70d244cc86ccca08cf5af4e1e306ecf908b1ad5e", + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 151643, + "dtype": "bfloat16", + "eos_token_id": 151645, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 6144, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 40960, + "max_window_layers": 28, + "model_type": "qwen3", + "num_attention_heads": 16, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "pad_token_id": 151643, + "rms_norm_eps": 1e-06, + "rope_parameters": { + "rope_theta": 1000000, + "rope_type": "default" + }, + "sliding_window": null, + "tie_word_embeddings": true, + "use_cache": false, + "use_sliding_window": false, + "vocab_size": 151936 + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": false, + "video_token_index": 151670, + "vision_aspect_ratio": "anyres_max_9", + "vision_config": { + "attention_dropout": 0.0, + "dtype": "bfloat16", + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_channels": 3, + "num_hidden_layers": 26, + "patch_size": 14, + "vision_use_head": false + }, + "vision_feature_layer": -1, + "vision_feature_select_strategy": "full" +} diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/generation_config.json b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..caf77791d2c04f34887781e78a159cf8968d3fe6 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 151643, + "eos_token_id": [ + 151645, + 151643 + ], + "output_attentions": false, + "output_hidden_states": false, + "transformers_version": "5.2.0", + "use_cache": true +} diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..485056a290e398a35e7abeb5cc7cf9eda94bbe99 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c691944f9d68071f5078527c05c2c9a342c9eae53ed3aca4960cf54d0270aea +size 3654094853 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..15d9dcd06f07acc5ee1f35b1de6a5b19e5706a81 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b400665444084f9a470fd55a4dd4665f88a6195396d13675b85f059618c5d3 +size 3654085701 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..d21e9a13161453c7787e7a705bcaadc9b0b85e88 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f4cb0e6fa0c778897380621cc1b4eca3ee2e0656be8456a912d0f53f47570a3 +size 3654083653 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b2b1c1abb1ff068cc01f81fa6cc3e17d756b02e --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178511313175542200cb5f7ef503cb00a4554cace8c5ebf003a303b32f73a38a +size 3654085317 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..9793c57f47c347b02d4a969802598a03b3bfc3db --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19335ebba0a2ac8db8ae180d897a98170d173867e8bc60e174f6cca18d75aa2a +size 3654085253 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..942008096d1edae82163d84e43dc29ff2875424a --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cea43eea1c10652a2dc1e1490a1baad5f205b6ceaa19c77b4f217fa6bd191e7 +size 3654085253 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99af4ecba1bf45fe7fbfbd3ed0f734643572a310 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479b2d00d5e545d488adaaa9be7d1c80fb487b29efe4392141ab56b3079c88f3 +size 3654085829 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..83c819ede8c4391d7f187bda96cfaefe51f31d3b --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26833aa47b40d64d4c895a305befff1741f31a12a8f127197da1100a93e6acb3 +size 3654081925 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/mp_rank_00_model_states.pt b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4d8b6e9eb657052b3c6f835b994460c959cb018 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/global_step300/mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeec3962da3dd0f0f61fba2019d51c3dbc764c665843ff710eff7e312f0b3146 +size 4872342181 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/latest b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/latest new file mode 100644 index 0000000000000000000000000000000000000000..6761b575fffac7f1984044dcb6446b3a51da04c8 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/latest @@ -0,0 +1 @@ +global_step300 \ No newline at end of file diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/model.safetensors b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b51cfc03ff2d8e71582041a74531f4ab98907215 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f24efcae4e9726d5380f1a8d964e9d7dee73d00929c92dd46b6cf7c5c5120e8 +size 4872193968 diff --git a/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/processor_config.json b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e71d78590b67986dba84d34b69c764d2b5a55947 --- /dev/null +++ b/video_mllm_swift/s2_image_only_10pct/v1-20260316-135215/checkpoint-300/processor_config.json @@ -0,0 +1,206 @@ +{ + "image_processor": { + "data_format": "channels_first", + "do_convert_rgb": true, + "do_normalize": true, + "do_pad": true, + "do_rescale": true, + "do_resize": true, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_processor_type": "LlavaOnevisionImageProcessorFast", + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "height": 384, + "width": 384 + } + }, + "image_token": "", + "num_image_tokens": 729, + "processor_class": "LlavaOnevisionProcessor", + "video_processor": { + "data_format": "channels_first", + "default_to_square": false, + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": false, + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "size": { + "height": 384, + "width": 384 + }, + "video_processor_type": "LlavaOnevisionVideoProcessor" + }, + "video_token": "