mingyuliutw's picture
Super-squash branch 'main' using huggingface_hub
8889131
{
"allow_patterns_overrides": [
"*/*.safetensors"
],
"architectures": [
"Cosmos3ForConditionalGeneration"
],
"image_token_id": 151655,
"model": {
"_recursive_": false,
"_target_": "projects.cosmos3.vfm.models.omni_mot_model.OmniMoTModel",
"config": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.model_config.OmniMoTModelConfig",
"action_gen": false,
"activation_checkpointing": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.activation_checkpointing.ActivationCheckpointingConfig",
"determinism_check": "default",
"mode": "full",
"preserve_rng_state": true,
"save_ops_regex": [
"fmha"
]
},
"causal_training_strategy": "none",
"compile": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.compile.CompileConfig",
"compile_dynamic": true,
"compiled_region": "language",
"coordinate_descent_tuning": false,
"enabled": true,
"max_autotune_pointwise": false,
"use_cuda_graphs": false
},
"diffusion_expert_config": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.model_config.DiffusionExpertConfig",
"base_fps": 16,
"enable_fps_modulation": true,
"load_weights_from_pretrained": true,
"max_vae_latent_side_after_patchify": 20,
"patch_spatial": 2,
"position_embedding_type": "unified_3d_mrope",
"rope_h_extrapolation_ratio": 1.0,
"rope_t_extrapolation_ratio": 1.0,
"rope_w_extrapolation_ratio": 1.0,
"timestep_range": 1.0,
"unified_3d_mrope_reset_spatial_ids": true,
"unified_3d_mrope_temporal_modality_margin": 15000
},
"ema": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.ema.EMAConfig",
"enabled": false,
"iteration_shift": 0,
"rate": 0.1
},
"fixed_step_sampler_config": null,
"input_caption_key": "ai_caption",
"input_image_key": "images",
"input_video_key": "video",
"joint_attn_implementation": "two_way",
"latent_downsample_factor": 16,
"lbl": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.model_config.LBLConfig",
"coeff_gen": null,
"coeff_und": null,
"method": "local"
},
"log_enc_time_every_n": 100,
"lora_alpha": 32,
"lora_enabled": false,
"lora_rank": 16,
"lora_target_modules": "q_proj_moe_gen,k_proj_moe_gen,v_proj_moe_gen,o_proj_moe_gen",
"max_action_dim": 32,
"max_num_tokens_after_packing": 45056,
"natten_parameter_list": null,
"net": null,
"num_embodiment_domains": 32,
"parallelism": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.parallelism.ParallelismConfig",
"cfg_parallel_shard_degree": 1,
"context_parallel_shard_degree": 1,
"data_parallel_replicate_degree": 1,
"data_parallel_shard_degree": 16,
"enable_inference_mode": false,
"fsdp_master_dtype": "float32"
},
"precision": "bfloat16",
"rectified_flow_inference_config": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.model_config.RectifiedFlowInferenceConfig",
"num_train_timesteps": 1000,
"scheduler_type": "unipc",
"shift": 1,
"use_dynamic_shifting": false
},
"rectified_flow_training_config": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.model_config.RectifiedFlowTrainingConfig",
"action_loss_weight": 10.0,
"high_sigma_ratio": 0.05,
"high_sigma_timesteps_max": 1000,
"high_sigma_timesteps_min": 995,
"image_loss_scale": null,
"independent_action_schedule": false,
"independent_sound_schedule": false,
"loss_scale": 1.0,
"normalize_loss_by_active": false,
"shift": {
"256": 1,
"480": 3,
"704": 5,
"720": 5
},
"shift_action": null,
"shift_sound": null,
"sound_loss_scale": null,
"train_time_action_distribution": "logitnormal",
"train_time_image_distribution": "logitnormal",
"train_time_sound_distribution": "logitnormal",
"train_time_video_distribution": "ltx2",
"train_time_weight": "uniform",
"use_discrete_rf": false,
"use_dynamic_shift": false,
"use_high_sigma_strategy": false,
"use_high_sigma_strategy_action": false,
"use_high_sigma_strategy_sound": false
},
"resolution": "480",
"sound_dim": null,
"sound_gen": false,
"sound_latent_fps": 25,
"sound_tokenizer": null,
"state_ch": 48,
"state_t": 300,
"tokenizer": {
"_target_": "projects.cosmos3.vfm.tokenizers.wan2pt2_vae_4x16x16.Wan2pt2VAEInterface",
"bucket_name": "bucket",
"causal": true,
"chunk_duration": 93,
"encode_bucket_multiple": null,
"encode_chunk_frames": {
"256": 68,
"480": 24,
"720": 12
},
"encode_exact_durations": null,
"keep_decoder_cache": false,
"object_store_credential_path_pretrained": "credentials/gcp_training.secret",
"spatial_compression_factor": 16,
"temporal_compression_factor": 4,
"temporal_window": null,
"use_streaming_encode": false,
"vae_path": "pretrained/tokenizers/video/wan2pt2/Wan2.2_VAE.pth"
},
"video_temporal_causal": false,
"vision_gen": true,
"vlm_config": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.vlm.VLMConfig",
"layer_module": null,
"model_instance": {
"_target_": "projects.cosmos3.vfm.models.mot.unified_mot.Qwen3VLTextForCausalLM",
"config": {
"_target_": "projects.cosmos3.vfm.configs.base.defaults.vlm.create_vlm_config",
"base_config": {
"_target_": "projects.cosmos3.vfm.models.mot.unified_mot.Qwen3VLMoTConfig.from_json_file",
"json_file": "projects/cosmos3/vfm/models/vlm/qwen3_vl/configs/Qwen3-VL-32B-Instruct.json"
},
"qk_norm_for_text": true
}
},
"model_name": "nvidia/Cosmos3-Super-Reasoner",
"pretrained_weights": {
"_type": "projects.cosmos3.vfm.configs.base.defaults.vlm.PretrainedWeightsConfig",
"backbone_path": "s3://bucket/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Super-Reasoner-b6df0d1/",
"checkpoint_format": null,
"credentials_path": "credentials/gcp_checkpoint.secret",
"enable_gcs_patch_in_boto3": true,
"enabled": true
},
"qk_norm": false,
"tie_word_embeddings": false,
"tokenizer": {
"_target_": "projects.cosmos3.vfm.configs.base.defaults.vlm.create_qwen2_tokenizer_with_download",
"config_variant": "gcp",
"pretrained_model_name": "Qwen/Qwen3-VL-32B-Instruct"
},
"use_system_prompt": false
}
}
},
"model_type": "cosmos3_omni",
"text_config": {
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"dtype": "bfloat16",
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 25600,
"max_position_embeddings": 262144,
"model_type": "qwen3_vl_text",
"num_attention_heads": 64,
"num_hidden_layers": 64,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default"
},
"rope_theta": 5000000,
"use_cache": true,
"vocab_size": 151936
},
"tie_word_embeddings": false,
"transformers_version": "4.57.0.dev0",
"video_token_id": 151656,
"vision_config": {
"deepstack_visual_indexes": [
8,
16,
24
],
"depth": 27,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"in_channels": 3,
"initializer_range": 0.02,
"intermediate_size": 4304,
"model_type": "qwen3_vl",
"num_heads": 16,
"num_position_embeddings": 2304,
"out_hidden_size": 5120,
"patch_size": 16,
"spatial_merge_size": 2,
"temporal_patch_size": 2
},
"vision_end_token_id": 151653,
"vision_start_token_id": 151652
}