| output_dir: ./runs/train_libero_v2_512hdim_variant_decouple_2node/decouple_2node_b8_20260605_184739 |
| batch_size: 8 |
| num_workers: 4 |
| prefetch_factor: 6 |
| lr_scheduler_type: constant |
| learning_rate: 0.0001 |
| num_epochs: 10 |
| max_steps: 80000 |
| log_every: 10 |
| save_every: 5000 |
| state_keep_last_n: 1 |
| weights_keep_last_n: 1000 |
| long_term_save_every: 10000 |
| long_term_save_start: 0 |
| eval_every: 0 |
| eval_num_inference_steps: 10 |
| gradient_accumulation_steps: 1 |
| mixed_precision: bf16 |
| seed: 42 |
| max_grad_norm: 1.0 |
| weight_decay: 0.01 |
| resume: null |
| compile_mot: false |
| optimizer_type: adamw8bit |
| wandb: |
| enabled: true |
| workspace: null |
| project: fastwam_ltx_decouple |
| name: decouple_2node_b8_20260605_184739 |
| group: null |
| mode: online |
| data: |
| train: |
| _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset |
| dataset_dirs: |
| - ./data/LIBERO-fastwam/libero_spatial_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_object_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_goal_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_10_no_noops_lerobot |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_frames: 33 |
| global_sample_stride: 1 |
| action_video_freq_ratio: 4 |
| video_size: |
| - 224 |
| - 448 |
| camera_key: null |
| val_set_proportion: 0 |
| is_training_set: true |
| skip_padding_as_possible: false |
| concat_multi_camera: horizontal |
| processor: |
| _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_obs_steps: 33 |
| num_output_cameras: 2 |
| action_output_dim: 7 |
| proprio_output_dim: 8 |
| delta_action_dim_mask: |
| default: |
| - true |
| - true |
| - true |
| - true |
| - true |
| - true |
| - false |
| action_state_transforms: null |
| use_stepwise_action_norm: false |
| norm_default_mode: min/max |
| norm_exception_mode: null |
| action_state_merger: |
| _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign |
| train_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| val_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| text_embedding_cache_dir: ./data/text_embeds_cache/libero |
| text_cache_slug: ltx23_gemma3_12b_v2connector |
| context_len: 128 |
| joint_latent_cache_dir: ./data/joint_latents/libero_ratio4_nf33 |
| val: |
| _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset |
| dataset_dirs: |
| - ./data/LIBERO-fastwam/libero_spatial_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_object_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_goal_no_noops_lerobot |
| - ./data/LIBERO-fastwam/libero_10_no_noops_lerobot |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_frames: 33 |
| global_sample_stride: 1 |
| action_video_freq_ratio: 4 |
| video_size: |
| - 224 |
| - 448 |
| camera_key: null |
| val_set_proportion: 0 |
| is_training_set: false |
| skip_padding_as_possible: false |
| concat_multi_camera: horizontal |
| processor: |
| _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor |
| shape_meta: |
| images: |
| - key: image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| - key: wrist_image |
| raw_shape: |
| - 3 |
| - 512 |
| - 512 |
| shape: |
| - 3 |
| - 224 |
| - 224 |
| action: |
| - key: default |
| raw_shape: 7 |
| shape: 7 |
| state: |
| - key: default |
| raw_shape: 8 |
| shape: 8 |
| num_obs_steps: 33 |
| num_output_cameras: 2 |
| action_output_dim: 7 |
| proprio_output_dim: 8 |
| delta_action_dim_mask: |
| default: |
| - true |
| - true |
| - true |
| - true |
| - true |
| - true |
| - false |
| action_state_transforms: null |
| use_stepwise_action_norm: false |
| norm_default_mode: min/max |
| norm_exception_mode: null |
| action_state_merger: |
| _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign |
| train_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| val_transforms: |
| - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor |
| - _target_: torchvision.transforms.Resize |
| size: |
| - 224 |
| - 224 |
| text_embedding_cache_dir: ./data/text_embeds_cache/libero |
| text_cache_slug: ltx23_gemma3_12b_v2connector |
| context_len: 128 |
| model: |
| _target_: fastwam.runtime.create_fastwam |
| ckpt_path: checkpoints/Lightricks/LTX-2.3/ltx-2.3-22b-dev.safetensors |
| gemma_path: checkpoints/google/gemma-3-12b-it-qat-q4_0-unquantized |
| load_text_encoder: false |
| attach_gemma_to_text_encoder: false |
| proprio_dim: 8 |
| mot_checkpoint_mixed_attn: false |
| action_dit_pretrained_path: checkpoints/preprocessed/ltx_action_dit_backbone.pt |
| skip_dit_load_from_pretrain: false |
| video_dit_config: |
| text_dim: 4096 |
| use_gradient_checkpointing: false |
| action_dim: 7 |
| action_dit_config: |
| action_dim: 7 |
| hidden_dim: 512 |
| num_heads: 32 |
| attn_head_dim: 128 |
| num_layers: 48 |
| text_dim: 4096 |
| eps: 1.0e-06 |
| cross_attention_adaln: false |
| use_gradient_checkpointing: false |
| video_scheduler: |
| type: ltx2 |
| min_shift: 0.95 |
| max_shift: 2.05 |
| min_tokens: 1024 |
| max_tokens: 4096 |
| infer_shift: 2.05 |
| num_train_timesteps: 1000 |
| train_shift: 5.0 |
| sigma_floor: 0.0 |
| action_scheduler: |
| type: wan |
| train_shift: 5.0 |
| infer_shift: 5.0 |
| num_train_timesteps: 1000 |
| sigma_floor: 0.0 |
| loss: |
| lambda_video: 0.1 |
| lambda_action: 1.0 |
| action_only_train: false |
| mot_attn_decouple_frac: 0.25 |
| video_expert_lr: 1.0e-05 |
| action_expert_lr: 0.0001 |
|
|