output_dir: ./runs/train_libero_v2_512hdim_variant_decouple_2node/decouple_2node_b8_20260605_184739
batch_size: 8
num_workers: 4
prefetch_factor: 6
lr_scheduler_type: constant
learning_rate: 0.0001
num_epochs: 10
max_steps: 80000
log_every: 10
save_every: 5000
state_keep_last_n: 1
weights_keep_last_n: 1000
long_term_save_every: 10000
long_term_save_start: 0
eval_every: 0
eval_num_inference_steps: 10
gradient_accumulation_steps: 1
mixed_precision: bf16
seed: 42
max_grad_norm: 1.0
weight_decay: 0.01
resume: null
compile_mot: false
optimizer_type: adamw8bit
wandb:
  enabled: true
  workspace: null
  project: fastwam_ltx_decouple
  name: decouple_2node_b8_20260605_184739
  group: null
  mode: online
data:
  train:
    _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset
    dataset_dirs:
    - ./data/LIBERO-fastwam/libero_spatial_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_object_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_goal_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_10_no_noops_lerobot
    shape_meta:
      images:
      - key: image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      - key: wrist_image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      action:
      - key: default
        raw_shape: 7
        shape: 7
      state:
      - key: default
        raw_shape: 8
        shape: 8
    num_frames: 33
    global_sample_stride: 1
    action_video_freq_ratio: 4
    video_size:
    - 224
    - 448
    camera_key: null
    val_set_proportion: 0
    is_training_set: true
    skip_padding_as_possible: false
    concat_multi_camera: horizontal
    processor:
      _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
      shape_meta:
        images:
        - key: image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        - key: wrist_image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        action:
        - key: default
          raw_shape: 7
          shape: 7
        state:
        - key: default
          raw_shape: 8
          shape: 8
      num_obs_steps: 33
      num_output_cameras: 2
      action_output_dim: 7
      proprio_output_dim: 8
      delta_action_dim_mask:
        default:
        - true
        - true
        - true
        - true
        - true
        - true
        - false
      action_state_transforms: null
      use_stepwise_action_norm: false
      norm_default_mode: min/max
      norm_exception_mode: null
      action_state_merger:
        _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
      train_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      val_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
    text_embedding_cache_dir: ./data/text_embeds_cache/libero
    text_cache_slug: ltx23_gemma3_12b_v2connector
    context_len: 128
    joint_latent_cache_dir: ./data/joint_latents/libero_ratio4_nf33
  val:
    _target_: fastwam.datasets.lerobot.robot_video_dataset.RobotVideoDataset
    dataset_dirs:
    - ./data/LIBERO-fastwam/libero_spatial_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_object_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_goal_no_noops_lerobot
    - ./data/LIBERO-fastwam/libero_10_no_noops_lerobot
    shape_meta:
      images:
      - key: image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      - key: wrist_image
        raw_shape:
        - 3
        - 512
        - 512
        shape:
        - 3
        - 224
        - 224
      action:
      - key: default
        raw_shape: 7
        shape: 7
      state:
      - key: default
        raw_shape: 8
        shape: 8
    num_frames: 33
    global_sample_stride: 1
    action_video_freq_ratio: 4
    video_size:
    - 224
    - 448
    camera_key: null
    val_set_proportion: 0
    is_training_set: false
    skip_padding_as_possible: false
    concat_multi_camera: horizontal
    processor:
      _target_: fastwam.datasets.lerobot.processors.fastwam_processor.FastWAMProcessor
      shape_meta:
        images:
        - key: image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        - key: wrist_image
          raw_shape:
          - 3
          - 512
          - 512
          shape:
          - 3
          - 224
          - 224
        action:
        - key: default
          raw_shape: 7
          shape: 7
        state:
        - key: default
          raw_shape: 8
          shape: 8
      num_obs_steps: 33
      num_output_cameras: 2
      action_output_dim: 7
      proprio_output_dim: 8
      delta_action_dim_mask:
        default:
        - true
        - true
        - true
        - true
        - true
        - true
        - false
      action_state_transforms: null
      use_stepwise_action_norm: false
      norm_default_mode: min/max
      norm_exception_mode: null
      action_state_merger:
        _target_: fastwam.datasets.lerobot.transforms.action_state_merger.ConcatLeftAlign
      train_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
      val_transforms:
      - _target_: fastwam.datasets.lerobot.transforms.image.ToTensor
      - _target_: torchvision.transforms.Resize
        size:
        - 224
        - 224
    text_embedding_cache_dir: ./data/text_embeds_cache/libero
    text_cache_slug: ltx23_gemma3_12b_v2connector
    context_len: 128
model:
  _target_: fastwam.runtime.create_fastwam
  ckpt_path: checkpoints/Lightricks/LTX-2.3/ltx-2.3-22b-dev.safetensors
  gemma_path: checkpoints/google/gemma-3-12b-it-qat-q4_0-unquantized
  load_text_encoder: false
  attach_gemma_to_text_encoder: false
  proprio_dim: 8
  mot_checkpoint_mixed_attn: false
  action_dit_pretrained_path: checkpoints/preprocessed/ltx_action_dit_backbone.pt
  skip_dit_load_from_pretrain: false
  video_dit_config:
    text_dim: 4096
    use_gradient_checkpointing: false
    action_dim: 7
  action_dit_config:
    action_dim: 7
    hidden_dim: 512
    num_heads: 32
    attn_head_dim: 128
    num_layers: 48
    text_dim: 4096
    eps: 1.0e-06
    cross_attention_adaln: false
    use_gradient_checkpointing: false
  video_scheduler:
    type: ltx2
    min_shift: 0.95
    max_shift: 2.05
    min_tokens: 1024
    max_tokens: 4096
    infer_shift: 2.05
    num_train_timesteps: 1000
    train_shift: 5.0
    sigma_floor: 0.0
  action_scheduler:
    type: wan
    train_shift: 5.0
    infer_shift: 5.0
    num_train_timesteps: 1000
    sigma_floor: 0.0
  loss:
    lambda_video: 0.1
    lambda_action: 1.0
  action_only_train: false
  mot_attn_decouple_frac: 0.25
video_expert_lr: 1.0e-05
action_expert_lr: 0.0001