| batch_size: 4 | |
| context_size: 4 | |
| datasets: | |
| avw_4k: | |
| data_folder: /path/to/dataset/avw_4k | |
| goals_per_obs: 4 | |
| test: /path/to/data_splits/avw_4k/val | |
| train: /path/to/data_splits/avw_4k/train | |
| distance: | |
| max_dist_cat: 16 | |
| min_dist_cat: -16 | |
| from_checkpoint: /path/to/pretrained/experts_merged.pth | |
| sample_rate: 16000 | |
| input_sr: 48000 | |
| tokenizer_a_path: /path/to/pretrained/soundstream.pt | |
| grad_clip_val: 10.0 | |
| image_size: 224 | |
| len_traj_pred: 16 | |
| lr: 16.0e-05 | |
| model: AVCDiT-B/2 | |
| normalize: true | |
| num_workers: 12 | |
| results_dir: logs | |
| run_name: training_stage3 | |
| train: true |