BEST-RQ-2 / config.yaml
ltuncay's picture
Submission to the Interspeech 2026 Audio Encoder Capability Challenge
eca55dc verified
task_name: train
tags:
- audioset
- best-rq-2
- cluster GPU
train: true
test: true
ckpt_path: null
seed: 21072023
data:
_target_: src.data.audioset_datamodule.AudioSetDataModule
data_dir: ${paths.data_dir}/AudioSet
batch_size: 256
num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
pin_memory: true
train_h5: full_unbal_bal_train_wav.h5
train_csv: silent_files_full_unbal_bal_train_wav.csv
val_h5: eval_soxrhq.h5
val_csv: silent_files_eval_soxrhq.csv
max_audio_length_sec: 10.0
target_sample_rate: 16000
collate_mode: pad
model:
_target_: src.models.best_rq2_module.BestRQ2Module
optimizer:
_target_: torch.optim.AdamW
_partial_: true
lr: 0.0001
weight_decay: 0.05
warmup_pct: 0.05
spectrogram_adjustment_mode: truncate
criterion:
_target_: torch.nn.CrossEntropyLoss
_partial_: true
reduction: mean
codebook_dim: 16
vocab_size: 8192
net:
spectrogram:
sample_rate: ${data.target_sample_rate}
n_fft: 2048
win_length_ms: 128
hop_length_ms: 39.0625
n_mels: 128
f_min: 0
f_max: 8000
power: 2.0
patch_embed:
img_size:
- 128
- 256
patch_size:
- 16
- 16
in_chans: 1
embed_dim: 768
masking:
input_size:
- 128
- 256
patch_size:
- 16
- 16
mask_ratio:
- 0.4
- 0.6
encoder:
embed_dim: 768
depth: 12
num_heads: 12
mlp_ratio: 4.0
qkv_bias: true
drop_rate: 0.0
attn_drop_rate: 0.0
drop_path_rate: 0.1
num_patches: 128
pos_embed_type: sincos
predictor:
embed_dim: 768
depth: 4
num_heads: 12
mlp_ratio: 4.0
qkv_bias: true
drop_rate: 0.0
attn_drop_rate: 0.0
drop_path_rate: 0.0
num_patches: 128
pos_embed_type: sincos
callbacks:
model_checkpoint: null
model_summary:
_target_: lightning.pytorch.callbacks.RichModelSummary
max_depth: 1
rich_progress_bar: null
safetensors:
_target_: src.callbacks.safetensors_callback.SafetensorsCallback
device_stats:
_target_: lightning.pytorch.callbacks.DeviceStatsMonitor
visualization:
_target_: src.callbacks.visualization_callback.VisualizationCallback
num_samples: 4
wandb_offline_checkpoint:
_target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
logger:
wandb:
_target_: lightning.pytorch.loggers.wandb.WandbLogger
save_dir: ${paths.output_dir}
offline: true
id: null
anonymous: null
project: audio embeddings
log_model: false
prefix: ""
group: ""
tags: []
job_type: ""
name: best_rq2-audioset-200k-256x1bs
trainer:
_target_: lightning.pytorch.trainer.Trainer
default_root_dir: ${paths.output_dir}
accelerator: gpu
devices: 1
check_val_every_n_epoch: 1
deterministic: false
max_steps: 200000
strategy: auto
max_time: 00:19:50:00
paths:
root_dir: ${oc.env:PROJECT_ROOT}
data_dir: ${paths.root_dir}/data/
log_dir: ${paths.root_dir}/logs/
output_dir: ${hydra:runtime.output_dir}
work_dir: ${hydra:runtime.cwd}
extras:
ignore_warnings: false
enforce_tags: true
print_config: true