File size: 3,230 Bytes
eca55dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
task_name: train
tags:
  - audioset
  - best-rq-2
  - cluster GPU
train: true
test: true
ckpt_path: null
seed: 21072023
data:
  _target_: src.data.audioset_datamodule.AudioSetDataModule
  data_dir: ${paths.data_dir}/AudioSet
  batch_size: 256
  num_workers: ${oc.decode:${oc.env:SLURM_CPUS_PER_TASK}}
  pin_memory: true
  train_h5: full_unbal_bal_train_wav.h5
  train_csv: silent_files_full_unbal_bal_train_wav.csv
  val_h5: eval_soxrhq.h5
  val_csv: silent_files_eval_soxrhq.csv
  max_audio_length_sec: 10.0
  target_sample_rate: 16000
  collate_mode: pad
model:
  _target_: src.models.best_rq2_module.BestRQ2Module
  optimizer:
    _target_: torch.optim.AdamW
    _partial_: true
    lr: 0.0001
    weight_decay: 0.05
  warmup_pct: 0.05
  spectrogram_adjustment_mode: truncate
  criterion:
    _target_: torch.nn.CrossEntropyLoss
    _partial_: true
    reduction: mean
  codebook_dim: 16
  vocab_size: 8192
  net:
    spectrogram:
      sample_rate: ${data.target_sample_rate}
      n_fft: 2048
      win_length_ms: 128
      hop_length_ms: 39.0625
      n_mels: 128
      f_min: 0
      f_max: 8000
      power: 2.0
    patch_embed:
      img_size:
        - 128
        - 256
      patch_size:
        - 16
        - 16
      in_chans: 1
      embed_dim: 768
    masking:
      input_size:
        - 128
        - 256
      patch_size:
        - 16
        - 16
      mask_ratio:
        - 0.4
        - 0.6
    encoder:
      embed_dim: 768
      depth: 12
      num_heads: 12
      mlp_ratio: 4.0
      qkv_bias: true
      drop_rate: 0.0
      attn_drop_rate: 0.0
      drop_path_rate: 0.1
      num_patches: 128
      pos_embed_type: sincos
    predictor:
      embed_dim: 768
      depth: 4
      num_heads: 12
      mlp_ratio: 4.0
      qkv_bias: true
      drop_rate: 0.0
      attn_drop_rate: 0.0
      drop_path_rate: 0.0
      num_patches: 128
      pos_embed_type: sincos
callbacks:
  model_checkpoint: null
  model_summary:
    _target_: lightning.pytorch.callbacks.RichModelSummary
    max_depth: 1
  rich_progress_bar: null
  safetensors:
    _target_: src.callbacks.safetensors_callback.SafetensorsCallback
  device_stats:
    _target_: lightning.pytorch.callbacks.DeviceStatsMonitor
  visualization:
    _target_: src.callbacks.visualization_callback.VisualizationCallback
    num_samples: 4
  wandb_offline_checkpoint:
    _target_: src.callbacks.wandb_callbacks.WandbOfflineCheckpointCallback
logger:
  wandb:
    _target_: lightning.pytorch.loggers.wandb.WandbLogger
    save_dir: ${paths.output_dir}
    offline: true
    id: null
    anonymous: null
    project: audio embeddings
    log_model: false
    prefix: ""
    group: ""
    tags: []
    job_type: ""
    name: best_rq2-audioset-200k-256x1bs
trainer:
  _target_: lightning.pytorch.trainer.Trainer
  default_root_dir: ${paths.output_dir}
  accelerator: gpu
  devices: 1
  check_val_every_n_epoch: 1
  deterministic: false
  max_steps: 200000
  strategy: auto
  max_time: 00:19:50:00
paths:
  root_dir: ${oc.env:PROJECT_ROOT}
  data_dir: ${paths.root_dir}/data/
  log_dir: ${paths.root_dir}/logs/
  output_dir: ${hydra:runtime.output_dir}
  work_dir: ${hydra:runtime.cwd}
extras:
  ignore_warnings: false
  enforce_tags: true
  print_config: true