JackIsNotInTheBox commited on
Commit
95ab06d
·
verified ·
1 Parent(s): 36a0a7b

Mirror meldataset.py from nvidia/bigvgan_v2_44khz_128band_512x@95a9d1dc

Browse files
encoders/nvidia/bigvgan_v2_44khz_128band_512x/meldataset.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 NVIDIA CORPORATION.
2
+ # Licensed under the MIT license.
3
+
4
+ # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
5
+ # LICENSE is in incl_licenses directory.
6
+
7
+ import math
8
+ import os
9
+ import random
10
+ import torch
11
+ import torch.utils.data
12
+ import numpy as np
13
+ from librosa.util import normalize
14
+ from scipy.io.wavfile import read
15
+ from librosa.filters import mel as librosa_mel_fn
16
+ import pathlib
17
+ from tqdm import tqdm
18
+
19
+ MAX_WAV_VALUE = 32767.0 # NOTE: 32768.0 -1 to prevent int16 overflow (results in popping sound in corner cases)
20
+
21
+
22
+ def load_wav(full_path, sr_target):
23
+ sampling_rate, data = read(full_path)
24
+ if sampling_rate != sr_target:
25
+ raise RuntimeError(
26
+ f"Sampling rate of the file {full_path} is {sampling_rate} Hz, but the model requires {sr_target} Hz"
27
+ )
28
+ return data, sampling_rate
29
+
30
+
31
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
32
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
33
+
34
+
35
+ def dynamic_range_decompression(x, C=1):
36
+ return np.exp(x) / C
37
+
38
+
39
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
40
+ return torch.log(torch.clamp(x, min=clip_val) * C)
41
+
42
+
43
+ def dynamic_range_decompression_torch(x, C=1):
44
+ return torch.exp(x) / C
45
+
46
+
47
+ def spectral_normalize_torch(magnitudes):
48
+ return dynamic_range_compression_torch(magnitudes)
49
+
50
+
51
+ def spectral_de_normalize_torch(magnitudes):
52
+ return dynamic_range_decompression_torch(magnitudes)
53
+
54
+
55
+ mel_basis_cache = {}
56
+ hann_window_cache = {}
57
+
58
+
59
+ def mel_spectrogram(
60
+ y: torch.Tensor,
61
+ n_fft: int,
62
+ num_mels: int,
63
+ sampling_rate: int,
64
+ hop_size: int,
65
+ win_size: int,
66
+ fmin: int,
67
+ fmax: int = None,
68
+ center: bool = False,
69
+ ) -> torch.Tensor:
70
+ """
71
+ Calculate the mel spectrogram of an input signal.
72
+ This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
73
+
74
+ Args:
75
+ y (torch.Tensor): Input signal.
76
+ n_fft (int): FFT size.
77
+ num_mels (int): Number of mel bins.
78
+ sampling_rate (int): Sampling rate of the input signal.
79
+ hop_size (int): Hop size for STFT.
80
+ win_size (int): Window size for STFT.
81
+ fmin (int): Minimum frequency for mel filterbank.
82
+ fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
83
+ center (bool): Whether to pad the input to center the frames. Default is False.
84
+
85
+ Returns:
86
+ torch.Tensor: Mel spectrogram.
87
+ """
88
+ if torch.min(y) < -1.0:
89
+ print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
90
+ if torch.max(y) > 1.0:
91
+ print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
92
+
93
+ device = y.device
94
+ key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
95
+
96
+ if key not in mel_basis_cache:
97
+ mel = librosa_mel_fn(
98
+ sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
99
+ )
100
+ mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
101
+ hann_window_cache[key] = torch.hann_window(win_size).to(device)
102
+
103
+ mel_basis = mel_basis_cache[key]
104
+ hann_window = hann_window_cache[key]
105
+
106
+ padding = (n_fft - hop_size) // 2
107
+ y = torch.nn.functional.pad(
108
+ y.unsqueeze(1), (padding, padding), mode="reflect"
109
+ ).squeeze(1)
110
+
111
+ spec = torch.stft(
112
+ y,
113
+ n_fft,
114
+ hop_length=hop_size,
115
+ win_length=win_size,
116
+ window=hann_window,
117
+ center=center,
118
+ pad_mode="reflect",
119
+ normalized=False,
120
+ onesided=True,
121
+ return_complex=True,
122
+ )
123
+ spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
124
+
125
+ mel_spec = torch.matmul(mel_basis, spec)
126
+ mel_spec = spectral_normalize_torch(mel_spec)
127
+
128
+ return mel_spec
129
+
130
+
131
+ def get_mel_spectrogram(wav, h):
132
+ """
133
+ Generate mel spectrogram from a waveform using given hyperparameters.
134
+
135
+ Args:
136
+ wav (torch.Tensor): Input waveform.
137
+ h: Hyperparameters object with attributes n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax.
138
+
139
+ Returns:
140
+ torch.Tensor: Mel spectrogram.
141
+ """
142
+ return mel_spectrogram(
143
+ wav,
144
+ h.n_fft,
145
+ h.num_mels,
146
+ h.sampling_rate,
147
+ h.hop_size,
148
+ h.win_size,
149
+ h.fmin,
150
+ h.fmax,
151
+ )
152
+
153
+
154
+ def get_dataset_filelist(a):
155
+ training_files = []
156
+ validation_files = []
157
+ list_unseen_validation_files = []
158
+
159
+ with open(a.input_training_file, "r", encoding="utf-8") as fi:
160
+ training_files = [
161
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
162
+ for x in fi.read().split("\n")
163
+ if len(x) > 0
164
+ ]
165
+ print(f"first training file: {training_files[0]}")
166
+
167
+ with open(a.input_validation_file, "r", encoding="utf-8") as fi:
168
+ validation_files = [
169
+ os.path.join(a.input_wavs_dir, x.split("|")[0] + ".wav")
170
+ for x in fi.read().split("\n")
171
+ if len(x) > 0
172
+ ]
173
+ print(f"first validation file: {validation_files[0]}")
174
+
175
+ for i in range(len(a.list_input_unseen_validation_file)):
176
+ with open(a.list_input_unseen_validation_file[i], "r", encoding="utf-8") as fi:
177
+ unseen_validation_files = [
178
+ os.path.join(a.list_input_unseen_wavs_dir[i], x.split("|")[0] + ".wav")
179
+ for x in fi.read().split("\n")
180
+ if len(x) > 0
181
+ ]
182
+ print(
183
+ f"first unseen {i}th validation fileset: {unseen_validation_files[0]}"
184
+ )
185
+ list_unseen_validation_files.append(unseen_validation_files)
186
+
187
+ return training_files, validation_files, list_unseen_validation_files
188
+
189
+
190
+ class MelDataset(torch.utils.data.Dataset):
191
+ def __init__(
192
+ self,
193
+ training_files,
194
+ hparams,
195
+ segment_size,
196
+ n_fft,
197
+ num_mels,
198
+ hop_size,
199
+ win_size,
200
+ sampling_rate,
201
+ fmin,
202
+ fmax,
203
+ split=True,
204
+ shuffle=True,
205
+ n_cache_reuse=1,
206
+ device=None,
207
+ fmax_loss=None,
208
+ fine_tuning=False,
209
+ base_mels_path=None,
210
+ is_seen=True,
211
+ ):
212
+ self.audio_files = training_files
213
+ random.seed(1234)
214
+ if shuffle:
215
+ random.shuffle(self.audio_files)
216
+ self.hparams = hparams
217
+ self.is_seen = is_seen
218
+ if self.is_seen:
219
+ self.name = pathlib.Path(self.audio_files[0]).parts[0]
220
+ else:
221
+ self.name = "-".join(pathlib.Path(self.audio_files[0]).parts[:2]).strip("/")
222
+
223
+ self.segment_size = segment_size
224
+ self.sampling_rate = sampling_rate
225
+ self.split = split
226
+ self.n_fft = n_fft
227
+ self.num_mels = num_mels
228
+ self.hop_size = hop_size
229
+ self.win_size = win_size
230
+ self.fmin = fmin
231
+ self.fmax = fmax
232
+ self.fmax_loss = fmax_loss
233
+ self.cached_wav = None
234
+ self.n_cache_reuse = n_cache_reuse
235
+ self._cache_ref_count = 0
236
+ self.device = device
237
+ self.fine_tuning = fine_tuning
238
+ self.base_mels_path = base_mels_path
239
+
240
+ print("[INFO] checking dataset integrity...")
241
+ for i in tqdm(range(len(self.audio_files))):
242
+ assert os.path.exists(
243
+ self.audio_files[i]
244
+ ), f"{self.audio_files[i]} not found"
245
+
246
+ def __getitem__(self, index):
247
+ filename = self.audio_files[index]
248
+ if self._cache_ref_count == 0:
249
+ audio, sampling_rate = load_wav(filename, self.sampling_rate)
250
+ audio = audio / MAX_WAV_VALUE
251
+ if not self.fine_tuning:
252
+ audio = normalize(audio) * 0.95
253
+ self.cached_wav = audio
254
+ if sampling_rate != self.sampling_rate:
255
+ raise ValueError(
256
+ f"{sampling_rate} SR doesn't match target {self.sampling_rate} SR"
257
+ )
258
+ self._cache_ref_count = self.n_cache_reuse
259
+ else:
260
+ audio = self.cached_wav
261
+ self._cache_ref_count -= 1
262
+
263
+ audio = torch.FloatTensor(audio)
264
+ audio = audio.unsqueeze(0)
265
+
266
+ if not self.fine_tuning:
267
+ if self.split:
268
+ if audio.size(1) >= self.segment_size:
269
+ max_audio_start = audio.size(1) - self.segment_size
270
+ audio_start = random.randint(0, max_audio_start)
271
+ audio = audio[:, audio_start : audio_start + self.segment_size]
272
+ else:
273
+ audio = torch.nn.functional.pad(
274
+ audio, (0, self.segment_size - audio.size(1)), "constant"
275
+ )
276
+
277
+ mel = mel_spectrogram(
278
+ audio,
279
+ self.n_fft,
280
+ self.num_mels,
281
+ self.sampling_rate,
282
+ self.hop_size,
283
+ self.win_size,
284
+ self.fmin,
285
+ self.fmax,
286
+ center=False,
287
+ )
288
+ else: # Validation step
289
+ # Match audio length to self.hop_size * n for evaluation
290
+ if (audio.size(1) % self.hop_size) != 0:
291
+ audio = audio[:, : -(audio.size(1) % self.hop_size)]
292
+ mel = mel_spectrogram(
293
+ audio,
294
+ self.n_fft,
295
+ self.num_mels,
296
+ self.sampling_rate,
297
+ self.hop_size,
298
+ self.win_size,
299
+ self.fmin,
300
+ self.fmax,
301
+ center=False,
302
+ )
303
+ assert (
304
+ audio.shape[1] == mel.shape[2] * self.hop_size
305
+ ), f"audio shape {audio.shape} mel shape {mel.shape}"
306
+
307
+ else:
308
+ mel = np.load(
309
+ os.path.join(
310
+ self.base_mels_path,
311
+ os.path.splitext(os.path.split(filename)[-1])[0] + ".npy",
312
+ )
313
+ )
314
+ mel = torch.from_numpy(mel)
315
+
316
+ if len(mel.shape) < 3:
317
+ mel = mel.unsqueeze(0)
318
+
319
+ if self.split:
320
+ frames_per_seg = math.ceil(self.segment_size / self.hop_size)
321
+
322
+ if audio.size(1) >= self.segment_size:
323
+ mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
324
+ mel = mel[:, :, mel_start : mel_start + frames_per_seg]
325
+ audio = audio[
326
+ :,
327
+ mel_start
328
+ * self.hop_size : (mel_start + frames_per_seg)
329
+ * self.hop_size,
330
+ ]
331
+ else:
332
+ mel = torch.nn.functional.pad(
333
+ mel, (0, frames_per_seg - mel.size(2)), "constant"
334
+ )
335
+ audio = torch.nn.functional.pad(
336
+ audio, (0, self.segment_size - audio.size(1)), "constant"
337
+ )
338
+
339
+ mel_loss = mel_spectrogram(
340
+ audio,
341
+ self.n_fft,
342
+ self.num_mels,
343
+ self.sampling_rate,
344
+ self.hop_size,
345
+ self.win_size,
346
+ self.fmin,
347
+ self.fmax_loss,
348
+ center=False,
349
+ )
350
+
351
+ return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
352
+
353
+ def __len__(self):
354
+ return len(self.audio_files)