| | import os |
| | import random |
| | import numpy as np |
| | import torch |
| | import tgt |
| | import pandas as pd |
| |
|
| | from torch.utils.data import Dataset |
| | import librosa |
| |
|
| |
|
| | def f0_to_coarse(f0, hparams): |
| | f0_bin = hparams['f0_bin'] |
| | f0_max = hparams['f0_max'] |
| | f0_min = hparams['f0_min'] |
| | is_torch = isinstance(f0, torch.Tensor) |
| | |
| | f0_mel_min = 1127 * np.log(1 + f0_min / 700) |
| | f0_mel_max = 1127 * np.log(1 + f0_max / 700) |
| | f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) |
| |
|
| | unvoiced = (f0_mel == 0) |
| |
|
| | f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1 |
| |
|
| | f0_mel[f0_mel <= 1] = 1 |
| | f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 |
| |
|
| | f0_mel[unvoiced] = 0 |
| |
|
| | f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(int) |
| | assert f0_coarse.max() <= 255 and f0_coarse.min() >= 0, (f0_coarse.max(), f0_coarse.min()) |
| | return f0_coarse |
| |
|
| |
|
| | |
| | class VCDecLPCDataset(Dataset): |
| | def __init__(self, data_dir, subset, content_dir='lpc_mel_512', extract_emb=False): |
| | self.path = data_dir |
| | meta = pd.read_csv(data_dir + 'meta_fix.csv') |
| | self.meta = meta[meta['subset'] == subset] |
| | self.content_dir = content_dir |
| | self.extract_emb = extract_emb |
| |
|
| | def get_vc_data(self, audio_path, mel_id): |
| | mel_dir = audio_path.replace('vocal', 'mel') |
| | embed_dir = audio_path.replace('vocal', 'embed') |
| | pitch_dir = audio_path.replace('vocal', 'f0') |
| | content_dir = audio_path.replace('vocal', self.content_dir) |
| |
|
| | mel = os.path.join(mel_dir, mel_id + '.npy') |
| | embed = os.path.join(embed_dir, mel_id + '.npy') |
| | pitch = os.path.join(pitch_dir, mel_id + '.npy') |
| | content = os.path.join(content_dir, mel_id + '.npy') |
| |
|
| | mel = np.load(mel) |
| | if self.extract_emb: |
| | embed = np.load(embed) |
| | else: |
| | embed = np.zeros(1) |
| |
|
| | pitch = np.load(pitch) |
| | content = np.load(content) |
| |
|
| | pitch = np.nan_to_num(pitch) |
| | pitch = f0_to_coarse(pitch, {'f0_bin': 256, |
| | 'f0_min': librosa.note_to_hz('C2'), |
| | 'f0_max': librosa.note_to_hz('C6')}) |
| |
|
| | mel = torch.from_numpy(mel).float() |
| | embed = torch.from_numpy(embed).float() |
| | pitch = torch.from_numpy(pitch).float() |
| | content = torch.from_numpy(content).float() |
| |
|
| | return (mel, embed, pitch, content) |
| |
|
| | def __getitem__(self, index): |
| | row = self.meta.iloc[index] |
| | mel_id = row['file_name'] |
| | audio_path = self.path + row['folder'] + row['subfolder'] |
| | mel, embed, pitch, content = self.get_vc_data(audio_path, mel_id) |
| | item = {'mel': mel, 'embed': embed, 'f0': pitch, 'content': content} |
| | return item |
| |
|
| | def __len__(self): |
| | return len(self.meta) |
| |
|
| |
|
| | class VCDecLPCBatchCollate(object): |
| | def __init__(self, train_frames, eps=np.log(1e-5), content_eps=np.log(1e-12)): |
| | self.train_frames = train_frames |
| | self.eps = eps |
| | self.content_eps = content_eps |
| |
|
| | def __call__(self, batch): |
| | train_frames = self.train_frames |
| | eps = self.eps |
| | content_eps = self.content_eps |
| |
|
| | B = len(batch) |
| | embed = torch.stack([item['embed'] for item in batch], 0) |
| |
|
| | n_mels = batch[0]['mel'].shape[0] |
| | content_dim = batch[0]['content'].shape[0] |
| |
|
| | |
| | mels1 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps |
| | mels2 = torch.ones((B, n_mels, train_frames), dtype=torch.float32) * eps |
| |
|
| | |
| | contents1 = torch.ones((B, content_dim, train_frames), dtype=torch.float32) * content_eps |
| |
|
| | f0s1 = torch.zeros((B, train_frames), dtype=torch.float32) |
| | max_starts = [max(item['mel'].shape[-1] - train_frames, 0) |
| | for item in batch] |
| |
|
| | starts1 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] |
| | starts2 = [random.choice(range(m)) if m > 0 else 0 for m in max_starts] |
| | mel_lengths = [] |
| | for i, item in enumerate(batch): |
| | mel = item['mel'] |
| | f0 = item['f0'] |
| | content = item['content'] |
| |
|
| | if mel.shape[-1] < train_frames: |
| | mel_length = mel.shape[-1] |
| | else: |
| | mel_length = train_frames |
| |
|
| | mels1[i, :, :mel_length] = mel[:, starts1[i]:starts1[i] + mel_length] |
| | f0s1[i, :mel_length] = f0[starts1[i]:starts1[i] + mel_length] |
| | contents1[i, :, :mel_length] = content[:, starts1[i]:starts1[i] + mel_length] |
| |
|
| | mels2[i, :, :mel_length] = mel[:, starts2[i]:starts2[i] + mel_length] |
| | mel_lengths.append(mel_length) |
| |
|
| | mel_lengths = torch.LongTensor(mel_lengths) |
| |
|
| | return {'mel1': mels1, 'mel2': mels2, 'mel_lengths': mel_lengths, |
| | 'embed': embed, |
| | 'f0_1': f0s1, |
| | 'content1': contents1} |
| |
|
| |
|
| | class VCDecLPCTest(Dataset): |
| | def __init__(self, data_dir, subset='test', eps=np.log(1e-5), content_eps=np.log(1e-12), test_frames=256, content_dir='lpc_mel_512', extract_emb=False): |
| | self.path = data_dir |
| | meta = pd.read_csv(data_dir + 'meta_test.csv') |
| | self.meta = meta[meta['subset'] == subset] |
| | self.content_dir = content_dir |
| | self.extract_emb = extract_emb |
| | self.eps = eps |
| | self.content_eps = content_eps |
| | self.test_frames = test_frames |
| |
|
| | def get_vc_data(self, audio_path, mel_id, pitch_shift): |
| | mel_dir = audio_path.replace('vocal', 'mel') |
| | embed_dir = audio_path.replace('vocal', 'embed') |
| | pitch_dir = audio_path.replace('vocal', 'f0') |
| | content_dir = audio_path.replace('vocal', self.content_dir) |
| |
|
| | mel = os.path.join(mel_dir, mel_id + '.npy') |
| | embed = os.path.join(embed_dir, mel_id + '.npy') |
| | pitch = os.path.join(pitch_dir, mel_id + '.npy') |
| | content = os.path.join(content_dir, mel_id + '.npy') |
| |
|
| | mel = np.load(mel) |
| | if self.extract_emb: |
| | embed = np.load(embed) |
| | else: |
| | embed = np.zeros(1) |
| |
|
| | pitch = np.load(pitch) |
| | content = np.load(content) |
| |
|
| | pitch = np.nan_to_num(pitch) |
| | pitch = pitch*pitch_shift |
| | pitch = f0_to_coarse(pitch, {'f0_bin': 256, |
| | 'f0_min': librosa.note_to_hz('C2'), |
| | 'f0_max': librosa.note_to_hz('C6')}) |
| |
|
| | mel = torch.from_numpy(mel).float() |
| | embed = torch.from_numpy(embed).float() |
| | pitch = torch.from_numpy(pitch).float() |
| | content = torch.from_numpy(content).float() |
| |
|
| | return (mel, embed, pitch, content) |
| |
|
| | def __getitem__(self, index): |
| | row = self.meta.iloc[index] |
| |
|
| | mel_id = row['content_file_name'] |
| | audio_path = self.path + row['content_folder'] + row['content_subfolder'] |
| | pitch_shift = row['pitch_shift'] |
| | mel1, _, f0, content = self.get_vc_data(audio_path, mel_id, pitch_shift) |
| |
|
| | mel_id = row['timbre_file_name'] |
| | audio_path = self.path + row['timbre_folder'] + row['timbre_subfolder'] |
| | mel2, embed, _, _ = self.get_vc_data(audio_path, mel_id, pitch_shift) |
| |
|
| | n_mels = mel1.shape[0] |
| | content_dim = content.shape[0] |
| |
|
| | mels1 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps |
| | mels2 = torch.ones((n_mels, self.test_frames), dtype=torch.float32) * self.eps |
| | |
| | lpcs1 = torch.ones((content_dim, self.test_frames), dtype=torch.float32) * self.content_eps |
| |
|
| | f0s1 = torch.zeros(self.test_frames, dtype=torch.float32) |
| |
|
| | if mel1.shape[-1] < self.test_frames: |
| | mel_length = mel1.shape[-1] |
| | else: |
| | mel_length = self.test_frames |
| | mels1[:, :mel_length] = mel1[:, :mel_length] |
| | f0s1[:mel_length] = f0[:mel_length] |
| | lpcs1[:, :mel_length] = content[:, :mel_length] |
| |
|
| | if mel2.shape[-1] < self.test_frames: |
| | mel_length = mel2.shape[-1] |
| | else: |
| | mel_length = self.test_frames |
| | mels2[:, :mel_length] = mel2[:, :mel_length] |
| |
|
| | return {'mel1': mels1, 'mel2': mels2, 'embed': embed, 'f0_1': f0s1, 'content1': lpcs1} |
| |
|
| | def __len__(self): |
| | return len(self.meta) |
| |
|
| |
|
| |
|
| |
|