| | import os |
| | import os.path as osp |
| | import sys |
| | import time |
| | from collections import defaultdict |
| |
|
| | import matplotlib |
| | import numpy as np |
| | import soundfile as sf |
| | import torch |
| | from torch import nn |
| | import jiwer |
| |
|
| | import matplotlib.pylab as plt |
| | import functools |
| | import os |
| | import random |
| | import traceback |
| | from pathlib import Path |
| | from typing import Any, Dict, List, Optional, Tuple |
| |
|
| | import librosa |
| | import numpy as np |
| | import torch |
| | from einops import rearrange |
| | from scipy import ndimage |
| | from torch.special import gammaln |
| |
|
| |
|
| | def calc_wer(target, pred, ignore_indexes=[0]): |
| | target_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(target))))) |
| | pred_chars = drop_duplicated(list(filter(lambda x: x not in ignore_indexes, map(str, list(pred))))) |
| | target_str = ' '.join(target_chars) |
| | pred_str = ' '.join(pred_chars) |
| | error = jiwer.wer(target_str, pred_str) |
| | return error |
| |
|
| | def drop_duplicated(chars): |
| | ret_chars = [chars[0]] |
| | for prev, curr in zip(chars[:-1], chars[1:]): |
| | if prev != curr: |
| | ret_chars.append(curr) |
| | return ret_chars |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | def build_criterion(critic_params={}): |
| | criterion = { |
| | "ce": nn.CrossEntropyLoss(ignore_index=-1), |
| | "ctc": torch.nn.CTCLoss(**critic_params.get('ctc', {})), |
| | } |
| | return criterion |
| |
|
| |
|
| |
|
| | def get_data_path_list(train_path=None, val_path=None): |
| | if train_path is None: |
| | train_path = "Data/train_list.txt" |
| | if val_path is None: |
| | val_path = "Data/val_list.txt" |
| |
|
| | with open(train_path, 'r') as f: |
| | train_list = f.readlines() |
| | with open(val_path, 'r') as f: |
| | val_list = f.readlines() |
| |
|
| | return train_list, val_list |
| |
|
| |
|
| | def plot_image(image): |
| | fig, ax = plt.subplots(figsize=(10, 2)) |
| | im = ax.imshow(image, aspect="auto", origin="lower", |
| | interpolation='none') |
| |
|
| | fig.canvas.draw() |
| | plt.close() |
| |
|
| | return fig |
| |
|
| |
|
| |
|
| | class PartialConv1d(torch.nn.Conv1d): |
| | """ |
| | Zero padding creates a unique identifier for where the edge of the data is, such that the model can almost always identify |
| | exactly where it is relative to either edge given a sufficient receptive field. Partial padding goes to some lengths to remove |
| | this affect. |
| | """ |
| |
|
| | __constants__ = ['slide_winsize'] |
| | slide_winsize: float |
| |
|
| | def __init__(self, *args, **kwargs): |
| | super(PartialConv1d, self).__init__(*args, **kwargs) |
| | weight_maskUpdater = torch.ones(1, 1, self.kernel_size[0]) |
| | self.register_buffer("weight_maskUpdater", weight_maskUpdater, persistent=False) |
| | self.slide_winsize = self.weight_maskUpdater.shape[1] * self.weight_maskUpdater.shape[2] |
| |
|
| | def forward(self, input, mask_in): |
| | if mask_in is None: |
| | mask = torch.ones(1, 1, input.shape[2], dtype=input.dtype, device=input.device) |
| | else: |
| | mask = mask_in |
| | input = torch.mul(input, mask) |
| | with torch.no_grad(): |
| | update_mask = F.conv1d( |
| | mask, |
| | self.weight_maskUpdater, |
| | bias=None, |
| | stride=self.stride, |
| | padding=self.padding, |
| | dilation=self.dilation, |
| | groups=1, |
| | ) |
| | update_mask_filled = torch.masked_fill(update_mask, update_mask == 0, self.slide_winsize) |
| | mask_ratio = self.slide_winsize / update_mask_filled |
| | update_mask = torch.clamp(update_mask, 0, 1) |
| | mask_ratio = torch.mul(mask_ratio, update_mask) |
| |
|
| | raw_out = self._conv_forward(input, self.weight, self.bias) |
| |
|
| | if self.bias is not None: |
| | bias_view = self.bias.view(1, self.out_channels, 1) |
| | output = torch.mul(raw_out - bias_view, mask_ratio) + bias_view |
| | output = torch.mul(output, update_mask) |
| | else: |
| | output = torch.mul(raw_out, mask_ratio) |
| |
|
| | return output |
| |
|
| |
|
| | class LinearNorm(torch.nn.Module): |
| | def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): |
| | super().__init__() |
| | self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) |
| |
|
| | torch.nn.init.xavier_uniform_(self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
| |
|
| | def forward(self, x): |
| | return self.linear_layer(x) |
| |
|
| |
|
| | class ConvNorm(torch.nn.Module): |
| | __constants__ = ['use_partial_padding'] |
| | use_partial_padding: bool |
| |
|
| | def __init__( |
| | self, |
| | in_channels, |
| | out_channels, |
| | kernel_size=1, |
| | stride=1, |
| | padding=None, |
| | dilation=1, |
| | bias=True, |
| | w_init_gain='linear', |
| | use_partial_padding=False, |
| | use_weight_norm=False, |
| | norm_fn=None, |
| | ): |
| | super(ConvNorm, self).__init__() |
| | if padding is None: |
| | assert kernel_size % 2 == 1 |
| | padding = int(dilation * (kernel_size - 1) / 2) |
| | self.use_partial_padding = use_partial_padding |
| | conv_fn = torch.nn.Conv1d |
| | if use_partial_padding: |
| | conv_fn = PartialConv1d |
| | self.conv = conv_fn( |
| | in_channels, |
| | out_channels, |
| | kernel_size=kernel_size, |
| | stride=stride, |
| | padding=padding, |
| | dilation=dilation, |
| | bias=bias, |
| | ) |
| | torch.nn.init.xavier_uniform_(self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) |
| | if use_weight_norm: |
| | self.conv = torch.nn.utils.weight_norm(self.conv) |
| | if norm_fn is not None: |
| | self.norm = norm_fn(out_channels, affine=True) |
| | else: |
| | self.norm = None |
| |
|
| | def forward(self, signal, mask=None): |
| | if self.use_partial_padding: |
| | ret = self.conv(signal, mask) |
| | if self.norm is not None: |
| | ret = self.norm(ret, mask) |
| | else: |
| | if mask is not None: |
| | signal = signal.mul(mask) |
| | ret = self.conv(signal) |
| | if self.norm is not None: |
| | ret = self.norm(ret) |
| |
|
| | |
| | |
| |
|
| | return ret |
| |
|
| |
|
| |
|
| | class BetaBinomialInterpolator: |
| | """ |
| | This module calculates alignment prior matrices (based on beta-binomial distribution) using cached popular sizes and image interpolation. |
| | The implementation is taken from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechSynthesis/FastPitch/fastpitch/data_function.py |
| | """ |
| |
|
| | def __init__(self, round_mel_len_to=50, round_text_len_to=10, cache_size=500, scaling_factor: float = 1.0): |
| | self.round_mel_len_to = round_mel_len_to |
| | self.round_text_len_to = round_text_len_to |
| | cached_func = lambda x, y: beta_binomial_prior_distribution(x, y, scaling_factor=scaling_factor) |
| | self.bank = functools.lru_cache(maxsize=cache_size)(cached_func) |
| |
|
| | @staticmethod |
| | def round(val, to): |
| | return max(1, int(np.round((val + 1) / to))) * to |
| |
|
| | def __call__(self, w, h): |
| | bw = BetaBinomialInterpolator.round(w, to=self.round_mel_len_to) |
| | bh = BetaBinomialInterpolator.round(h, to=self.round_text_len_to) |
| | ret = ndimage.zoom(self.bank(bw, bh).T, zoom=(w / bw, h / bh), order=1) |
| | assert ret.shape[0] == w, ret.shape |
| | assert ret.shape[1] == h, ret.shape |
| | return ret |
| |
|
| |
|
| | def general_padding(item, item_len, max_len, pad_value=0): |
| | if item_len < max_len: |
| | item = torch.nn.functional.pad(item, (0, max_len - item_len), value=pad_value) |
| | return item |
| |
|
| |
|
| | def stack_tensors(tensors: List[torch.Tensor], max_lens: List[int], pad_value: float = 0.0) -> torch.Tensor: |
| | """ |
| | Create batch by stacking input tensor list along the time axes. |
| | |
| | Args: |
| | tensors: List of tensors to pad and stack |
| | max_lens: List of lengths to pad each axis to, starting with the last axis |
| | pad_value: Value for padding |
| | |
| | Returns: |
| | Padded and stacked tensor. |
| | """ |
| | padded_tensors = [] |
| | for tensor in tensors: |
| | padding = [] |
| | for i, max_len in enumerate(max_lens, 1): |
| | padding += [0, max_len - tensor.shape[-i]] |
| |
|
| | padded_tensor = torch.nn.functional.pad(tensor, pad=padding, value=pad_value) |
| | padded_tensors.append(padded_tensor) |
| |
|
| | stacked_tensor = torch.stack(padded_tensors) |
| | return stacked_tensor |
| |
|
| |
|
| | def logbeta(x, y): |
| | return gammaln(x) + gammaln(y) - gammaln(x + y) |
| |
|
| |
|
| | def logcombinations(n, k): |
| | return gammaln(n + 1) - gammaln(k + 1) - gammaln(n - k + 1) |
| |
|
| |
|
| | def logbetabinom(n, a, b, x): |
| | return logcombinations(n, x) + logbeta(x + a, n - x + b) - logbeta(a, b) |
| |
|
| |
|
| | def beta_binomial_prior_distribution(phoneme_count: int, mel_count: int, scaling_factor: float = 1.0) -> np.array: |
| | x = rearrange(torch.arange(0, phoneme_count), "b -> 1 b") |
| | y = rearrange(torch.arange(1, mel_count + 1), "b -> b 1") |
| | a = scaling_factor * y |
| | b = scaling_factor * (mel_count + 1 - y) |
| | n = torch.FloatTensor([phoneme_count - 1]) |
| |
|
| | return logbetabinom(n, a, b, x).exp().numpy() |
| |
|
| |
|
| | |
| |
|