File size: 2,765 Bytes
619d482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from transformers import PretrainedConfig


class Usad2Config(PretrainedConfig):
    model_type = "usad2"

    def __init__(
        self,
        input_dim: int = 128,
        use_framewise_subsample: bool = True,
        conv_subsample_channels: int = 64,
        conv_subsample_rate: int = 2,
        use_patchwise_subsample: bool = False,
        patch_size_time: int = 16,
        patch_size_freq: int = 16,
        subsample_normalization: bool = True,
        conv_pos: bool = True,
        conv_pos_depth: int = 5,
        conv_pos_width: int = 95,
        conv_pos_groups: int = 16,
        encoder_dim: int = 384,
        num_layers: int = 12,
        attention_type="mhsa",
        num_attention_heads: int = 8,
        feed_forward_expansion_factor: int = 4,
        conv_expansion_factor: int = 2,
        input_dropout_p: float = 0.0,
        feed_forward_dropout_p: float = 0.0,
        attention_dropout_p: float = 0.0,
        conv_dropout_p: float = 0.0,
        conv_kernel_size: int = 31,
        half_step_residual: bool = True,
        transformer_style: bool = True,
        layerdrop_p: float = 0.0,
        usad_v2: bool = True,
        pre_norm: bool = False,
        rms_norm: bool = False,
        sample_rate: int = 16000,
        **kwargs,
    ):
        super().__init__(**kwargs)

        self.input_dim = input_dim
        self.use_framewise_subsample = use_framewise_subsample
        self.conv_subsample_channels = conv_subsample_channels
        self.conv_subsample_rate = conv_subsample_rate
        self.use_patchwise_subsample = use_patchwise_subsample
        self.patch_size_time = patch_size_time
        self.patch_size_freq = patch_size_freq
        self.subsample_normalization = subsample_normalization
        self.conv_pos = conv_pos
        self.conv_pos_depth = conv_pos_depth
        self.conv_pos_width = conv_pos_width
        self.conv_pos_groups = conv_pos_groups
        self.encoder_dim = encoder_dim
        self.num_layers = num_layers
        self.attention_type = attention_type
        self.num_attention_heads = num_attention_heads
        self.feed_forward_expansion_factor = feed_forward_expansion_factor
        self.conv_expansion_factor = conv_expansion_factor
        self.input_dropout_p = input_dropout_p
        self.feed_forward_dropout_p = feed_forward_dropout_p
        self.attention_dropout_p = attention_dropout_p
        self.conv_dropout_p = conv_dropout_p
        self.conv_kernel_size = conv_kernel_size
        self.half_step_residual = half_step_residual
        self.transformer_style = transformer_style
        self.layerdrop_p = layerdrop_p
        self.usad_v2 = usad_v2
        self.pre_norm = pre_norm
        self.rms_norm = rms_norm
        self.sample_rate = sample_rate