Feature Extraction
Transformers
Safetensors
English
usad2
automatic-speech-recognition
audio-classification
audio
speech
music
custom_code
Instructions to use MIT-SLS/USAD2-Large-Plus with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MIT-SLS/USAD2-Large-Plus with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="MIT-SLS/USAD2-Large-Plus", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("MIT-SLS/USAD2-Large-Plus", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 2,765 Bytes
619d482 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | from transformers import PretrainedConfig
class Usad2Config(PretrainedConfig):
model_type = "usad2"
def __init__(
self,
input_dim: int = 128,
use_framewise_subsample: bool = True,
conv_subsample_channels: int = 64,
conv_subsample_rate: int = 2,
use_patchwise_subsample: bool = False,
patch_size_time: int = 16,
patch_size_freq: int = 16,
subsample_normalization: bool = True,
conv_pos: bool = True,
conv_pos_depth: int = 5,
conv_pos_width: int = 95,
conv_pos_groups: int = 16,
encoder_dim: int = 384,
num_layers: int = 12,
attention_type="mhsa",
num_attention_heads: int = 8,
feed_forward_expansion_factor: int = 4,
conv_expansion_factor: int = 2,
input_dropout_p: float = 0.0,
feed_forward_dropout_p: float = 0.0,
attention_dropout_p: float = 0.0,
conv_dropout_p: float = 0.0,
conv_kernel_size: int = 31,
half_step_residual: bool = True,
transformer_style: bool = True,
layerdrop_p: float = 0.0,
usad_v2: bool = True,
pre_norm: bool = False,
rms_norm: bool = False,
sample_rate: int = 16000,
**kwargs,
):
super().__init__(**kwargs)
self.input_dim = input_dim
self.use_framewise_subsample = use_framewise_subsample
self.conv_subsample_channels = conv_subsample_channels
self.conv_subsample_rate = conv_subsample_rate
self.use_patchwise_subsample = use_patchwise_subsample
self.patch_size_time = patch_size_time
self.patch_size_freq = patch_size_freq
self.subsample_normalization = subsample_normalization
self.conv_pos = conv_pos
self.conv_pos_depth = conv_pos_depth
self.conv_pos_width = conv_pos_width
self.conv_pos_groups = conv_pos_groups
self.encoder_dim = encoder_dim
self.num_layers = num_layers
self.attention_type = attention_type
self.num_attention_heads = num_attention_heads
self.feed_forward_expansion_factor = feed_forward_expansion_factor
self.conv_expansion_factor = conv_expansion_factor
self.input_dropout_p = input_dropout_p
self.feed_forward_dropout_p = feed_forward_dropout_p
self.attention_dropout_p = attention_dropout_p
self.conv_dropout_p = conv_dropout_p
self.conv_kernel_size = conv_kernel_size
self.half_step_residual = half_step_residual
self.transformer_style = transformer_style
self.layerdrop_p = layerdrop_p
self.usad_v2 = usad_v2
self.pre_norm = pre_norm
self.rms_norm = rms_norm
self.sample_rate = sample_rate
|