Feature Extraction
Transformers
Safetensors
English
usad2
automatic-speech-recognition
audio-classification
audio
speech
music
custom_code
Instructions to use MIT-SLS/USAD2-Large with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MIT-SLS/USAD2-Large with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="MIT-SLS/USAD2-Large", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("MIT-SLS/USAD2-Large", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from transformers import PretrainedConfig | |
| class Usad2Config(PretrainedConfig): | |
| model_type = "usad2" | |
| def __init__( | |
| self, | |
| input_dim: int = 128, | |
| use_framewise_subsample: bool = True, | |
| conv_subsample_channels: int = 64, | |
| conv_subsample_rate: int = 2, | |
| use_patchwise_subsample: bool = False, | |
| patch_size_time: int = 16, | |
| patch_size_freq: int = 16, | |
| subsample_normalization: bool = True, | |
| conv_pos: bool = True, | |
| conv_pos_depth: int = 5, | |
| conv_pos_width: int = 95, | |
| conv_pos_groups: int = 16, | |
| encoder_dim: int = 384, | |
| num_layers: int = 12, | |
| attention_type="mhsa", | |
| num_attention_heads: int = 8, | |
| feed_forward_expansion_factor: int = 4, | |
| conv_expansion_factor: int = 2, | |
| input_dropout_p: float = 0.0, | |
| feed_forward_dropout_p: float = 0.0, | |
| attention_dropout_p: float = 0.0, | |
| conv_dropout_p: float = 0.0, | |
| conv_kernel_size: int = 31, | |
| half_step_residual: bool = True, | |
| transformer_style: bool = True, | |
| layerdrop_p: float = 0.0, | |
| usad_v2: bool = True, | |
| pre_norm: bool = False, | |
| rms_norm: bool = False, | |
| sample_rate: int = 16000, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.input_dim = input_dim | |
| self.use_framewise_subsample = use_framewise_subsample | |
| self.conv_subsample_channels = conv_subsample_channels | |
| self.conv_subsample_rate = conv_subsample_rate | |
| self.use_patchwise_subsample = use_patchwise_subsample | |
| self.patch_size_time = patch_size_time | |
| self.patch_size_freq = patch_size_freq | |
| self.subsample_normalization = subsample_normalization | |
| self.conv_pos = conv_pos | |
| self.conv_pos_depth = conv_pos_depth | |
| self.conv_pos_width = conv_pos_width | |
| self.conv_pos_groups = conv_pos_groups | |
| self.encoder_dim = encoder_dim | |
| self.num_layers = num_layers | |
| self.attention_type = attention_type | |
| self.num_attention_heads = num_attention_heads | |
| self.feed_forward_expansion_factor = feed_forward_expansion_factor | |
| self.conv_expansion_factor = conv_expansion_factor | |
| self.input_dropout_p = input_dropout_p | |
| self.feed_forward_dropout_p = feed_forward_dropout_p | |
| self.attention_dropout_p = attention_dropout_p | |
| self.conv_dropout_p = conv_dropout_p | |
| self.conv_kernel_size = conv_kernel_size | |
| self.half_step_residual = half_step_residual | |
| self.transformer_style = transformer_style | |
| self.layerdrop_p = layerdrop_p | |
| self.usad_v2 = usad_v2 | |
| self.pre_norm = pre_norm | |
| self.rms_norm = rms_norm | |
| self.sample_rate = sample_rate | |