grider-transwithai commited on 3 days ago

Commit

17e678b

verified ·

1 Parent(s): 0082832

Upload folder using huggingface_hub

Browse files

Files changed (19) hide show

.gitattributes +1 -0
added_tokens.json +0 -0
chat_template.jinja +74 -0
config.json +361 -0
configuration_step_audio_2.py +175 -0
generation_config.json +4 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_step_audio_2.py +430 -0
recipe.yaml +6 -0
special_tokens_map.json +49 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

The diff for this file is too large to render. See raw diff

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,74 @@

+{%- if tools %}
+    {{- '<|BOT|>system
+' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] + '<|EOT|>' }}
+    {%- else %}
+        {{- 'You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
+    {%- endif %}
+    {{- '<|BOT|>' }}
+    {{- "tool_json_schemas
+" }}
+    {{- tools | tojson }}
+    {{- '<|EOT|>' }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|BOT|>system
+' + messages[0]['content'] + '<|EOT|>' }}
+    {%- else %}
+        {{- '<|BOT|>system
+You are a helpful assistant. Please think step by step and provide your reasoning process within <think> </think> tags, followed by your final answer. Format: <think>your reasoning here</think>your final answer<|EOT|>' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if message["role"] == "user" %}
+        {{- '<|BOT|>human
+' + message["content"] + '<|EOT|>' }}
+    {%- elif (message["role"] == "system" and not loop.first) or (message["role"] == "assistant" and not message["tool_calls"]) %}
+        {{- '<|BOT|>' + message["role"] + '
+' + message["content"] + '<|EOT|>' }}
+    {%- elif message["role"] == "assistant" %}
+        {{- '<|BOT|>' + message["role"] + '
+' }}
+        {%- if message["content"] %}
+            {{- message["content"] }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call["function"] is defined %}
+                {%- set tool_call = tool_call["function"] %}
+            {%- endif %}
+            {{- '<|CALL_START|>' + 'function
+' + tool_call["name"] + '
+' }}
+            {{- tool_call["arguments"] | tojson }}
+            {{- '<|CALL_END|>' }}
+        {%- endfor %}
+        {{- '<|EOT|>' }}
+    {%- elif message["role"] == "tool" %}
+        {{- '<|BOT|>' }}
+        {%- set ns = namespace(function_name="tool") %}
+        {%- if message["tool_call_id"] %}
+            {%- for prev_msg in messages %}
+                {%- if prev_msg["role"] == "assistant" and prev_msg["tool_calls"] %}
+                    {%- for tool_call in prev_msg["tool_calls"] %}
+                        {%- if tool_call["id"] == message["tool_call_id"] %}
+                            {%- if tool_call["function"] is defined %}
+                                {%- set ns.function_name = tool_call["function"]["name"] %}
+                            {%- endif %}
+                        {%- endif %}
+                    {%- endfor %}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- 'function_output
+' + ns.function_name + '
+' }}
+        {{- message["content"] }}
+        {{- '<|EOT|>' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|BOT|>assistant
+<think>
+' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,361 @@

+{
+  "architectures": [
+    "StepAudio2ForCausalLM"
+  ],
+  "audio_encoder_config": {
+    "adapter_stride": 2,
+    "kernel_size": 3,
+    "llm_dim": 5120,
+    "model_type": "step_audio_2_encoder",
+    "n_audio_ctx": 1500,
+    "n_audio_head": 20,
+    "n_audio_layer": 32,
+    "n_audio_state": 1280,
+    "n_codebook_size": 4096,
+    "n_mels": 128
+  },
+  "auto_map": {
+    "AutoConfig": "configuration_step_audio_2.StepAudio2Config",
+    "AutoModelForCausalLM": "modeling_step_audio_2.StepAudio2ForCausalLM"
+  },
+  "dtype": "bfloat16",
+  "max_window_layers": null,
+  "model_type": "step_audio_2",
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "format": "nvfp4-pack-quantized",
+        "input_activations": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": "local",
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "static_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        },
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 16,
+          "num_bits": 4,
+          "observer": "static_minmax",
+          "observer_kwargs": {},
+          "scale_dtype": "torch.float8_e4m3fn",
+          "strategy": "tensor_group",
+          "symmetric": true,
+          "type": "float",
+          "zp_dtype": null
+        }
+      }
+    },
+    "format": "nvfp4-pack-quantized",
+    "global_compression_ratio": null,
+    "ignore": [
+      "encoder.blocks.0.attn.query",
+      "encoder.blocks.0.attn.key",
+      "encoder.blocks.0.attn.value",
+      "encoder.blocks.0.attn.out",
+      "encoder.blocks.0.mlp.0",
+      "encoder.blocks.0.mlp.2",
+      "encoder.blocks.1.attn.query",
+      "encoder.blocks.1.attn.key",
+      "encoder.blocks.1.attn.value",
+      "encoder.blocks.1.attn.out",
+      "encoder.blocks.1.mlp.0",
+      "encoder.blocks.1.mlp.2",
+      "encoder.blocks.2.attn.query",
+      "encoder.blocks.2.attn.key",
+      "encoder.blocks.2.attn.value",
+      "encoder.blocks.2.attn.out",
+      "encoder.blocks.2.mlp.0",
+      "encoder.blocks.2.mlp.2",
+      "encoder.blocks.3.attn.query",
+      "encoder.blocks.3.attn.key",
+      "encoder.blocks.3.attn.value",
+      "encoder.blocks.3.attn.out",
+      "encoder.blocks.3.mlp.0",
+      "encoder.blocks.3.mlp.2",
+      "encoder.blocks.4.attn.query",
+      "encoder.blocks.4.attn.key",
+      "encoder.blocks.4.attn.value",
+      "encoder.blocks.4.attn.out",
+      "encoder.blocks.4.mlp.0",
+      "encoder.blocks.4.mlp.2",
+      "encoder.blocks.5.attn.query",
+      "encoder.blocks.5.attn.key",
+      "encoder.blocks.5.attn.value",
+      "encoder.blocks.5.attn.out",
+      "encoder.blocks.5.mlp.0",
+      "encoder.blocks.5.mlp.2",
+      "encoder.blocks.6.attn.query",
+      "encoder.blocks.6.attn.key",
+      "encoder.blocks.6.attn.value",
+      "encoder.blocks.6.attn.out",
+      "encoder.blocks.6.mlp.0",
+      "encoder.blocks.6.mlp.2",
+      "encoder.blocks.7.attn.query",
+      "encoder.blocks.7.attn.key",
+      "encoder.blocks.7.attn.value",
+      "encoder.blocks.7.attn.out",
+      "encoder.blocks.7.mlp.0",
+      "encoder.blocks.7.mlp.2",
+      "encoder.blocks.8.attn.query",
+      "encoder.blocks.8.attn.key",
+      "encoder.blocks.8.attn.value",
+      "encoder.blocks.8.attn.out",
+      "encoder.blocks.8.mlp.0",
+      "encoder.blocks.8.mlp.2",
+      "encoder.blocks.9.attn.query",
+      "encoder.blocks.9.attn.key",
+      "encoder.blocks.9.attn.value",
+      "encoder.blocks.9.attn.out",
+      "encoder.blocks.9.mlp.0",
+      "encoder.blocks.9.mlp.2",
+      "encoder.blocks.10.attn.query",
+      "encoder.blocks.10.attn.key",
+      "encoder.blocks.10.attn.value",
+      "encoder.blocks.10.attn.out",
+      "encoder.blocks.10.mlp.0",
+      "encoder.blocks.10.mlp.2",
+      "encoder.blocks.11.attn.query",
+      "encoder.blocks.11.attn.key",
+      "encoder.blocks.11.attn.value",
+      "encoder.blocks.11.attn.out",
+      "encoder.blocks.11.mlp.0",
+      "encoder.blocks.11.mlp.2",
+      "encoder.blocks.12.attn.query",
+      "encoder.blocks.12.attn.key",
+      "encoder.blocks.12.attn.value",
+      "encoder.blocks.12.attn.out",
+      "encoder.blocks.12.mlp.0",
+      "encoder.blocks.12.mlp.2",
+      "encoder.blocks.13.attn.query",
+      "encoder.blocks.13.attn.key",
+      "encoder.blocks.13.attn.value",
+      "encoder.blocks.13.attn.out",
+      "encoder.blocks.13.mlp.0",
+      "encoder.blocks.13.mlp.2",
+      "encoder.blocks.14.attn.query",
+      "encoder.blocks.14.attn.key",
+      "encoder.blocks.14.attn.value",
+      "encoder.blocks.14.attn.out",
+      "encoder.blocks.14.mlp.0",
+      "encoder.blocks.14.mlp.2",
+      "encoder.blocks.15.attn.query",
+      "encoder.blocks.15.attn.key",
+      "encoder.blocks.15.attn.value",
+      "encoder.blocks.15.attn.out",
+      "encoder.blocks.15.mlp.0",
+      "encoder.blocks.15.mlp.2",
+      "encoder.blocks.16.attn.query",
+      "encoder.blocks.16.attn.key",
+      "encoder.blocks.16.attn.value",
+      "encoder.blocks.16.attn.out",
+      "encoder.blocks.16.mlp.0",
+      "encoder.blocks.16.mlp.2",
+      "encoder.blocks.17.attn.query",
+      "encoder.blocks.17.attn.key",
+      "encoder.blocks.17.attn.value",
+      "encoder.blocks.17.attn.out",
+      "encoder.blocks.17.mlp.0",
+      "encoder.blocks.17.mlp.2",
+      "encoder.blocks.18.attn.query",
+      "encoder.blocks.18.attn.key",
+      "encoder.blocks.18.attn.value",
+      "encoder.blocks.18.attn.out",
+      "encoder.blocks.18.mlp.0",
+      "encoder.blocks.18.mlp.2",
+      "encoder.blocks.19.attn.query",
+      "encoder.blocks.19.attn.key",
+      "encoder.blocks.19.attn.value",
+      "encoder.blocks.19.attn.out",
+      "encoder.blocks.19.mlp.0",
+      "encoder.blocks.19.mlp.2",
+      "encoder.blocks.20.attn.query",
+      "encoder.blocks.20.attn.key",
+      "encoder.blocks.20.attn.value",
+      "encoder.blocks.20.attn.out",
+      "encoder.blocks.20.mlp.0",
+      "encoder.blocks.20.mlp.2",
+      "encoder.blocks.21.attn.query",
+      "encoder.blocks.21.attn.key",
+      "encoder.blocks.21.attn.value",
+      "encoder.blocks.21.attn.out",
+      "encoder.blocks.21.mlp.0",
+      "encoder.blocks.21.mlp.2",
+      "encoder.blocks.22.attn.query",
+      "encoder.blocks.22.attn.key",
+      "encoder.blocks.22.attn.value",
+      "encoder.blocks.22.attn.out",
+      "encoder.blocks.22.mlp.0",
+      "encoder.blocks.22.mlp.2",
+      "encoder.blocks.23.attn.query",
+      "encoder.blocks.23.attn.key",
+      "encoder.blocks.23.attn.value",
+      "encoder.blocks.23.attn.out",
+      "encoder.blocks.23.mlp.0",
+      "encoder.blocks.23.mlp.2",
+      "encoder.blocks.24.attn.query",
+      "encoder.blocks.24.attn.key",
+      "encoder.blocks.24.attn.value",
+      "encoder.blocks.24.attn.out",
+      "encoder.blocks.24.mlp.0",
+      "encoder.blocks.24.mlp.2",
+      "encoder.blocks.25.attn.query",
+      "encoder.blocks.25.attn.key",
+      "encoder.blocks.25.attn.value",
+      "encoder.blocks.25.attn.out",
+      "encoder.blocks.25.mlp.0",
+      "encoder.blocks.25.mlp.2",
+      "encoder.blocks.26.attn.query",
+      "encoder.blocks.26.attn.key",
+      "encoder.blocks.26.attn.value",
+      "encoder.blocks.26.attn.out",
+      "encoder.blocks.26.mlp.0",
+      "encoder.blocks.26.mlp.2",
+      "encoder.blocks.27.attn.query",
+      "encoder.blocks.27.attn.key",
+      "encoder.blocks.27.attn.value",
+      "encoder.blocks.27.attn.out",
+      "encoder.blocks.27.mlp.0",
+      "encoder.blocks.27.mlp.2",
+      "encoder.blocks.28.attn.query",
+      "encoder.blocks.28.attn.key",
+      "encoder.blocks.28.attn.value",
+      "encoder.blocks.28.attn.out",
+      "encoder.blocks.28.mlp.0",
+      "encoder.blocks.28.mlp.2",
+      "encoder.blocks.29.attn.query",
+      "encoder.blocks.29.attn.key",
+      "encoder.blocks.29.attn.value",
+      "encoder.blocks.29.attn.out",
+      "encoder.blocks.29.mlp.0",
+      "encoder.blocks.29.mlp.2",
+      "encoder.blocks.30.attn.query",
+      "encoder.blocks.30.attn.key",
+      "encoder.blocks.30.attn.value",
+      "encoder.blocks.30.attn.out",
+      "encoder.blocks.30.mlp.0",
+      "encoder.blocks.30.mlp.2",
+      "encoder.blocks.31.attn.query",
+      "encoder.blocks.31.attn.key",
+      "encoder.blocks.31.attn.value",
+      "encoder.blocks.31.attn.out",
+      "encoder.blocks.31.mlp.0",
+      "encoder.blocks.31.mlp.2",
+      "adapter.linear1",
+      "adapter.linear2",
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {},
+    "transform_config": {},
+    "version": "0.13.0"
+  },
+  "sliding_window": 2048,
+  "text_config": {
+    "architectures": [
+      "Qwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 27648,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 65536,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 40,
+    "num_hidden_layers": 64,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 158720
+  },
+  "transformers_version": "4.57.3",
+  "use_sliding_window": false
+}

configuration_step_audio_2.py ADDED Viewed

	@@ -0,0 +1,175 @@

+from typing import Optional, Union
+from transformers import Qwen2Config
+from transformers.configuration_utils import PretrainedConfig
+class StepAudio2EncoderConfig(PretrainedConfig):
+    model_type = "step_audio_2_encoder"
+    def __init__(
+        self,
+        n_mels=128,
+        n_audio_ctx=1500,
+        n_audio_state=512,
+        n_audio_head=8,
+        n_audio_layer=6,
+        llm_dim=4096,
+        kernel_size=3,
+        adapter_stride=2,
+        **kwargs,
+    ):
+        self.n_mels      = n_mels
+        self.n_audio_ctx = n_audio_ctx
+        self.n_audio_state = n_audio_state
+        self.n_audio_head = n_audio_head
+        self.n_audio_layer = n_audio_layer
+        self.llm_dim     = llm_dim
+        self.kernel_size = kernel_size
+        self.adapter_stride = adapter_stride
+        super().__init__(**kwargs)
+class StepAudio2TextConfig(PretrainedConfig):
+    model_type = "step_audio_2_text"
+    def __init__(
+        self,
+        vocab_size=64012,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=48,
+        num_attention_heads=32,
+        num_attention_groups=4,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        eos_token_id=None,
+        **kwargs
+    ):
+        if eos_token_id is not None:
+            if isinstance(eos_token_id, list):
+                eos_token_id = list(set([151643, 151645, 151665] + eos_token_id))
+            else:
+                eos_token_id = [151643, 151645, 151665, eos_token_id]
+        else:
+            eos_token_id = [151643, 151645, 151665]
+        super().__init__(
+            eos_token_id=eos_token_id,
+            **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_key_value_heads = num_key_value_heads
+        assert self.num_attention_groups == self.num_key_value_heads, "num_attention_groups must be equal to num_key_value_heads"
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Get torch_dtype from kwargs if provided
+        torch_dtype = kwargs.get("torch_dtype", getattr(self, "torch_dtype", "bfloat16"))
+        self.text_config = Qwen2Config(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            hidden_act=hidden_act,
+            max_position_embeddings=max_position_embeddings,
+            initializer_range=initializer_range,
+            rms_norm_eps=rms_norm_eps,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            architectures=["Qwen2ForCausalLM"],
+            torch_dtype=torch_dtype,
+        )
+class StepAudio2Config(PretrainedConfig):
+    model_type = "step_audio_2"
+    architectures = ["StepAudio2ForCausalLM"]
+    # Support alternative model types and architectures for 32B model
+    # This allows the config to work with both "step_audio_2" and "step_audio_qwen2" model types
+    def __init__(
+        self,
+        audio_encoder_config :Optional[Union[dict, StepAudio2EncoderConfig]] = None,
+        text_config: Optional[Union[dict, StepAudio2TextConfig]] = None,
+        use_sliding_window: bool = False,
+        sliding_window: Optional[int] = 2048,
+        max_window_layers: Optional[int] = None,
+        **kwargs
+    ):
+        kwargs.setdefault("use_sliding_window", use_sliding_window)
+        kwargs.setdefault("sliding_window", sliding_window)
+        if max_window_layers is None:
+            max_window_layers = kwargs.get("num_hidden_layers", None)
+        kwargs.setdefault("max_window_layers", max_window_layers)
+        # Save torch_dtype if provided (for 32B model flat config)
+        if 'torch_dtype' in kwargs:
+            self.torch_dtype = kwargs['torch_dtype']
+        super().__init__(**kwargs)
+        # Support for flat config structure (32B model format)
+        # If text_config is None and we have flat config parameters, extract them
+        if text_config is None:
+            # Check if we have flat config parameters (32B model format)
+            flat_text_params = {}
+            text_param_names = [
+                'vocab_size', 'hidden_size', 'intermediate_size', 'num_hidden_layers',
+                'num_attention_heads', 'num_attention_groups', 'num_key_value_heads',
+                'hidden_act', 'max_position_embeddings', 'initializer_range',
+                'rms_norm_eps', 'rope_theta', 'rope_scaling', 'eos_token_id', 'pad_token_id'
+            ]
+            for param_name in text_param_names:
+                if param_name in kwargs:
+                    flat_text_params[param_name] = kwargs[param_name]
+            # Set default hidden_act if not present (32B model config doesn't have it)
+            if 'hidden_act' not in flat_text_params:
+                flat_text_params['hidden_act'] = 'silu'
+            # Set default initializer_range if not present
+            if 'initializer_range' not in flat_text_params:
+                flat_text_params['initializer_range'] = 0.02
+            # Also check for torch_dtype
+            if 'torch_dtype' in kwargs:
+                flat_text_params['torch_dtype'] = kwargs['torch_dtype']
+            if flat_text_params:
+                # We have flat config, use it to build text_config
+                text_config = StepAudio2TextConfig(**flat_text_params).text_config
+            else:
+                # No flat config, use defaults
+                text_config = StepAudio2TextConfig().text_config
+        elif isinstance(text_config, dict):
+            text_config = StepAudio2TextConfig(**text_config).text_config
+        self.text_config = text_config
+        if audio_encoder_config is None:
+            # Check if we have flat audio_encoder_config in kwargs
+            if 'audio_encoder_config' in kwargs and isinstance(kwargs['audio_encoder_config'], dict):
+                self.audio_encoder_config = StepAudio2EncoderConfig(**kwargs['audio_encoder_config'])
+            else:
+                self.audio_encoder_config = StepAudio2EncoderConfig()
+        elif isinstance(audio_encoder_config, dict):
+            self.audio_encoder_config = StepAudio2EncoderConfig(**audio_encoder_config)

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f3b29e25b478f82675d441c4a5aacacfee7be4725b545c2ad5b4701a726d40c
+size 4952380248

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d970d841e5e890ded3cdd46c70a0d2cdd946cb89280ac4631e7d191698c18760
+size 4937521480

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bba8fbf97f165d187a282711e56b8e4856001b2aa1d42950cb1d2b5b9dd52c61
+size 4937521480

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2e727ab22a1a25117641dc3df8936d0baca6c98319463f492f6ed86ba159f17
+size 4997834160

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d709ee0b879644343deb6973984b3261b251b9d26fdfc3656aebd2a6914e4b74
+size 2291022848

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_step_audio_2.py ADDED Viewed

	@@ -0,0 +1,430 @@

+from typing import Iterable, Optional, Tuple
+import librosa
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch import Tensor, nn
+from transformers import PreTrainedModel, Qwen2Model
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_step_audio_2 import StepAudio2Config
+def _mel_filters(n_mels: int) -> torch.Tensor:
+    """Load the mel filterbank matrix for projecting STFT into a Mel spectrogram."""
+    assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
+    if n_mels == 128:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=128))
+    else:
+        return torch.from_numpy(librosa.filters.mel(sr=16000, n_fft=400, n_mels=80))
+def load_audio(file_path, target_rate=16000, max_length=None):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    If max_length is provided, truncate the audio to that length
+    """
+    waveform, sample_rate = torchaudio.load(file_path)
+    if sample_rate != target_rate:
+        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_rate)(waveform)
+    audio = waveform[0]  # get the first channel
+    # Truncate audio if it exceeds max_length
+    if max_length is not None and audio.shape[0] > max_length:
+        audio = audio[:max_length]
+    return audio
+def log_mel_spectrogram(audio, n_mels=128, padding=479, device=None):
+    """
+    Compute the log-Mel spectrogram with specific padding for StepAudio
+    """
+    if not torch.is_tensor(audio):
+        if isinstance(audio, str):
+            audio = load_audio(audio)
+        audio = torch.from_numpy(audio)
+    if device is not None:
+        audio = audio.to(device)
+    if padding > 0:
+        audio = F.pad(audio, (0, padding))
+    window = torch.hann_window(400).to(audio.device)
+    stft = torch.stft(audio, 400, 160, window=window, return_complex=True)
+    magnitudes = stft[..., :-1].abs() ** 2
+    filters = _mel_filters(n_mels)
+    mel_spec = filters @ magnitudes
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
+    log_spec = (log_spec + 4.0) / 4.0
+    return log_spec
+def compute_token_num(max_feature_len):
+    # First, audio goes through encoder:
+    # 1. conv1: kernel=3, stride=1, padding=1 -> size unchanged
+    # 2. conv2: kernel=3, stride=2, padding=1 -> size/2
+    # 3. avg_pooler: kernel=2, stride=2 -> size/2
+    max_feature_len = max_feature_len - 2  # remove padding
+    encoder_output_dim = (max_feature_len + 1) // 2 // 2  # after conv2 and avg_pooler
+    # Then through adaptor (parameters from config file):
+    padding = 1
+    kernel_size = 3  # from config: audio_encoder_config.kernel_size
+    stride = 2      # from config: audio_encoder_config.adapter_stride
+    adapter_output_dim = (encoder_output_dim + 2 * padding - kernel_size) // stride + 1
+    return adapter_output_dim
+def make_non_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    """Make mask tensor containing indices of non-padded part.
+    The sequences in a batch may have different lengths. To enable
+    batch computing, padding is need to make all sequence in same
+    size. To avoid the padding part pass value to context dependent
+    block such as attention or convolution , this padding part is
+    masked.
+    1 for non-padded part and 0 for padded part.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, max_T).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+    """
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return ~mask
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    """Convert bool-tensor to float-tensor for flash attention.
+    Parameters
+    ----------
+        lengths (torch.Tensor): Batch of lengths (B, ?).
+    Returns:
+    -------
+        torch.Tensor: Mask tensor containing indices of padded part (B, ?).
+    Examples:
+        >>> import torch
+        >>> import s3tokenizer
+        >>> lengths = torch.tensor([5, 3, 2])
+        >>> masks = s3tokenizer.make_non_pad_mask(lengths)
+        masks = [[1, 1, 1, 1, 1],
+                 [1, 1, 1, 0, 0],
+                 [1, 1, 0, 0, 0]]
+        >>> new_masks = s3tokenizer.mask_to_bias(masks, torch.float32)
+        new_masks = [[-0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00, -0.0000e+00],
+                    [-0.0000e+00, -0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10],
+                    [-0.0000e+00, -0.0000e+00, -1.0000e+10, -1.0000e+10, -1.0000e+10]]
+    """
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+class LayerNorm(nn.LayerNorm):
+    def forward(self, input: Tensor) -> Tensor:
+        return super().forward(input).type(input.dtype)
+class Linear(nn.Linear):
+    def forward(self, input: Tensor) -> Tensor:
+        return F.linear(
+            input,
+            self.weight.to(input.dtype),
+            None if self.bias is None else self.bias.to(input.dtype),
+        )
+class Conv1d(nn.Conv1d):
+    def _conv_forward(
+        self, input: Tensor, weight: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        return super()._conv_forward(
+            input, weight.to(input.dtype), None if bias is None else bias.to(input.dtype)
+        )
+class MultiHeadAttention(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.n_head = n_head
+        self.query = Linear(n_state, n_state)
+        self.key = Linear(n_state, n_state, bias=False)
+        self.value = Linear(n_state, n_state)
+        self.out = Linear(n_state, n_state)
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        q = self.query(x)
+        k = self.key(x)
+        v = self.value(x)
+        wv, qk = self.qkv_attention(q, k, v, mask)
+        return self.out(wv), qk
+    def qkv_attention(
+        self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
+    ):
+        _, T, D = q.shape
+        scale = (D // self.n_head) ** -0.25
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        qk = q @ k  # (B, n_head, T, T)
+        if mask is not None:
+            qk = qk + mask
+        qk = qk.float()
+        w = F.softmax(qk, dim=-1).to(q.dtype)
+        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, n_state: int, n_head: int):
+        super().__init__()
+        self.attn = MultiHeadAttention(n_state, n_head)
+        self.attn_ln = LayerNorm(n_state)
+        n_mlp = n_state * 4
+        self.mlp = nn.Sequential(
+            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
+        )
+        self.mlp_ln = LayerNorm(n_state)
+    def forward(
+        self,
+        x: Tensor,
+        mask: Optional[Tensor] = None,
+    ):
+        x = x + self.attn(self.attn_ln(x.contiguous()), mask=mask)[0]
+        x = x + self.mlp(self.mlp_ln(x.contiguous()))
+        return x
+class AudioEncoder(nn.Module):
+    def __init__(
+        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
+    ):
+        super().__init__()
+        self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1)
+        self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1)
+        self.positional_embedding = nn.Embedding(n_ctx, n_state)
+        self.positional_embedding.requires_grad_(False)
+        self.blocks: Iterable[ResidualAttentionBlock] = nn.ModuleList(
+            [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]
+        )
+        self.avg_pooler = nn.AvgPool1d(2, stride=2)
+        self.after_norm = LayerNorm(n_state)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor, x_len: Tensor) -> Tuple[Tensor, Tensor]:
+        T = x.size(-1)
+        x = F.gelu(self.conv1(x))
+        x = F.gelu(self.conv2(x))
+        x = x.permute(0, 2, 1)  # (B, T // 2, n_state)
+        mask = make_non_pad_mask(x_len, T).unsqueeze(1)  # (B, 1, T)
+        mask = mask_to_bias(mask[:, :, (T + 1) % 2::2], x.dtype)  # (B, 1, T // 2)
+        x = (x + self.positional_embedding.weight[:x.shape[1], :]).to(x.dtype)
+        for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(block, x, mask.unsqueeze(1))
+            else:
+                x = block(x, mask.unsqueeze(1))
+        x = x.permute(0, 2, 1)
+        x = self.avg_pooler(x)
+        x = x.permute(0, 2, 1)
+        x_len = (x_len + 1) // 2 // 2
+        x = self.after_norm(x.contiguous())
+        return x, x_len
+class Adaptor(nn.Module):
+    def __init__(
+        self,
+        n_state: int = 1280,
+        n_hidden: int = 3072,
+        kernel_size: int = 7,
+        stride: int = 4
+    ):
+        super().__init__()
+        self.stride = stride
+        if self.stride != -1:
+            # print("self.stride: {}".format(self.stride))
+            self.conv = Conv1d(n_state, n_state, kernel_size, stride, padding=1)
+        self.linear1 = nn.Linear(n_state, 2048)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(2048, n_hidden)
+        self.gradient_checkpointing = False
+    def forward(self, x: Tensor) -> Tuple[Tensor]:
+        T = x.size(-1)
+        if self.stride != -1:
+            if self.gradient_checkpointing and self.training:
+                x = torch.utils.checkpoint.checkpoint(self.conv, x.permute(0, 2, 1))
+                x = x.permute(0, 2, 1)
+            else:
+                x = x.permute(0, 2, 1)
+                x = F.gelu(self.conv(x))
+                x = x.permute(0, 2, 1)
+        if self.gradient_checkpointing and self.training:
+            x = torch.utils.checkpoint.checkpoint(self.linear1, x)
+            x = torch.utils.checkpoint.checkpoint(self.relu, x)
+            x = torch.utils.checkpoint.checkpoint(self.linear2, x)
+        else:
+            x = self.linear1(x)
+            x = self.relu(x)
+            x = self.linear2(x)
+        return x
+class StepAudio2ForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = StepAudio2Config
+    main_input_name = "input_ids"
+    # Important: Add this attribute to make HF recognize it as a model with generation capability
+    # _keys_to_ignore_on_load_missing = ["lm_head.weight"]
+    supports_gradient_checkpointing = True  # 新增，声明支持gradient checkpointing
+    def __init__(self, config: StepAudio2Config):
+        super().__init__(config)
+        # Handle torch_dtype with default fallback
+        if hasattr(config, 'torch_dtype') and config.torch_dtype is not None:
+            if isinstance(config.torch_dtype, str):
+                dtype = getattr(torch, config.torch_dtype)
+            else:
+                dtype = config.torch_dtype
+        else:
+            # Default to bfloat16 if not specified
+            dtype = torch.bfloat16
+        self.model = Qwen2Model(config.text_config)
+        self.bf16 = dtype==torch.bfloat16
+        self.encoder = AudioEncoder(
+            config.audio_encoder_config.n_mels, config.audio_encoder_config.n_audio_ctx, config.audio_encoder_config.n_audio_state,
+            config.audio_encoder_config.n_audio_head, config.audio_encoder_config.n_audio_layer
+        )
+        self.adapter = Adaptor(
+            config.audio_encoder_config.n_audio_state, config.audio_encoder_config.llm_dim,
+            config.audio_encoder_config.kernel_size, config.audio_encoder_config.adapter_stride
+        )
+        if self.bf16:
+            self.encoder = self.encoder.bfloat16()
+            self.adapter = self.adapter.bfloat16()
+        self.lm_head = torch.nn.Linear(
+            config.text_config.hidden_size,
+            config.text_config.vocab_size,
+            bias=False,
+            dtype=dtype
+        )
+        self.post_init()
+    def forward(
+        self,
+        input_ids=None,
+        wavs=None,
+        wav_lens=None,
+        attention_mask=None,
+        **kwargs
+    ):
+        hidden_states = self.model.embed_tokens(input_ids)
+        if wavs is not None:
+            if self.bf16:
+                wavs = wavs.bfloat16()
+            out, feat_lens = self.encoder(wavs, wav_lens)
+            out = self.adapter(out)
+            feat_lens = (feat_lens - 1) // 2 + 1
+            insert_location = torch.nonzero(input_ids == 151688)
+            insert_location[:,1] += 1
+            for idx in range(len(insert_location)):
+                i,s = insert_location[idx]
+                hidden_states[i][s : s+feat_lens[idx]] = out[idx][:feat_lens[idx]]
+        x = self.model(inputs_embeds=hidden_states, attention_mask=attention_mask)[0]
+        logits = self.lm_head(x)
+        return CausalLMOutputWithPast(
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None
+        )
+    def get_input_embeddings(self):
+        """Return the model's input embeddings - required for GenerationMixin"""
+        return self.model.embed_tokens
+    def get_output_embeddings(self):
+        """Return the model's output embeddings (LM head) - required for GenerationMixin"""
+        return self.lm_head
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
+        """Prepare inputs for generation - required for GenerationMixin"""
+        # Keep the wavs and wav_lens from the initial call
+        wavs = kwargs.get("wavs", None)
+        wav_lens = kwargs.get("wav_lens", None)
+        # For generation steps after the first, we don't need to process audio again
+        # because the audio tokens have already been replaced in the input sequence
+        if "past_key_values" in kwargs and kwargs["past_key_values"] is not None:
+            # We're in a generation step, no need to process audio again
+            return {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "past_key_values": kwargs.get("past_key_values")
+            }
+        # First generation step, include audio processing
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "wavs": wavs,
+            "wav_lens": wav_lens
+        }
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """Reorder the cache for beam search - required for GenerationMixin if using beam search"""
+        # If you're not using past_key_values or beam search, this can be a simple pass-through
+        # Otherwise implement according to your model's cache structure
+        return past_key_values
+    def _set_gradient_checkpointing(self, module, value=False):
+        # For Qwen2Model
+        if hasattr(self.model, 'gradient_checkpointing'):
+            self.model.gradient_checkpointing = value
+            # Add the missing _gradient_checkpointing_func method to Qwen2Model
+            # This is what Qwen2Model tries to use when gradient_checkpointing=True
+            if value and not hasattr(self.model, '_gradient_checkpointing_func'):
+                def _gradient_checkpointing_func(module_to_run, *args, **kwargs):
+                    # This function wraps torch.utils.checkpoint.checkpoint
+                    # and is used by Qwen2Model to perform checkpointing
+                    return torch.utils.checkpoint.checkpoint(module_to_run, *args, **kwargs)
+                self.model._gradient_checkpointing_func = _gradient_checkpointing_func
+        # For custom encoder and adapter
+        if hasattr(self.encoder, 'gradient_checkpointing'):
+            self.encoder.gradient_checkpointing = value
+        if hasattr(self.adapter, 'gradient_checkpointing'):
+            self.adapter.gradient_checkpointing = value

recipe.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+default_stage:
+  default_modifiers:
+    QuantizationModifier:
+      targets: [Linear]
+      ignore: [lm_head, 're:encoder.*', 're:adapter.*']
+      scheme: NVFP4

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "additional_special_tokens": [
+    "<|EOT|>",
+    "<|BOT|>",
+    "<|CALL_START|>",
+    "<|CALL_END|>",
+    "<|THINK_START|>",
+    "<|THINK_END|>",
+    "<|IMG_START|>",
+    "<|IMG_END|>",
+    "<|META_START|>",
+    "<|META_END|>",
+    "<im_patch>",
+    "<im_start>",
+    "<im_end>",
+    "<dream>",
+    "<dream_start>",
+    "<dream_end>",
+    "<|MASK_1e69f|>",
+    "<|UNMASK_1e69f|>",
+    "<video_start>",
+    "<video_end>",
+    "<patch_start>",
+    "<patch_end>",
+    "<patch_newline>",
+    "<audio_start>",
+    "<audio_end>",
+    "<audio_patch>",
+    "<audio_patch_pad>",
+    "<|SC|>",
+    "<tts_start>",
+    "<tts_end>",
+    "<tts_pad>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40bd4f52eb1efcdac503b40bdf65a4f7ffd92bd0c798f0d7bd84f4b5e008c151
+size 12684896

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff