| | --- |
| | library_name: transformers |
| | datasets: |
| | - Blinorot/ALARM-Corpora |
| | base_model: |
| | - Qwen/Qwen3-4B-Thinking-2507 |
| | --- |
| | |
| | # Model Card for AL-MuQ-R |
| |
|
| | This is a checkpoint for AL-MuQ-R, audio-understanding reasoning language model, proposed in [ALARM: Audio–Language Alignment for Reasoning Models](https://arxiv.org/abs/2603.09556). |
| |
|
| | For more details regarding the model and its usage, please refer to our [GitHub](https://github.com/Blinorot/ALARM). |
| |
|
| | ## Inference |
| |
|
| | We provide [vLLM](https://github.com/vllm-project/vllm) support using [vLLM Prompt Embedding API](https://docs.vllm.ai/en/stable/features/prompt_embeds/). |
| | Since ALARM uses the frozen Qwen3 model as the backbone, `vllm` just runs the original Qwen3 checkpoint, and the ALARM checkpoint is used for extracting LLM input embeddings. |
| | After you cloned the repo and installed the depnedencies, you can run the pretrained model as follows: |
| |
|
| | ```python |
| | # Import libraries |
| | import os |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "0" #optional |
| | |
| | # run before importing torch because generate_vllm sets the multiprocessing method |
| | from generate_vllm import get_response |
| | from src.model.wrapped_llms.qwen3 import Qwen3AudioWrappedFeatureExtractor |
| | |
| | from omegaconf import OmegaConf |
| | from torchaudio.utils import _download_asset |
| | from torchcodec.decoders import AudioDecoder |
| | from transformers import AutoTokenizer |
| | from vllm import LLM |
| | |
| | |
| | # The model configuration config. |
| | # Handles vllm-related configuration and defines feature extractors, |
| | # i.e., audio -> encoder input embedding conversion. |
| | # All other configuration, including model architecture, will be |
| | # loaded from the checkpoint. |
| | default_model_config_name = "src/configs/model/default_inference.yaml" |
| | model_config = OmegaConf.load(default_model_config_name) |
| | |
| | # checkpoint_name = which model to run |
| | # Single model version (no inference-time ensemble): |
| | # checkpoint_name='Blinorot/AL-Whisper-Instruct-R' |
| | # ALARM-E embedding fusion-type version (inference-time ensemble): |
| | # checkpoint_name=["Blinorot/ALARM-CA","Blinorot/AL-Whisper-Instruct-R"] |
| | checkpoint_name = "Blinorot/AL-MuQ-R" |
| | |
| | device = "cuda" |
| | |
| | # Load Tokenizer for Text Processing |
| | tokenizer = AutoTokenizer.from_pretrained(model_config.llm) |
| | |
| | # Load ALARM/AL-*-R checkpoints for extraction of LLM input embeddings |
| | if isinstance(checkpoint_name, list): # ALARM-E-style embedding fusion (inference-time ensemble) |
| | feature_extractor_list = [] |
| | for name in checkpoint_name: |
| | # Load weights into the (audio,text)->LLM embeddings converter |
| | feature_extractor = Qwen3AudioWrappedFeatureExtractor( |
| | model_config=model_config, |
| | checkpoint_name=name, |
| | tokenizer=tokenizer, |
| | ) |
| | feature_extractor.to(device) |
| | feature_extractor_list.append(feature_extractor) |
| | feature_extractor = feature_extractor_list |
| | else: # Single Model version (no inference-time ensemble) |
| | # Load weights into the (audio,text)->LLM embeddings converter |
| | feature_extractor = Qwen3AudioWrappedFeatureExtractor( |
| | model_config=model_config, |
| | checkpoint_name=checkpoint_name, |
| | tokenizer=tokenizer, |
| | ) |
| | feature_extractor.to(device) |
| | |
| | # Start the offline vLLM instance of original Qwen3 RLM |
| | # Model will be loaded to CUDA_VISIBLE_DEVICES id |
| | llm = LLM( |
| | model_config.llm, |
| | enable_prefix_caching=True, |
| | max_model_len=model_config.max_model_len, |
| | max_num_seqs=model_config.max_num_seq, |
| | max_num_batched_tokens=model_config.max_num_batched_tokens, |
| | gpu_memory_utilization=model_config.gpu_memory_utilization, |
| | enable_prompt_embeds=True, |
| | ) |
| | |
| | # Set sampling arguments for the RLM |
| | sample = llm.get_default_sampling_params() |
| | sample.seed = model_config.seed |
| | sample.max_tokens = model_config.max_tokens |
| | |
| | # Define audio and prompt |
| | # Audio must come from torchcodec.AudioDecoder |
| | audio_example_path = _download_asset("tutorial-assets/ctc-decoding/1688-142285-0007.wav") |
| | audio = AudioDecoder(audio_example_path) |
| | prompt = "Describe the audio content." |
| | |
| | # Define a system prompt |
| | system_prompt = "You are an audio-understanding model." |
| | |
| | # Obtain response from Audio RLM |
| | response = get_response( |
| | prompts=[prompt], # list of all the prompts |
| | audio_list=[audio], # list of corresponding audio |
| | llm=llm, |
| | feature_extractor=feature_extractor, |
| | sample=sample, |
| | tokenizer=tokenizer, |
| | system_prompt=system_prompt, |
| | max_thinking_tokens=model_config.max_thinking_tokens, # controls thinking budget for the RLM |
| | debug=False, |
| | ) |
| | |
| | # Response is a list of responses, one per each (prompt, audio) input pair |
| | # We have only one input pair, so the final response is at index 0 |
| | response = response[0] |
| | |
| | print(f"Model response:\n\n{response}") |
| | ``` |
| |
|
| | ## Citation |
| |
|
| | If you use this work, please cite: |
| |
|
| | ```bibtex |
| | @article{grinberg2026alarm, |
| | title={ALARM: Audio-Language Alignment for Reasoning Models}, |
| | author={Grinberg, Petr and Shahmohammadi, Hassan}, |
| | journal={arXiv preprint arXiv:2603.09556}, |
| | year={2026} |
| | } |
| | ``` |
| |
|
| | ## License |
| |
|
| | The model checkpoint is licensed under Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0). |
| | It may only be used for non-commercial research purposes. |