MOSS-TTS-Realtime / processing_mossttsrealtime.py

gaoyang07

Update modeling

b1cede0 3 days ago

8.37 kB

	# Copyright 2026 OpenMOSS and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Processing utilities for MossTTSRealtime."""

	from __future__ import annotations

	from typing import Iterable, Optional

	import numpy as np

	from transformers.processing_utils import ProcessorMixin


	class MossTTSRealtimeProcessor(ProcessorMixin):
	"""Builds MossTTSRealtime prompt inputs with text and audio codebooks.

	This processor focuses on preparing the mixed text/audio token layout expected by MossTTSRealtime.
	It does not perform audio encoding/decoding by itself.
	"""

	attributes = ["tokenizer"]
	tokenizer_class = "AutoTokenizer"

	def __init__(
	self,
	tokenizer,
	audio_pad_token: str = "<\|audio_pad\|>",
	text_pad_token: str = "<\|text_pad\|>",
	tts_system_prompt: Optional[str] = None,
	channels: int = 16,
	audio_channel_pad: int = 1024,
	audio_bos_token: int = 1025,
	audio_eos_token: int = 1026,
	delay_tokens_len: int = 12,
	):
	super().__init__(tokenizer=tokenizer)
	self.audio_pad_token = audio_pad_token
	self.text_pad_token = text_pad_token
	self.channels = channels
	self.audio_channel_pad = audio_channel_pad
	self.audio_bos_token = audio_bos_token
	self.audio_eos_token = audio_eos_token
	self.delay_tokens_len = delay_tokens_len

	self.audio_pad_token_id = self._convert_token_to_id(audio_pad_token)
	self.text_pad_token_id = self._convert_token_to_id(text_pad_token)

	if tts_system_prompt is None:
	tts_system_prompt = (
	"<\|im_start\|>system\n"
	"You are a highly expressive text-to-speech (TTS) engine developed by Mosi Intelligence. \n"
	"You possess natural language understanding, emotional modeling, and multi-style speech generation "
	"capabilities, allowing you to generate the corresponding speech based on the text given in the assistant."
	"<\|im_end\|>\n"
	)
	self.tts_system_prompt = tts_system_prompt

	def _convert_token_to_id(self, token: str) -> int:
	if hasattr(self.tokenizer, "convert_tokens_to_ids"):
	token_id = self.tokenizer.convert_tokens_to_ids(token)
	if token_id is not None and token_id != self.tokenizer.unk_token_id:
	return int(token_id)
	token_ids = self.tokenizer.encode(token, add_special_tokens=False)
	if not token_ids:
	raise ValueError(f"Token '{token}' could not be converted to an id.")
	if len(token_ids) != 1:
	raise ValueError(f"Token '{token}' maps to multiple ids: {token_ids}")
	return int(token_ids[0])

	def make_voice_clone_prompt(self, prompt_audio_tokens_len: int) -> str:
	padded_audio_prompt = f"{self.audio_pad_token * prompt_audio_tokens_len}"
	voice_clone = (
	"<\|im_start\|>context\n"
	"The assistant section should be synthesized using the following voice timbre:"
	f"{padded_audio_prompt}"
	)
	return voice_clone

	def _normalize_audio_tokens(self, audio_tokens: np.ndarray \| Iterable) -> np.ndarray:
	tokens = np.array(audio_tokens)
	if tokens.ndim != 2:
	raise ValueError(f"Expected 2D audio tokens, got shape {tokens.shape}")
	# Accept [channels, T] or [T, channels], and slice to expected channels if needed.
	if tokens.shape[0] == self.channels:
	tokens = tokens.T
	elif tokens.shape[1] == self.channels:
	tokens = tokens
	elif tokens.shape[0] > self.channels and tokens.shape[1] != self.channels:
	tokens = tokens[: self.channels, :].T
	elif tokens.shape[1] > self.channels and tokens.shape[0] != self.channels:
	tokens = tokens[:, : self.channels]
	if tokens.shape[1] != self.channels:
	raise ValueError(f"Expected {self.channels} channels, got shape {tokens.shape}")
	return tokens

	def make_ensemble(self, prompt_audio_tokens: Optional[np.ndarray] = None) -> np.ndarray:
	if prompt_audio_tokens is not None:
	prompt_audio_tokens = self._normalize_audio_tokens(prompt_audio_tokens)
	prompt_audio_tokens = prompt_audio_tokens[:, : self.channels]
	system_prompt_text = f"{self.tts_system_prompt}" + f"{self.make_voice_clone_prompt(prompt_audio_tokens.shape[0])}"
	else:
	system_prompt_text = f"{self.tts_system_prompt}"

	system_prompt_tokens = self.tokenizer(system_prompt_text)["input_ids"]
	system_prompt_tokens_full = np.full(
	shape=(len(system_prompt_tokens), self.channels + 1), fill_value=self.audio_channel_pad, dtype=np.int64
	)
	system_prompt_tokens_full[:, 0] = system_prompt_tokens

	if prompt_audio_tokens is not None:
	system_prompt_tokens = np.array(system_prompt_tokens)
	indices = np.where(system_prompt_tokens == self.audio_pad_token_id)[0]
	if indices.size == 0:
	raise ValueError("No <\|audio_pad\|> tokens found in the system prompt.")
	prompt_audio_start_pos, prompt_audio_end_pos = indices[0], indices[-1]
	system_prompt_tokens_full[prompt_audio_start_pos : prompt_audio_end_pos + 1, 1:] = prompt_audio_tokens

	return system_prompt_tokens_full

	def make_user_prompt(self, text: str, audio_tokens: np.ndarray) -> np.ndarray:
	prefill_temp = "<\|im_end\|>\n<\|im_start\|>user\n"
	text_tokens = self.tokenizer(text)["input_ids"]
	text_start_pos = len(self.tokenizer.encode(prefill_temp))
	token = self._normalize_audio_tokens(audio_tokens)

	text_len = len(text_tokens)
	audio_len = token.shape[0]

	if text_len >= self.delay_tokens_len:
	padded_text_len = audio_len + self.delay_tokens_len - text_len + 1
	cur_input_id_ch1 = prefill_temp + text + "<\|text_pad\|>" * padded_text_len
	assistant_tokens_ch1 = self.tokenizer(cur_input_id_ch1)["input_ids"]
	cur_input_id = np.full(
	shape=(len(assistant_tokens_ch1), self.channels + 1),
	fill_value=self.audio_channel_pad,
	dtype=np.int64,
	)
	cur_input_id[:, 0] = assistant_tokens_ch1
	cur_input_id[
	text_start_pos + self.delay_tokens_len : text_start_pos + self.delay_tokens_len + audio_len, 1:
	] = token
	cur_input_id[text_start_pos + self.delay_tokens_len - 1, 1] = self.audio_bos_token
	cur_input_id[text_start_pos + self.delay_tokens_len + audio_len, 1] = self.audio_eos_token
	else:
	padded_text_len = audio_len + 1
	cur_input_id_ch1 = prefill_temp + text + "<\|text_pad\|>" * padded_text_len
	assistant_tokens_ch1 = self.tokenizer(cur_input_id_ch1)["input_ids"]
	cur_input_id = np.full(
	shape=(len(assistant_tokens_ch1), self.channels + 1),
	fill_value=self.audio_channel_pad,
	dtype=np.int64,
	)
	cur_input_id[:, 0] = assistant_tokens_ch1
	cur_input_id[-(audio_len + 1) : -1, 1:] = token
	cur_input_id[-(audio_len + 2), 1] = self.audio_bos_token
	cur_input_id[-1, 1] = self.audio_eos_token

	begin_of_response = self.tokenizer.encode("<\|im_end\|>\n<\|im_start\|>assistant\n")
	begin_of_response_full = np.full(
	shape=(len(begin_of_response), self.channels + 1), fill_value=self.audio_channel_pad, dtype=np.int64
	)
	begin_of_response_full[:, 0] = begin_of_response

	input_ids = np.concatenate([cur_input_id, begin_of_response_full], axis=0)
	return input_ids


	__all__ = ["MossTTSRealtimeProcessor"]