from __future__ import annotations from dataclasses import dataclass from typing import Any, Dict, List from .constants import DEFAULT_ESPEAK_VOICE, EMOTION_TO_SYMBOL, INTENSITY_SYMBOLS @dataclass(frozen=True) class PreparedInput: text: str phonemes: List[str] token_ids: List[int] emotion: str intensity: float emotion_symbol: str intensity_symbol: str def clamp_unit(value: float) -> float: if value != value: return 0.0 if value < 0.0: return 0.0 if value > 1.0: return 1.0 return float(value) def load_token_map(config: dict[str, Any]) -> Dict[str, int]: phoneme_id_map = config.get("phoneme_id_map") if not isinstance(phoneme_id_map, dict): raise KeyError("config.json is missing phoneme_id_map") token_map: Dict[str, int] = {} for symbol, raw_value in phoneme_id_map.items(): if isinstance(raw_value, int): token_map[symbol] = raw_value continue if isinstance(raw_value, list) and len(raw_value) == 1: token_map[symbol] = int(raw_value[0]) continue raise ValueError( f"Unsupported token mapping for symbol {symbol!r}: expected int or single-item list" ) return token_map def intensity_to_symbol(intensity: float) -> str: value = clamp_unit(intensity) idx = int(value * len(INTENSITY_SYMBOLS)) idx = max(0, min(idx, len(INTENSITY_SYMBOLS) - 1)) return INTENSITY_SYMBOLS[idx] def normalize_emotion(emotion: str | None) -> str: value = (emotion or "neutral").strip().lower() if value not in EMOTION_TO_SYMBOL: raise ValueError( f"Unsupported emotion {emotion!r}. Expected one of: {', '.join(EMOTION_TO_SYMBOL)}" ) return value def phonemize_full_utterance(text: str, espeak_voice: str = DEFAULT_ESPEAK_VOICE) -> List[str]: try: from piper_phonemize import phonemize_espeak except ImportError as exc: raise ImportError( "wfloat-tts requires piper-phonemize for phonemization. " "Install it with: pip install \"piper-phonemize==1.3.0\" " "-f https://k2-fsa.github.io/icefall/piper_phonemize" ) from exc sentence_groups = phonemize_espeak(text, espeak_voice) phonemes: List[str] = [] for group in sentence_groups: if not group: continue if phonemes: phonemes.append(" ") phonemes.extend(group) return phonemes def prepare_input( text: str, config: dict[str, Any], emotion: str = "neutral", intensity: float = 0.5, espeak_voice: str = DEFAULT_ESPEAK_VOICE, ) -> PreparedInput: normalized_emotion = normalize_emotion(emotion) normalized_intensity = clamp_unit(intensity) phonemes = phonemize_full_utterance(text, espeak_voice=espeak_voice) emotion_symbol = EMOTION_TO_SYMBOL[normalized_emotion] intensity_symbol = intensity_to_symbol(normalized_intensity) phonemes.extend([emotion_symbol, intensity_symbol]) token_map = load_token_map(config) missing = [symbol for symbol in phonemes if symbol not in token_map] if missing: joined = ", ".join(sorted(set(missing))) raise KeyError(f"Missing symbol(s) in config.json phoneme_id_map: {joined}") token_ids = [token_map[symbol] for symbol in phonemes] return PreparedInput( text=text, phonemes=phonemes, token_ids=token_ids, emotion=normalized_emotion, intensity=normalized_intensity, emotion_symbol=emotion_symbol, intensity_symbol=intensity_symbol, )