| | |
| |
|
| | use crate::{ |
| | audio::{load_audio, save_audio, AudioConfig, AudioData}, |
| | config::Config, |
| | model::{EmotionEncoder, SamplingStrategy, SemanticEncoder, SpeakerEncoder}, |
| | text::{TextNormalizer, TextTokenizer, TokenizerConfig}, |
| | vocoder::{BigVGAN, BigVGANConfig, Vocoder}, Result, |
| | }; |
| | use ndarray::Array1; |
| | use std::path::{Path, PathBuf}; |
| | use std::time::Instant; |
| |
|
| | |
| | #[derive(Debug, Clone)] |
| | pub struct SynthesisOptions { |
| | |
| | pub emotion_vector: Option<Vec<f32>>, |
| | |
| | pub emotion_audio: Option<PathBuf>, |
| | |
| | pub emotion_alpha: f32, |
| | |
| | pub sampling: SamplingStrategy, |
| | |
| | pub repetition_penalty: f32, |
| | |
| | pub max_length: usize, |
| | |
| | pub segment_silence_ms: u32, |
| | } |
| |
|
| | impl Default for SynthesisOptions { |
| | fn default() -> Self { |
| | Self { |
| | emotion_vector: None, |
| | emotion_audio: None, |
| | emotion_alpha: 1.0, |
| | sampling: SamplingStrategy::TopKP { k: 50, p: 0.95 }, |
| | repetition_penalty: 1.1, |
| | max_length: 250, |
| | segment_silence_ms: 200, |
| | } |
| | } |
| | } |
| |
|
| | |
| | #[derive(Debug)] |
| | pub struct SynthesisResult { |
| | |
| | pub audio: Vec<f32>, |
| | |
| | pub sample_rate: u32, |
| | |
| | pub duration: f32, |
| | |
| | pub processing_time: f32, |
| | |
| | pub rtf: f32, |
| | } |
| |
|
| | impl SynthesisResult { |
| | |
| | pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> { |
| | let audio_data = AudioData::new(self.audio.clone(), self.sample_rate); |
| | save_audio(path, &audio_data) |
| | } |
| |
|
| | |
| | pub fn duration_formatted(&self) -> String { |
| | let minutes = (self.duration / 60.0) as u32; |
| | let seconds = (self.duration % 60.0) as u32; |
| | format!("{:02}:{:02}", minutes, seconds) |
| | } |
| | } |
| |
|
| | |
| | pub struct IndexTTS { |
| | |
| | normalizer: TextNormalizer, |
| | |
| | tokenizer: TextTokenizer, |
| | |
| | speaker_encoder: SpeakerEncoder, |
| | |
| | emotion_encoder: EmotionEncoder, |
| | |
| | semantic_encoder: SemanticEncoder, |
| | |
| | vocoder: BigVGAN, |
| | |
| | audio_config: AudioConfig, |
| | |
| | config: Config, |
| | } |
| |
|
| | impl IndexTTS { |
| | |
| | pub fn new(config: Config) -> Result<Self> { |
| | config.validate()?; |
| |
|
| | log::info!("Initializing IndexTTS..."); |
| |
|
| | |
| | let normalizer = TextNormalizer::new(); |
| | let tokenizer = TextTokenizer::new(TokenizerConfig { |
| | model_path: config.dataset.bpe_model.display().to_string(), |
| | vocab_size: config.dataset.vocab_size, |
| | ..Default::default() |
| | })?; |
| |
|
| | |
| | let speaker_encoder = SpeakerEncoder::new_placeholder(192); |
| | let emotion_encoder = EmotionEncoder::new( |
| | config.emotions.num_dims, |
| | config.emotions.num.clone(), |
| | 256, |
| | ); |
| | let semantic_encoder = SemanticEncoder::new_placeholder(); |
| |
|
| | |
| | let vocoder_config = BigVGANConfig { |
| | sample_rate: config.s2mel.preprocess.sr, |
| | num_mels: config.s2mel.preprocess.n_mels, |
| | ..Default::default() |
| | }; |
| | let vocoder = BigVGAN::new_fallback(vocoder_config); |
| |
|
| | |
| | let audio_config = AudioConfig { |
| | sample_rate: config.s2mel.preprocess.sr, |
| | n_fft: config.s2mel.preprocess.n_fft, |
| | hop_length: config.s2mel.preprocess.hop_length, |
| | win_length: config.s2mel.preprocess.win_length, |
| | n_mels: config.s2mel.preprocess.n_mels, |
| | fmin: config.s2mel.preprocess.fmin, |
| | fmax: config.s2mel.preprocess.fmax, |
| | }; |
| |
|
| | log::info!("IndexTTS initialized successfully"); |
| |
|
| | Ok(Self { |
| | normalizer, |
| | tokenizer, |
| | speaker_encoder, |
| | emotion_encoder, |
| | semantic_encoder, |
| | vocoder, |
| | audio_config, |
| | config, |
| | }) |
| | } |
| |
|
| | |
| | pub fn load<P: AsRef<Path>>(config_path: P) -> Result<Self> { |
| | let config = Config::load(config_path)?; |
| | Self::new(config) |
| | } |
| |
|
| | |
| | pub fn synthesize( |
| | &self, |
| | text: &str, |
| | speaker_audio_path: &str, |
| | options: &SynthesisOptions, |
| | ) -> Result<SynthesisResult> { |
| | let start_time = Instant::now(); |
| |
|
| | log::info!("Starting synthesis for: {}", &text[..text.len().min(50)]); |
| |
|
| | |
| | log::debug!("Normalizing text..."); |
| | let normalized_text = self.normalizer.normalize(text)?; |
| |
|
| | |
| | log::debug!("Tokenizing text..."); |
| | let tokens = self.tokenizer.encode(&normalized_text)?; |
| | log::debug!("Generated {} tokens", tokens.len()); |
| |
|
| | |
| | log::debug!("Loading speaker audio..."); |
| | let speaker_audio = load_audio(speaker_audio_path, Some(self.audio_config.sample_rate))?; |
| |
|
| | |
| | log::debug!("Extracting speaker embedding..."); |
| | let mel_spec = crate::audio::mel_spectrogram(&speaker_audio.samples, &self.audio_config)?; |
| | let speaker_embedding = self.speaker_encoder.encode(&mel_spec)?; |
| |
|
| | |
| | log::debug!("Extracting semantic codes..."); |
| | let semantic_codes = self |
| | .semantic_encoder |
| | .encode(&speaker_audio.samples, self.audio_config.sample_rate)?; |
| |
|
| | |
| | log::debug!("Preparing emotion conditioning..."); |
| | let emotion_embedding = if let Some(ref emo_vec) = options.emotion_vector { |
| | let emo = self.emotion_encoder.apply_strength(emo_vec, options.emotion_alpha); |
| | self.emotion_encoder.encode(&emo)? |
| | } else { |
| | let neutral = self.emotion_encoder.neutral(); |
| | self.emotion_encoder.encode(&neutral)? |
| | }; |
| |
|
| | |
| | log::debug!("Generating mel spectrogram..."); |
| | let mel_length = (tokens.len() as f32 * 2.5) as usize; |
| | let mel_spec = self.generate_mel_spectrogram( |
| | &tokens, |
| | &semantic_codes, |
| | &speaker_embedding, |
| | &emotion_embedding, |
| | mel_length, |
| | )?; |
| |
|
| | |
| | log::debug!("Running vocoder..."); |
| | let audio = self.vocoder.synthesize(&mel_spec)?; |
| |
|
| | |
| | log::debug!("Post-processing..."); |
| | let audio = self.post_process(&audio); |
| |
|
| | let processing_time = start_time.elapsed().as_secs_f32(); |
| | let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
| | let rtf = processing_time / duration; |
| |
|
| | log::info!( |
| | "Synthesis complete: {:.2}s audio in {:.2}s (RTF: {:.3})", |
| | duration, |
| | processing_time, |
| | rtf |
| | ); |
| |
|
| | Ok(SynthesisResult { |
| | audio, |
| | sample_rate: self.vocoder.sample_rate(), |
| | duration, |
| | processing_time, |
| | rtf, |
| | }) |
| | } |
| |
|
| | |
| | pub fn synthesize_to_file( |
| | &self, |
| | text: &str, |
| | speaker_audio_path: &str, |
| | output_path: &str, |
| | options: &SynthesisOptions, |
| | ) -> Result<SynthesisResult> { |
| | let result = self.synthesize(text, speaker_audio_path, options)?; |
| | result.save(output_path)?; |
| | log::info!("Saved audio to: {}", output_path); |
| | Ok(result) |
| | } |
| |
|
| | |
| | fn generate_mel_spectrogram( |
| | &self, |
| | _tokens: &[i64], |
| | _semantic_codes: &[i64], |
| | _speaker_embedding: &Array1<f32>, |
| | _emotion_embedding: &Array1<f32>, |
| | mel_length: usize, |
| | ) -> Result<ndarray::Array2<f32>> { |
| | |
| | |
| |
|
| | use rand::Rng; |
| | let mut rng = rand::thread_rng(); |
| |
|
| | let n_mels = self.audio_config.n_mels; |
| | let mut mel = ndarray::Array2::zeros((n_mels, mel_length)); |
| |
|
| | |
| | for t in 0..mel_length { |
| | for freq in 0..n_mels { |
| | |
| | let base_value = -4.0 + (freq as f32 / n_mels as f32) * 2.0; |
| | let time_mod = ((t as f32 * 0.1).sin() + 1.0) * 0.5; |
| | let noise = rng.gen_range(-0.5..0.5); |
| | mel[[freq, t]] = base_value + time_mod + noise; |
| | } |
| | } |
| |
|
| | Ok(mel) |
| | } |
| |
|
| | |
| | fn post_process(&self, audio: &[f32]) -> Vec<f32> { |
| | use crate::audio::{normalize_audio_peak, apply_fade}; |
| |
|
| | |
| | let normalized = normalize_audio_peak(audio, 0.89); |
| |
|
| | |
| | let fade_samples = (self.audio_config.sample_rate as f32 * 0.005) as usize; |
| | apply_fade(&normalized, fade_samples, fade_samples) |
| | } |
| |
|
| | |
| | pub fn synthesize_long( |
| | &self, |
| | text: &str, |
| | speaker_audio_path: &str, |
| | options: &SynthesisOptions, |
| | ) -> Result<SynthesisResult> { |
| | let start_time = Instant::now(); |
| |
|
| | |
| | let segments = super::segment_text(text, 100); |
| | log::info!("Split text into {} segments", segments.len()); |
| |
|
| | |
| | let mut audio_segments = Vec::new(); |
| | for (i, segment) in segments.iter().enumerate() { |
| | log::info!("Synthesizing segment {}/{}", i + 1, segments.len()); |
| | let result = self.synthesize(segment, speaker_audio_path, options)?; |
| | audio_segments.push(result.audio); |
| | } |
| |
|
| | |
| | let audio = super::concatenate_audio( |
| | &audio_segments, |
| | options.segment_silence_ms, |
| | self.vocoder.sample_rate(), |
| | ); |
| |
|
| | let processing_time = start_time.elapsed().as_secs_f32(); |
| | let duration = audio.len() as f32 / self.vocoder.sample_rate() as f32; |
| | let rtf = processing_time / duration; |
| |
|
| | Ok(SynthesisResult { |
| | audio, |
| | sample_rate: self.vocoder.sample_rate(), |
| | duration, |
| | processing_time, |
| | rtf, |
| | }) |
| | } |
| |
|
| | |
| | pub fn sample_rate(&self) -> u32 { |
| | self.vocoder.sample_rate() |
| | } |
| |
|
| | |
| | pub fn config(&self) -> &Config { |
| | &self.config |
| | } |
| | } |
| |
|
| | #[cfg(test)] |
| | mod tests { |
| | use super::*; |
| |
|
| | #[test] |
| | fn test_synthesis_options_default() { |
| | let options = SynthesisOptions::default(); |
| | assert_eq!(options.emotion_alpha, 1.0); |
| | assert!(matches!(options.sampling, SamplingStrategy::TopKP { .. })); |
| | } |
| |
|
| | #[test] |
| | fn test_synthesis_result_duration() { |
| | let result = SynthesisResult { |
| | audio: vec![0.0; 22050 * 125], |
| | sample_rate: 22050, |
| | duration: 125.0, |
| | processing_time: 10.0, |
| | rtf: 0.08, |
| | }; |
| |
|
| | assert_eq!(result.duration_formatted(), "02:05"); |
| | } |
| |
|
| | #[test] |
| | fn test_segment_text() { |
| | let text = "This is sentence one. This is sentence two. This is sentence three."; |
| | let segments = super::super::segment_text(text, 50); |
| | assert!(segments.len() >= 2); |
| | } |
| |
|
| | #[test] |
| | fn test_concatenate_audio() { |
| | let seg1 = vec![1.0f32; 100]; |
| | let seg2 = vec![2.0f32; 100]; |
| | let result = super::super::concatenate_audio(&[seg1, seg2], 10, 1000); |
| | |
| | assert_eq!(result.len(), 210); |
| | } |
| | } |
| |
|