|
|
| """
|
| 音频后处理模块 - 齿音和呼吸音处理
|
| 基于研究文献的最佳实践
|
| """
|
| import numpy as np
|
| from scipy import signal
|
| from typing import Optional
|
|
|
|
|
| def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
|
| """
|
| 检测齿音帧 (s, sh, ch, z 等高频辅音)
|
|
|
| 参考: "Managing Sibilance" - Sound on Sound
|
| 齿音主要集中在 4-10kHz 频段
|
|
|
| Args:
|
| audio: 音频数据
|
| sr: 采样率
|
| threshold_db: 高频能量阈值 (dB)
|
|
|
| Returns:
|
| 布尔数组,True 表示齿音帧
|
| """
|
|
|
| nyquist = sr / 2
|
| low_freq = 4000 / nyquist
|
| high_freq = min(10000 / nyquist, 0.99)
|
|
|
|
|
| sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
|
| high_freq_audio = signal.sosfilt(sos, audio)
|
|
|
|
|
| frame_length = int(0.02 * sr)
|
| hop_length = int(0.01 * sr)
|
|
|
| n_frames = 1 + (len(audio) - frame_length) // hop_length
|
| high_energy = np.zeros(n_frames)
|
| total_energy = np.zeros(n_frames)
|
|
|
| for i in range(n_frames):
|
| start = i * hop_length
|
| end = start + frame_length
|
| if end > len(audio):
|
| break
|
|
|
|
|
| high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
|
|
|
| total_energy[i] = np.sum(audio[start:end] ** 2)
|
|
|
|
|
| high_ratio = np.zeros_like(high_energy)
|
| mask = total_energy > 1e-10
|
| high_ratio[mask] = high_energy[mask] / total_energy[mask]
|
|
|
|
|
| high_energy_db = 10 * np.log10(high_energy + 1e-10)
|
|
|
|
|
| is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)
|
|
|
| return is_sibilance
|
|
|
|
|
| def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
|
| """
|
| 减少齿音 (De-essing)
|
|
|
| 参考: "Advanced Sibilance Control" - Mike's Mix Master
|
| 使用多频段动态压缩技术
|
|
|
| Args:
|
| audio: 音频数据
|
| sr: 采样率
|
| reduction_db: 齿音衰减量 (dB)
|
|
|
| Returns:
|
| 处理后的音频
|
| """
|
|
|
| sibilance_frames = detect_sibilance_frames(audio, sr)
|
|
|
| if not np.any(sibilance_frames):
|
| return audio
|
|
|
|
|
| frame_length = int(0.02 * sr)
|
| hop_length = int(0.01 * sr)
|
|
|
| gain_curve = np.ones(len(audio))
|
| reduction_factor = 10 ** (-reduction_db / 20)
|
|
|
| for i, is_sib in enumerate(sibilance_frames):
|
| if is_sib:
|
| start = i * hop_length
|
| end = start + frame_length
|
| if end > len(audio):
|
| break
|
|
|
|
|
| fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
|
| sustain = np.full(frame_length // 2, reduction_factor)
|
| fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
|
| envelope = np.concatenate([fade_in, sustain, fade_out])
|
|
|
|
|
| gain_curve[start:start+len(envelope)] = np.minimum(
|
| gain_curve[start:start+len(envelope)],
|
| envelope
|
| )
|
|
|
|
|
| result = audio * gain_curve
|
|
|
| return result
|
|
|
|
|
| def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
|
| """
|
| 检测呼吸音帧
|
|
|
| 呼吸音特征:
|
| - 低能量
|
| - 宽频噪声
|
| - 通常在乐句之间
|
|
|
| Args:
|
| audio: 音频数据
|
| sr: 采样率
|
| threshold_db: 能量阈值 (dB)
|
|
|
| Returns:
|
| 布尔数组,True 表示呼吸音帧
|
| """
|
| frame_length = int(0.02 * sr)
|
| hop_length = int(0.01 * sr)
|
|
|
| n_frames = 1 + (len(audio) - frame_length) // hop_length
|
| is_breath = np.zeros(n_frames, dtype=bool)
|
|
|
| for i in range(n_frames):
|
| start = i * hop_length
|
| end = start + frame_length
|
| if end > len(audio):
|
| break
|
|
|
| frame = audio[start:end]
|
|
|
|
|
| energy = np.sum(frame ** 2)
|
| energy_db = 10 * np.log10(energy + 1e-10)
|
|
|
|
|
| fft = np.abs(np.fft.rfft(frame))
|
| geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
|
| arithmetic_mean = np.mean(fft)
|
| spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)
|
|
|
|
|
| is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)
|
|
|
| return is_breath
|
|
|
|
|
| def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
|
| """
|
| 减少呼吸音噪声
|
|
|
| 参考: "How to REALLY Clean Vocals" - Waves
|
|
|
| Args:
|
| audio: 音频数据
|
| sr: 采样率
|
| reduction_db: 呼吸音衰减量 (dB)
|
|
|
| Returns:
|
| 处理后的音频
|
| """
|
|
|
| breath_frames = detect_breath_frames(audio, sr)
|
|
|
| if not np.any(breath_frames):
|
| return audio
|
|
|
|
|
| frame_length = int(0.02 * sr)
|
| hop_length = int(0.01 * sr)
|
|
|
| gain_curve = np.ones(len(audio))
|
| reduction_factor = 10 ** (-reduction_db / 20)
|
|
|
| for i, is_breath in enumerate(breath_frames):
|
| if is_breath:
|
| start = i * hop_length
|
| end = start + frame_length
|
| if end > len(audio):
|
| break
|
|
|
|
|
| fade_length = frame_length // 4
|
| fade_in = np.linspace(1.0, reduction_factor, fade_length)
|
| sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
|
| fade_out = np.linspace(reduction_factor, 1.0, fade_length)
|
| envelope = np.concatenate([fade_in, sustain, fade_out])
|
|
|
|
|
| gain_curve[start:start+len(envelope)] = np.minimum(
|
| gain_curve[start:start+len(envelope)],
|
| envelope
|
| )
|
|
|
|
|
| result = audio * gain_curve
|
|
|
| return result
|
|
|
|
|
| def apply_vocal_cleanup(
|
| audio: np.ndarray,
|
| sr: int,
|
| reduce_sibilance_enabled: bool = True,
|
| reduce_breath_enabled: bool = True,
|
| sibilance_reduction_db: float = 4.0,
|
| breath_reduction_db: float = 8.0
|
| ) -> np.ndarray:
|
| """
|
| 应用完整的人声清理处理
|
|
|
| Args:
|
| audio: 音频数据
|
| sr: 采样率
|
| reduce_sibilance_enabled: 是否减少齿音
|
| reduce_breath_enabled: 是否减少呼吸音
|
| sibilance_reduction_db: 齿音衰减量 (dB)
|
| breath_reduction_db: 呼吸音衰减量 (dB)
|
|
|
| Returns:
|
| 处理后的音频
|
| """
|
| result = audio.copy()
|
|
|
|
|
| if reduce_breath_enabled:
|
| result = reduce_breath_noise(result, sr, breath_reduction_db)
|
|
|
|
|
| if reduce_sibilance_enabled:
|
| result = reduce_sibilance(result, sr, sibilance_reduction_db)
|
|
|
| return result
|
|
|