Spaces:

mason369
/

AI-RVC

Running

App Files Files Community

AI-RVC / lib /vocal_cleanup.py

mason369

Upload folder using huggingface_hub

b6f9c90 verified about 1 month ago

raw

history blame contribute delete

7.43 kB

	# -- coding: utf-8 --
	"""
	音频后处理模块 - 齿音和呼吸音处理
	基于研究文献的最佳实践
	"""
	import numpy as np
	from scipy import signal
	from typing import Optional


	def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
	"""
	检测齿音帧 (s, sh, ch, z 等高频辅音)

	参考: "Managing Sibilance" - Sound on Sound
	齿音主要集中在 4-10kHz 频段

	Args:
	audio: 音频数据
	sr: 采样率
	threshold_db: 高频能量阈值 (dB)

	Returns:
	布尔数组，True 表示齿音帧
	"""
	# 设计高通滤波器提取高频成分 (4-10kHz)
	nyquist = sr / 2
	low_freq = 4000 / nyquist
	high_freq = min(10000 / nyquist, 0.99)

	# 带通滤波器
	sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
	high_freq_audio = signal.sosfilt(sos, audio)

	# 计算帧能量
	frame_length = int(0.02 * sr) # 20ms 帧
	hop_length = int(0.01 * sr) # 10ms 跳跃

	n_frames = 1 + (len(audio) - frame_length) // hop_length
	high_energy = np.zeros(n_frames)
	total_energy = np.zeros(n_frames)

	for i in range(n_frames):
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	# 高频能量
	high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
	# 总能量
	total_energy[i] = np.sum(audio[start:end] ** 2)

	# 计算高频能量比例
	high_ratio = np.zeros_like(high_energy)
	mask = total_energy > 1e-10
	high_ratio[mask] = high_energy[mask] / total_energy[mask]

	# 转换为 dB
	high_energy_db = 10 * np.log10(high_energy + 1e-10)

	# 齿音检测：高频能量高且高频比例大
	is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)

	return is_sibilance


	def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
	"""
	减少齿音 (De-essing)

	参考: "Advanced Sibilance Control" - Mike's Mix Master
	使用多频段动态压缩技术

	Args:
	audio: 音频数据
	sr: 采样率
	reduction_db: 齿音衰减量 (dB)

	Returns:
	处理后的音频
	"""
	# 检测齿音帧
	sibilance_frames = detect_sibilance_frames(audio, sr)

	if not np.any(sibilance_frames):
	return audio

	# 计算衰减增益曲线（在时域应用，避免频段分离的相位问题）
	frame_length = int(0.02 * sr)
	hop_length = int(0.01 * sr)

	gain_curve = np.ones(len(audio))
	reduction_factor = 10 ** (-reduction_db / 20)

	for i, is_sib in enumerate(sibilance_frames):
	if is_sib:
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	# 平滑过渡
	fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
	sustain = np.full(frame_length // 2, reduction_factor)
	fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
	envelope = np.concatenate([fade_in, sustain, fade_out])

	# 应用增益
	gain_curve[start:start+len(envelope)] = np.minimum(
	gain_curve[start:start+len(envelope)],
	envelope
	)

	# 直接在时域应用增益（避免频段分离）
	result = audio * gain_curve

	return result


	def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
	"""
	检测呼吸音帧

	呼吸音特征：
	- 低能量
	- 宽频噪声
	- 通常在乐句之间

	Args:
	audio: 音频数据
	sr: 采样率
	threshold_db: 能量阈值 (dB)

	Returns:
	布尔数组，True 表示呼吸音帧
	"""
	frame_length = int(0.02 * sr) # 20ms
	hop_length = int(0.01 * sr) # 10ms

	n_frames = 1 + (len(audio) - frame_length) // hop_length
	is_breath = np.zeros(n_frames, dtype=bool)

	for i in range(n_frames):
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	frame = audio[start:end]

	# 计算能量
	energy = np.sum(frame ** 2)
	energy_db = 10 * np.log10(energy + 1e-10)

	# 计算频谱平坦度 (噪声特征)
	fft = np.abs(np.fft.rfft(frame))
	geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
	arithmetic_mean = np.mean(fft)
	spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)

	# 呼吸音：低能量 + 高频谱平坦度
	is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)

	return is_breath


	def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
	"""
	减少呼吸音噪声

	参考: "How to REALLY Clean Vocals" - Waves

	Args:
	audio: 音频数据
	sr: 采样率
	reduction_db: 呼吸音衰减量 (dB)

	Returns:
	处理后的音频
	"""
	# 检测呼吸音帧
	breath_frames = detect_breath_frames(audio, sr)

	if not np.any(breath_frames):
	return audio

	# 计算衰减增益曲线
	frame_length = int(0.02 * sr)
	hop_length = int(0.01 * sr)

	gain_curve = np.ones(len(audio))
	reduction_factor = 10 ** (-reduction_db / 20)

	for i, is_breath in enumerate(breath_frames):
	if is_breath:
	start = i * hop_length
	end = start + frame_length
	if end > len(audio):
	break

	# 平滑过渡，避免咔嗒声
	fade_length = frame_length // 4
	fade_in = np.linspace(1.0, reduction_factor, fade_length)
	sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
	fade_out = np.linspace(reduction_factor, 1.0, fade_length)
	envelope = np.concatenate([fade_in, sustain, fade_out])

	# 应用增益
	gain_curve[start:start+len(envelope)] = np.minimum(
	gain_curve[start:start+len(envelope)],
	envelope
	)

	# 应用增益曲线
	result = audio * gain_curve

	return result


	def apply_vocal_cleanup(
	audio: np.ndarray,
	sr: int,
	reduce_sibilance_enabled: bool = True,
	reduce_breath_enabled: bool = True,
	sibilance_reduction_db: float = 4.0,
	breath_reduction_db: float = 8.0
	) -> np.ndarray:
	"""
	应用完整的人声清理处理

	Args:
	audio: 音频数据
	sr: 采样率
	reduce_sibilance_enabled: 是否减少齿音
	reduce_breath_enabled: 是否减少呼吸音
	sibilance_reduction_db: 齿音衰减量 (dB)
	breath_reduction_db: 呼吸音衰减量 (dB)

	Returns:
	处理后的音频
	"""
	result = audio.copy()

	# 减少呼吸音（先处理，因为能量更低）
	if reduce_breath_enabled:
	result = reduce_breath_noise(result, sr, breath_reduction_db)

	# 减少齿音
	if reduce_sibilance_enabled:
	result = reduce_sibilance(result, sr, sibilance_reduction_db)

	return result