Refactor config classes and inference pipeline to improve path handling, weight management, and modularity

Files changed (3) hide show

training_pipeline/configs.py +45 -5
training_pipeline/stages/inference.py +209 -26
training_pipeline/stages/training.py +5 -7

training_pipeline/configs.py CHANGED Viewed

@@ -3,9 +3,11 @@
 """
 import os
 import sys
 from dataclasses import dataclass
 from .enums import ModelVersion
@@ -38,7 +40,7 @@ class AudioSliceConfig(BaseConfig):
     @property
     def output_dir(self):
-        return os.path.join(self.exp_root, self.exp_name, "slicer_opt")
 @dataclass
@@ -51,11 +53,11 @@ class ASRConfig(BaseConfig):
     @property
     def input_dir(self):
-        return os.path.join(self.exp_root, self.exp_name, "slicer_opt")
     @property
     def output_dir(self):
-        return os.path.join(self.exp_root, self.exp_name, "asr_opt")
 @dataclass
@@ -76,12 +78,13 @@ class FeatureExtractionConfig(BaseConfig):
     @property
     def inp_text(self):
         """标注文件路径"""
-        return os.path.join(self.exp_root, self.exp_name, "slicer_opt.list")
     @property
     def inp_wav_dir(self):
         """音频目录"""
-        return os.path.join(self.exp_root, self.exp_name, "slicer_opt")
 @dataclass
@@ -99,7 +102,13 @@ class SoVITSTrainConfig(BaseConfig):
     if_grad_ckpt: bool = False
     lora_rank: int = 32
 @dataclass
 class GPTTrainConfig(BaseConfig):
     """GPT训练配置"""
@@ -112,12 +121,43 @@ class GPTTrainConfig(BaseConfig):
     if_dpo: bool = False
     pretrained_s1: str = 'GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt'
 @dataclass
 class InferenceConfig(BaseConfig):
     """推理配置"""
     gpt_path: str = ""
     sovits_path: str = ""
     bert_path: str = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
     cnhubert_base_path: str = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
     batched_infer_enabled: bool = False

 """
 import os
+import pathlib
 import sys
 from dataclasses import dataclass
+from config import SoVITS_weight_version2root, GPT_weight_version2root
 from .enums import ModelVersion
     @property
     def output_dir(self):
+        return os.path.join(self.exp_dir, "slicer_opt")
 @dataclass
     @property
     def input_dir(self):
+        return os.path.join(self.exp_dir, "slicer_opt")
     @property
     def output_dir(self):
+        return os.path.join(self.exp_dir, "asr_opt")
 @dataclass
     @property
     def inp_text(self):
         """标注文件路径"""
+        return os.path.join(self.exp_dir, 'asr_opt', "slicer_opt.list")
     @property
     def inp_wav_dir(self):
         """音频目录"""
+        return os.path.join(self.exp_dir, "slicer_opt")
 @dataclass
     if_grad_ckpt: bool = False
     lora_rank: int = 32
+    @property
+    def output_dir(self):
+        _output_dir = os.path.join(self.exp_dir, SoVITS_weight_version2root[self.version.value])
+        os.makedirs(_output_dir, exist_ok=True)
+        return _output_dir
 @dataclass
 class GPTTrainConfig(BaseConfig):
     """GPT训练配置"""
     if_dpo: bool = False
     pretrained_s1: str = 'GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt'
+    @property
+    def output_dir(self):
+        _output_dir = os.path.join(self.exp_dir, GPT_weight_version2root[self.version.value])
+        os.makedirs(_output_dir, exist_ok=True)
+        return _output_dir
 @dataclass
 class InferenceConfig(BaseConfig):
     """推理配置"""
+    version: ModelVersion = ModelVersion.V2_PRO
     gpt_path: str = ""
     sovits_path: str = ""
     bert_path: str = "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large"
     cnhubert_base_path: str = "GPT_SoVITS/pretrained_models/chinese-hubert-base"
     batched_infer_enabled: bool = False
+    ref_text: str = ""
+    ref_audio_path: str = ""
+    target_text: str = ""
+    text_split_method: str = "cut1"
+    @property
+    def output_dir(self):
+        return os.path.join(self.exp_dir, 'inference')
+    def gpt_paths(self) -> list:
+        """获取所有 GPT 权重路径"""
+        base = pathlib.Path(self.exp_dir) / GPT_weight_version2root[self.version.value]
+        if not base.exists():
+            return []
+        return sorted([item.as_posix() for item in base.iterdir() if item.is_file()])
+    def sovits_paths(self) -> list:
+        """获取所有 SoVITS 权重路径"""
+        base = pathlib.Path(self.exp_dir) / SoVITS_weight_version2root[self.version.value]
+        if not base.exists():
+            return []
+        return sorted([item.as_posix() for item in base.iterdir() if item.is_file()])

training_pipeline/stages/inference.py CHANGED Viewed

@@ -4,50 +4,233 @@
 包含:
 - InferenceStage: TTS推理
 """
 import os
 from typing import Dict, Any, Generator
 from ..base import BaseStage
-from ..enums import StageStatus
 from ..configs import InferenceConfig
 class InferenceStage(BaseStage):
     """1C - TTS推理"""
     def __init__(self, config: InferenceConfig):
         super().__init__(config)
         self.config: InferenceConfig = config
     @property
     def name(self) -> str:
         return "TTS推理"
     def validate(self) -> bool:
-        return self.config.gpt_path != "" and self.config.sovits_path != ""
     def run(self) -> Generator[Dict[str, Any], None, None]:
         self._status = StageStatus.RUNNING
         cfg = self.config
-        # 设置环境变量
-        os.environ["gpt_path"] = cfg.gpt_path
-        os.environ["sovits_path"] = cfg.sovits_path
-        os.environ["cnhubert_base_path"] = cfg.cnhubert_base_path
-        os.environ["bert_path"] = cfg.bert_path
-        os.environ["_CUDA_VISIBLE_DEVICES"] = cfg.gpu_numbers
-        os.environ["is_half"] = str(cfg.is_half)
-        if cfg.batched_infer_enabled:
-            cmd = f'"{cfg.python_exec}" -s GPT_SoVITS/inference_webui_fast.py'
         else:
-            cmd = f'"{cfg.python_exec}" -s GPT_SoVITS/inference_webui.py'
-        yield self._make_progress("推理WebUI启动中...", 0.5)
-        self._process = self._run_command(cmd, wait=False)
-        self._status = StageStatus.COMPLETED
-        yield self._make_progress("推理WebUI已启动", 1.0)

 包含:
 - InferenceStage: TTS推理
 """
 import os
+import sys
+from itertools import product
+from pathlib import Path
 from typing import Dict, Any, Generator
+import soundfile as sf
+from GPT_SoVITS.TTS_infer_pack.TTS import TTS_Config, TTS
+from GPT_SoVITS.utils import HParams
 from ..base import BaseStage
 from ..configs import InferenceConfig
+from ..enums import StageStatus
+if "utils" not in sys.modules:
+    class GPTSoVITSFixedUtilsModule:
+        HParams = HParams
+    sys.modules['utils'] = GPTSoVITSFixedUtilsModule
+def create_tts_module(cfg: InferenceConfig):
+    """创建 TTS 模块
+    Args:
+        cfg: 推理配置
+    Returns:
+        TTS 模块实例
+    """
+    tts_config = TTS_Config({
+        "v2": {
+            "device": "cpu",
+            "is_half": False,
+            "version": "v2",
+            "t2s_weights_path": cfg.gpt_path,
+            "vits_weights_path": cfg.sovits_path,
+            "cnhuhbert_base_path": cfg.cnhubert_base_path,
+            "bert_base_path": cfg.bert_path,
+        },
+    })
+    return TTS(tts_config)
+def create_inference_config(
+        text: str,
+        ref_audio_path: str,
+        prompt_text: str = "",
+        text_lang: str = "zh",
+        prompt_lang: str = "zh",
+        aux_ref_audio_paths: list = None,
+        top_k: int = 15,
+        top_p: float = 1.0,
+        temperature: float = 1.0,
+        text_split_method: str = "cut1",
+        batch_size: int = 1,
+        batch_threshold: float = 0.75,
+        split_bucket: bool = True,
+        speed_factor: float = 1.0,
+        fragment_interval: float = 0.3,
+        seed: int = -1,
+        parallel_infer: bool = False,
+        repetition_penalty: float = 1.35,
+        sample_steps: int = 32,
+        super_sampling: bool = False,
+        return_fragment: bool = False,
+        streaming_mode: bool = False,
+        overlap_length: int = 2,
+        min_chunk_length: int = 16,
+        fixed_length_chunk: bool = False,
+) -> Dict[str, Any]:
+    """创建推理配置
+    Args:
+        text: 要合成的文本
+        ref_audio_path: 参考音频路径
+        prompt_text: 参考音频的提示文本
+        text_lang: 文本语言
+        prompt_lang: 提示文本语言
+        aux_ref_audio_paths: 辅助参考音频路径列表，用于多说话人音色融合
+        top_k: top k 采样
+        top_p: top p 采样
+        temperature: 采样温度
+        text_split_method: 文本分割方法，详见 text_segmentation_method.py
+        batch_size: 推理批次大小
+        batch_threshold: 批次分割阈值
+        split_bucket: 是否将批次分割成多个桶
+        speed_factor: 控制合成音频的速度
+        fragment_interval: 控制音频片段的间隔
+        seed: 随机种子，用于可复现性
+        parallel_infer: 是否使用并行推理
+        repetition_penalty: T2S 模型的重复惩罚
+        sample_steps: VITS V3 模型的采样步数
+        super_sampling: VITS V3 模型是否使用超采样
+        return_fragment: 是否逐步返回音频片段（最佳质量，最慢响应）
+        streaming_mode: 是否按块返回音频（中等质量，较慢响应）
+        overlap_length: 流式模式下语义 token 的重叠长度
+        min_chunk_length: 流式模式下语义 token 的最小块长度
+        fixed_length_chunk: 是否使用固定长度块（较低质量，更快响应）
+    Returns:
+        推理配置字典
+    """
+    if aux_ref_audio_paths is None:
+        aux_ref_audio_paths = []
+    return {
+        "text": text,  # str.(required) text to be synthesized
+        "text_lang": text_lang,  # str.(required) language of the text to be synthesized
+        "ref_audio_path": ref_audio_path,  # str.(required) reference audio path
+        "aux_ref_audio_paths": aux_ref_audio_paths,
+        # list.(optional) auxiliary reference audio paths for multi-speaker tone fusion
+        "prompt_text": prompt_text,  # str.(optional) prompt text for the reference audio
+        "prompt_lang": prompt_lang,  # str.(required) language of the prompt text for the reference audio
+        "top_k": top_k,  # int. top k sampling
+        "top_p": top_p,  # float. top p sampling
+        "temperature": temperature,  # float. temperature for sampling
+        "text_split_method": text_split_method,  # str. text split method, see text_segmentation_method.py for details.
+        "batch_size": batch_size,  # int. batch size for inference
+        "batch_threshold": batch_threshold,  # float. threshold for batch splitting.
+        "split_bucket": split_bucket,  # bool. whether to split the batch into multiple buckets.
+        "speed_factor": speed_factor,  # float. control the speed of the synthesized audio.
+        "fragment_interval": fragment_interval,  # float. to control the interval of the audio fragment.
+        "seed": seed,  # int. random seed for reproducibility.
+        "parallel_infer": parallel_infer,  # bool. whether to use parallel inference.
+        "repetition_penalty": repetition_penalty,  # float. repetition penalty for T2S model.
+        "sample_steps": sample_steps,  # int. number of sampling steps for VITS model V3.
+        "super_sampling": super_sampling,  # bool. whether to use super-sampling for audio when using VITS model V3.
+        "return_fragment": return_fragment,
+        # bool. step by step return the audio fragment. (Best Quality, Slowest response speed. old version of streaming mode)
+        "streaming_mode": streaming_mode,  # bool. return audio chunk by chunk. (Medium quality, Slow response speed)
+        "overlap_length": overlap_length,  # int. overlap length of semantic tokens for streaming mode.
+        "min_chunk_length": min_chunk_length,
+        # int. The minimum chunk length of semantic tokens for streaming mode. (affects audio chunk size)
+        "fixed_length_chunk": fixed_length_chunk,
+        # bool. When turned on, it can achieve faster streaming response, but with lower quality. (lower quality, faster response speed)
+    }
 class InferenceStage(BaseStage):
     """1C - TTS推理"""
     def __init__(self, config: InferenceConfig):
         super().__init__(config)
         self.config: InferenceConfig = config
     @property
     def name(self) -> str:
         return "TTS推理"
     def validate(self) -> bool:
+        # 如果指定了具体路径，使用指定的路径
+        if self.config.gpt_path and self.config.sovits_path:
+            return True
+        # 否则检查是否能从实验目录获取路径
+        gpt_paths = self.config.gpt_paths()
+        sovits_paths = self.config.sovits_paths()
+        return len(gpt_paths) > 0 and len(sovits_paths) > 0
     def run(self) -> Generator[Dict[str, Any], None, None]:
         self._status = StageStatus.RUNNING
         cfg = self.config
+        # 确保输出目录存在
+        os.makedirs(cfg.output_dir, exist_ok=True)
+        # 获取所有权重路径
+        if cfg.gpt_path and cfg.sovits_path:
+            # 使用指定的单一路径
+            combinations = [(cfg.gpt_path, cfg.sovits_path)]
         else:
+            # 获取所有路径进行排列组合
+            gpt_paths = cfg.gpt_paths()
+            sovits_paths = cfg.sovits_paths()
+            combinations = list(product(gpt_paths, sovits_paths))
+        total_combinations = len(combinations)
+        yield self._make_progress(f"共 {total_combinations} 个组合待推理", 0.0)
+        for idx, (gpt_path, sovits_path) in enumerate(combinations):
+            # 提取权重文件名（不含扩展名）
+            gpt_name = Path(gpt_path).stem
+            sovits_name = Path(sovits_path).stem
+            # 生成独立的输出文件名
+            output_filename = f"{cfg.exp_name}_gpt-{gpt_name}_sovits-{sovits_name}.wav"
+            output_path = os.path.join(cfg.output_dir, output_filename)
+            progress = (idx / total_combinations)
+            yield self._make_progress(
+                f"[{idx + 1}/{total_combinations}] GPT: {gpt_name}, SoVITS: {sovits_name}",
+                progress
+            )
+            # 创建临时配置用于当前组合
+            temp_cfg = InferenceConfig(
+                exp_name=cfg.exp_name,
+                exp_root=cfg.exp_root,
+                gpt_path=gpt_path,
+                sovits_path=sovits_path,
+                bert_path=cfg.bert_path,
+                cnhubert_base_path=cfg.cnhubert_base_path,
+                ref_text=cfg.ref_text,
+                ref_audio_path=cfg.ref_audio_path,
+                target_text=cfg.target_text,
+            )
+            # 创建 TTS 模块并推理
+            module = create_tts_module(temp_cfg)
+            inference_config = create_inference_config(
+                text=cfg.target_text,
+                ref_audio_path=cfg.ref_audio_path,
+                prompt_text=cfg.ref_text,
+            )
+            for item in module.run(inference_config):
+                sample_rate, audio_data = item[0], item[1]
+                # 保存到独立的输出文件
+                sf.write(output_path, audio_data, sample_rate, subtype='PCM_16')
+                break
+            yield self._make_progress(
+                f"[{idx + 1}/{total_combinations}] 已保存: {output_filename}",
+                (idx + 1) / total_combinations
+            )
+        self._status = StageStatus.COMPLETED
+        yield self._make_progress(f"推理完成，共生成 {total_combinations} 个音频文件", 1.0)

training_pipeline/stages/training.py CHANGED Viewed

@@ -6,16 +6,16 @@
 - GPTTrainStage: GPT模型训练
 """
-import os
 import json
 from typing import Dict, Any, Generator
 import yaml
 from ..base import BaseStage
-from ..enums import StageStatus, ModelVersion
 from ..configs import SoVITSTrainConfig, GPTTrainConfig
-from config import SoVITS_weight_version2root, GPT_weight_version2root
 class SoVITSTrainStage(BaseStage):
     """1Ba - SoVITS模型训练"""
@@ -69,8 +69,7 @@ class SoVITSTrainStage(BaseStage):
         data["train"]["lora_rank"] = cfg.lora_rank
         data["model"]["version"] = version_str
         data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir
-        data["save_weight_dir"] = SoVITS_weight_version2root[version_str]
-        os.makedirs(SoVITS_weight_version2root[version_str], exist_ok=True)
         data["name"] = cfg.exp_name
         data["version"] = version_str
@@ -136,8 +135,7 @@ class GPTTrainStage(BaseStage):
         data["train"]["if_save_every_weights"] = cfg.if_save_every_weights
         data["train"]["if_save_latest"] = cfg.if_save_latest
         data["train"]["if_dpo"] = cfg.if_dpo
-        data["train"]["half_weights_save_dir"] = GPT_weight_version2root[cfg.version.value]
-        os.makedirs(GPT_weight_version2root[cfg.version.value], exist_ok=True)
         data["train"]["exp_name"] = cfg.exp_name
         data["train_semantic_path"] = f"{s1_dir}/6-name2semantic.tsv"
         data["train_phoneme_path"] = f"{s1_dir}/2-name2text.txt"

 - GPTTrainStage: GPT模型训练
 """
 import json
+import os
 from typing import Dict, Any, Generator
 import yaml
 from ..base import BaseStage
 from ..configs import SoVITSTrainConfig, GPTTrainConfig
+from ..enums import StageStatus, ModelVersion
 class SoVITSTrainStage(BaseStage):
     """1Ba - SoVITS模型训练"""
         data["train"]["lora_rank"] = cfg.lora_rank
         data["model"]["version"] = version_str
         data["data"]["exp_dir"] = data["s2_ckpt_dir"] = s2_dir
+        data["save_weight_dir"] = cfg.output_dir
         data["name"] = cfg.exp_name
         data["version"] = version_str
         data["train"]["if_save_every_weights"] = cfg.if_save_every_weights
         data["train"]["if_save_latest"] = cfg.if_save_latest
         data["train"]["if_dpo"] = cfg.if_dpo
+        data["train"]["half_weights_save_dir"] = cfg.output_dir
         data["train"]["exp_name"] = cfg.exp_name
         data["train_semantic_path"] = f"{s1_dir}/6-name2semantic.tsv"
         data["train_phoneme_path"] = f"{s1_dir}/2-name2text.txt"