liumaolin
feat(api/tts-voice-app): add Cantonese (yue) support and improve language-specific defaults
9845a3d
| """ | |
| Quick Mode 任务 Schema | |
| 小白用户一键训练模式的请求/响应模型 | |
| 参考文档: development.md 4.6.1 + 4.6.3 | |
| """ | |
| from datetime import datetime | |
| from typing import List, Literal, Optional | |
| from pydantic import BaseModel, Field | |
| class InferenceOptions(BaseModel): | |
| """ | |
| 推理选项(可选) | |
| 训练完成后进行推理测试的配置。如果不提供,将使用默认值自动进行推理。 | |
| Attributes: | |
| enabled: 是否启用推理阶段 | |
| ref_audio_path: 参考音频路径,不提供则使用训练音频的切片 | |
| ref_text: 参考音频的文本,不提供则从 ASR 结果自动获取 | |
| target_text: 要合成的目标文本 | |
| """ | |
| enabled: bool = Field( | |
| default=True, | |
| description="是否启用推理阶段,默认启用" | |
| ) | |
| ref_audio_path: Optional[str] = Field( | |
| default=None, | |
| description="参考音频路径,不提供则自动使用训练音频的切片" | |
| ) | |
| ref_text: Optional[str] = Field( | |
| default=None, | |
| description="参考音频的文本,不提供则从 ASR 结果自动获取" | |
| ) | |
| target_text: str = Field( | |
| default="这是一段测试语音合成的文本。", | |
| description="要合成的目标文本" | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "enabled": True, | |
| "ref_audio_path": None, | |
| "ref_text": None, | |
| "target_text": "这是一段测试语音合成的文本。" | |
| } | |
| ] | |
| } | |
| } | |
| class QuickModeOptions(BaseModel): | |
| """ | |
| Quick Mode 训练选项 | |
| 用于一键训练时的简化参数配置 | |
| Attributes: | |
| version: 模型版本 | |
| language: 训练语言 | |
| quality: 训练质量预设 | |
| 质量预设说明: | |
| - fast: SoVITS 4 epochs, GPT 8 epochs, ~10分钟 | |
| - standard: SoVITS 8 epochs, GPT 15 epochs, ~20分钟 | |
| - high: SoVITS 16 epochs, GPT 30 epochs, ~40分钟 | |
| """ | |
| version: Literal["v1", "v2", "v2Pro", "v3", "v4"] = Field( | |
| default="v2", | |
| description="模型版本" | |
| ) | |
| language: Literal["zh", "en", "ja", "ko", "yue"] = Field( | |
| default="zh", | |
| description="训练语言:zh(中文)、en(英语)、ja(日语)、ko(韩语)、yue(粤语)" | |
| ) | |
| quality: Literal["fast", "standard", "high"] = Field( | |
| default="standard", | |
| description="训练质量预设:fast(快速)、standard(标准)、high(高质量)" | |
| ) | |
| inference: Optional[InferenceOptions] = Field( | |
| default=None, | |
| description="推理配置,不提供则使用默认配置自动推理" | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "version": "v2", | |
| "language": "zh", | |
| "quality": "standard", | |
| "inference": { | |
| "enabled": True, | |
| "target_text": "这是一段测试语音合成的文本。" | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| class QuickModeRequest(BaseModel): | |
| """ | |
| 小白用户一键训练请求 | |
| 创建一键训练任务,系统自动配置所有参数并执行完整流程: | |
| audio_slice -> asr -> text_feature -> hubert_feature -> semantic_token -> sovits_train -> gpt_train -> inference | |
| Attributes: | |
| exp_name: 实验名称(用于标识训练任务) | |
| audio_file_id: 已上传音频文件的 ID | |
| options: 训练选项(包含推理配置) | |
| """ | |
| exp_name: str = Field( | |
| ..., | |
| min_length=1, | |
| max_length=100, | |
| description="实验名称,用于标识训练任务和生成的模型" | |
| ) | |
| audio_file_id: str = Field( | |
| ..., | |
| description="已上传音频文件的 ID" | |
| ) | |
| options: QuickModeOptions = Field( | |
| default_factory=QuickModeOptions, | |
| description="训练选项" | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "exp_name": "my_voice", | |
| "audio_file_id": "550e8400-e29b-41d4-a716-446655440000", | |
| "options": { | |
| "version": "v2", | |
| "language": "zh", | |
| "quality": "standard" | |
| } | |
| } | |
| ] | |
| } | |
| } | |
| class TaskResponse(BaseModel): | |
| """ | |
| 任务响应(Quick Mode) | |
| 返回任务的完整状态信息,包括进度、当前阶段等 | |
| Attributes: | |
| id: 任务唯一标识 | |
| exp_name: 实验名称 | |
| status: 任务状态 | |
| current_stage: 当前执行的阶段 | |
| progress: 当前阶段进度 (0.0-1.0) | |
| overall_progress: 总体进度 (0.0-1.0) | |
| message: 最新状态消息 | |
| error_message: 错误消息(失败时) | |
| created_at: 任务创建时间 | |
| started_at: 任务开始执行时间 | |
| completed_at: 任务完成时间 | |
| """ | |
| id: str = Field(..., description="任务唯一标识") | |
| exp_name: str = Field(..., description="实验名称") | |
| status: Literal["queued", "running", "completed", "failed", "cancelled", "interrupted"] = Field( | |
| ..., | |
| description="任务状态" | |
| ) | |
| current_stage: Optional[str] = Field( | |
| default=None, | |
| description="当前执行的阶段,如 'audio_slice', 'sovits_train' 等" | |
| ) | |
| progress: float = Field( | |
| default=0.0, | |
| ge=0.0, | |
| le=1.0, | |
| description="当前阶段进度 (0.0-1.0)" | |
| ) | |
| overall_progress: float = Field( | |
| default=0.0, | |
| ge=0.0, | |
| le=1.0, | |
| description="总体进度 (0.0-1.0)" | |
| ) | |
| message: Optional[str] = Field( | |
| default=None, | |
| description="最新状态消息" | |
| ) | |
| error_message: Optional[str] = Field( | |
| default=None, | |
| description="错误消息(失败时)" | |
| ) | |
| created_at: Optional[datetime] = Field( | |
| default=None, | |
| description="任务创建时间" | |
| ) | |
| started_at: Optional[datetime] = Field( | |
| default=None, | |
| description="任务开始执行时间" | |
| ) | |
| completed_at: Optional[datetime] = Field( | |
| default=None, | |
| description="任务完成时间" | |
| ) | |
| model_config = { | |
| "from_attributes": True, | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "id": "task-550e8400-e29b-41d4-a716-446655440000", | |
| "exp_name": "my_voice", | |
| "status": "running", | |
| "current_stage": "sovits_train", | |
| "progress": 0.45, | |
| "overall_progress": 0.72, | |
| "message": "SoVITS 训练中 Epoch 8/16", | |
| "error_message": None, | |
| "created_at": "2024-01-01T10:00:00Z", | |
| "started_at": "2024-01-01T10:00:05Z", | |
| "completed_at": None | |
| } | |
| ] | |
| } | |
| } | |
| class TaskListResponse(BaseModel): | |
| """ | |
| 任务列表响应 | |
| Attributes: | |
| items: 任务列表 | |
| total: 总数量 | |
| limit: 每页数量 | |
| offset: 偏移量 | |
| """ | |
| items: List[TaskResponse] = Field( | |
| default_factory=list, | |
| description="任务列表" | |
| ) | |
| total: int = Field( | |
| default=0, | |
| ge=0, | |
| description="总数量" | |
| ) | |
| limit: int = Field( | |
| default=50, | |
| ge=1, | |
| le=100, | |
| description="每页数量" | |
| ) | |
| offset: int = Field( | |
| default=0, | |
| ge=0, | |
| description="偏移量" | |
| ) | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "items": [ | |
| { | |
| "id": "task-123", | |
| "exp_name": "voice_1", | |
| "status": "completed", | |
| "current_stage": None, | |
| "progress": 1.0, | |
| "overall_progress": 1.0, | |
| "message": "训练完成" | |
| } | |
| ], | |
| "total": 1, | |
| "limit": 50, | |
| "offset": 0 | |
| } | |
| ] | |
| } | |
| } | |
| class InferenceOutputItem(BaseModel): | |
| """ | |
| 推理输出项 | |
| 表示一个推理生成的音频文件的元信息 | |
| Attributes: | |
| filename: 文件名 | |
| gpt_model: 使用的 GPT 模型名称 | |
| sovits_model: 使用的 SoVITS 模型名称 | |
| gpt_path: GPT 模型完整路径 | |
| sovits_path: SoVITS 模型完整路径 | |
| file_path: 文件相对路径 | |
| size_bytes: 文件大小(字节) | |
| created_at: 创建时间 | |
| """ | |
| filename: str = Field(..., description="文件名") | |
| gpt_model: str = Field(..., description="使用的 GPT 模型名称") | |
| sovits_model: str = Field(..., description="使用的 SoVITS 模型名称") | |
| gpt_path: str = Field(..., description="GPT 模型完整路径") | |
| sovits_path: str = Field(..., description="SoVITS 模型完整路径") | |
| file_path: str = Field(..., description="文件相对路径") | |
| size_bytes: int = Field(..., ge=0, description="文件大小(字节)") | |
| created_at: Optional[datetime] = Field(default=None, description="创建时间") | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "filename": "my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav", | |
| "gpt_model": "my_voice_e15_s150", | |
| "sovits_model": "my_voice_e8_s200", | |
| "gpt_path": "logs/my_voice/GPT_weights_v2/my_voice_e15_s150.ckpt", | |
| "sovits_path": "logs/my_voice/SoVITS_weights_v2/my_voice_e8_s200.pth", | |
| "file_path": "logs/my_voice/inference/my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav", | |
| "size_bytes": 102400, | |
| "created_at": "2024-01-01T12:00:00Z" | |
| } | |
| ] | |
| } | |
| } | |
| class InferenceOutputsResponse(BaseModel): | |
| """ | |
| 推理输出列表响应 | |
| 返回任务的所有推理输出文件列表 | |
| Attributes: | |
| task_id: 任务 ID | |
| exp_name: 实验名称 | |
| ref_text: 参考音频文本 | |
| ref_audio_path: 参考音频路径 | |
| target_text: 合成的目标文本 | |
| outputs: 推理输出文件列表 | |
| total: 总数量 | |
| """ | |
| task_id: str = Field(..., description="任务 ID") | |
| exp_name: str = Field(..., description="实验名称") | |
| ref_text: str = Field(default="", description="参考音频文本") | |
| ref_audio_path: str = Field(default="", description="参考音频路径") | |
| target_text: str = Field(default="", description="合成的目标文本") | |
| outputs: List[InferenceOutputItem] = Field( | |
| default_factory=list, | |
| description="推理输出文件列表" | |
| ) | |
| total: int = Field(default=0, ge=0, description="总数量") | |
| model_config = { | |
| "json_schema_extra": { | |
| "examples": [ | |
| { | |
| "task_id": "task-123", | |
| "exp_name": "my_voice", | |
| "ref_text": "大家好,又到了复盘的时间,今天即使。", | |
| "ref_audio_path": "logs/my_voice/slicer_opt/audio_0000012160_0000152320.wav", | |
| "target_text": "这是一段测试语音合成的文本。", | |
| "outputs": [ | |
| { | |
| "filename": "my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav", | |
| "gpt_model": "my_voice_e15_s150", | |
| "sovits_model": "my_voice_e8_s200", | |
| "gpt_path": "logs/my_voice/GPT_weights_v2/my_voice_e15_s150.ckpt", | |
| "sovits_path": "logs/my_voice/SoVITS_weights_v2/my_voice_e8_s200.pth", | |
| "file_path": "logs/my_voice/inference/my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav", | |
| "size_bytes": 102400, | |
| "created_at": "2024-01-01T12:00:00Z" | |
| } | |
| ], | |
| "total": 1 | |
| } | |
| ] | |
| } | |
| } | |