liumaolin
feat(api/tts-voice-app): add Cantonese (yue) support and improve language-specific defaults
9845a3d
"""
Quick Mode 任务 Schema
小白用户一键训练模式的请求/响应模型
参考文档: development.md 4.6.1 + 4.6.3
"""
from datetime import datetime
from typing import List, Literal, Optional
from pydantic import BaseModel, Field
class InferenceOptions(BaseModel):
"""
推理选项(可选)
训练完成后进行推理测试的配置。如果不提供,将使用默认值自动进行推理。
Attributes:
enabled: 是否启用推理阶段
ref_audio_path: 参考音频路径,不提供则使用训练音频的切片
ref_text: 参考音频的文本,不提供则从 ASR 结果自动获取
target_text: 要合成的目标文本
"""
enabled: bool = Field(
default=True,
description="是否启用推理阶段,默认启用"
)
ref_audio_path: Optional[str] = Field(
default=None,
description="参考音频路径,不提供则自动使用训练音频的切片"
)
ref_text: Optional[str] = Field(
default=None,
description="参考音频的文本,不提供则从 ASR 结果自动获取"
)
target_text: str = Field(
default="这是一段测试语音合成的文本。",
description="要合成的目标文本"
)
model_config = {
"json_schema_extra": {
"examples": [
{
"enabled": True,
"ref_audio_path": None,
"ref_text": None,
"target_text": "这是一段测试语音合成的文本。"
}
]
}
}
class QuickModeOptions(BaseModel):
"""
Quick Mode 训练选项
用于一键训练时的简化参数配置
Attributes:
version: 模型版本
language: 训练语言
quality: 训练质量预设
质量预设说明:
- fast: SoVITS 4 epochs, GPT 8 epochs, ~10分钟
- standard: SoVITS 8 epochs, GPT 15 epochs, ~20分钟
- high: SoVITS 16 epochs, GPT 30 epochs, ~40分钟
"""
version: Literal["v1", "v2", "v2Pro", "v3", "v4"] = Field(
default="v2",
description="模型版本"
)
language: Literal["zh", "en", "ja", "ko", "yue"] = Field(
default="zh",
description="训练语言:zh(中文)、en(英语)、ja(日语)、ko(韩语)、yue(粤语)"
)
quality: Literal["fast", "standard", "high"] = Field(
default="standard",
description="训练质量预设:fast(快速)、standard(标准)、high(高质量)"
)
inference: Optional[InferenceOptions] = Field(
default=None,
description="推理配置,不提供则使用默认配置自动推理"
)
model_config = {
"json_schema_extra": {
"examples": [
{
"version": "v2",
"language": "zh",
"quality": "standard",
"inference": {
"enabled": True,
"target_text": "这是一段测试语音合成的文本。"
}
}
]
}
}
class QuickModeRequest(BaseModel):
"""
小白用户一键训练请求
创建一键训练任务,系统自动配置所有参数并执行完整流程:
audio_slice -> asr -> text_feature -> hubert_feature -> semantic_token -> sovits_train -> gpt_train -> inference
Attributes:
exp_name: 实验名称(用于标识训练任务)
audio_file_id: 已上传音频文件的 ID
options: 训练选项(包含推理配置)
"""
exp_name: str = Field(
...,
min_length=1,
max_length=100,
description="实验名称,用于标识训练任务和生成的模型"
)
audio_file_id: str = Field(
...,
description="已上传音频文件的 ID"
)
options: QuickModeOptions = Field(
default_factory=QuickModeOptions,
description="训练选项"
)
model_config = {
"json_schema_extra": {
"examples": [
{
"exp_name": "my_voice",
"audio_file_id": "550e8400-e29b-41d4-a716-446655440000",
"options": {
"version": "v2",
"language": "zh",
"quality": "standard"
}
}
]
}
}
class TaskResponse(BaseModel):
"""
任务响应(Quick Mode)
返回任务的完整状态信息,包括进度、当前阶段等
Attributes:
id: 任务唯一标识
exp_name: 实验名称
status: 任务状态
current_stage: 当前执行的阶段
progress: 当前阶段进度 (0.0-1.0)
overall_progress: 总体进度 (0.0-1.0)
message: 最新状态消息
error_message: 错误消息(失败时)
created_at: 任务创建时间
started_at: 任务开始执行时间
completed_at: 任务完成时间
"""
id: str = Field(..., description="任务唯一标识")
exp_name: str = Field(..., description="实验名称")
status: Literal["queued", "running", "completed", "failed", "cancelled", "interrupted"] = Field(
...,
description="任务状态"
)
current_stage: Optional[str] = Field(
default=None,
description="当前执行的阶段,如 'audio_slice', 'sovits_train' 等"
)
progress: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="当前阶段进度 (0.0-1.0)"
)
overall_progress: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="总体进度 (0.0-1.0)"
)
message: Optional[str] = Field(
default=None,
description="最新状态消息"
)
error_message: Optional[str] = Field(
default=None,
description="错误消息(失败时)"
)
created_at: Optional[datetime] = Field(
default=None,
description="任务创建时间"
)
started_at: Optional[datetime] = Field(
default=None,
description="任务开始执行时间"
)
completed_at: Optional[datetime] = Field(
default=None,
description="任务完成时间"
)
model_config = {
"from_attributes": True,
"json_schema_extra": {
"examples": [
{
"id": "task-550e8400-e29b-41d4-a716-446655440000",
"exp_name": "my_voice",
"status": "running",
"current_stage": "sovits_train",
"progress": 0.45,
"overall_progress": 0.72,
"message": "SoVITS 训练中 Epoch 8/16",
"error_message": None,
"created_at": "2024-01-01T10:00:00Z",
"started_at": "2024-01-01T10:00:05Z",
"completed_at": None
}
]
}
}
class TaskListResponse(BaseModel):
"""
任务列表响应
Attributes:
items: 任务列表
total: 总数量
limit: 每页数量
offset: 偏移量
"""
items: List[TaskResponse] = Field(
default_factory=list,
description="任务列表"
)
total: int = Field(
default=0,
ge=0,
description="总数量"
)
limit: int = Field(
default=50,
ge=1,
le=100,
description="每页数量"
)
offset: int = Field(
default=0,
ge=0,
description="偏移量"
)
model_config = {
"json_schema_extra": {
"examples": [
{
"items": [
{
"id": "task-123",
"exp_name": "voice_1",
"status": "completed",
"current_stage": None,
"progress": 1.0,
"overall_progress": 1.0,
"message": "训练完成"
}
],
"total": 1,
"limit": 50,
"offset": 0
}
]
}
}
class InferenceOutputItem(BaseModel):
"""
推理输出项
表示一个推理生成的音频文件的元信息
Attributes:
filename: 文件名
gpt_model: 使用的 GPT 模型名称
sovits_model: 使用的 SoVITS 模型名称
gpt_path: GPT 模型完整路径
sovits_path: SoVITS 模型完整路径
file_path: 文件相对路径
size_bytes: 文件大小(字节)
created_at: 创建时间
"""
filename: str = Field(..., description="文件名")
gpt_model: str = Field(..., description="使用的 GPT 模型名称")
sovits_model: str = Field(..., description="使用的 SoVITS 模型名称")
gpt_path: str = Field(..., description="GPT 模型完整路径")
sovits_path: str = Field(..., description="SoVITS 模型完整路径")
file_path: str = Field(..., description="文件相对路径")
size_bytes: int = Field(..., ge=0, description="文件大小(字节)")
created_at: Optional[datetime] = Field(default=None, description="创建时间")
model_config = {
"json_schema_extra": {
"examples": [
{
"filename": "my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav",
"gpt_model": "my_voice_e15_s150",
"sovits_model": "my_voice_e8_s200",
"gpt_path": "logs/my_voice/GPT_weights_v2/my_voice_e15_s150.ckpt",
"sovits_path": "logs/my_voice/SoVITS_weights_v2/my_voice_e8_s200.pth",
"file_path": "logs/my_voice/inference/my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav",
"size_bytes": 102400,
"created_at": "2024-01-01T12:00:00Z"
}
]
}
}
class InferenceOutputsResponse(BaseModel):
"""
推理输出列表响应
返回任务的所有推理输出文件列表
Attributes:
task_id: 任务 ID
exp_name: 实验名称
ref_text: 参考音频文本
ref_audio_path: 参考音频路径
target_text: 合成的目标文本
outputs: 推理输出文件列表
total: 总数量
"""
task_id: str = Field(..., description="任务 ID")
exp_name: str = Field(..., description="实验名称")
ref_text: str = Field(default="", description="参考音频文本")
ref_audio_path: str = Field(default="", description="参考音频路径")
target_text: str = Field(default="", description="合成的目标文本")
outputs: List[InferenceOutputItem] = Field(
default_factory=list,
description="推理输出文件列表"
)
total: int = Field(default=0, ge=0, description="总数量")
model_config = {
"json_schema_extra": {
"examples": [
{
"task_id": "task-123",
"exp_name": "my_voice",
"ref_text": "大家好,又到了复盘的时间,今天即使。",
"ref_audio_path": "logs/my_voice/slicer_opt/audio_0000012160_0000152320.wav",
"target_text": "这是一段测试语音合成的文本。",
"outputs": [
{
"filename": "my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav",
"gpt_model": "my_voice_e15_s150",
"sovits_model": "my_voice_e8_s200",
"gpt_path": "logs/my_voice/GPT_weights_v2/my_voice_e15_s150.ckpt",
"sovits_path": "logs/my_voice/SoVITS_weights_v2/my_voice_e8_s200.pth",
"file_path": "logs/my_voice/inference/my_voice_gpt-my_voice_e15_s150-sovits_e8_s200.wav",
"size_bytes": 102400,
"created_at": "2024-01-01T12:00:00Z"
}
],
"total": 1
}
]
}
}