Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- README.md +39 -5
- app.py +365 -0
- config/environments/development.yaml +41 -0
- config/environments/production.yaml +41 -0
- config/logging.yaml +65 -0
- pyproject.toml +148 -0
- requirements.txt +43 -0
- src/__init__.py +44 -0
- src/__pycache__/__init__.cpython-310.pyc +0 -0
- src/api/__init__.py +13 -0
- src/api/__pycache__/__init__.cpython-310.pyc +0 -0
- src/api/__pycache__/gradio_interface.cpython-310.pyc +0 -0
- src/api/gradio_interface.py +574 -0
- src/core/__init__.py +19 -0
- src/core/__pycache__/__init__.cpython-310.pyc +0 -0
- src/core/__pycache__/config.cpython-310.pyc +0 -0
- src/core/__pycache__/task_manager.cpython-310.pyc +0 -0
- src/core/config.py +171 -0
- src/core/task_manager.py +462 -0
- src/services/__init__.py +20 -0
- src/services/__pycache__/__init__.cpython-310.pyc +0 -0
- src/services/__pycache__/file_validator.cpython-310.pyc +0 -0
- src/services/__pycache__/oss_service.cpython-310.pyc +0 -0
- src/services/__pycache__/paraformer_service.cpython-310.pyc +0 -0
- src/services/file_validator.py +277 -0
- src/services/oss_service.py +293 -0
- src/services/paraformer_service.py +407 -0
- src/utils/__init__.py +34 -0
- src/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- src/utils/__pycache__/error_handler.cpython-310.pyc +0 -0
- src/utils/__pycache__/logger.cpython-310.pyc +0 -0
- src/utils/error_handler.py +380 -0
- src/utils/logger.py +260 -0
README.md
CHANGED
|
@@ -1,12 +1,46 @@
|
|
| 1 |
---
|
| 2 |
title: Transcript Service
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Transcript Service
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.9.1
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🎙️ 音频转文字服务
|
| 14 |
+
|
| 15 |
+
基于 Gradio 的智能音频转文字 Web 服务。
|
| 16 |
+
|
| 17 |
+
## ✨ 功能特点
|
| 18 |
+
|
| 19 |
+
- 🎤 支持多种音频格式(MP3, WAV, M4A 等)
|
| 20 |
+
- 📝 自动语音识别转文字
|
| 21 |
+
- ☁️ 阿里云 OSS 云存储
|
| 22 |
+
- 🤖 阿里云 DashScope API 支持
|
| 23 |
+
- 🌐 简洁易用的 Web 界面
|
| 24 |
+
|
| 25 |
+
## 🚀 使用方法
|
| 26 |
+
|
| 27 |
+
1. 上传音频文件
|
| 28 |
+
2. 选择语言(自动检测或手动指定)
|
| 29 |
+
3. 点击"转换"按钮
|
| 30 |
+
4. 等待处理完成
|
| 31 |
+
5. 查看或下载转换结果
|
| 32 |
+
|
| 33 |
+
## 🛠️ 技术栈
|
| 34 |
+
|
| 35 |
+
- **前端**: Gradio 5.9.1
|
| 36 |
+
- **后端**: Python 3.10
|
| 37 |
+
- **存储**: 阿里云 OSS
|
| 38 |
+
- **AI 服务**: 阿里云 DashScope
|
| 39 |
+
|
| 40 |
+
## 📝 许可证
|
| 41 |
+
|
| 42 |
+
Apache License 2.0
|
| 43 |
+
|
| 44 |
+
---
|
| 45 |
+
|
| 46 |
+
**部署在 Hugging Face Spaces** 🤗
|
app.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""音频转文字服务主应用程序
|
| 2 |
+
|
| 3 |
+
基于Gradio的音频转文字Web服务应用程序入口。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import sys
|
| 8 |
+
import signal
|
| 9 |
+
import time
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
# 添加项目根目录到Python路径
|
| 14 |
+
project_root = Path(__file__).parent
|
| 15 |
+
sys.path.insert(0, str(project_root))
|
| 16 |
+
|
| 17 |
+
# 加载环境变量
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
load_dotenv(project_root / ".env")
|
| 20 |
+
|
| 21 |
+
from src.core.config import get_config, reload_config
|
| 22 |
+
from src.utils.logger import get_logger
|
| 23 |
+
from src.api.gradio_interface import get_gradio_interface
|
| 24 |
+
from src.core.task_manager import get_task_manager, TaskStatus
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class TranscriptServiceApp:
|
| 28 |
+
"""音频转文字服务应用程序"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, environment: Optional[str] = None):
|
| 31 |
+
"""初始化应用程序
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
environment: 运行环境 (development/production)
|
| 35 |
+
"""
|
| 36 |
+
# 加载配置
|
| 37 |
+
if environment:
|
| 38 |
+
self.config = reload_config(environment)
|
| 39 |
+
else:
|
| 40 |
+
self.config = get_config()
|
| 41 |
+
|
| 42 |
+
# 初始化日志
|
| 43 |
+
self.logger = get_logger("transcript_service.app")
|
| 44 |
+
|
| 45 |
+
# 初始化界面
|
| 46 |
+
self.gradio_interface = get_gradio_interface()
|
| 47 |
+
|
| 48 |
+
# 添加健康检查端点
|
| 49 |
+
self._setup_health_endpoint()
|
| 50 |
+
|
| 51 |
+
# 运行状态
|
| 52 |
+
self.is_running = False
|
| 53 |
+
|
| 54 |
+
self.logger.info(f"应用程序初始化完成 - 环境: {self.config.environment}")
|
| 55 |
+
|
| 56 |
+
def _setup_health_endpoint(self):
|
| 57 |
+
"""设置健康检查端点"""
|
| 58 |
+
try:
|
| 59 |
+
import gradio as gr
|
| 60 |
+
|
| 61 |
+
def health_check():
|
| 62 |
+
"""健康检查函数"""
|
| 63 |
+
import json
|
| 64 |
+
import time
|
| 65 |
+
|
| 66 |
+
health_data = {
|
| 67 |
+
"status": "healthy",
|
| 68 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 69 |
+
"environment": self.config.environment,
|
| 70 |
+
"version": self.config.app.version,
|
| 71 |
+
"uptime": time.time() - getattr(self, '_start_time', time.time()),
|
| 72 |
+
"services": {
|
| 73 |
+
"oss": self._check_oss_connection(),
|
| 74 |
+
"dashscope": self._check_dashscope_connection()
|
| 75 |
+
}
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
return json.dumps(health_data, indent=2, ensure_ascii=False)
|
| 79 |
+
|
| 80 |
+
# 在Gradio应用中添加健康检查端点
|
| 81 |
+
if hasattr(self.gradio_interface, 'app'):
|
| 82 |
+
from fastapi.responses import JSONResponse
|
| 83 |
+
|
| 84 |
+
@self.gradio_interface.app.get("/health")
|
| 85 |
+
async def health_endpoint():
|
| 86 |
+
health_data = {
|
| 87 |
+
"status": "healthy",
|
| 88 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 89 |
+
"environment": self.config.environment,
|
| 90 |
+
"version": self.config.app.version,
|
| 91 |
+
"uptime": time.time() - getattr(self, '_start_time', time.time()),
|
| 92 |
+
"services": {
|
| 93 |
+
"oss": self._check_oss_connection(),
|
| 94 |
+
"dashscope": self._check_dashscope_connection()
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
return JSONResponse(content=health_data)
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
self.logger.warning(f"设置健康检查端点失败: {e}")
|
| 101 |
+
|
| 102 |
+
def _check_oss_connection(self) -> bool:
|
| 103 |
+
"""检查OSS连接"""
|
| 104 |
+
try:
|
| 105 |
+
if not (self.config.oss.access_key_id and self.config.oss.access_key_secret):
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
import oss2
|
| 109 |
+
auth = oss2.Auth(self.config.oss.access_key_id, self.config.oss.access_key_secret)
|
| 110 |
+
service = oss2.Service(auth, "https://oss-cn-beijing.aliyuncs.com")
|
| 111 |
+
|
| 112 |
+
# 简单的连接测试
|
| 113 |
+
list(service.list_buckets(max_keys=1))
|
| 114 |
+
return True
|
| 115 |
+
except Exception:
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
def _check_dashscope_connection(self) -> bool:
|
| 119 |
+
"""检查DashScope连接"""
|
| 120 |
+
try:
|
| 121 |
+
if not self.config.dashscope.api_key:
|
| 122 |
+
return False
|
| 123 |
+
|
| 124 |
+
# 简单的API key格式检查
|
| 125 |
+
return self.config.dashscope.api_key.startswith("sk-")
|
| 126 |
+
except Exception:
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def setup_signal_handlers(self):
|
| 130 |
+
"""设置信号处理器"""
|
| 131 |
+
# 移除优雅关闭功能,允许应用直接终止
|
| 132 |
+
pass
|
| 133 |
+
|
| 134 |
+
def validate_environment(self) -> bool:
|
| 135 |
+
"""验证运行环境
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
环境是否有效
|
| 139 |
+
"""
|
| 140 |
+
try:
|
| 141 |
+
# 检查必要的环境变量
|
| 142 |
+
missing_vars = []
|
| 143 |
+
if not self.config.oss.access_key_id:
|
| 144 |
+
missing_vars.append("OSS_ACCESS_KEY_ID")
|
| 145 |
+
if not self.config.oss.access_key_secret:
|
| 146 |
+
missing_vars.append("OSS_ACCESS_KEY_SECRET")
|
| 147 |
+
if not self.config.dashscope.api_key:
|
| 148 |
+
missing_vars.append("DASHSCOPE_API_KEY")
|
| 149 |
+
|
| 150 |
+
if missing_vars:
|
| 151 |
+
self.logger.error(f"缺少必要的环境变量: {missing_vars}")
|
| 152 |
+
return False
|
| 153 |
+
|
| 154 |
+
# 检查目录权限
|
| 155 |
+
logs_dir = self.config.get_logs_dir()
|
| 156 |
+
temp_dir = self.config.get_temp_dir()
|
| 157 |
+
|
| 158 |
+
for directory in [logs_dir, temp_dir]:
|
| 159 |
+
if not directory.exists():
|
| 160 |
+
directory.mkdir(parents=True, exist_ok=True)
|
| 161 |
+
|
| 162 |
+
# 测试写权限
|
| 163 |
+
test_file = directory / ".write_test"
|
| 164 |
+
try:
|
| 165 |
+
test_file.write_text("test")
|
| 166 |
+
test_file.unlink()
|
| 167 |
+
except Exception as e:
|
| 168 |
+
self.logger.error(f"目录权限检查失败 {directory}: {str(e)}")
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
self.logger.info("环境验证通过")
|
| 172 |
+
return True
|
| 173 |
+
|
| 174 |
+
except Exception as e:
|
| 175 |
+
self.logger.exception(f"环境验证失败: {str(e)}")
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
def run(self, **launch_kwargs):
|
| 179 |
+
"""启动应用程序
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
**launch_kwargs: Gradio启动参数
|
| 183 |
+
"""
|
| 184 |
+
try:
|
| 185 |
+
# 设置信号处理器
|
| 186 |
+
self.setup_signal_handlers()
|
| 187 |
+
|
| 188 |
+
# 验证环境
|
| 189 |
+
if not self.validate_environment():
|
| 190 |
+
self.logger.error("环境验证失败,应用程序无法启动")
|
| 191 |
+
sys.exit(1)
|
| 192 |
+
|
| 193 |
+
# 启动应用
|
| 194 |
+
self.is_running = True
|
| 195 |
+
self._start_time = time.time() # 记录启动时间
|
| 196 |
+
self.logger.info("正在启动音频转文字服务...")
|
| 197 |
+
|
| 198 |
+
# 启动Gradio界面
|
| 199 |
+
self.gradio_interface.launch(**launch_kwargs)
|
| 200 |
+
|
| 201 |
+
except OSError as e:
|
| 202 |
+
if "address already in use" in str(e).lower():
|
| 203 |
+
port = launch_kwargs.get('server_port', self.config.app.port)
|
| 204 |
+
self.logger.warning(f"端口 {port} 已被占用。正在尝试使用一个可用的随机端口...")
|
| 205 |
+
|
| 206 |
+
# 显式设置 server_port=None 来让 Gradio 自动查找可用端口
|
| 207 |
+
launch_kwargs['server_port'] = None
|
| 208 |
+
|
| 209 |
+
try:
|
| 210 |
+
# 再次尝试启动
|
| 211 |
+
self.gradio_interface.launch(**launch_kwargs)
|
| 212 |
+
except Exception as final_e:
|
| 213 |
+
self.logger.exception(f"尝试使用随机端口后,应用程序启动仍然失败: {str(final_e)}")
|
| 214 |
+
sys.exit(1)
|
| 215 |
+
else:
|
| 216 |
+
self.logger.exception(f"启动时发生未处理的网络错误: {str(e)}")
|
| 217 |
+
sys.exit(1)
|
| 218 |
+
except KeyboardInterrupt:
|
| 219 |
+
self.logger.info("接收到键盘中断信号")
|
| 220 |
+
self.shutdown()
|
| 221 |
+
except Exception as e:
|
| 222 |
+
self.logger.exception(f"应用程序启动失败: {str(e)}")
|
| 223 |
+
sys.exit(1)
|
| 224 |
+
|
| 225 |
+
def shutdown(self):
|
| 226 |
+
"""关闭应用程序"""
|
| 227 |
+
if not self.is_running:
|
| 228 |
+
return
|
| 229 |
+
|
| 230 |
+
self.logger.info("开始关闭应用程序...")
|
| 231 |
+
self.is_running = False
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
# 清理任务管理器
|
| 235 |
+
task_manager = get_task_manager()
|
| 236 |
+
|
| 237 |
+
# 取消所有待处理的任务
|
| 238 |
+
pending_tasks = task_manager.get_tasks_by_status(TaskStatus.PENDING)
|
| 239 |
+
for task in pending_tasks:
|
| 240 |
+
try:
|
| 241 |
+
loop = asyncio.get_running_loop()
|
| 242 |
+
asyncio.create_task(task_manager.cancel_task(task.id))
|
| 243 |
+
except RuntimeError: # No running loop
|
| 244 |
+
asyncio.run(task_manager.cancel_task(task.id))
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
# 等待正在处理的任务完成(最多等待30秒)
|
| 248 |
+
active_tasks = (
|
| 249 |
+
task_manager.get_tasks_by_status(TaskStatus.VALIDATING) +
|
| 250 |
+
task_manager.get_tasks_by_status(TaskStatus.UPLOADING) +
|
| 251 |
+
task_manager.get_tasks_by_status(TaskStatus.TRANSCRIBING)
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
if active_tasks:
|
| 255 |
+
self.logger.info(f"等待 {len(active_tasks)} 个活跃任务完成...")
|
| 256 |
+
# 这里可以添加更复杂的等待逻辑, 但为简单起见, 我们直接继续
|
| 257 |
+
|
| 258 |
+
# 清理临时文件
|
| 259 |
+
self.cleanup_temp_files()
|
| 260 |
+
|
| 261 |
+
self.logger.info("应用程序已安全关闭")
|
| 262 |
+
|
| 263 |
+
except Exception as e:
|
| 264 |
+
self.logger.exception(f"关闭应用程序时发生错误: {str(e)}")
|
| 265 |
+
|
| 266 |
+
def cleanup_temp_files(self):
|
| 267 |
+
"""清理临时文件"""
|
| 268 |
+
try:
|
| 269 |
+
temp_dir = self.config.get_temp_dir()
|
| 270 |
+
if temp_dir.exists():
|
| 271 |
+
for file_path in temp_dir.glob("*"):
|
| 272 |
+
if file_path.is_file():
|
| 273 |
+
file_path.unlink()
|
| 274 |
+
self.logger.info("临时文件清理完成")
|
| 275 |
+
except Exception as e:
|
| 276 |
+
self.logger.warning(f"清理临时文件失败: {str(e)}")
|
| 277 |
+
|
| 278 |
+
def get_app_info(self) -> dict:
|
| 279 |
+
"""获取应用程序信息
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
应用程序信息字典
|
| 283 |
+
"""
|
| 284 |
+
return {
|
| 285 |
+
"name": self.config.app.name,
|
| 286 |
+
"version": self.config.app.version,
|
| 287 |
+
"environment": self.config.environment,
|
| 288 |
+
"debug": self.config.app.debug,
|
| 289 |
+
"host": self.config.app.host,
|
| 290 |
+
"port": self.config.app.port,
|
| 291 |
+
"is_running": self.is_running
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
def create_app(environment: Optional[str] = None) -> TranscriptServiceApp:
|
| 296 |
+
"""创建应用程序实例
|
| 297 |
+
|
| 298 |
+
Args:
|
| 299 |
+
environment: 运行环境
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
应用程序实例
|
| 303 |
+
"""
|
| 304 |
+
return TranscriptServiceApp(environment)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def main():
|
| 308 |
+
"""主函数入口"""
|
| 309 |
+
import argparse
|
| 310 |
+
|
| 311 |
+
parser = argparse.ArgumentParser(description="音频转文字服务")
|
| 312 |
+
parser.add_argument(
|
| 313 |
+
"--env",
|
| 314 |
+
choices=["development", "production"],
|
| 315 |
+
default="development",
|
| 316 |
+
help="运行环境"
|
| 317 |
+
)
|
| 318 |
+
parser.add_argument(
|
| 319 |
+
"--host",
|
| 320 |
+
default=None,
|
| 321 |
+
help="服务主机地址"
|
| 322 |
+
)
|
| 323 |
+
parser.add_argument(
|
| 324 |
+
"--port",
|
| 325 |
+
type=int,
|
| 326 |
+
default=None,
|
| 327 |
+
help="服务端口"
|
| 328 |
+
)
|
| 329 |
+
parser.add_argument(
|
| 330 |
+
"--share",
|
| 331 |
+
action="store_true",
|
| 332 |
+
help="启用Gradio分享链接"
|
| 333 |
+
)
|
| 334 |
+
parser.add_argument(
|
| 335 |
+
"--debug",
|
| 336 |
+
action="store_true",
|
| 337 |
+
help="启用调试模式"
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
args = parser.parse_args()
|
| 341 |
+
|
| 342 |
+
# 创建应用
|
| 343 |
+
app = create_app(args.env)
|
| 344 |
+
|
| 345 |
+
# 准备启动参数
|
| 346 |
+
launch_kwargs = {
|
| 347 |
+
'share': False # 生产环境禁用share
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
if args.host:
|
| 351 |
+
launch_kwargs['server_name'] = args.host
|
| 352 |
+
if args.port:
|
| 353 |
+
launch_kwargs['server_port'] = args.port
|
| 354 |
+
if args.share:
|
| 355 |
+
launch_kwargs['share'] = True # 如果用户明确要求share
|
| 356 |
+
if args.debug:
|
| 357 |
+
launch_kwargs['debug'] = True
|
| 358 |
+
|
| 359 |
+
# 启动应用
|
| 360 |
+
app.run(**launch_kwargs)
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
if __name__ == "__main__":
|
| 364 |
+
main()
|
| 365 |
+
|
config/environments/development.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 开发环境配置
|
| 2 |
+
app:
|
| 3 |
+
name: "音频转文字服务"
|
| 4 |
+
version: "1.0.0"
|
| 5 |
+
debug: true
|
| 6 |
+
host: "127.0.0.1"
|
| 7 |
+
port: 7860
|
| 8 |
+
max_file_size: 2147483648 # 2GB
|
| 9 |
+
max_files_count: 100
|
| 10 |
+
concurrent_tasks: 5
|
| 11 |
+
|
| 12 |
+
# OSS配置
|
| 13 |
+
oss:
|
| 14 |
+
endpoint: "oss-cn-beijing.aliyuncs.com"
|
| 15 |
+
bucket_name: "audio-transcript-dev"
|
| 16 |
+
upload_timeout: 300
|
| 17 |
+
url_expire_hours: 24
|
| 18 |
+
temp_prefix: "temp/audio"
|
| 19 |
+
auto_cleanup_days: 7
|
| 20 |
+
|
| 21 |
+
# 阿里云百炼API配置
|
| 22 |
+
dashscope:
|
| 23 |
+
base_url: "https://dashscope.aliyuncs.com/api/v1"
|
| 24 |
+
model: "paraformer-v2"
|
| 25 |
+
timeout: 300
|
| 26 |
+
max_retries: 3
|
| 27 |
+
retry_delay: 5
|
| 28 |
+
language_hints: ["zh", "en"]
|
| 29 |
+
|
| 30 |
+
# 任务配置
|
| 31 |
+
task:
|
| 32 |
+
status_check_interval: 2
|
| 33 |
+
max_processing_time: 3600 # 1小时
|
| 34 |
+
queue_size: 1000
|
| 35 |
+
|
| 36 |
+
# 日志配置
|
| 37 |
+
logging:
|
| 38 |
+
level: "DEBUG"
|
| 39 |
+
format: "detailed"
|
| 40 |
+
file_max_size: "10MB"
|
| 41 |
+
backup_count: 5
|
config/environments/production.yaml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 生产环境配置
|
| 2 |
+
app:
|
| 3 |
+
name: "音频转文字服务"
|
| 4 |
+
version: "1.0.0"
|
| 5 |
+
debug: false
|
| 6 |
+
host: "0.0.0.0"
|
| 7 |
+
port: 8080
|
| 8 |
+
max_file_size: 2147483648 # 2GB
|
| 9 |
+
max_files_count: 100
|
| 10 |
+
concurrent_tasks: 10
|
| 11 |
+
|
| 12 |
+
# OSS配置
|
| 13 |
+
oss:
|
| 14 |
+
endpoint: "oss-cn-beijing.aliyuncs.com"
|
| 15 |
+
bucket_name: "audio-transcript-prod"
|
| 16 |
+
upload_timeout: 300
|
| 17 |
+
url_expire_hours: 24
|
| 18 |
+
temp_prefix: "temp/audio"
|
| 19 |
+
auto_cleanup_days: 7
|
| 20 |
+
|
| 21 |
+
# 阿里云百炼API配置
|
| 22 |
+
dashscope:
|
| 23 |
+
base_url: "https://dashscope.aliyuncs.com/api/v1"
|
| 24 |
+
model: "paraformer-v2"
|
| 25 |
+
timeout: 300
|
| 26 |
+
max_retries: 5
|
| 27 |
+
retry_delay: 10
|
| 28 |
+
language_hints: ["zh", "en"]
|
| 29 |
+
|
| 30 |
+
# 任务配置
|
| 31 |
+
task:
|
| 32 |
+
status_check_interval: 5
|
| 33 |
+
max_processing_time: 3600 # 1小时
|
| 34 |
+
queue_size: 2000
|
| 35 |
+
|
| 36 |
+
# 日志配置
|
| 37 |
+
logging:
|
| 38 |
+
level: "INFO"
|
| 39 |
+
format: "structured"
|
| 40 |
+
file_max_size: "50MB"
|
| 41 |
+
backup_count: 10
|
config/logging.yaml
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 日志系统配置
|
| 2 |
+
version: 1
|
| 3 |
+
disable_existing_loggers: false
|
| 4 |
+
|
| 5 |
+
formatters:
|
| 6 |
+
detailed:
|
| 7 |
+
format: '[%(asctime)s] [%(levelname)s] [%(name)s] [%(task_id)s] %(message)s'
|
| 8 |
+
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 9 |
+
|
| 10 |
+
structured:
|
| 11 |
+
format: '{"timestamp": "%(asctime)s", "level": "%(levelname)s", "module": "%(name)s", "task_id": "%(task_id)s", "message": "%(message)s"}'
|
| 12 |
+
datefmt: '%Y-%m-%d %H:%M:%S'
|
| 13 |
+
|
| 14 |
+
simple:
|
| 15 |
+
format: '[%(levelname)s] %(message)s'
|
| 16 |
+
|
| 17 |
+
handlers:
|
| 18 |
+
console:
|
| 19 |
+
class: logging.StreamHandler
|
| 20 |
+
level: DEBUG
|
| 21 |
+
formatter: detailed
|
| 22 |
+
stream: ext://sys.stdout
|
| 23 |
+
|
| 24 |
+
file_handler:
|
| 25 |
+
class: logging.handlers.RotatingFileHandler
|
| 26 |
+
level: INFO
|
| 27 |
+
formatter: structured
|
| 28 |
+
filename: logs/app.log
|
| 29 |
+
maxBytes: 10485760 # 10MB
|
| 30 |
+
backupCount: 5
|
| 31 |
+
encoding: utf8
|
| 32 |
+
|
| 33 |
+
error_file_handler:
|
| 34 |
+
class: logging.handlers.RotatingFileHandler
|
| 35 |
+
level: ERROR
|
| 36 |
+
formatter: detailed
|
| 37 |
+
filename: logs/error.log
|
| 38 |
+
maxBytes: 10485760 # 10MB
|
| 39 |
+
backupCount: 5
|
| 40 |
+
encoding: utf8
|
| 41 |
+
|
| 42 |
+
loggers:
|
| 43 |
+
transcript_service:
|
| 44 |
+
level: DEBUG
|
| 45 |
+
handlers: [console, file_handler, error_file_handler]
|
| 46 |
+
propagate: false
|
| 47 |
+
|
| 48 |
+
transcript_service.oss:
|
| 49 |
+
level: INFO
|
| 50 |
+
handlers: [console, file_handler]
|
| 51 |
+
propagate: false
|
| 52 |
+
|
| 53 |
+
transcript_service.api:
|
| 54 |
+
level: INFO
|
| 55 |
+
handlers: [console, file_handler]
|
| 56 |
+
propagate: false
|
| 57 |
+
|
| 58 |
+
transcript_service.task:
|
| 59 |
+
level: DEBUG
|
| 60 |
+
handlers: [console, file_handler]
|
| 61 |
+
propagate: false
|
| 62 |
+
|
| 63 |
+
root:
|
| 64 |
+
level: WARNING
|
| 65 |
+
handlers: [console, file_handler]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "transcript-service"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "智能音频转文字Web服务"
|
| 5 |
+
authors = [{name = "Your Name", email = "your.email@example.com"}]
|
| 6 |
+
license = {text = "MIT"}
|
| 7 |
+
readme = "README.md"
|
| 8 |
+
requires-python = ">=3.9"
|
| 9 |
+
|
| 10 |
+
dependencies = [
|
| 11 |
+
# 核心Web框架依赖
|
| 12 |
+
"gradio>=4.44.0",
|
| 13 |
+
"fastapi>=0.104.0",
|
| 14 |
+
"uvicorn>=0.24.0",
|
| 15 |
+
|
| 16 |
+
# 云服务集成依赖
|
| 17 |
+
"oss2>=2.18.0",
|
| 18 |
+
"dashscope>=1.14.0",
|
| 19 |
+
|
| 20 |
+
# 数据处理核心依赖
|
| 21 |
+
"pydantic>=2.5.0",
|
| 22 |
+
"pydantic-settings>=2.1.0",
|
| 23 |
+
|
| 24 |
+
# 文件处理工具依赖
|
| 25 |
+
"python-multipart>=0.0.6",
|
| 26 |
+
"python-magic>=0.4.27",
|
| 27 |
+
|
| 28 |
+
# 配置管理依赖
|
| 29 |
+
"PyYAML>=6.0.1",
|
| 30 |
+
"python-dotenv>=1.0.0",
|
| 31 |
+
|
| 32 |
+
# 日志和监控依赖
|
| 33 |
+
"structlog>=23.2.0",
|
| 34 |
+
"rich>=13.7.0",
|
| 35 |
+
|
| 36 |
+
# HTTP客户端依赖
|
| 37 |
+
"httpx>=0.25.2",
|
| 38 |
+
"aiohttp>=3.9.0",
|
| 39 |
+
|
| 40 |
+
# 命令行工具依赖
|
| 41 |
+
"click>=8.1.7",
|
| 42 |
+
"typer>=0.9.0",
|
| 43 |
+
|
| 44 |
+
# 性能优化可选依赖
|
| 45 |
+
"orjson>=3.9.0",
|
| 46 |
+
"ujson>=5.8.0"
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
[project.optional-dependencies]
|
| 50 |
+
dev = [
|
| 51 |
+
"pytest>=7.4.0",
|
| 52 |
+
"pytest-asyncio>=0.21.0",
|
| 53 |
+
"pytest-cov>=4.1.0",
|
| 54 |
+
"black>=23.11.0",
|
| 55 |
+
"flake8>=7.0.0",
|
| 56 |
+
"isort>=5.12.0",
|
| 57 |
+
"mypy>=1.7.0",
|
| 58 |
+
"pre-commit>=3.5.0"
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
[project.scripts]
|
| 62 |
+
transcript-service = "app:main"
|
| 63 |
+
|
| 64 |
+
[build-system]
|
| 65 |
+
requires = ["hatchling"]
|
| 66 |
+
build-backend = "hatchling.build"
|
| 67 |
+
|
| 68 |
+
[tool.hatch.build.targets.wheel]
|
| 69 |
+
packages = ["src"]
|
| 70 |
+
|
| 71 |
+
[dependency-groups]
|
| 72 |
+
dev = [
|
| 73 |
+
"pytest>=7.4.0",
|
| 74 |
+
"pytest-asyncio>=0.21.0",
|
| 75 |
+
"pytest-cov>=4.1.0",
|
| 76 |
+
"black>=23.11.0",
|
| 77 |
+
"flake8>=7.0.0",
|
| 78 |
+
"isort>=5.12.0",
|
| 79 |
+
"mypy>=1.7.0",
|
| 80 |
+
"pre-commit>=3.5.0"
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
[tool.black]
|
| 84 |
+
line-length = 88
|
| 85 |
+
target-version = ['py39']
|
| 86 |
+
include = '\.pyi?$'
|
| 87 |
+
extend-exclude = '''
|
| 88 |
+
/(
|
| 89 |
+
# directories
|
| 90 |
+
\.eggs
|
| 91 |
+
| \.git
|
| 92 |
+
| \.hg
|
| 93 |
+
| \.mypy_cache
|
| 94 |
+
| \.tox
|
| 95 |
+
| \.venv
|
| 96 |
+
| _build
|
| 97 |
+
| buck-out
|
| 98 |
+
| build
|
| 99 |
+
| dist
|
| 100 |
+
)/
|
| 101 |
+
'''
|
| 102 |
+
|
| 103 |
+
[tool.isort]
|
| 104 |
+
profile = "black"
|
| 105 |
+
multi_line_output = 3
|
| 106 |
+
line_length = 88
|
| 107 |
+
known_first_party = ["src"]
|
| 108 |
+
|
| 109 |
+
[tool.flake8]
|
| 110 |
+
max-line-length = 88
|
| 111 |
+
extend-ignore = ["E203", "W503"]
|
| 112 |
+
exclude = [
|
| 113 |
+
".git",
|
| 114 |
+
"__pycache__",
|
| 115 |
+
".venv",
|
| 116 |
+
"build",
|
| 117 |
+
"dist",
|
| 118 |
+
"*.egg-info"
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
[tool.mypy]
|
| 122 |
+
python_version = "3.9"
|
| 123 |
+
warn_return_any = true
|
| 124 |
+
warn_unused_configs = true
|
| 125 |
+
disallow_untyped_defs = true
|
| 126 |
+
disallow_incomplete_defs = true
|
| 127 |
+
check_untyped_defs = true
|
| 128 |
+
disallow_untyped_decorators = true
|
| 129 |
+
no_implicit_optional = true
|
| 130 |
+
warn_redundant_casts = true
|
| 131 |
+
warn_unused_ignores = true
|
| 132 |
+
warn_no_return = true
|
| 133 |
+
warn_unreachable = true
|
| 134 |
+
strict_equality = true
|
| 135 |
+
|
| 136 |
+
[[tool.mypy.overrides]]
|
| 137 |
+
module = ["gradio.*", "oss2.*", "dashscope.*"]
|
| 138 |
+
ignore_missing_imports = true
|
| 139 |
+
|
| 140 |
+
[tool.pytest.ini_options]
|
| 141 |
+
minversion = "7.0"
|
| 142 |
+
addopts = "-ra -q --strict-markers"
|
| 143 |
+
testpaths = ["tests"]
|
| 144 |
+
markers = [
|
| 145 |
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
| 146 |
+
"integration: marks tests as integration tests",
|
| 147 |
+
"unit: marks tests as unit tests"
|
| 148 |
+
]
|
requirements.txt
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 核心依赖
|
| 2 |
+
gradio>=4.44.0
|
| 3 |
+
fastapi>=0.104.0
|
| 4 |
+
uvicorn>=0.24.0
|
| 5 |
+
|
| 6 |
+
# 云服务依赖
|
| 7 |
+
oss2>=2.18.0
|
| 8 |
+
dashscope>=1.14.0
|
| 9 |
+
|
| 10 |
+
# 数据处理依赖
|
| 11 |
+
pydantic>=2.5.0
|
| 12 |
+
pydantic-settings>=2.1.0
|
| 13 |
+
|
| 14 |
+
# 文件处理依赖
|
| 15 |
+
python-multipart>=0.0.6
|
| 16 |
+
python-magic>=0.4.27
|
| 17 |
+
|
| 18 |
+
# 配置管理
|
| 19 |
+
PyYAML>=6.0.1
|
| 20 |
+
python-dotenv>=1.0.0
|
| 21 |
+
|
| 22 |
+
# 日志和监控
|
| 23 |
+
structlog>=23.2.0
|
| 24 |
+
rich>=13.7.0
|
| 25 |
+
|
| 26 |
+
# HTTP客户端
|
| 27 |
+
httpx>=0.25.2
|
| 28 |
+
aiohttp>=3.9.0
|
| 29 |
+
|
| 30 |
+
# 工具依赖
|
| 31 |
+
click>=8.1.7
|
| 32 |
+
typer>=0.9.0
|
| 33 |
+
|
| 34 |
+
# 开发依赖
|
| 35 |
+
pytest>=7.4.0
|
| 36 |
+
pytest-asyncio>=0.21.0
|
| 37 |
+
black>=23.11.0
|
| 38 |
+
flake8>=6.1.0
|
| 39 |
+
isort>=5.12.0
|
| 40 |
+
|
| 41 |
+
# 可选依赖(用于性能优化)
|
| 42 |
+
orjson>=3.9.0
|
| 43 |
+
ujson>=5.8.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""源代码主模块
|
| 2 |
+
|
| 3 |
+
应用程序源代码的根模块,集成所有功能组件。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# 导入核心模块
|
| 7 |
+
from .core import (
|
| 8 |
+
Config, get_config, reload_config,
|
| 9 |
+
TaskManager, TaskStatus, TaskPriority, Task, get_task_manager, task_manager
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# 导入服务模块
|
| 13 |
+
from .services import (
|
| 14 |
+
FileValidator, get_file_validator, file_validator,
|
| 15 |
+
OSSService, get_oss_service, oss_service,
|
| 16 |
+
ParaformerService, get_paraformer_service, paraformer_service
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# 导入工具模块
|
| 20 |
+
from .utils import (
|
| 21 |
+
Logger, TaskLogger, get_logger, get_task_logger, logger
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# 导入API模块
|
| 25 |
+
from .api import (
|
| 26 |
+
GradioInterface, get_gradio_interface, create_demo_interface, gradio_interface
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
__all__ = [
|
| 30 |
+
# 核心模块
|
| 31 |
+
"Config", "get_config", "reload_config",
|
| 32 |
+
"TaskManager", "TaskStatus", "TaskPriority", "Task", "get_task_manager", "task_manager",
|
| 33 |
+
|
| 34 |
+
# 服务模块
|
| 35 |
+
"FileValidator", "get_file_validator", "file_validator",
|
| 36 |
+
"OSSService", "get_oss_service", "oss_service",
|
| 37 |
+
"ParaformerService", "get_paraformer_service", "paraformer_service",
|
| 38 |
+
|
| 39 |
+
# 工具模块
|
| 40 |
+
"Logger", "TaskLogger", "get_logger", "get_task_logger", "logger",
|
| 41 |
+
|
| 42 |
+
# API模块
|
| 43 |
+
"GradioInterface", "get_gradio_interface", "create_demo_interface", "gradio_interface"
|
| 44 |
+
]
|
src/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (1.14 kB). View file
|
|
|
src/api/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API模块
|
| 2 |
+
|
| 3 |
+
包含应用程序的API接口和用户界面。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .gradio_interface import GradioInterface, get_gradio_interface, create_demo_interface, gradio_interface
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"GradioInterface",
|
| 10 |
+
"get_gradio_interface",
|
| 11 |
+
"create_demo_interface",
|
| 12 |
+
"gradio_interface"
|
| 13 |
+
]
|
src/api/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (385 Bytes). View file
|
|
|
src/api/__pycache__/gradio_interface.cpython-310.pyc
ADDED
|
Binary file (15.5 kB). View file
|
|
|
src/api/gradio_interface.py
ADDED
|
@@ -0,0 +1,574 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio用户界面模块
|
| 2 |
+
|
| 3 |
+
提供基于Gradio的Web界面,支持文件上传、进度显示和结果展示。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Dict, List, Optional, Tuple, Any
|
| 10 |
+
import gradio as gr
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
from ..core.config import get_config
|
| 14 |
+
from ..core.task_manager import get_task_manager, TaskStatus, TaskPriority
|
| 15 |
+
from ..utils.logger import get_task_logger
|
| 16 |
+
from ..services.file_validator import get_file_validator
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class GradioInterface:
|
| 20 |
+
"""Gradio界面管理器"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
"""初始化Gradio界面"""
|
| 24 |
+
self.config = get_config()
|
| 25 |
+
self.task_manager = get_task_manager()
|
| 26 |
+
self.file_validator = get_file_validator()
|
| 27 |
+
self.logger = get_task_logger(logger_name="transcript_service.gradio")
|
| 28 |
+
|
| 29 |
+
# 当前任务ID
|
| 30 |
+
self.current_task_id = None
|
| 31 |
+
|
| 32 |
+
# 创建界面
|
| 33 |
+
self.interface = self._create_interface()
|
| 34 |
+
|
| 35 |
+
# 注册任务状态回调
|
| 36 |
+
self.task_manager.add_status_callback(self._on_task_status_change)
|
| 37 |
+
|
| 38 |
+
def _create_interface(self) -> gr.Blocks:
|
| 39 |
+
"""创建Gradio界面"""
|
| 40 |
+
# 获取支持的格式信息
|
| 41 |
+
supported_formats = self.file_validator.get_supported_formats()
|
| 42 |
+
|
| 43 |
+
with gr.Blocks(
|
| 44 |
+
title="音频转文字服务",
|
| 45 |
+
theme=gr.themes.Soft(),
|
| 46 |
+
css="""
|
| 47 |
+
.main-container { max-width: 1000px; margin: 0 auto; }
|
| 48 |
+
.upload-area { border: 2px dashed #ccc; border-radius: 10px; padding: 20px; text-align: center; }
|
| 49 |
+
.result-area { margin-top: 20px; }
|
| 50 |
+
.status-simple { font-size: 16px; font-weight: bold; }
|
| 51 |
+
"""
|
| 52 |
+
) as interface:
|
| 53 |
+
# 简洁标题
|
| 54 |
+
gr.Markdown("# 🎵 音频转文字服务")
|
| 55 |
+
|
| 56 |
+
with gr.Row():
|
| 57 |
+
with gr.Column(scale=3):
|
| 58 |
+
# 文件上传区
|
| 59 |
+
file_upload = gr.File(
|
| 60 |
+
label="📁 选择音频文件(支持多文件)",
|
| 61 |
+
file_count="multiple",
|
| 62 |
+
file_types=list(supported_formats['extensions']),
|
| 63 |
+
height=120
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# 简化的配置区
|
| 67 |
+
with gr.Row():
|
| 68 |
+
# 任务优先级
|
| 69 |
+
priority_select = gr.Radio(
|
| 70 |
+
label="优先级",
|
| 71 |
+
choices=[("普通", "NORMAL"), ("高优先级", "HIGH")],
|
| 72 |
+
value="NORMAL"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
# 参数设置区(默认隐藏)
|
| 76 |
+
with gr.Accordion("⚙️ 转录参数设置", open=False) as params_section:
|
| 77 |
+
# 语言选择
|
| 78 |
+
language_select = gr.CheckboxGroup(
|
| 79 |
+
label="识别语言",
|
| 80 |
+
choices=[
|
| 81 |
+
("中文", "zh"), ("英文", "en"), ("日语", "ja"),
|
| 82 |
+
("粤语", "yue"), ("韩语", "ko"), ("德语", "de"),
|
| 83 |
+
("法语", "fr"), ("俄语", "ru")
|
| 84 |
+
],
|
| 85 |
+
value=["zh", "en"]
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
with gr.Row():
|
| 89 |
+
# 基础选项
|
| 90 |
+
disfluency_removal = gr.Checkbox(
|
| 91 |
+
label="过滤语气词",
|
| 92 |
+
value=True
|
| 93 |
+
)
|
| 94 |
+
timestamp_alignment = gr.Checkbox(
|
| 95 |
+
label="时间戳校准",
|
| 96 |
+
value=True
|
| 97 |
+
)
|
| 98 |
+
diarization_enabled = gr.Checkbox(
|
| 99 |
+
label="说话人分离",
|
| 100 |
+
value=True
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
with gr.Row():
|
| 104 |
+
speaker_count = gr.Number(
|
| 105 |
+
label="说话人数量(可选)",
|
| 106 |
+
value=None,
|
| 107 |
+
minimum=None,
|
| 108 |
+
maximum=100,
|
| 109 |
+
step=1,
|
| 110 |
+
info="留空则自动判断,如需指定请输入2-100之间的数值"
|
| 111 |
+
)
|
| 112 |
+
channel_select = gr.Textbox(
|
| 113 |
+
label="音轨索引",
|
| 114 |
+
value="0",
|
| 115 |
+
info="多音轨文件的音轨索引,用逗号分隔"
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# 高级选项(更深层折叠)
|
| 119 |
+
with gr.Accordion("高级选项", open=False):
|
| 120 |
+
vocabulary_id = gr.Textbox(
|
| 121 |
+
label="热词ID v2",
|
| 122 |
+
value="",
|
| 123 |
+
info="v2模型的热词ID"
|
| 124 |
+
)
|
| 125 |
+
phrase_id = gr.Textbox(
|
| 126 |
+
label="热词ID v1",
|
| 127 |
+
value="",
|
| 128 |
+
info="v1模型的热词ID"
|
| 129 |
+
)
|
| 130 |
+
special_word_filter = gr.Textbox(
|
| 131 |
+
label="敏感词过滤配置",
|
| 132 |
+
value="",
|
| 133 |
+
lines=2,
|
| 134 |
+
placeholder='JSON格式配置',
|
| 135 |
+
info="敏感词过滤的JSON配置"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# 控制按钮
|
| 139 |
+
with gr.Row():
|
| 140 |
+
start_btn = gr.Button("🚀 开始转录", variant="primary", size="lg")
|
| 141 |
+
cancel_btn = gr.Button("❌ 取消", variant="secondary")
|
| 142 |
+
clear_btn = gr.Button("🗑️ 清空", variant="secondary")
|
| 143 |
+
|
| 144 |
+
with gr.Column(scale=2):
|
| 145 |
+
# 简化的状态显示
|
| 146 |
+
status_text = gr.Textbox(
|
| 147 |
+
label="📊 当前状态",
|
| 148 |
+
value="等待上传文件...",
|
| 149 |
+
interactive=False,
|
| 150 |
+
elem_classes=["status-simple"]
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# 转录结果
|
| 154 |
+
result_text = gr.Textbox(
|
| 155 |
+
label="📝 转录结果",
|
| 156 |
+
placeholder="转录结果将在这里显示...",
|
| 157 |
+
lines=12,
|
| 158 |
+
max_lines=20,
|
| 159 |
+
show_copy_button=True,
|
| 160 |
+
elem_classes=["result-area"]
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# 文件统计表格
|
| 164 |
+
stats_df = gr.Dataframe(
|
| 165 |
+
headers=["文件名", "时长", "文本长度", "置信度"],
|
| 166 |
+
datatype=["str", "str", "number", "number"],
|
| 167 |
+
label="📈 处理统计",
|
| 168 |
+
visible=False
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# 折叠的详细信息区域
|
| 172 |
+
with gr.Accordion("📋 详细信息", open=False) as detail_section:
|
| 173 |
+
with gr.Tabs():
|
| 174 |
+
with gr.Tab("系统信息"):
|
| 175 |
+
system_info = gr.JSON(
|
| 176 |
+
label="服务状态",
|
| 177 |
+
value=self._get_system_info()
|
| 178 |
+
)
|
| 179 |
+
format_info = gr.JSON(
|
| 180 |
+
label="支持格式",
|
| 181 |
+
value=supported_formats
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
with gr.Tab("任务信息"):
|
| 185 |
+
task_info = gr.JSON(
|
| 186 |
+
label="当前任务",
|
| 187 |
+
value={}
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
with gr.Tab("完整结果"):
|
| 191 |
+
result_json = gr.JSON(
|
| 192 |
+
label="JSON结果",
|
| 193 |
+
value={}
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
with gr.Tab("处理日志"):
|
| 197 |
+
log_text = gr.Textbox(
|
| 198 |
+
label="详细日志",
|
| 199 |
+
lines=8,
|
| 200 |
+
max_lines=12,
|
| 201 |
+
interactive=False,
|
| 202 |
+
show_copy_button=True
|
| 203 |
+
)
|
| 204 |
+
log_download = gr.File(
|
| 205 |
+
label="下载日志文件",
|
| 206 |
+
visible=False
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# 添加手动刷新按钮
|
| 212 |
+
with gr.Row():
|
| 213 |
+
refresh_btn = gr.Button("🔄 刷新状态", variant="secondary", size="sm")
|
| 214 |
+
refresh_btn.click(
|
| 215 |
+
fn=self._update_interface,
|
| 216 |
+
outputs=[status_text, task_info, result_text, result_json, stats_df, system_info, log_text]
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# 事件处理
|
| 220 |
+
start_btn.click(
|
| 221 |
+
fn=self._process_files,
|
| 222 |
+
inputs=[
|
| 223 |
+
file_upload, priority_select, language_select,
|
| 224 |
+
disfluency_removal, timestamp_alignment, diarization_enabled,
|
| 225 |
+
speaker_count, channel_select, vocabulary_id,
|
| 226 |
+
phrase_id, special_word_filter
|
| 227 |
+
],
|
| 228 |
+
outputs=[status_text, task_info, log_text]
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
cancel_btn.click(
|
| 232 |
+
fn=self._cancel_current_task,
|
| 233 |
+
outputs=[status_text, task_info]
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
clear_btn.click(
|
| 237 |
+
fn=self._clear_interface,
|
| 238 |
+
outputs=[file_upload, result_text, result_json, stats_df, log_text, status_text, task_info]
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# 定时更新
|
| 242 |
+
interface.load(
|
| 243 |
+
fn=self._update_interface,
|
| 244 |
+
outputs=[status_text, task_info, result_text, result_json, stats_df, system_info, log_text]
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
return interface
|
| 248 |
+
|
| 249 |
+
def _get_custom_css(self) -> str:
|
| 250 |
+
"""获取自定义CSS样式"""
|
| 251 |
+
return """
|
| 252 |
+
.gradio-container {
|
| 253 |
+
max-width: 1200px !important;
|
| 254 |
+
}
|
| 255 |
+
.gr-button-primary {
|
| 256 |
+
background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
|
| 257 |
+
border: none !important;
|
| 258 |
+
}
|
| 259 |
+
.gr-button-primary:hover {
|
| 260 |
+
transform: translateY(-2px) !important;
|
| 261 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
|
| 262 |
+
}
|
| 263 |
+
.progress-bar {
|
| 264 |
+
background: linear-gradient(90deg, #FF6B6B, #4ECDC4) !important;
|
| 265 |
+
}
|
| 266 |
+
"""
|
| 267 |
+
|
| 268 |
+
def _get_system_info(self) -> Dict:
|
| 269 |
+
"""获取系统信息"""
|
| 270 |
+
stats = self.task_manager.get_statistics()
|
| 271 |
+
return {
|
| 272 |
+
"服务状态": "运行中",
|
| 273 |
+
"当前任务数": stats['total_tasks'],
|
| 274 |
+
"待处理": stats['pending'],
|
| 275 |
+
"处理中": stats['validating'] + stats['uploading'] + stats['transcribing'],
|
| 276 |
+
"已完成": stats['completed'],
|
| 277 |
+
"失败": stats['failed'],
|
| 278 |
+
"队列大小": stats['queue_size']
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
def _get_timestamp(self) -> str:
|
| 282 |
+
"""获取当前时间戳"""
|
| 283 |
+
from datetime import datetime
|
| 284 |
+
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 285 |
+
|
| 286 |
+
async def _process_files(
|
| 287 |
+
self,
|
| 288 |
+
files: List,
|
| 289 |
+
priority: str,
|
| 290 |
+
languages: List[str],
|
| 291 |
+
disfluency_removal: bool,
|
| 292 |
+
timestamp_alignment: bool,
|
| 293 |
+
diarization_enabled: bool,
|
| 294 |
+
speaker_count: Optional[int] | None,
|
| 295 |
+
channel_id: str,
|
| 296 |
+
vocabulary_id: str,
|
| 297 |
+
phrase_id: str,
|
| 298 |
+
special_word_filter: str
|
| 299 |
+
) -> Tuple[str, Dict, str]:
|
| 300 |
+
"""处理上传的文件
|
| 301 |
+
|
| 302 |
+
Args:
|
| 303 |
+
files: 上传的文件列表
|
| 304 |
+
languages: 选择的语言
|
| 305 |
+
priority: 任务优先级
|
| 306 |
+
channel_id: 音轨索引
|
| 307 |
+
disfluency_removal: 是否过滤语气词
|
| 308 |
+
timestamp_alignment: 是否启用时间戳校准
|
| 309 |
+
diarization_enabled: 是否启用说话人分离
|
| 310 |
+
speaker_count: 说话人数量参考值
|
| 311 |
+
vocabulary_id: 热词ID v2
|
| 312 |
+
phrase_id: 热词ID v1
|
| 313 |
+
special_word_filter: 敏感词过滤配置
|
| 314 |
+
|
| 315 |
+
Returns:
|
| 316 |
+
(状态信息, 任务信息, 日志信息)
|
| 317 |
+
"""
|
| 318 |
+
try:
|
| 319 |
+
if not files:
|
| 320 |
+
return "请先上传音频文件", {}, "错误: 未选择任何文件"
|
| 321 |
+
|
| 322 |
+
# 记录详细日志
|
| 323 |
+
log_messages = []
|
| 324 |
+
log_messages.append(f"[{self._get_timestamp()}] 开始处理文件上传请求")
|
| 325 |
+
log_messages.append(f"[{self._get_timestamp()}] 接收到 {len(files)} 个文件")
|
| 326 |
+
|
| 327 |
+
# 转换文件路径
|
| 328 |
+
file_paths = [Path(f.name) for f in files]
|
| 329 |
+
log_messages.append(f"[{self._get_timestamp()}] 转换文件路径完成")
|
| 330 |
+
|
| 331 |
+
# 显示文件信息
|
| 332 |
+
for i, file_path in enumerate(file_paths):
|
| 333 |
+
try:
|
| 334 |
+
file_size = file_path.stat().st_size
|
| 335 |
+
log_messages.append(f"[{self._get_timestamp()}] 文件 {i+1}: {file_path.name} (大小: {file_size} 字节)")
|
| 336 |
+
except Exception as e:
|
| 337 |
+
log_messages.append(f"[{self._get_timestamp()}] 文件 {i+1}: {file_path.name} (无法获取文件信息: {str(e)})")
|
| 338 |
+
|
| 339 |
+
# 解析音轨参数
|
| 340 |
+
try:
|
| 341 |
+
channel_list = [int(x.strip()) for x in channel_id.split(',') if x.strip()]
|
| 342 |
+
except ValueError:
|
| 343 |
+
channel_list = [0] # 默认为第一条音轨
|
| 344 |
+
|
| 345 |
+
# 验证说话人数量参数
|
| 346 |
+
validated_speaker_count = None
|
| 347 |
+
if speaker_count is not None:
|
| 348 |
+
if isinstance(speaker_count, (int, float)) and speaker_count >= 2 and speaker_count <= 100:
|
| 349 |
+
validated_speaker_count = int(speaker_count)
|
| 350 |
+
else:
|
| 351 |
+
log_messages.append(f"[{self._get_timestamp()}] 警告: 说话人数量无效({speaker_count}),将使用自动判断")
|
| 352 |
+
|
| 353 |
+
# 解析敏感词过滤参数
|
| 354 |
+
special_filter = None
|
| 355 |
+
if special_word_filter.strip():
|
| 356 |
+
try:
|
| 357 |
+
special_filter = json.loads(special_word_filter)
|
| 358 |
+
except json.JSONDecodeError as e:
|
| 359 |
+
log_messages.append(f"[{self._get_timestamp()}] 警告: 敏感词过滤配置格式错误,将使���默认设置")
|
| 360 |
+
|
| 361 |
+
# 创建任务
|
| 362 |
+
task_priority = TaskPriority.HIGH if priority == "HIGH" else TaskPriority.NORMAL
|
| 363 |
+
|
| 364 |
+
# 准备元数据,包含所有Paraformer参数
|
| 365 |
+
metadata = {
|
| 366 |
+
"languages": languages,
|
| 367 |
+
"file_count": len(file_paths),
|
| 368 |
+
"paraformer_params": {
|
| 369 |
+
"language_hints": languages,
|
| 370 |
+
"channel_id": channel_list,
|
| 371 |
+
"disfluency_removal_enabled": disfluency_removal,
|
| 372 |
+
"timestamp_alignment_enabled": timestamp_alignment,
|
| 373 |
+
"diarization_enabled": diarization_enabled,
|
| 374 |
+
"speaker_count": validated_speaker_count,
|
| 375 |
+
"vocabulary_id": vocabulary_id.strip() if vocabulary_id.strip() else None,
|
| 376 |
+
"phrase_id": phrase_id.strip() if phrase_id.strip() else None,
|
| 377 |
+
"special_word_filter": json.dumps(special_filter) if special_filter else None
|
| 378 |
+
}
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
log_messages.append(f"[{self._get_timestamp()}] 创建任务,优先级: {task_priority.value}")
|
| 382 |
+
log_messages.append(f"[{self._get_timestamp()}] 选择语言: {', '.join(languages) if languages else '自动识别'}")
|
| 383 |
+
|
| 384 |
+
self.current_task_id = await self.task_manager.create_task(
|
| 385 |
+
file_paths=file_paths,
|
| 386 |
+
priority=task_priority,
|
| 387 |
+
metadata=metadata
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
task = self.task_manager.get_task(self.current_task_id)
|
| 391 |
+
|
| 392 |
+
log_messages.append(f"[{self._get_timestamp()}] 任务创建成功,任务ID: {self.current_task_id}")
|
| 393 |
+
|
| 394 |
+
return (
|
| 395 |
+
f"任务已创建: {self.current_task_id}",
|
| 396 |
+
task.to_dict() if task else {},
|
| 397 |
+
"\n".join(log_messages) + f"\n开始处理 {len(file_paths)} 个文件...\n"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
except Exception as e:
|
| 401 |
+
error_msg = f"创建任务失败: {str(e)}"
|
| 402 |
+
self.logger.exception(error_msg)
|
| 403 |
+
return error_msg, {}, f"错误: {error_msg}\n"
|
| 404 |
+
|
| 405 |
+
def _cancel_current_task(self) -> Tuple[str, Dict]:
|
| 406 |
+
"""取消当前任务"""
|
| 407 |
+
if not self.current_task_id:
|
| 408 |
+
return "没有正在执行的任务", {}
|
| 409 |
+
|
| 410 |
+
success = asyncio.create_task(
|
| 411 |
+
self.task_manager.cancel_task(self.current_task_id)
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
if success:
|
| 415 |
+
return f"任务 {self.current_task_id} 已取消", {}
|
| 416 |
+
else:
|
| 417 |
+
return "取消任务失败", {}
|
| 418 |
+
|
| 419 |
+
def _clear_interface(self) -> Tuple[None, str, Dict, List, str, str, Dict]:
|
| 420 |
+
"""清空界面"""
|
| 421 |
+
self.current_task_id = None
|
| 422 |
+
return (
|
| 423 |
+
None, # file_upload
|
| 424 |
+
"", # result_text
|
| 425 |
+
{}, # result_json
|
| 426 |
+
[], # stats_df
|
| 427 |
+
"", # log_text
|
| 428 |
+
"界面已清空,等待上传文件...", # status_text
|
| 429 |
+
{} # task_info
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
def _update_interface(self) -> Tuple[str, Dict, str, Dict, List, Dict, str]:
|
| 433 |
+
"""更新界面状态"""
|
| 434 |
+
# 更新当前任务状态
|
| 435 |
+
status_text = "等待上传文件..."
|
| 436 |
+
task_info = {}
|
| 437 |
+
result_text = ""
|
| 438 |
+
result_json = {}
|
| 439 |
+
stats_data = []
|
| 440 |
+
log_text = ""
|
| 441 |
+
|
| 442 |
+
if self.current_task_id:
|
| 443 |
+
task = self.task_manager.get_task(self.current_task_id)
|
| 444 |
+
if task:
|
| 445 |
+
task_info = task.to_dict()
|
| 446 |
+
status_text = f"[{task.status.value}] {task.progress.message}"
|
| 447 |
+
|
| 448 |
+
# 收集详细日志
|
| 449 |
+
log_text = self._collect_task_logs(task)
|
| 450 |
+
|
| 451 |
+
# 如果任务完成,显示结果
|
| 452 |
+
if task.status == TaskStatus.COMPLETED:
|
| 453 |
+
self.logger.debug(f"任务已完成,检查转录结果: {task.result.transcription_results}")
|
| 454 |
+
if task.result.transcription_results:
|
| 455 |
+
result_json = task.result.transcription_results
|
| 456 |
+
|
| 457 |
+
# 提取转录文本
|
| 458 |
+
transcriptions = result_json.get('transcriptions', [])
|
| 459 |
+
self.logger.debug(f"转录结果: {transcriptions}")
|
| 460 |
+
result_text = "\n\n".join([
|
| 461 |
+
f"文件: {t.get('file_url', '').split('/')[-1]}\n{t.get('text', '')}"
|
| 462 |
+
for t in transcriptions if t.get('text')
|
| 463 |
+
])
|
| 464 |
+
|
| 465 |
+
# 生成统计表格
|
| 466 |
+
stats_data = []
|
| 467 |
+
for t in transcriptions:
|
| 468 |
+
if 'error' not in t:
|
| 469 |
+
stats_data.append([
|
| 470 |
+
t.get('file_url', '').split('/')[-1],
|
| 471 |
+
f"{t.get('duration', 0):.1f}s",
|
| 472 |
+
len(t.get('text', '')),
|
| 473 |
+
t.get('language', 'unknown'),
|
| 474 |
+
round(t.get('confidence', 0), 3)
|
| 475 |
+
])
|
| 476 |
+
else:
|
| 477 |
+
self.logger.debug("任务已完成但没有转录结果")
|
| 478 |
+
elif task.status == TaskStatus.FAILED:
|
| 479 |
+
# 如果任务失败,显示错误信息
|
| 480 |
+
if task.result and task.result.error_message:
|
| 481 |
+
log_text += f"\n[{self._get_timestamp()}] 任务失败: {task.result.error_message}"
|
| 482 |
+
|
| 483 |
+
# 更新系统信息
|
| 484 |
+
system_info = self._get_system_info()
|
| 485 |
+
|
| 486 |
+
return status_text, task_info, result_text, result_json, stats_data, system_info, log_text
|
| 487 |
+
|
| 488 |
+
def _collect_task_logs(self, task) -> str:
|
| 489 |
+
"""收集任务的详细日志
|
| 490 |
+
|
| 491 |
+
Args:
|
| 492 |
+
task: 任务对象
|
| 493 |
+
|
| 494 |
+
Returns:
|
| 495 |
+
格式化的日志字符串
|
| 496 |
+
"""
|
| 497 |
+
if not task:
|
| 498 |
+
return "无任务信息"
|
| 499 |
+
|
| 500 |
+
log_lines = []
|
| 501 |
+
log_lines.append(f"[{self._get_timestamp()}] 任务ID: {task.id}")
|
| 502 |
+
log_lines.append(f"[{self._get_timestamp()}] 任务状态: {task.status.value}")
|
| 503 |
+
log_lines.append(f"[{self._get_timestamp()}] 任务创建时间: {task.created_at}")
|
| 504 |
+
|
| 505 |
+
# 添加进度信息
|
| 506 |
+
if task.progress:
|
| 507 |
+
log_lines.append(f"[{self._get_timestamp()}] 进度信息: {task.progress.message}")
|
| 508 |
+
# TaskProgress对象没有details属性,只使用message
|
| 509 |
+
|
| 510 |
+
# 添加文件信息
|
| 511 |
+
if hasattr(task, 'file_paths') and task.file_paths:
|
| 512 |
+
log_lines.append(f"[{self._get_timestamp()}] 文件列表:")
|
| 513 |
+
for i, file_path in enumerate(task.file_paths):
|
| 514 |
+
try:
|
| 515 |
+
file_size = file_path.stat().st_size
|
| 516 |
+
log_lines.append(f" {i+1}. {file_path.name} ({file_size} bytes)")
|
| 517 |
+
except Exception as e:
|
| 518 |
+
log_lines.append(f" {i+1}. {file_path.name} (无法获取文件信息: {str(e)})")
|
| 519 |
+
|
| 520 |
+
# 添加结果信息(如果任务已完成)
|
| 521 |
+
if task.status == TaskStatus.COMPLETED and task.result:
|
| 522 |
+
log_lines.append(f"[{self._get_timestamp()}] 任务完成时间: {task.completed_at}")
|
| 523 |
+
if hasattr(task.result, 'transcription_results') and task.result.transcription_results:
|
| 524 |
+
transcriptions = task.result.transcription_results.get('transcriptions', [])
|
| 525 |
+
log_lines.append(f"[{self._get_timestamp()}] 转录结果: {len(transcriptions)} 个文件")
|
| 526 |
+
|
| 527 |
+
# 添加错误信息(如果有的话)
|
| 528 |
+
# Task对象没有error属性,错误信息在result中
|
| 529 |
+
|
| 530 |
+
return "\n".join(log_lines)
|
| 531 |
+
|
| 532 |
+
def _on_task_status_change(self, task):
|
| 533 |
+
"""任务状态变化回调"""
|
| 534 |
+
self.logger.debug(f"任务状态变化: {task.id} -> {task.status.value}")
|
| 535 |
+
# 当任务状态变化时,不直接更新界面,而是依赖定时更新机制
|
| 536 |
+
# Gradio的回调中不能直接更新界面组件
|
| 537 |
+
|
| 538 |
+
def launch(self, **kwargs):
|
| 539 |
+
"""启动Gradio界面"""
|
| 540 |
+
default_kwargs = {
|
| 541 |
+
'server_name': '0.0.0.0', # 改为0.0.0.0以允许外部访问
|
| 542 |
+
'server_port': self.config.app.port,
|
| 543 |
+
'share': True, # 开启分享链接
|
| 544 |
+
'debug': self.config.app.debug,
|
| 545 |
+
'show_error': True,
|
| 546 |
+
'quiet': not self.config.app.debug
|
| 547 |
+
}
|
| 548 |
+
default_kwargs.update(kwargs)
|
| 549 |
+
|
| 550 |
+
self.logger.info(f"启动Gradio界面: http://{default_kwargs['server_name']}:{default_kwargs['server_port']}")
|
| 551 |
+
|
| 552 |
+
return self.interface.launch(**default_kwargs)
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
# 全局界面实例
|
| 556 |
+
gradio_interface = GradioInterface()
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def get_gradio_interface() -> GradioInterface:
|
| 560 |
+
"""获取Gradio界面实例
|
| 561 |
+
|
| 562 |
+
Returns:
|
| 563 |
+
Gradio界面实例
|
| 564 |
+
"""
|
| 565 |
+
return gradio_interface
|
| 566 |
+
|
| 567 |
+
|
| 568 |
+
def create_demo_interface() -> gr.Blocks:
|
| 569 |
+
"""创建演示界面
|
| 570 |
+
|
| 571 |
+
Returns:
|
| 572 |
+
Gradio界面对象
|
| 573 |
+
"""
|
| 574 |
+
return gradio_interface.interface
|
src/core/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""核心模块
|
| 2 |
+
|
| 3 |
+
包含应用程序的核心功能和基础组件。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .config import Config, get_config, reload_config
|
| 7 |
+
from .task_manager import TaskManager, TaskStatus, TaskPriority, Task, get_task_manager, task_manager
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"Config",
|
| 11 |
+
"get_config",
|
| 12 |
+
"reload_config",
|
| 13 |
+
"TaskManager",
|
| 14 |
+
"TaskStatus",
|
| 15 |
+
"TaskPriority",
|
| 16 |
+
"Task",
|
| 17 |
+
"get_task_manager",
|
| 18 |
+
"task_manager"
|
| 19 |
+
]
|
src/core/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (536 Bytes). View file
|
|
|
src/core/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (5.46 kB). View file
|
|
|
src/core/__pycache__/task_manager.cpython-310.pyc
ADDED
|
Binary file (14.3 kB). View file
|
|
|
src/core/config.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""配置管理模块
|
| 2 |
+
|
| 3 |
+
提供应用程序配置的加载和管理功能。
|
| 4 |
+
支持多环境配置和环境变量覆盖。
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Any, Dict, Optional
|
| 10 |
+
import yaml
|
| 11 |
+
from pydantic import Field
|
| 12 |
+
from pydantic_settings import BaseSettings
|
| 13 |
+
from pydantic_settings import SettingsConfigDict
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class AppConfig(BaseSettings):
|
| 17 |
+
"""应用程序配置"""
|
| 18 |
+
model_config = SettingsConfigDict(
|
| 19 |
+
env_prefix="APP_",
|
| 20 |
+
env_file=".env",
|
| 21 |
+
env_file_encoding="utf-8",
|
| 22 |
+
case_sensitive=False,
|
| 23 |
+
extra="ignore" # 忽略额外字段
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
name: str = "音频转文字服务"
|
| 27 |
+
version: str = "1.0.0"
|
| 28 |
+
debug: bool = False
|
| 29 |
+
host: str = "127.0.0.1"
|
| 30 |
+
port: int = 7860
|
| 31 |
+
max_file_size: int = 2147483648 # 2GB
|
| 32 |
+
max_files_count: int = 100
|
| 33 |
+
concurrent_tasks: int = 5
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class OSSConfig(BaseSettings):
|
| 37 |
+
"""OSS配置"""
|
| 38 |
+
model_config = SettingsConfigDict(
|
| 39 |
+
env_prefix="OSS_",
|
| 40 |
+
env_file=".env",
|
| 41 |
+
env_file_encoding="utf-8",
|
| 42 |
+
case_sensitive=False,
|
| 43 |
+
extra="ignore"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
endpoint: str = Field(..., description="OSS服务端点")
|
| 47 |
+
access_key_id: str = Field(..., description="访问密钥ID")
|
| 48 |
+
access_key_secret: str = Field(..., description="访问密钥密码")
|
| 49 |
+
bucket_name: str = Field(..., description="存储桶名称")
|
| 50 |
+
upload_timeout: int = 300
|
| 51 |
+
url_expire_hours: int = 24
|
| 52 |
+
temp_prefix: str = "temp/audio"
|
| 53 |
+
auto_cleanup_days: int = 7
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class DashScopeConfig(BaseSettings):
|
| 57 |
+
"""阿里云百炼API配置"""
|
| 58 |
+
model_config = SettingsConfigDict(
|
| 59 |
+
env_prefix="DASHSCOPE_",
|
| 60 |
+
env_file=".env",
|
| 61 |
+
env_file_encoding="utf-8",
|
| 62 |
+
case_sensitive=False,
|
| 63 |
+
extra="ignore"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
api_key: str = Field(..., description="API密钥")
|
| 67 |
+
base_url: str = "https://dashscope.aliyuncs.com/api/v1"
|
| 68 |
+
model: str = "paraformer-v2"
|
| 69 |
+
timeout: int = 300
|
| 70 |
+
max_retries: int = 3
|
| 71 |
+
retry_delay: int = 5
|
| 72 |
+
language_hints: list[str] = ["zh", "en"]
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
class TaskConfig(BaseSettings):
|
| 76 |
+
"""任务配置"""
|
| 77 |
+
model_config = SettingsConfigDict(
|
| 78 |
+
env_prefix="TASK_",
|
| 79 |
+
env_file=".env",
|
| 80 |
+
env_file_encoding="utf-8",
|
| 81 |
+
case_sensitive=False,
|
| 82 |
+
extra="ignore"
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
status_check_interval: int = 2
|
| 86 |
+
max_processing_time: int = 3600 # 1小时
|
| 87 |
+
queue_size: int = 1000
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class LoggingConfig(BaseSettings):
|
| 91 |
+
"""日志配置"""
|
| 92 |
+
model_config = SettingsConfigDict(
|
| 93 |
+
env_prefix="LOGGING_",
|
| 94 |
+
env_file=".env",
|
| 95 |
+
env_file_encoding="utf-8",
|
| 96 |
+
case_sensitive=False,
|
| 97 |
+
extra="ignore"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
level: str = "INFO"
|
| 101 |
+
format: str = "structured"
|
| 102 |
+
file_max_size: str = "10MB"
|
| 103 |
+
backup_count: int = 5
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class Config:
|
| 107 |
+
"""配置管理器"""
|
| 108 |
+
|
| 109 |
+
def __init__(self, environment: Optional[str] = None):
|
| 110 |
+
"""初始化配置管理器
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
environment: 环境名称(development/production)
|
| 114 |
+
"""
|
| 115 |
+
self.environment = environment or os.getenv("ENVIRONMENT", "development")
|
| 116 |
+
self._config_data = self._load_config()
|
| 117 |
+
|
| 118 |
+
# 初始化各个配置模块
|
| 119 |
+
self.app = AppConfig(**self._config_data.get("app", {}))
|
| 120 |
+
|
| 121 |
+
# OSS配置 - 直接创建实例以支持环境变量覆盖
|
| 122 |
+
self.oss = OSSConfig()
|
| 123 |
+
|
| 124 |
+
# DashScope配置 - 直接创建实例以支持环境变量覆盖
|
| 125 |
+
self.dashscope = DashScopeConfig()
|
| 126 |
+
|
| 127 |
+
self.task = TaskConfig(**self._config_data.get("task", {}))
|
| 128 |
+
self.logging = LoggingConfig(**self._config_data.get("logging", {}))
|
| 129 |
+
|
| 130 |
+
def _load_config(self) -> Dict[str, Any]:
|
| 131 |
+
"""加载配置文件"""
|
| 132 |
+
config_dir = Path(__file__).parent.parent.parent / "config" / "environments"
|
| 133 |
+
config_file = config_dir / f"{self.environment}.yaml"
|
| 134 |
+
|
| 135 |
+
if not config_file.exists():
|
| 136 |
+
raise FileNotFoundError(f"配置文件不存在: {config_file}")
|
| 137 |
+
|
| 138 |
+
with open(config_file, 'r', encoding='utf-8') as file:
|
| 139 |
+
return yaml.safe_load(file)
|
| 140 |
+
|
| 141 |
+
def get_project_root(self) -> Path:
|
| 142 |
+
"""获取项目根目录"""
|
| 143 |
+
return Path(__file__).parent.parent.parent
|
| 144 |
+
|
| 145 |
+
def get_logs_dir(self) -> Path:
|
| 146 |
+
"""获取日志目录"""
|
| 147 |
+
logs_dir = self.get_project_root() / "logs"
|
| 148 |
+
logs_dir.mkdir(exist_ok=True)
|
| 149 |
+
return logs_dir
|
| 150 |
+
|
| 151 |
+
def get_temp_dir(self) -> Path:
|
| 152 |
+
"""获取临时文件目录"""
|
| 153 |
+
temp_dir = self.get_project_root() / "temp"
|
| 154 |
+
temp_dir.mkdir(exist_ok=True)
|
| 155 |
+
return temp_dir
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# 全局配置实例
|
| 159 |
+
config = Config()
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def get_config() -> Config:
|
| 163 |
+
"""获取配置实例"""
|
| 164 |
+
return config
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def reload_config(environment: Optional[str] = None) -> Config:
|
| 168 |
+
"""重新加载配置"""
|
| 169 |
+
global config
|
| 170 |
+
config = Config(environment)
|
| 171 |
+
return config
|
src/core/task_manager.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""任务管理模块
|
| 2 |
+
|
| 3 |
+
提供任务状态跟踪、进度管理和任务队列功能。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import time
|
| 8 |
+
import uuid
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from enum import Enum
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Dict, List, Optional, Callable, Any
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 15 |
+
|
| 16 |
+
from ..core.config import get_config
|
| 17 |
+
from ..utils.logger import get_task_logger
|
| 18 |
+
from ..services.file_validator import get_file_validator
|
| 19 |
+
from ..services.oss_service import get_oss_service
|
| 20 |
+
from ..services.paraformer_service import get_paraformer_service
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TaskStatus(Enum):
|
| 24 |
+
"""任务状态"""
|
| 25 |
+
PENDING = "pending"
|
| 26 |
+
VALIDATING = "validating"
|
| 27 |
+
UPLOADING = "uploading"
|
| 28 |
+
TRANSCRIBING = "transcribing"
|
| 29 |
+
COMPLETED = "completed"
|
| 30 |
+
FAILED = "failed"
|
| 31 |
+
CANCELLED = "cancelled"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class TaskPriority(Enum):
|
| 35 |
+
"""任务优先级"""
|
| 36 |
+
LOW = 1
|
| 37 |
+
NORMAL = 2
|
| 38 |
+
HIGH = 3
|
| 39 |
+
URGENT = 4
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@dataclass
|
| 43 |
+
class TaskProgress:
|
| 44 |
+
"""任务进度信息"""
|
| 45 |
+
stage: str = ""
|
| 46 |
+
current: int = 0
|
| 47 |
+
total: int = 100
|
| 48 |
+
message: str = ""
|
| 49 |
+
percentage: float = 0.0
|
| 50 |
+
|
| 51 |
+
def update(self, current: int = None, total: int = None, message: str = None):
|
| 52 |
+
"""更新进度信息"""
|
| 53 |
+
if current is not None:
|
| 54 |
+
self.current = current
|
| 55 |
+
if total is not None:
|
| 56 |
+
self.total = total
|
| 57 |
+
if message is not None:
|
| 58 |
+
self.message = message
|
| 59 |
+
|
| 60 |
+
if self.total > 0:
|
| 61 |
+
self.percentage = min(100.0, (self.current / self.total) * 100)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
@dataclass
|
| 65 |
+
class TaskResult:
|
| 66 |
+
"""任务结果"""
|
| 67 |
+
success: bool = False
|
| 68 |
+
data: Optional[Dict] = None
|
| 69 |
+
error_message: Optional[str] = None
|
| 70 |
+
processed_files: List[str] = field(default_factory=list)
|
| 71 |
+
failed_files: List[str] = field(default_factory=list)
|
| 72 |
+
transcription_results: Optional[Dict] = None
|
| 73 |
+
duration: float = 0.0
|
| 74 |
+
|
| 75 |
+
def to_dict(self) -> Dict:
|
| 76 |
+
"""转换为字典格式"""
|
| 77 |
+
return {
|
| 78 |
+
'success': self.success,
|
| 79 |
+
'data': self.data,
|
| 80 |
+
'error_message': self.error_message,
|
| 81 |
+
'processed_files': self.processed_files,
|
| 82 |
+
'failed_files': self.failed_files,
|
| 83 |
+
'transcription_results': self.transcription_results,
|
| 84 |
+
'duration': self.duration
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@dataclass
|
| 89 |
+
class Task:
|
| 90 |
+
"""任务信息"""
|
| 91 |
+
id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
|
| 92 |
+
status: TaskStatus = TaskStatus.PENDING
|
| 93 |
+
priority: TaskPriority = TaskPriority.NORMAL
|
| 94 |
+
file_paths: List[Path] = field(default_factory=list)
|
| 95 |
+
progress: TaskProgress = field(default_factory=TaskProgress)
|
| 96 |
+
result: TaskResult = field(default_factory=TaskResult)
|
| 97 |
+
created_at: datetime = field(default_factory=datetime.now)
|
| 98 |
+
started_at: Optional[datetime] = None
|
| 99 |
+
completed_at: Optional[datetime] = None
|
| 100 |
+
callback: Optional[Callable] = None
|
| 101 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
| 102 |
+
|
| 103 |
+
def to_dict(self) -> Dict:
|
| 104 |
+
"""转换为字典格式"""
|
| 105 |
+
return {
|
| 106 |
+
'id': self.id,
|
| 107 |
+
'status': self.status.value,
|
| 108 |
+
'priority': self.priority.value,
|
| 109 |
+
'file_count': len(self.file_paths),
|
| 110 |
+
'file_names': [fp.name for fp in self.file_paths],
|
| 111 |
+
'progress': {
|
| 112 |
+
'stage': self.progress.stage,
|
| 113 |
+
'current': self.progress.current,
|
| 114 |
+
'total': self.progress.total,
|
| 115 |
+
'percentage': self.progress.percentage,
|
| 116 |
+
'message': self.progress.message
|
| 117 |
+
},
|
| 118 |
+
'result': self.result.to_dict(),
|
| 119 |
+
'created_at': self.created_at.isoformat() if self.created_at else None,
|
| 120 |
+
'started_at': self.started_at.isoformat() if self.started_at else None,
|
| 121 |
+
'completed_at': self.completed_at.isoformat() if self.completed_at else None,
|
| 122 |
+
'metadata': self.metadata
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
class TaskManager:
|
| 127 |
+
"""任务管理器"""
|
| 128 |
+
|
| 129 |
+
def __init__(self):
|
| 130 |
+
"""初始化任务管理器"""
|
| 131 |
+
self.config = get_config()
|
| 132 |
+
self.logger = get_task_logger(logger_name="transcript_service.task")
|
| 133 |
+
|
| 134 |
+
# 任务存储
|
| 135 |
+
self.tasks: Dict[str, Task] = {}
|
| 136 |
+
self.task_queue: asyncio.Queue = asyncio.Queue(maxsize=self.config.task.queue_size)
|
| 137 |
+
|
| 138 |
+
# 服务实例
|
| 139 |
+
self.file_validator = get_file_validator()
|
| 140 |
+
self.oss_service = get_oss_service()
|
| 141 |
+
self.paraformer_service = get_paraformer_service()
|
| 142 |
+
|
| 143 |
+
# 工作线程池
|
| 144 |
+
self.executor = ThreadPoolExecutor(max_workers=self.config.app.concurrent_tasks)
|
| 145 |
+
|
| 146 |
+
# 状态回调
|
| 147 |
+
self.status_callbacks: List[Callable] = []
|
| 148 |
+
|
| 149 |
+
# 任务处理器状态
|
| 150 |
+
self._processor_started = False
|
| 151 |
+
|
| 152 |
+
# 启动任务处理器
|
| 153 |
+
self._start_task_processor()
|
| 154 |
+
|
| 155 |
+
def add_status_callback(self, callback: Callable):
|
| 156 |
+
"""添加状态变化回调函数
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
callback: 回调函数
|
| 160 |
+
"""
|
| 161 |
+
self.status_callbacks.append(callback)
|
| 162 |
+
|
| 163 |
+
def _notify_status_change(self, task: Task):
|
| 164 |
+
"""通知状态变化"""
|
| 165 |
+
for callback in self.status_callbacks:
|
| 166 |
+
try:
|
| 167 |
+
callback(task)
|
| 168 |
+
except Exception as e:
|
| 169 |
+
self.logger.error(f"回调函数执行失败: {str(e)}")
|
| 170 |
+
|
| 171 |
+
async def create_task(self, file_paths: List[Path], priority: TaskPriority = TaskPriority.NORMAL, metadata = None) -> str:
|
| 172 |
+
"""创建新任务
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
file_paths: 文件路径列表
|
| 176 |
+
priority: 任务优先级
|
| 177 |
+
metadata: 任务元数据
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
任务ID
|
| 181 |
+
"""
|
| 182 |
+
# 确保任务处理器已启动
|
| 183 |
+
if not self._processor_started:
|
| 184 |
+
self._ensure_processor_started()
|
| 185 |
+
|
| 186 |
+
task = Task(
|
| 187 |
+
file_paths=file_paths,
|
| 188 |
+
priority=priority,
|
| 189 |
+
metadata=metadata or {}
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
self.tasks[task.id] = task
|
| 193 |
+
|
| 194 |
+
# 添加到队列
|
| 195 |
+
await self.task_queue.put(task.id)
|
| 196 |
+
|
| 197 |
+
self.logger.info(f"创建任务: {task.id}, 文件数量: {len(file_paths)}")
|
| 198 |
+
return task.id
|
| 199 |
+
|
| 200 |
+
def get_task(self, task_id: str) -> Optional[Task]:
|
| 201 |
+
"""获取任务信息
|
| 202 |
+
|
| 203 |
+
Args:
|
| 204 |
+
task_id: 任务ID
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
任务对象
|
| 208 |
+
"""
|
| 209 |
+
return self.tasks.get(task_id)
|
| 210 |
+
|
| 211 |
+
def get_all_tasks(self) -> List[Task]:
|
| 212 |
+
"""获取所有任务"""
|
| 213 |
+
return list(self.tasks.values())
|
| 214 |
+
|
| 215 |
+
def get_tasks_by_status(self, status: TaskStatus) -> List[Task]:
|
| 216 |
+
"""根据状态获取任务"""
|
| 217 |
+
return [task for task in self.tasks.values() if task.status == status]
|
| 218 |
+
|
| 219 |
+
async def cancel_task(self, task_id: str) -> bool:
|
| 220 |
+
"""取消任务
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
task_id: 任务ID
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
是否成功取消
|
| 227 |
+
"""
|
| 228 |
+
task = self.get_task(task_id)
|
| 229 |
+
if not task:
|
| 230 |
+
return False
|
| 231 |
+
|
| 232 |
+
if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
|
| 233 |
+
return False
|
| 234 |
+
|
| 235 |
+
task.status = TaskStatus.CANCELLED
|
| 236 |
+
task.completed_at = datetime.now()
|
| 237 |
+
task.progress.message = "任务已取消"
|
| 238 |
+
|
| 239 |
+
self._notify_status_change(task)
|
| 240 |
+
self.logger.info(f"任务已取消: {task_id}")
|
| 241 |
+
return True
|
| 242 |
+
|
| 243 |
+
def _start_task_processor(self):
|
| 244 |
+
"""启动任务处理器"""
|
| 245 |
+
try:
|
| 246 |
+
# 只有在有运行的事件循环时才启动任务处理器
|
| 247 |
+
loop = asyncio.get_running_loop()
|
| 248 |
+
asyncio.create_task(self._process_tasks())
|
| 249 |
+
except RuntimeError:
|
| 250 |
+
# 没有运行的事件循环,延迟启动
|
| 251 |
+
self.logger.debug("没有运行的事件循环,任务处理器将在需要时启动")
|
| 252 |
+
self._processor_started = False
|
| 253 |
+
else:
|
| 254 |
+
self._processor_started = True
|
| 255 |
+
|
| 256 |
+
def _ensure_processor_started(self):
|
| 257 |
+
"""确保任务处理器已启动"""
|
| 258 |
+
if not self._processor_started:
|
| 259 |
+
try:
|
| 260 |
+
loop = asyncio.get_running_loop()
|
| 261 |
+
asyncio.create_task(self._process_tasks())
|
| 262 |
+
self._processor_started = True
|
| 263 |
+
except RuntimeError:
|
| 264 |
+
self.logger.warning("无法启动任务处理器:没有运行的事件循环")
|
| 265 |
+
|
| 266 |
+
async def _process_tasks(self):
|
| 267 |
+
"""处理任务队列"""
|
| 268 |
+
while True:
|
| 269 |
+
try:
|
| 270 |
+
# 从队列获取任务
|
| 271 |
+
task_id = await self.task_queue.get()
|
| 272 |
+
task = self.get_task(task_id)
|
| 273 |
+
|
| 274 |
+
if not task or task.status == TaskStatus.CANCELLED:
|
| 275 |
+
self.task_queue.task_done()
|
| 276 |
+
continue
|
| 277 |
+
|
| 278 |
+
# 处理任务
|
| 279 |
+
await self._execute_task(task)
|
| 280 |
+
self.task_queue.task_done()
|
| 281 |
+
|
| 282 |
+
except Exception as e:
|
| 283 |
+
self.logger.exception(f"处理任务队列时发生错误: {str(e)}")
|
| 284 |
+
await asyncio.sleep(1)
|
| 285 |
+
|
| 286 |
+
async def _execute_task(self, task: Task):
|
| 287 |
+
"""执行任务
|
| 288 |
+
|
| 289 |
+
Args:
|
| 290 |
+
task: 任务对象
|
| 291 |
+
"""
|
| 292 |
+
try:
|
| 293 |
+
# 设置任务日志上下文
|
| 294 |
+
self.logger.set_task_id(task.id)
|
| 295 |
+
|
| 296 |
+
task.status = TaskStatus.VALIDATING
|
| 297 |
+
task.started_at = datetime.now()
|
| 298 |
+
task.progress.stage = "文件验证"
|
| 299 |
+
task.progress.update(0, 100, "开始验证文件")
|
| 300 |
+
self._notify_status_change(task)
|
| 301 |
+
|
| 302 |
+
# 1. 文件验证
|
| 303 |
+
valid_files, invalid_files = await self._validate_files(task)
|
| 304 |
+
if not valid_files:
|
| 305 |
+
task.status = TaskStatus.FAILED
|
| 306 |
+
task.result.error_message = "没有有效的文件"
|
| 307 |
+
task.result.failed_files = [str(f[0]) for f in invalid_files]
|
| 308 |
+
task.completed_at = datetime.now()
|
| 309 |
+
self._notify_status_change(task)
|
| 310 |
+
return
|
| 311 |
+
|
| 312 |
+
# 2. 文件上传
|
| 313 |
+
task.status = TaskStatus.UPLOADING
|
| 314 |
+
task.progress.stage = "文件上传"
|
| 315 |
+
task.progress.update(0, len(valid_files), "开始上传文件到OSS")
|
| 316 |
+
self._notify_status_change(task)
|
| 317 |
+
|
| 318 |
+
upload_results = await self._upload_files(task, valid_files)
|
| 319 |
+
successful_uploads = [r for r in upload_results if r[1]]
|
| 320 |
+
|
| 321 |
+
if not successful_uploads:
|
| 322 |
+
task.status = TaskStatus.FAILED
|
| 323 |
+
task.result.error_message = "文件上传失败"
|
| 324 |
+
task.completed_at = datetime.now()
|
| 325 |
+
self._notify_status_change(task)
|
| 326 |
+
return
|
| 327 |
+
|
| 328 |
+
# 3. 转录处理
|
| 329 |
+
task.status = TaskStatus.TRANSCRIBING
|
| 330 |
+
task.progress.stage = "语音转录"
|
| 331 |
+
task.progress.update(0, 100, "开始语音转录")
|
| 332 |
+
self._notify_status_change(task)
|
| 333 |
+
|
| 334 |
+
file_urls = [r[2] for r in successful_uploads]
|
| 335 |
+
success, transcription_result, error = await self._transcribe_audio(task, file_urls)
|
| 336 |
+
|
| 337 |
+
# 4. 完成任务
|
| 338 |
+
task.completed_at = datetime.now()
|
| 339 |
+
task.result.duration = (task.completed_at - task.started_at).total_seconds()
|
| 340 |
+
|
| 341 |
+
if success:
|
| 342 |
+
task.status = TaskStatus.COMPLETED
|
| 343 |
+
task.result.success = True
|
| 344 |
+
task.result.transcription_results = transcription_result
|
| 345 |
+
task.result.processed_files = [r[0] for r in successful_uploads]
|
| 346 |
+
task.progress.update(100, 100, "转录完成")
|
| 347 |
+
else:
|
| 348 |
+
task.status = TaskStatus.FAILED
|
| 349 |
+
task.result.error_message = error
|
| 350 |
+
|
| 351 |
+
self._notify_status_change(task)
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
task.status = TaskStatus.FAILED
|
| 355 |
+
task.result.error_message = f"任务执行失败: {str(e)}"
|
| 356 |
+
task.completed_at = datetime.now()
|
| 357 |
+
self.logger.exception(f"执行任务时发生错误: {task.id}")
|
| 358 |
+
self._notify_status_change(task)
|
| 359 |
+
finally:
|
| 360 |
+
self.logger.clear_task_id()
|
| 361 |
+
|
| 362 |
+
async def _validate_files(self, task: Task) -> tuple:
|
| 363 |
+
"""验证文件"""
|
| 364 |
+
self.logger.info(f"开始验证 {len(task.file_paths)} 个文件")
|
| 365 |
+
|
| 366 |
+
valid_files, invalid_files = self.file_validator.validate_multiple_files(task.file_paths)
|
| 367 |
+
|
| 368 |
+
task.progress.update(100, 100, f"验证完成: {len(valid_files)} 个有效文件")
|
| 369 |
+
self.logger.info(f"文件验证完成: {len(valid_files)} 个有效文件, {len(invalid_files)} 个无效文件")
|
| 370 |
+
|
| 371 |
+
return valid_files, invalid_files
|
| 372 |
+
|
| 373 |
+
async def _upload_files(self, task: Task, file_paths: List[Path]) -> List[tuple]:
|
| 374 |
+
"""上传文件"""
|
| 375 |
+
self.logger.info(f"开始上传 {len(file_paths)} 个文件")
|
| 376 |
+
|
| 377 |
+
results = []
|
| 378 |
+
for i, file_path in enumerate(file_paths):
|
| 379 |
+
if task.status == TaskStatus.CANCELLED:
|
| 380 |
+
break
|
| 381 |
+
|
| 382 |
+
success, url_or_error, object_key = await self.oss_service.upload_file(file_path, task.id)
|
| 383 |
+
results.append((file_path.name, success, url_or_error, object_key))
|
| 384 |
+
|
| 385 |
+
# 更新进度
|
| 386 |
+
task.progress.update(i + 1, len(file_paths), f"已上传 {i + 1}/{len(file_paths)} 个文件")
|
| 387 |
+
self._notify_status_change(task)
|
| 388 |
+
|
| 389 |
+
self.logger.info(f"文件上传完成: {len([r for r in results if r[1]])} 个成功")
|
| 390 |
+
return results
|
| 391 |
+
|
| 392 |
+
async def _transcribe_audio(self, task: Task, file_urls: List[str]) -> tuple:
|
| 393 |
+
"""转录音频"""
|
| 394 |
+
self.logger.info(f"开始转录 {len(file_urls)} 个音频文件")
|
| 395 |
+
|
| 396 |
+
# 提取Paraformer参数
|
| 397 |
+
paraformer_params = None
|
| 398 |
+
if 'paraformer_params' in task.metadata:
|
| 399 |
+
paraformer_params = task.metadata['paraformer_params']
|
| 400 |
+
self.logger.info(f"使用自定义Paraformer参数: {paraformer_params}")
|
| 401 |
+
|
| 402 |
+
success, results, error = await self.paraformer_service.batch_process_with_retry(
|
| 403 |
+
file_urls, task.id, paraformer_params
|
| 404 |
+
)
|
| 405 |
+
|
| 406 |
+
if success:
|
| 407 |
+
task.progress.update(100, 100, "转录完成")
|
| 408 |
+
self.logger.info(f"转录完成: {len(file_urls)} 个文件")
|
| 409 |
+
else:
|
| 410 |
+
self.logger.error(f"转录失败: {error}")
|
| 411 |
+
|
| 412 |
+
return success, results, error
|
| 413 |
+
|
| 414 |
+
def cleanup_completed_tasks(self, hours: int = 24):
|
| 415 |
+
"""清理已完成的任务
|
| 416 |
+
|
| 417 |
+
Args:
|
| 418 |
+
hours: 保留时间(小时)
|
| 419 |
+
"""
|
| 420 |
+
cutoff_time = datetime.now() - timedelta(hours=hours)
|
| 421 |
+
to_remove = []
|
| 422 |
+
|
| 423 |
+
for task_id, task in self.tasks.items():
|
| 424 |
+
if (task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED] and
|
| 425 |
+
task.completed_at and task.completed_at < cutoff_time):
|
| 426 |
+
to_remove.append(task_id)
|
| 427 |
+
|
| 428 |
+
for task_id in to_remove:
|
| 429 |
+
del self.tasks[task_id]
|
| 430 |
+
|
| 431 |
+
self.logger.info(f"清理了 {len(to_remove)} 个过期任务")
|
| 432 |
+
|
| 433 |
+
def get_statistics(self) -> Dict:
|
| 434 |
+
"""获取任务统计信息"""
|
| 435 |
+
stats = {
|
| 436 |
+
'total_tasks': len(self.tasks),
|
| 437 |
+
'pending': len(self.get_tasks_by_status(TaskStatus.PENDING)),
|
| 438 |
+
'validating': len(self.get_tasks_by_status(TaskStatus.VALIDATING)),
|
| 439 |
+
'uploading': len(self.get_tasks_by_status(TaskStatus.UPLOADING)),
|
| 440 |
+
'transcribing': len(self.get_tasks_by_status(TaskStatus.TRANSCRIBING)),
|
| 441 |
+
'completed': len(self.get_tasks_by_status(TaskStatus.COMPLETED)),
|
| 442 |
+
'failed': len(self.get_tasks_by_status(TaskStatus.FAILED)),
|
| 443 |
+
'cancelled': len(self.get_tasks_by_status(TaskStatus.CANCELLED)),
|
| 444 |
+
'queue_size': self.task_queue.qsize()
|
| 445 |
+
}
|
| 446 |
+
return stats
|
| 447 |
+
|
| 448 |
+
|
| 449 |
+
# 全局任务管理器实例
|
| 450 |
+
task_manager = None
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
def get_task_manager() -> TaskManager:
|
| 454 |
+
"""获取任务管理器实例
|
| 455 |
+
|
| 456 |
+
Returns:
|
| 457 |
+
任务管理器实例
|
| 458 |
+
"""
|
| 459 |
+
global task_manager
|
| 460 |
+
if task_manager is None:
|
| 461 |
+
task_manager = TaskManager()
|
| 462 |
+
return task_manager
|
src/services/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""服务模块
|
| 2 |
+
|
| 3 |
+
包含应用程序的核心业务逻辑服务。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .file_validator import FileValidator, get_file_validator, file_validator
|
| 7 |
+
from .oss_service import OSSService, get_oss_service, oss_service
|
| 8 |
+
from .paraformer_service import ParaformerService, get_paraformer_service, paraformer_service
|
| 9 |
+
|
| 10 |
+
__all__ = [
|
| 11 |
+
"FileValidator",
|
| 12 |
+
"get_file_validator",
|
| 13 |
+
"file_validator",
|
| 14 |
+
"OSSService",
|
| 15 |
+
"get_oss_service",
|
| 16 |
+
"oss_service",
|
| 17 |
+
"ParaformerService",
|
| 18 |
+
"get_paraformer_service",
|
| 19 |
+
"paraformer_service"
|
| 20 |
+
]
|
src/services/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (585 Bytes). View file
|
|
|
src/services/__pycache__/file_validator.cpython-310.pyc
ADDED
|
Binary file (7.32 kB). View file
|
|
|
src/services/__pycache__/oss_service.cpython-310.pyc
ADDED
|
Binary file (8.51 kB). View file
|
|
|
src/services/__pycache__/paraformer_service.cpython-310.pyc
ADDED
|
Binary file (9.86 kB). View file
|
|
|
src/services/file_validator.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""文件验证模块
|
| 2 |
+
|
| 3 |
+
提供音频文件格式验证、大小检查等功能。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import magic
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Optional, Tuple
|
| 9 |
+
import mimetypes
|
| 10 |
+
|
| 11 |
+
from ..core.config import get_config
|
| 12 |
+
from ..utils.logger import get_task_logger
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class FileValidator:
|
| 16 |
+
"""文件验证器"""
|
| 17 |
+
|
| 18 |
+
# 支持的音频文件格式
|
| 19 |
+
SUPPORTED_EXTENSIONS = {
|
| 20 |
+
'.aac', '.amr', '.avi', '.flac', '.flv', '.m4a', '.mkv',
|
| 21 |
+
'.mov', '.mp3', '.mp4', '.mpeg', '.ogg', '.opus', '.wav',
|
| 22 |
+
'.webm', '.wma', '.wmv'
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# 支持的MIME类型
|
| 26 |
+
SUPPORTED_MIME_TYPES = {
|
| 27 |
+
'audio/aac', 'audio/amr', 'audio/flac', 'audio/mp3', 'audio/mpeg',
|
| 28 |
+
'audio/mp4', 'audio/ogg', 'audio/opus', 'audio/wav', 'audio/webm',
|
| 29 |
+
'audio/x-wav', 'audio/x-flac', 'audio/x-m4a',
|
| 30 |
+
'video/mp4', 'video/avi', 'video/x-flv', 'video/quicktime',
|
| 31 |
+
'video/x-msvideo', 'video/webm', 'video/x-ms-wmv'
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
def __init__(self):
|
| 35 |
+
"""初始化文件验证器"""
|
| 36 |
+
self.config = get_config()
|
| 37 |
+
self.logger = get_task_logger(logger_name="transcript_service.validator")
|
| 38 |
+
|
| 39 |
+
# 初始化libmagic
|
| 40 |
+
try:
|
| 41 |
+
self.magic = magic.Magic(mime=True)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
self.logger.warning(f"无法初始化libmagic: {str(e)}, 将使用基础验证")
|
| 44 |
+
self.magic = None
|
| 45 |
+
|
| 46 |
+
def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str]]:
|
| 47 |
+
"""验证单个文件
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
file_path: 文件路径
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
(是否有效, 错误信息)
|
| 54 |
+
"""
|
| 55 |
+
try:
|
| 56 |
+
# 检查文件是否存在
|
| 57 |
+
if not file_path.exists():
|
| 58 |
+
return False, f"文件不存在: {file_path}"
|
| 59 |
+
|
| 60 |
+
# 检查是否是文件
|
| 61 |
+
if not file_path.is_file():
|
| 62 |
+
return False, f"不是有效的文件: {file_path}"
|
| 63 |
+
|
| 64 |
+
# 检查文件大小
|
| 65 |
+
file_size = file_path.stat().st_size
|
| 66 |
+
if file_size == 0:
|
| 67 |
+
return False, f"文件为空: {file_path.name}"
|
| 68 |
+
|
| 69 |
+
if file_size > self.config.app.max_file_size:
|
| 70 |
+
size_mb = file_size / (1024 * 1024)
|
| 71 |
+
max_size_mb = self.config.app.max_file_size / (1024 * 1024)
|
| 72 |
+
return False, f"文件大小 {size_mb:.1f}MB 超过限制 {max_size_mb:.1f}MB: {file_path.name}"
|
| 73 |
+
|
| 74 |
+
# 检查文件扩展名
|
| 75 |
+
file_ext = file_path.suffix.lower()
|
| 76 |
+
if file_ext not in self.SUPPORTED_EXTENSIONS:
|
| 77 |
+
return False, f"不支持的文件格式 {file_ext}: {file_path.name}"
|
| 78 |
+
|
| 79 |
+
# 检查MIME类型
|
| 80 |
+
if not self._check_mime_type(file_path):
|
| 81 |
+
return False, f"文件内容与扩展名不匹配: {file_path.name}"
|
| 82 |
+
|
| 83 |
+
# 检查文件完整性
|
| 84 |
+
if not self._check_file_integrity(file_path):
|
| 85 |
+
return False, f"文件可能损坏或不完整: {file_path.name}"
|
| 86 |
+
|
| 87 |
+
self.logger.info(f"文件验证通过: {file_path.name}")
|
| 88 |
+
return True, None
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
error_msg = f"验证文件时发生错误: {file_path.name}, 错误: {str(e)}"
|
| 92 |
+
self.logger.exception(error_msg)
|
| 93 |
+
return False, error_msg
|
| 94 |
+
|
| 95 |
+
def validate_multiple_files(self, file_paths: List[Path]) -> Tuple[List[Path], List[Tuple[Path, str]]]:
|
| 96 |
+
"""验证多个文件
|
| 97 |
+
|
| 98 |
+
Args:
|
| 99 |
+
file_paths: 文件路径列表
|
| 100 |
+
|
| 101 |
+
Returns:
|
| 102 |
+
(有效文件列表, 无效文件列表[(文件路径, 错误信息)])
|
| 103 |
+
"""
|
| 104 |
+
# 检查文件数量
|
| 105 |
+
if len(file_paths) > self.config.app.max_files_count:
|
| 106 |
+
self.logger.warning(f"文件数量 {len(file_paths)} 超过限制 {self.config.app.max_files_count}")
|
| 107 |
+
|
| 108 |
+
valid_files = []
|
| 109 |
+
invalid_files = []
|
| 110 |
+
|
| 111 |
+
for file_path in file_paths[:self.config.app.max_files_count]:
|
| 112 |
+
is_valid, error_msg = self.validate_file(file_path)
|
| 113 |
+
if is_valid:
|
| 114 |
+
valid_files.append(file_path)
|
| 115 |
+
else:
|
| 116 |
+
invalid_files.append((file_path, error_msg))
|
| 117 |
+
|
| 118 |
+
# 如果超过限制,记录被跳过的文件
|
| 119 |
+
if len(file_paths) > self.config.app.max_files_count:
|
| 120 |
+
skipped_count = len(file_paths) - self.config.app.max_files_count
|
| 121 |
+
self.logger.warning(f"跳过了 {skipped_count} 个文件(超过批处理限制)")
|
| 122 |
+
|
| 123 |
+
self.logger.info(f"文件验证完成: {len(valid_files)} 个有效文件, {len(invalid_files)} 个无效文件")
|
| 124 |
+
return valid_files, invalid_files
|
| 125 |
+
|
| 126 |
+
def _check_mime_type(self, file_path: Path) -> bool:
|
| 127 |
+
"""检查文件MIME类型
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
file_path: 文件路径
|
| 131 |
+
|
| 132 |
+
Returns:
|
| 133 |
+
MIME类型是否匹配
|
| 134 |
+
"""
|
| 135 |
+
try:
|
| 136 |
+
# 使用libmagic检查
|
| 137 |
+
if self.magic:
|
| 138 |
+
mime_type = self.magic.from_file(str(file_path))
|
| 139 |
+
if mime_type in self.SUPPORTED_MIME_TYPES:
|
| 140 |
+
return True
|
| 141 |
+
|
| 142 |
+
# 使用mimetypes作为备选方案
|
| 143 |
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
| 144 |
+
if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
|
| 145 |
+
return True
|
| 146 |
+
|
| 147 |
+
# 对于某些格式,检查文件头
|
| 148 |
+
return self._check_file_header(file_path)
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
self.logger.warning(f"检查MIME类型时发生错误: {file_path.name}, 错误: {str(e)}")
|
| 152 |
+
# 如果MIME检查失败,只要扩展名正确就通过
|
| 153 |
+
return True
|
| 154 |
+
|
| 155 |
+
def _check_file_header(self, file_path: Path) -> bool:
|
| 156 |
+
"""检查文件头部特征
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
file_path: 文件路径
|
| 160 |
+
|
| 161 |
+
Returns:
|
| 162 |
+
文件头是否匹配
|
| 163 |
+
"""
|
| 164 |
+
try:
|
| 165 |
+
with open(file_path, 'rb') as f:
|
| 166 |
+
header = f.read(16)
|
| 167 |
+
|
| 168 |
+
if not header:
|
| 169 |
+
return False
|
| 170 |
+
|
| 171 |
+
# 检查常见音频格式的文件头
|
| 172 |
+
if header.startswith(b'ID3') or header[4:8] == b'ftyp': # MP3, MP4
|
| 173 |
+
return True
|
| 174 |
+
elif header.startswith(b'RIFF') and b'WAVE' in header: # WAV
|
| 175 |
+
return True
|
| 176 |
+
elif header.startswith(b'fLaC'): # FLAC
|
| 177 |
+
return True
|
| 178 |
+
elif header.startswith(b'OggS'): # OGG
|
| 179 |
+
return True
|
| 180 |
+
elif header.startswith(b'\xff\xfb') or header.startswith(b'\xff\xfa'): # MP3
|
| 181 |
+
return True
|
| 182 |
+
|
| 183 |
+
# 如果无法识别文件头,但扩展名正确,就通过验证
|
| 184 |
+
return True
|
| 185 |
+
|
| 186 |
+
except Exception as e:
|
| 187 |
+
self.logger.warning(f"检查文件头时发生错误: {file_path.name}, 错误: {str(e)}")
|
| 188 |
+
return True
|
| 189 |
+
|
| 190 |
+
def _check_file_integrity(self, file_path: Path) -> bool:
|
| 191 |
+
"""检查文件完整性
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
file_path: 文件路径
|
| 195 |
+
|
| 196 |
+
Returns:
|
| 197 |
+
文件是否完整
|
| 198 |
+
"""
|
| 199 |
+
try:
|
| 200 |
+
# 基础完整性检查:确保文件可以完全读取
|
| 201 |
+
with open(file_path, 'rb') as f:
|
| 202 |
+
# 读取文件开头和结尾
|
| 203 |
+
f.read(1024) # 读取前1KB
|
| 204 |
+
f.seek(-min(1024, file_path.stat().st_size), 2) # 读取后1KB
|
| 205 |
+
f.read()
|
| 206 |
+
|
| 207 |
+
return True
|
| 208 |
+
|
| 209 |
+
except Exception as e:
|
| 210 |
+
self.logger.warning(f"检查文件完整性时发生错误: {file_path.name}, 错误: {str(e)}")
|
| 211 |
+
return False
|
| 212 |
+
|
| 213 |
+
def get_file_info(self, file_path: Path) -> dict:
|
| 214 |
+
"""获取文件信息
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
file_path: 文件路径
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
文件信息字典
|
| 221 |
+
"""
|
| 222 |
+
try:
|
| 223 |
+
stat = file_path.stat()
|
| 224 |
+
|
| 225 |
+
# 获取MIME类型
|
| 226 |
+
mime_type = None
|
| 227 |
+
if self.magic:
|
| 228 |
+
try:
|
| 229 |
+
mime_type = self.magic.from_file(str(file_path))
|
| 230 |
+
except:
|
| 231 |
+
pass
|
| 232 |
+
|
| 233 |
+
if not mime_type:
|
| 234 |
+
mime_type, _ = mimetypes.guess_type(str(file_path))
|
| 235 |
+
|
| 236 |
+
return {
|
| 237 |
+
'name': file_path.name,
|
| 238 |
+
'size': stat.st_size,
|
| 239 |
+
'size_mb': round(stat.st_size / (1024 * 1024), 2),
|
| 240 |
+
'extension': file_path.suffix.lower(),
|
| 241 |
+
'mime_type': mime_type,
|
| 242 |
+
'modified_time': stat.st_mtime,
|
| 243 |
+
'is_supported': file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
self.logger.error(f"获取文件信息失败: {file_path.name}, 错误: {str(e)}")
|
| 248 |
+
return {
|
| 249 |
+
'name': file_path.name,
|
| 250 |
+
'error': str(e)
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
def get_supported_formats(self) -> dict:
|
| 254 |
+
"""获取支持的文件格式信息
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
支持的格式信息
|
| 258 |
+
"""
|
| 259 |
+
return {
|
| 260 |
+
'extensions': sorted(list(self.SUPPORTED_EXTENSIONS)),
|
| 261 |
+
'mime_types': sorted(list(self.SUPPORTED_MIME_TYPES)),
|
| 262 |
+
'max_file_size_mb': self.config.app.max_file_size / (1024 * 1024),
|
| 263 |
+
'max_files_count': self.config.app.max_files_count
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# 全局文件验证器实例
|
| 268 |
+
file_validator = FileValidator()
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def get_file_validator() -> FileValidator:
|
| 272 |
+
"""获取文件验证器实例
|
| 273 |
+
|
| 274 |
+
Returns:
|
| 275 |
+
文件验证器实例
|
| 276 |
+
"""
|
| 277 |
+
return file_validator
|
src/services/oss_service.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OSS云存储服务模块
|
| 2 |
+
|
| 3 |
+
提供阿里云OSS文件上传、下载和管理功能。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import List, Optional, Tuple
|
| 11 |
+
import asyncio
|
| 12 |
+
import aiohttp
|
| 13 |
+
import oss2
|
| 14 |
+
from oss2.exceptions import OssError
|
| 15 |
+
|
| 16 |
+
from ..core.config import get_config
|
| 17 |
+
from ..utils.logger import get_task_logger
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class OSSService:
|
| 21 |
+
"""OSS云存储服务"""
|
| 22 |
+
|
| 23 |
+
def __init__(self):
|
| 24 |
+
"""初始化OSS服务"""
|
| 25 |
+
self.config = get_config()
|
| 26 |
+
self.oss_config = self.config.oss
|
| 27 |
+
|
| 28 |
+
# 初始化OSS客户端
|
| 29 |
+
auth = oss2.Auth(
|
| 30 |
+
self.oss_config.access_key_id,
|
| 31 |
+
self.oss_config.access_key_secret
|
| 32 |
+
)
|
| 33 |
+
self.bucket = oss2.Bucket(
|
| 34 |
+
auth,
|
| 35 |
+
self.oss_config.endpoint,
|
| 36 |
+
self.oss_config.bucket_name
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
self.logger = get_task_logger(logger_name="transcript_service.oss")
|
| 40 |
+
|
| 41 |
+
def _generate_object_key(self, filename: str, task_id: str) -> str:
|
| 42 |
+
"""生成OSS对象键名
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
filename: 原始文件名
|
| 46 |
+
task_id: 任务ID
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
OSS对象键名
|
| 50 |
+
"""
|
| 51 |
+
now = datetime.now()
|
| 52 |
+
date_path = now.strftime("%Y/%m/%d")
|
| 53 |
+
timestamp = now.strftime("%Y%m%d_%H%M%S")
|
| 54 |
+
|
| 55 |
+
# 获取文件扩展名
|
| 56 |
+
file_ext = Path(filename).suffix
|
| 57 |
+
safe_filename = f"{timestamp}_{task_id}_{uuid.uuid4().hex[:8]}{file_ext}"
|
| 58 |
+
|
| 59 |
+
return f"{self.oss_config.temp_prefix}/{date_path}/{safe_filename}"
|
| 60 |
+
|
| 61 |
+
async def upload_file(self, file_path: Path, task_id: str) -> Tuple[bool, str, Optional[str]]:
|
| 62 |
+
"""上传文件到OSS
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
file_path: 本地文件路径
|
| 66 |
+
task_id: 任务ID
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
(是否成功, 公网URL或错误信息, 对象键名)
|
| 70 |
+
"""
|
| 71 |
+
try:
|
| 72 |
+
self.logger.info(f"开始上传文件到OSS: {file_path.name}")
|
| 73 |
+
|
| 74 |
+
# 生成对象键名
|
| 75 |
+
object_key = self._generate_object_key(file_path.name, task_id)
|
| 76 |
+
|
| 77 |
+
# 上传文件并设置公共读取权限
|
| 78 |
+
try:
|
| 79 |
+
# 首先上传文件
|
| 80 |
+
self.bucket.put_object_from_file(object_key, str(file_path))
|
| 81 |
+
|
| 82 |
+
# 设置对象ACL为公共读取
|
| 83 |
+
self.bucket.put_object_acl(object_key, oss2.OBJECT_ACL_PUBLIC_READ)
|
| 84 |
+
|
| 85 |
+
# 生成公网访问URL
|
| 86 |
+
url = self._generate_public_url(object_key)
|
| 87 |
+
self.logger.info(f"文件上传成功: {object_key}, URL: {url}")
|
| 88 |
+
return True, url, object_key
|
| 89 |
+
|
| 90 |
+
except oss2.exceptions.OssError as oss_err:
|
| 91 |
+
# 如果设置ACL失败,尝试使用签名URL
|
| 92 |
+
if 'public-read' in str(oss_err).lower():
|
| 93 |
+
self.logger.warning(f"ACL设置失败,使用签名URL: {oss_err}")
|
| 94 |
+
url = self._generate_signed_url(object_key)
|
| 95 |
+
self.logger.info(f"文件上传成功: {object_key}, URL: {url}")
|
| 96 |
+
return True, url, object_key
|
| 97 |
+
else:
|
| 98 |
+
raise
|
| 99 |
+
|
| 100 |
+
except OssError as e:
|
| 101 |
+
error_msg = f"OSS错误: {str(e)}"
|
| 102 |
+
self.logger.error(error_msg)
|
| 103 |
+
return False, error_msg, None
|
| 104 |
+
except Exception as e:
|
| 105 |
+
error_msg = f"上传文件时发生未知错误: {str(e)}"
|
| 106 |
+
self.logger.exception(error_msg)
|
| 107 |
+
return False, error_msg, None
|
| 108 |
+
|
| 109 |
+
async def upload_multiple_files(self, file_paths: List[Path], task_id: str) -> List[Tuple[str, bool, str, Optional[str]]]:
|
| 110 |
+
"""批量上传文件到OSS
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
file_paths: 本地文件路径列表
|
| 114 |
+
task_id: 任务ID
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
[(文件名, 是否成功, URL或错误信息, 对象键名), ...]
|
| 118 |
+
"""
|
| 119 |
+
results = []
|
| 120 |
+
|
| 121 |
+
# 创建异步任务
|
| 122 |
+
tasks = []
|
| 123 |
+
for file_path in file_paths:
|
| 124 |
+
task = self._upload_single_file_async(file_path, task_id)
|
| 125 |
+
tasks.append((file_path.name, task))
|
| 126 |
+
|
| 127 |
+
# 等待所有上传完成
|
| 128 |
+
for filename, task in tasks:
|
| 129 |
+
success, url_or_error, object_key = await task
|
| 130 |
+
results.append((filename, success, url_or_error, object_key))
|
| 131 |
+
|
| 132 |
+
return results
|
| 133 |
+
|
| 134 |
+
async def _upload_single_file_async(self, file_path: Path, task_id: str) -> Tuple[bool, str, Optional[str]]:
|
| 135 |
+
"""异步上传单个文件"""
|
| 136 |
+
return await asyncio.get_event_loop().run_in_executor(
|
| 137 |
+
None,
|
| 138 |
+
lambda: asyncio.run(self.upload_file(file_path, task_id))
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
def _generate_public_url(self, object_key: str) -> str:
|
| 142 |
+
"""生成公网访问URL
|
| 143 |
+
|
| 144 |
+
Args:
|
| 145 |
+
object_key: OSS对象键名
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
公网访问URL
|
| 149 |
+
"""
|
| 150 |
+
# 生成简单的公网访问URL(不带签名)
|
| 151 |
+
# 正确的格式: https://bucket-name.endpoint/object-key
|
| 152 |
+
# 注意: endpoint不能包含协议前缀
|
| 153 |
+
endpoint = self.oss_config.endpoint
|
| 154 |
+
if endpoint.startswith('http://'):
|
| 155 |
+
endpoint = endpoint[7:]
|
| 156 |
+
elif endpoint.startswith('https://'):
|
| 157 |
+
endpoint = endpoint[8:]
|
| 158 |
+
|
| 159 |
+
# 构造公网URL - 注意这里的格式必须正确
|
| 160 |
+
url = f"https://{self.oss_config.bucket_name}.{endpoint}/{object_key}"
|
| 161 |
+
|
| 162 |
+
# 记录生成的URL以便调试
|
| 163 |
+
self.logger.debug(f"生成公网URL: {url}")
|
| 164 |
+
|
| 165 |
+
return url
|
| 166 |
+
|
| 167 |
+
def _generate_signed_url(self, object_key: str) -> str:
|
| 168 |
+
"""生成签名URL(备用方案)
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
object_key: OSS对象键名
|
| 172 |
+
|
| 173 |
+
Returns:
|
| 174 |
+
签名URL
|
| 175 |
+
"""
|
| 176 |
+
# 生成有时效性的签名URL
|
| 177 |
+
expire_time = int((datetime.now() + timedelta(hours=self.oss_config.url_expire_hours)).timestamp())
|
| 178 |
+
url = self.bucket.sign_url('GET', object_key, expire_time)
|
| 179 |
+
return url
|
| 180 |
+
|
| 181 |
+
def delete_file(self, object_key: str) -> bool:
|
| 182 |
+
"""删除OSS文件
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
object_key: OSS对象键名
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
是否删除成功
|
| 189 |
+
"""
|
| 190 |
+
try:
|
| 191 |
+
self.bucket.delete_object(object_key)
|
| 192 |
+
self.logger.info(f"文件删除成功: {object_key}")
|
| 193 |
+
return True
|
| 194 |
+
except OssError as e:
|
| 195 |
+
self.logger.error(f"删除文件失败: {object_key}, 错误: {str(e)}")
|
| 196 |
+
return False
|
| 197 |
+
except Exception as e:
|
| 198 |
+
self.logger.exception(f"删除文件时发生未知错误: {object_key}, 错误: {str(e)}")
|
| 199 |
+
return False
|
| 200 |
+
|
| 201 |
+
def cleanup_old_files(self, days: Optional[int] = None) -> int:
|
| 202 |
+
"""清理过期的临时文件
|
| 203 |
+
|
| 204 |
+
Args:
|
| 205 |
+
days: 保留天数,默认使用配置中的值
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
删除的文件数量
|
| 209 |
+
"""
|
| 210 |
+
cleanup_days = days or self.oss_config.auto_cleanup_days
|
| 211 |
+
cutoff_date = datetime.now() - timedelta(days=cleanup_days)
|
| 212 |
+
|
| 213 |
+
deleted_count = 0
|
| 214 |
+
prefix = self.oss_config.temp_prefix
|
| 215 |
+
|
| 216 |
+
try:
|
| 217 |
+
# 列出所有临时文件
|
| 218 |
+
for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
|
| 219 |
+
# 检查文件最后修改时间
|
| 220 |
+
if obj.last_modified.replace(tzinfo=None) < cutoff_date:
|
| 221 |
+
if self.delete_file(obj.key):
|
| 222 |
+
deleted_count += 1
|
| 223 |
+
|
| 224 |
+
self.logger.info(f"清理完成,删除了 {deleted_count} 个过期文件")
|
| 225 |
+
return deleted_count
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
self.logger.exception(f"清理过期文件时发生错误: {str(e)}")
|
| 229 |
+
return deleted_count
|
| 230 |
+
|
| 231 |
+
def get_file_info(self, object_key: str) -> Optional[dict]:
|
| 232 |
+
"""获取文件信息
|
| 233 |
+
|
| 234 |
+
Args:
|
| 235 |
+
object_key: OSS对象键名
|
| 236 |
+
|
| 237 |
+
Returns:
|
| 238 |
+
文件信息字典
|
| 239 |
+
"""
|
| 240 |
+
try:
|
| 241 |
+
info = self.bucket.head_object(object_key)
|
| 242 |
+
return {
|
| 243 |
+
'size': info.content_length,
|
| 244 |
+
'last_modified': info.last_modified,
|
| 245 |
+
'etag': info.etag,
|
| 246 |
+
'content_type': info.content_type
|
| 247 |
+
}
|
| 248 |
+
except OssError as e:
|
| 249 |
+
self.logger.error(f"获取文件信息失败: {object_key}, 错误: {str(e)}")
|
| 250 |
+
return None
|
| 251 |
+
|
| 252 |
+
def check_bucket_exists(self) -> bool:
|
| 253 |
+
"""检查存储桶是否存在
|
| 254 |
+
|
| 255 |
+
Returns:
|
| 256 |
+
存储桶是否存在
|
| 257 |
+
"""
|
| 258 |
+
try:
|
| 259 |
+
return self.bucket.bucket_exists()
|
| 260 |
+
except Exception as e:
|
| 261 |
+
self.logger.error(f"检查存储桶失败: {str(e)}")
|
| 262 |
+
return False
|
| 263 |
+
|
| 264 |
+
def get_bucket_info(self) -> Optional[dict]:
|
| 265 |
+
"""获取存储桶信息
|
| 266 |
+
|
| 267 |
+
Returns:
|
| 268 |
+
存储桶信息
|
| 269 |
+
"""
|
| 270 |
+
try:
|
| 271 |
+
info = self.bucket.get_bucket_info()
|
| 272 |
+
return {
|
| 273 |
+
'name': info.name,
|
| 274 |
+
'location': info.location,
|
| 275 |
+
'creation_date': info.creation_date,
|
| 276 |
+
'storage_class': info.storage_class
|
| 277 |
+
}
|
| 278 |
+
except Exception as e:
|
| 279 |
+
self.logger.error(f"获取存储桶信息失败: {str(e)}")
|
| 280 |
+
return None
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# 全局OSS服务实例
|
| 284 |
+
oss_service = OSSService()
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def get_oss_service() -> OSSService:
|
| 288 |
+
"""获取OSS服务实例
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
OSS服务实例
|
| 292 |
+
"""
|
| 293 |
+
return oss_service
|
src/services/paraformer_service.py
ADDED
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Paraformer转录服务模块
|
| 2 |
+
|
| 3 |
+
提供阿里云百炼平台Paraformer-v2模型的语音转录功能。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import json
|
| 8 |
+
import time
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
from enum import Enum
|
| 11 |
+
import httpx
|
| 12 |
+
from dashscope import audio
|
| 13 |
+
|
| 14 |
+
from ..core.config import get_config
|
| 15 |
+
from ..utils.logger import get_task_logger
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class TaskStatus(Enum):
|
| 19 |
+
"""任务状态枚举"""
|
| 20 |
+
PENDING = "PENDING"
|
| 21 |
+
RUNNING = "RUNNING"
|
| 22 |
+
SUCCEEDED = "SUCCEEDED"
|
| 23 |
+
FAILED = "FAILED"
|
| 24 |
+
CANCELLED = "CANCELLED"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ParaformerService:
|
| 28 |
+
"""Paraformer转录服务"""
|
| 29 |
+
|
| 30 |
+
def __init__(self):
|
| 31 |
+
"""初始化Paraformer服务"""
|
| 32 |
+
self.config = get_config()
|
| 33 |
+
self.api_config = self.config.dashscope
|
| 34 |
+
self.logger = get_task_logger(logger_name="transcript_service.api")
|
| 35 |
+
|
| 36 |
+
# 设置API密钥
|
| 37 |
+
audio.api_key = self.api_config.api_key
|
| 38 |
+
|
| 39 |
+
async def submit_transcription_task(
|
| 40 |
+
self,
|
| 41 |
+
file_urls: List[str],
|
| 42 |
+
task_id: str,
|
| 43 |
+
paraformer_params: Optional[Dict] = None
|
| 44 |
+
) -> Tuple[bool, str, Optional[str]]:
|
| 45 |
+
"""提交转录任务
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
file_urls: 音频文件URL列表
|
| 49 |
+
task_id: 任务ID
|
| 50 |
+
paraformer_params: Paraformer额外参数
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
(是否成功, 消息, API任务ID)
|
| 54 |
+
"""
|
| 55 |
+
try:
|
| 56 |
+
self.logger.info(f"提交转录任务: {len(file_urls)} 个文件")
|
| 57 |
+
|
| 58 |
+
# 准备请求参数
|
| 59 |
+
transcription_params = {
|
| 60 |
+
'model': self.api_config.model,
|
| 61 |
+
'file_urls': file_urls
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
# 添加额外参数(如果提供)
|
| 65 |
+
if paraformer_params:
|
| 66 |
+
# 语言提示
|
| 67 |
+
if 'language_hints' in paraformer_params and paraformer_params['language_hints']:
|
| 68 |
+
transcription_params['language_hints'] = paraformer_params['language_hints']
|
| 69 |
+
else:
|
| 70 |
+
transcription_params['language_hints'] = self.api_config.language_hints
|
| 71 |
+
|
| 72 |
+
# 音轨选择
|
| 73 |
+
if 'channel_id' in paraformer_params and paraformer_params['channel_id']:
|
| 74 |
+
transcription_params['channel_id'] = paraformer_params['channel_id']
|
| 75 |
+
|
| 76 |
+
# 语气词过滤
|
| 77 |
+
if 'disfluency_removal_enabled' in paraformer_params:
|
| 78 |
+
transcription_params['disfluency_removal_enabled'] = paraformer_params['disfluency_removal_enabled']
|
| 79 |
+
|
| 80 |
+
# 时间戳校准
|
| 81 |
+
if 'timestamp_alignment_enabled' in paraformer_params:
|
| 82 |
+
transcription_params['timestamp_alignment_enabled'] = paraformer_params['timestamp_alignment_enabled']
|
| 83 |
+
|
| 84 |
+
# 说话人分离
|
| 85 |
+
if 'diarization_enabled' in paraformer_params:
|
| 86 |
+
transcription_params['diarization_enabled'] = paraformer_params['diarization_enabled']
|
| 87 |
+
|
| 88 |
+
# 说话人数量
|
| 89 |
+
if 'speaker_count' in paraformer_params and paraformer_params['speaker_count']:
|
| 90 |
+
transcription_params['speaker_count'] = paraformer_params['speaker_count']
|
| 91 |
+
|
| 92 |
+
# 热词ID v2
|
| 93 |
+
if 'vocabulary_id' in paraformer_params and paraformer_params['vocabulary_id']:
|
| 94 |
+
transcription_params['vocabulary_id'] = paraformer_params['vocabulary_id']
|
| 95 |
+
|
| 96 |
+
# 热词ID v1
|
| 97 |
+
if 'phrase_id' in paraformer_params and paraformer_params['phrase_id']:
|
| 98 |
+
transcription_params['phrase_id'] = paraformer_params['phrase_id']
|
| 99 |
+
|
| 100 |
+
# 敏感词过滤
|
| 101 |
+
if 'special_word_filter' in paraformer_params and paraformer_params['special_word_filter']:
|
| 102 |
+
transcription_params['special_word_filter'] = paraformer_params['special_word_filter']
|
| 103 |
+
else:
|
| 104 |
+
# 使用默认配置
|
| 105 |
+
transcription_params['language_hints'] = self.api_config.language_hints
|
| 106 |
+
|
| 107 |
+
# 记录最终参数用于调试
|
| 108 |
+
self.logger.info(f"转录参数: {transcription_params}")
|
| 109 |
+
|
| 110 |
+
# 调用API
|
| 111 |
+
response = audio.asr.Transcription.async_call(**transcription_params)
|
| 112 |
+
|
| 113 |
+
if response.status_code == 200:
|
| 114 |
+
api_task_id = response.output.task_id
|
| 115 |
+
self.logger.info(f"任务提交成功, API任务ID: {api_task_id}")
|
| 116 |
+
return True, f"任务提交成功", api_task_id
|
| 117 |
+
else:
|
| 118 |
+
error_msg = f"API调用失败, 状态码: {response.status_code}, 错误: {response.message}"
|
| 119 |
+
self.logger.error(error_msg)
|
| 120 |
+
return False, error_msg, None
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
error_msg = f"提交转录任务时发生错误: {str(e)}"
|
| 124 |
+
self.logger.exception(error_msg)
|
| 125 |
+
return False, error_msg, None
|
| 126 |
+
|
| 127 |
+
async def check_task_status(self, api_task_id: str) -> Tuple[TaskStatus, Optional[dict], Optional[str]]:
|
| 128 |
+
"""检查任务状态
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
api_task_id: API任务ID
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
(任务状态, 结果数据, 错误信息)
|
| 135 |
+
"""
|
| 136 |
+
try:
|
| 137 |
+
response = audio.asr.Transcription.fetch(task=api_task_id)
|
| 138 |
+
|
| 139 |
+
if response.status_code == 200:
|
| 140 |
+
task_status = TaskStatus(response.output.task_status)
|
| 141 |
+
|
| 142 |
+
if task_status == TaskStatus.SUCCEEDED:
|
| 143 |
+
# 解析转录结果
|
| 144 |
+
results = await self._parse_transcription_results(response.output.results)
|
| 145 |
+
return task_status, results, None
|
| 146 |
+
elif task_status == TaskStatus.FAILED:
|
| 147 |
+
error_msg = getattr(response.output, 'message', '转录失败')
|
| 148 |
+
return task_status, None, error_msg
|
| 149 |
+
else:
|
| 150 |
+
# 任务进行中
|
| 151 |
+
return task_status, None, None
|
| 152 |
+
else:
|
| 153 |
+
error_msg = f"检查任务状态失败: {response.message}"
|
| 154 |
+
self.logger.error(error_msg)
|
| 155 |
+
return TaskStatus.FAILED, None, error_msg
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
error_msg = f"检查任务状态时发生错误: {str(e)}"
|
| 159 |
+
self.logger.exception(error_msg)
|
| 160 |
+
return TaskStatus.FAILED, None, error_msg
|
| 161 |
+
|
| 162 |
+
async def process_audio_files(
|
| 163 |
+
self,
|
| 164 |
+
file_urls: List[str],
|
| 165 |
+
task_id: str,
|
| 166 |
+
paraformer_params: Optional[Dict] = None
|
| 167 |
+
) -> Tuple[bool, Optional[dict], Optional[str]]:
|
| 168 |
+
"""处理音频文件转录(完整流程)
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
file_urls: 音频文件URL列表
|
| 172 |
+
task_id: 任务ID
|
| 173 |
+
paraformer_params: Paraformer额外参数
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
(是否成功, 转录结果, 错误信息)
|
| 177 |
+
"""
|
| 178 |
+
try:
|
| 179 |
+
# 保存原始URL映射,用于结果处理
|
| 180 |
+
self._original_urls = file_urls.copy()
|
| 181 |
+
self.logger.info(f"保存原始URL: {self._original_urls}")
|
| 182 |
+
|
| 183 |
+
# 1. 提交任务
|
| 184 |
+
success, message, api_task_id = await self.submit_transcription_task(file_urls, task_id, paraformer_params)
|
| 185 |
+
if not success:
|
| 186 |
+
return False, None, message
|
| 187 |
+
|
| 188 |
+
self.logger.info(f"开始监控任务状态: {api_task_id}")
|
| 189 |
+
|
| 190 |
+
# 2. 监控任务状态
|
| 191 |
+
max_wait_time = self.api_config.timeout
|
| 192 |
+
check_interval = self.config.task.status_check_interval
|
| 193 |
+
start_time = time.time()
|
| 194 |
+
|
| 195 |
+
while time.time() - start_time < max_wait_time:
|
| 196 |
+
status, results, error = await self.check_task_status(api_task_id)
|
| 197 |
+
|
| 198 |
+
if status == TaskStatus.SUCCEEDED:
|
| 199 |
+
self.logger.info(f"转录完成: {api_task_id}")
|
| 200 |
+
return True, results, None
|
| 201 |
+
elif status == TaskStatus.FAILED:
|
| 202 |
+
self.logger.error(f"转录失败: {api_task_id}, 错误: {error}")
|
| 203 |
+
return False, None, error
|
| 204 |
+
elif status in [TaskStatus.PENDING, TaskStatus.RUNNING]:
|
| 205 |
+
self.logger.debug(f"任务进行中: {api_task_id}, 状态: {status.value}")
|
| 206 |
+
await asyncio.sleep(check_interval)
|
| 207 |
+
else:
|
| 208 |
+
error_msg = f"未知任务状态: {status}"
|
| 209 |
+
self.logger.error(error_msg)
|
| 210 |
+
return False, None, error_msg
|
| 211 |
+
|
| 212 |
+
# 超时
|
| 213 |
+
error_msg = f"任务超时: {api_task_id} (等待时间: {max_wait_time}秒)"
|
| 214 |
+
self.logger.error(error_msg)
|
| 215 |
+
return False, None, error_msg
|
| 216 |
+
|
| 217 |
+
except Exception as e:
|
| 218 |
+
error_msg = f"处理音频文件时发生错误: {str(e)}"
|
| 219 |
+
self.logger.exception(error_msg)
|
| 220 |
+
return False, None, error_msg
|
| 221 |
+
|
| 222 |
+
async def _parse_transcription_results(self, results: List) -> dict:
|
| 223 |
+
"""解析转录结果
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
results: API返回的结果列表
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
解析后的结果字典
|
| 230 |
+
"""
|
| 231 |
+
parsed_results = {
|
| 232 |
+
'transcriptions': [],
|
| 233 |
+
'summary': {
|
| 234 |
+
'total_files': len(results),
|
| 235 |
+
'total_duration': 0,
|
| 236 |
+
'total_text_length': 0,
|
| 237 |
+
'languages_detected': set()
|
| 238 |
+
}
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
for i, result in enumerate(results):
|
| 242 |
+
try:
|
| 243 |
+
# 使用原始URL而不是API返回的file_url
|
| 244 |
+
original_url = ''
|
| 245 |
+
if hasattr(self, '_original_urls') and i < len(self._original_urls):
|
| 246 |
+
original_url = self._original_urls[i]
|
| 247 |
+
self.logger.info(f"使用原始URL[{i}]: {original_url}")
|
| 248 |
+
else:
|
| 249 |
+
original_url = result.get('file_url', '')
|
| 250 |
+
self.logger.warning(f"未找到原始URL[{i}],使用API返回的URL: {original_url}")
|
| 251 |
+
|
| 252 |
+
# 从transcription_url下载实际的转录结果
|
| 253 |
+
transcription_text = ''
|
| 254 |
+
duration = 0
|
| 255 |
+
language = 'unknown'
|
| 256 |
+
confidence = 0
|
| 257 |
+
segments = []
|
| 258 |
+
|
| 259 |
+
if result.get('subtask_status') == 'SUCCEEDED' and result.get('transcription_url'):
|
| 260 |
+
try:
|
| 261 |
+
# 下载转录结果
|
| 262 |
+
async with httpx.AsyncClient() as client:
|
| 263 |
+
response = await client.get(result['transcription_url'])
|
| 264 |
+
if response.status_code == 200:
|
| 265 |
+
transcription_data = response.json()
|
| 266 |
+
# 根据实际返回的数据结构解析
|
| 267 |
+
# 获取原始时长(毫秒)
|
| 268 |
+
original_duration_ms = transcription_data.get('properties', {}).get('original_duration_in_milliseconds', 0)
|
| 269 |
+
duration = original_duration_ms / 1000.0 # 转换为秒
|
| 270 |
+
language = 'en' # 根据测试设置默认为英语
|
| 271 |
+
|
| 272 |
+
# 从transcripts中提取文本
|
| 273 |
+
transcription_text = ''
|
| 274 |
+
all_sentences = []
|
| 275 |
+
transcripts = transcription_data.get('transcripts', [])
|
| 276 |
+
if transcripts:
|
| 277 |
+
# 提取第一个transcript的文本
|
| 278 |
+
first_transcript = transcripts[0]
|
| 279 |
+
transcription_text = first_transcript.get('text', '')
|
| 280 |
+
|
| 281 |
+
# 获取句子信息
|
| 282 |
+
all_sentences = first_transcript.get('sentences', [])
|
| 283 |
+
|
| 284 |
+
# 计算置信度平均值(如果有句子信息)
|
| 285 |
+
confidence = 0
|
| 286 |
+
if all_sentences:
|
| 287 |
+
confidences = [sentence.get('confidence', 0) for sentence in all_sentences if 'confidence' in sentence]
|
| 288 |
+
if confidences:
|
| 289 |
+
confidence = sum(confidences) / len(confidences)
|
| 290 |
+
else:
|
| 291 |
+
self.logger.warning(f"下载转录结果失败,状态码: {response.status_code}")
|
| 292 |
+
self.logger.warning(f"响应内容: {response.text}")
|
| 293 |
+
except Exception as e:
|
| 294 |
+
self.logger.warning(f"下载转录结果时发生错误: {str(e)}")
|
| 295 |
+
|
| 296 |
+
transcription = {
|
| 297 |
+
'file_url': original_url,
|
| 298 |
+
'text': transcription_text,
|
| 299 |
+
'duration': duration,
|
| 300 |
+
'language': language,
|
| 301 |
+
'confidence': confidence,
|
| 302 |
+
'segments': segments
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# 如果需要调试,保存API返回的原始file_url
|
| 306 |
+
api_file_url = result.get('file_url', '')
|
| 307 |
+
if api_file_url and api_file_url != original_url:
|
| 308 |
+
transcription['api_file_url'] = api_file_url
|
| 309 |
+
|
| 310 |
+
parsed_results['transcriptions'].append(transcription)
|
| 311 |
+
|
| 312 |
+
# 更新摘要信息
|
| 313 |
+
parsed_results['summary']['total_duration'] += transcription['duration']
|
| 314 |
+
parsed_results['summary']['total_text_length'] += len(transcription['text'])
|
| 315 |
+
parsed_results['summary']['languages_detected'].add(transcription['language'])
|
| 316 |
+
|
| 317 |
+
except Exception as e:
|
| 318 |
+
self.logger.warning(f"解析单个转录结果时发生错误: {str(e)}")
|
| 319 |
+
# 添加错误的结果项
|
| 320 |
+
original_url = ''
|
| 321 |
+
if hasattr(self, '_original_urls') and i < len(self._original_urls):
|
| 322 |
+
original_url = self._original_urls[i]
|
| 323 |
+
|
| 324 |
+
parsed_results['transcriptions'].append({
|
| 325 |
+
'file_url': original_url,
|
| 326 |
+
'error': str(e),
|
| 327 |
+
'raw_result': result
|
| 328 |
+
})
|
| 329 |
+
|
| 330 |
+
# 转换语言集合为列表
|
| 331 |
+
parsed_results['summary']['languages_detected'] = list(parsed_results['summary']['languages_detected'])
|
| 332 |
+
|
| 333 |
+
return parsed_results
|
| 334 |
+
|
| 335 |
+
async def batch_process_with_retry(
|
| 336 |
+
self,
|
| 337 |
+
file_urls: List[str],
|
| 338 |
+
task_id: str,
|
| 339 |
+
paraformer_params: Optional[Dict] = None
|
| 340 |
+
) -> Tuple[bool, Optional[dict], Optional[str]]:
|
| 341 |
+
"""批量处理音频文件(带重试机制)
|
| 342 |
+
|
| 343 |
+
Args:
|
| 344 |
+
file_urls: 音频文件URL列表
|
| 345 |
+
task_id: 任务ID
|
| 346 |
+
paraformer_params: Paraformer额外参数
|
| 347 |
+
|
| 348 |
+
Returns:
|
| 349 |
+
(是否成功, 转录结果, 错误信息)
|
| 350 |
+
"""
|
| 351 |
+
max_retries = self.api_config.max_retries
|
| 352 |
+
retry_delay = self.api_config.retry_delay
|
| 353 |
+
|
| 354 |
+
for attempt in range(max_retries + 1):
|
| 355 |
+
try:
|
| 356 |
+
success, results, error = await self.process_audio_files(file_urls, task_id, paraformer_params)
|
| 357 |
+
|
| 358 |
+
if success:
|
| 359 |
+
return True, results, None
|
| 360 |
+
|
| 361 |
+
# 如果是最后一次重试,返回错误
|
| 362 |
+
if attempt == max_retries:
|
| 363 |
+
return False, None, error
|
| 364 |
+
|
| 365 |
+
# 等待后重试
|
| 366 |
+
self.logger.warning(f"第 {attempt + 1} 次尝试失败,{retry_delay} 秒后重试: {error}")
|
| 367 |
+
await asyncio.sleep(retry_delay * (attempt + 1)) # 递增延迟
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
error_msg = f"重试过程中发生错误: {str(e)}"
|
| 371 |
+
self.logger.exception(error_msg)
|
| 372 |
+
|
| 373 |
+
if attempt == max_retries:
|
| 374 |
+
return False, None, error_msg
|
| 375 |
+
|
| 376 |
+
await asyncio.sleep(retry_delay * (attempt + 1))
|
| 377 |
+
|
| 378 |
+
return False, None, "重试次数已达上限"
|
| 379 |
+
|
| 380 |
+
def get_service_info(self) -> dict:
|
| 381 |
+
"""获取服务信息
|
| 382 |
+
|
| 383 |
+
Returns:
|
| 384 |
+
服务配置信息
|
| 385 |
+
"""
|
| 386 |
+
return {
|
| 387 |
+
'model': self.api_config.model,
|
| 388 |
+
'base_url': self.api_config.base_url,
|
| 389 |
+
'timeout': self.api_config.timeout,
|
| 390 |
+
'max_retries': self.api_config.max_retries,
|
| 391 |
+
'retry_delay': self.api_config.retry_delay,
|
| 392 |
+
'language_hints': self.api_config.language_hints,
|
| 393 |
+
'status_check_interval': self.config.task.status_check_interval
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
# 全局Paraformer服务实例
|
| 398 |
+
paraformer_service = ParaformerService()
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def get_paraformer_service() -> ParaformerService:
|
| 402 |
+
"""获取Paraformer服务实例
|
| 403 |
+
|
| 404 |
+
Returns:
|
| 405 |
+
Paraformer服务实例
|
| 406 |
+
"""
|
| 407 |
+
return paraformer_service
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""工具模块
|
| 2 |
+
|
| 3 |
+
包含应用程序的工具函数和辅助类。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .logger import Logger, TaskLogger, get_logger, get_task_logger, logger
|
| 7 |
+
from .error_handler import (
|
| 8 |
+
ErrorCode, TranscriptServiceError, FileValidationError, NetworkError,
|
| 9 |
+
APIError, OSSError, SystemError, RetryStrategy, ErrorHandler,
|
| 10 |
+
retry_async, retry_sync, safe_execute, safe_execute_async, get_error_handler, error_handler
|
| 11 |
+
)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"Logger",
|
| 15 |
+
"TaskLogger",
|
| 16 |
+
"get_logger",
|
| 17 |
+
"get_task_logger",
|
| 18 |
+
"logger",
|
| 19 |
+
"ErrorCode",
|
| 20 |
+
"TranscriptServiceError",
|
| 21 |
+
"FileValidationError",
|
| 22 |
+
"NetworkError",
|
| 23 |
+
"APIError",
|
| 24 |
+
"OSSError",
|
| 25 |
+
"SystemError",
|
| 26 |
+
"RetryStrategy",
|
| 27 |
+
"ErrorHandler",
|
| 28 |
+
"retry_async",
|
| 29 |
+
"retry_sync",
|
| 30 |
+
"safe_execute",
|
| 31 |
+
"safe_execute_async",
|
| 32 |
+
"get_error_handler",
|
| 33 |
+
"error_handler"
|
| 34 |
+
]
|
src/utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (850 Bytes). View file
|
|
|
src/utils/__pycache__/error_handler.cpython-310.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
src/utils/__pycache__/logger.cpython-310.pyc
ADDED
|
Binary file (7.74 kB). View file
|
|
|
src/utils/error_handler.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""错误处理和容错机制模块
|
| 2 |
+
|
| 3 |
+
提供统一的错误处理、重试逻辑和异常恢复功能。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import asyncio
|
| 7 |
+
import functools
|
| 8 |
+
import time
|
| 9 |
+
from typing import Any, Callable, Dict, Optional, Type, Union
|
| 10 |
+
from enum import Enum
|
| 11 |
+
|
| 12 |
+
from ..core.config import get_config
|
| 13 |
+
from ..utils.logger import get_task_logger
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ErrorCode(Enum):
|
| 17 |
+
"""错误代码"""
|
| 18 |
+
# 文件相关错误
|
| 19 |
+
FILE_NOT_FOUND = "FILE_001"
|
| 20 |
+
FILE_TOO_LARGE = "FILE_002"
|
| 21 |
+
FILE_FORMAT_UNSUPPORTED = "FILE_003"
|
| 22 |
+
FILE_CORRUPTED = "FILE_004"
|
| 23 |
+
|
| 24 |
+
# 网络相关错误
|
| 25 |
+
NETWORK_TIMEOUT = "NET_001"
|
| 26 |
+
NETWORK_CONNECTION_ERROR = "NET_002"
|
| 27 |
+
NETWORK_DNS_ERROR = "NET_003"
|
| 28 |
+
|
| 29 |
+
# API相关错误
|
| 30 |
+
API_KEY_INVALID = "API_001"
|
| 31 |
+
API_QUOTA_EXCEEDED = "API_002"
|
| 32 |
+
API_SERVICE_UNAVAILABLE = "API_003"
|
| 33 |
+
API_RATE_LIMITED = "API_004"
|
| 34 |
+
|
| 35 |
+
# OSS相关错误
|
| 36 |
+
OSS_ACCESS_DENIED = "OSS_001"
|
| 37 |
+
OSS_BUCKET_NOT_FOUND = "OSS_002"
|
| 38 |
+
OSS_UPLOAD_FAILED = "OSS_003"
|
| 39 |
+
|
| 40 |
+
# 系统相关错误
|
| 41 |
+
SYSTEM_OUT_OF_MEMORY = "SYS_001"
|
| 42 |
+
SYSTEM_DISK_FULL = "SYS_002"
|
| 43 |
+
SYSTEM_PERMISSION_DENIED = "SYS_003"
|
| 44 |
+
|
| 45 |
+
# 通用错误
|
| 46 |
+
UNKNOWN_ERROR = "GEN_001"
|
| 47 |
+
TIMEOUT_ERROR = "GEN_002"
|
| 48 |
+
VALIDATION_ERROR = "GEN_003"
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class TranscriptServiceError(Exception):
|
| 52 |
+
"""服务自定义异常基类"""
|
| 53 |
+
|
| 54 |
+
def __init__(self, message: str, error_code: ErrorCode = ErrorCode.UNKNOWN_ERROR, details: Dict = None):
|
| 55 |
+
"""初始化异常
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
message: 错误消息
|
| 59 |
+
error_code: 错误代码
|
| 60 |
+
details: 额外详情
|
| 61 |
+
"""
|
| 62 |
+
super().__init__(message)
|
| 63 |
+
self.message = message
|
| 64 |
+
self.error_code = error_code
|
| 65 |
+
self.details = details or {}
|
| 66 |
+
self.timestamp = time.time()
|
| 67 |
+
|
| 68 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 69 |
+
"""转换为字典格式"""
|
| 70 |
+
return {
|
| 71 |
+
'error_code': self.error_code.value,
|
| 72 |
+
'message': self.message,
|
| 73 |
+
'details': self.details,
|
| 74 |
+
'timestamp': self.timestamp
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class FileValidationError(TranscriptServiceError):
|
| 79 |
+
"""文件验证错误"""
|
| 80 |
+
pass
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class NetworkError(TranscriptServiceError):
|
| 84 |
+
"""网络相关错误"""
|
| 85 |
+
pass
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class APIError(TranscriptServiceError):
|
| 89 |
+
"""API调用错误"""
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class OSSError(TranscriptServiceError):
|
| 94 |
+
"""OSS操作错误"""
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class SystemError(TranscriptServiceError):
|
| 99 |
+
"""系统错误"""
|
| 100 |
+
pass
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class RetryStrategy:
|
| 104 |
+
"""重试策略"""
|
| 105 |
+
|
| 106 |
+
def __init__(
|
| 107 |
+
self,
|
| 108 |
+
max_attempts: int = 3,
|
| 109 |
+
base_delay: float = 1.0,
|
| 110 |
+
max_delay: float = 60.0,
|
| 111 |
+
exponential_base: float = 2.0,
|
| 112 |
+
jitter: bool = True
|
| 113 |
+
):
|
| 114 |
+
"""初始化重试策略
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
max_attempts: 最大重试次数
|
| 118 |
+
base_delay: 基础延迟时间(秒)
|
| 119 |
+
max_delay: 最大延迟时间(秒)
|
| 120 |
+
exponential_base: 指数退避基数
|
| 121 |
+
jitter: 是否添加随机抖动
|
| 122 |
+
"""
|
| 123 |
+
self.max_attempts = max_attempts
|
| 124 |
+
self.base_delay = base_delay
|
| 125 |
+
self.max_delay = max_delay
|
| 126 |
+
self.exponential_base = exponential_base
|
| 127 |
+
self.jitter = jitter
|
| 128 |
+
|
| 129 |
+
def calculate_delay(self, attempt: int) -> float:
|
| 130 |
+
"""计算延迟时间
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
attempt: 当前尝试次数(从1开始)
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
延迟时间(秒)
|
| 137 |
+
"""
|
| 138 |
+
delay = self.base_delay * (self.exponential_base ** (attempt - 1))
|
| 139 |
+
delay = min(delay, self.max_delay)
|
| 140 |
+
|
| 141 |
+
if self.jitter:
|
| 142 |
+
import random
|
| 143 |
+
delay *= (0.5 + random.random() * 0.5) # 添加±50%的随机抖动
|
| 144 |
+
|
| 145 |
+
return delay
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
class ErrorHandler:
|
| 149 |
+
"""错误处理器"""
|
| 150 |
+
|
| 151 |
+
def __init__(self):
|
| 152 |
+
"""初始化错误处理器"""
|
| 153 |
+
self.config = get_config()
|
| 154 |
+
self.logger = get_task_logger(logger_name="transcript_service.error")
|
| 155 |
+
|
| 156 |
+
# 错误分类映射
|
| 157 |
+
self.error_mapping = {
|
| 158 |
+
# 文件错误
|
| 159 |
+
FileNotFoundError: (FileValidationError, ErrorCode.FILE_NOT_FOUND),
|
| 160 |
+
PermissionError: (SystemError, ErrorCode.SYSTEM_PERMISSION_DENIED),
|
| 161 |
+
|
| 162 |
+
# 网络错误
|
| 163 |
+
asyncio.TimeoutError: (NetworkError, ErrorCode.NETWORK_TIMEOUT),
|
| 164 |
+
ConnectionError: (NetworkError, ErrorCode.NETWORK_CONNECTION_ERROR),
|
| 165 |
+
|
| 166 |
+
# 通用错误
|
| 167 |
+
ValueError: (TranscriptServiceError, ErrorCode.VALIDATION_ERROR),
|
| 168 |
+
RuntimeError: (TranscriptServiceError, ErrorCode.UNKNOWN_ERROR),
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
# 可重试的错误类型
|
| 172 |
+
self.retryable_errors = {
|
| 173 |
+
ErrorCode.NETWORK_TIMEOUT,
|
| 174 |
+
ErrorCode.NETWORK_CONNECTION_ERROR,
|
| 175 |
+
ErrorCode.API_RATE_LIMITED,
|
| 176 |
+
ErrorCode.OSS_UPLOAD_FAILED,
|
| 177 |
+
ErrorCode.API_SERVICE_UNAVAILABLE
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
def classify_error(self, error: Exception) -> TranscriptServiceError:
|
| 181 |
+
"""分类和包装错误
|
| 182 |
+
|
| 183 |
+
Args:
|
| 184 |
+
error: 原始异常
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
分类后的服务异常
|
| 188 |
+
"""
|
| 189 |
+
if isinstance(error, TranscriptServiceError):
|
| 190 |
+
return error
|
| 191 |
+
|
| 192 |
+
error_type = type(error)
|
| 193 |
+
if error_type in self.error_mapping:
|
| 194 |
+
exception_class, error_code = self.error_mapping[error_type]
|
| 195 |
+
return exception_class(str(error), error_code)
|
| 196 |
+
|
| 197 |
+
# 根据错误消息内容进行分类
|
| 198 |
+
error_msg = str(error).lower()
|
| 199 |
+
|
| 200 |
+
if "timeout" in error_msg:
|
| 201 |
+
return NetworkError(str(error), ErrorCode.NETWORK_TIMEOUT)
|
| 202 |
+
elif "permission denied" in error_msg:
|
| 203 |
+
return SystemError(str(error), ErrorCode.SYSTEM_PERMISSION_DENIED)
|
| 204 |
+
elif "api key" in error_msg:
|
| 205 |
+
return APIError(str(error), ErrorCode.API_KEY_INVALID)
|
| 206 |
+
elif "quota" in error_msg or "limit" in error_msg:
|
| 207 |
+
return APIError(str(error), ErrorCode.API_QUOTA_EXCEEDED)
|
| 208 |
+
else:
|
| 209 |
+
return TranscriptServiceError(str(error), ErrorCode.UNKNOWN_ERROR)
|
| 210 |
+
|
| 211 |
+
def is_retryable(self, error: TranscriptServiceError) -> bool:
|
| 212 |
+
"""判断错误是否可重试
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
error: 服务异常
|
| 216 |
+
|
| 217 |
+
Returns:
|
| 218 |
+
是否可重试
|
| 219 |
+
"""
|
| 220 |
+
return error.error_code in self.retryable_errors
|
| 221 |
+
|
| 222 |
+
def handle_error(self, error: Exception, context: str = "") -> TranscriptServiceError:
|
| 223 |
+
"""处理错误
|
| 224 |
+
|
| 225 |
+
Args:
|
| 226 |
+
error: 原始异常
|
| 227 |
+
context: 错误上下文
|
| 228 |
+
|
| 229 |
+
Returns:
|
| 230 |
+
处理后的服务异常
|
| 231 |
+
"""
|
| 232 |
+
classified_error = self.classify_error(error)
|
| 233 |
+
|
| 234 |
+
# 记录错误日志
|
| 235 |
+
log_msg = f"错误处理 - {context}: {classified_error.message}"
|
| 236 |
+
if classified_error.error_code in [ErrorCode.UNKNOWN_ERROR, ErrorCode.SYSTEM_OUT_OF_MEMORY]:
|
| 237 |
+
self.logger.exception(log_msg)
|
| 238 |
+
else:
|
| 239 |
+
self.logger.error(log_msg)
|
| 240 |
+
|
| 241 |
+
return classified_error
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# 全局错误处理器实例
|
| 245 |
+
error_handler = ErrorHandler()
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def retry_async(
|
| 249 |
+
strategy: Optional[RetryStrategy] = None,
|
| 250 |
+
exceptions: tuple = (Exception,),
|
| 251 |
+
context: str = ""
|
| 252 |
+
):
|
| 253 |
+
"""异步函数重试装饰器
|
| 254 |
+
|
| 255 |
+
Args:
|
| 256 |
+
strategy: 重试策略
|
| 257 |
+
exceptions: 需要重试的异常类型
|
| 258 |
+
context: 上下文信息
|
| 259 |
+
"""
|
| 260 |
+
if strategy is None:
|
| 261 |
+
strategy = RetryStrategy()
|
| 262 |
+
|
| 263 |
+
def decorator(func: Callable):
|
| 264 |
+
@functools.wraps(func)
|
| 265 |
+
async def wrapper(*args, **kwargs):
|
| 266 |
+
logger = get_task_logger(logger_name="transcript_service.retry")
|
| 267 |
+
|
| 268 |
+
for attempt in range(1, strategy.max_attempts + 1):
|
| 269 |
+
try:
|
| 270 |
+
return await func(*args, **kwargs)
|
| 271 |
+
except exceptions as e:
|
| 272 |
+
classified_error = error_handler.classify_error(e)
|
| 273 |
+
|
| 274 |
+
# 检查是否可重试
|
| 275 |
+
if attempt == strategy.max_attempts or not error_handler.is_retryable(classified_error):
|
| 276 |
+
logger.error(f"{context} 最终失败 (尝试 {attempt}/{strategy.max_attempts}): {str(e)}")
|
| 277 |
+
raise classified_error
|
| 278 |
+
|
| 279 |
+
# 计算延迟时间
|
| 280 |
+
delay = strategy.calculate_delay(attempt)
|
| 281 |
+
logger.warning(f"{context} 第 {attempt} 次尝试失败,{delay:.1f}秒后重试: {str(e)}")
|
| 282 |
+
|
| 283 |
+
await asyncio.sleep(delay)
|
| 284 |
+
|
| 285 |
+
# 理论上不会执行到这里
|
| 286 |
+
raise TranscriptServiceError("重试逻辑异常", ErrorCode.UNKNOWN_ERROR)
|
| 287 |
+
|
| 288 |
+
return wrapper
|
| 289 |
+
return decorator
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def retry_sync(
|
| 293 |
+
strategy: Optional[RetryStrategy] = None,
|
| 294 |
+
exceptions: tuple = (Exception,),
|
| 295 |
+
context: str = ""
|
| 296 |
+
):
|
| 297 |
+
"""同步函数重试装饰器
|
| 298 |
+
|
| 299 |
+
Args:
|
| 300 |
+
strategy: 重试策略
|
| 301 |
+
exceptions: 需要重试的异常类型
|
| 302 |
+
context: 上下文信息
|
| 303 |
+
"""
|
| 304 |
+
if strategy is None:
|
| 305 |
+
strategy = RetryStrategy()
|
| 306 |
+
|
| 307 |
+
def decorator(func: Callable):
|
| 308 |
+
@functools.wraps(func)
|
| 309 |
+
def wrapper(*args, **kwargs):
|
| 310 |
+
logger = get_task_logger(logger_name="transcript_service.retry")
|
| 311 |
+
|
| 312 |
+
for attempt in range(1, strategy.max_attempts + 1):
|
| 313 |
+
try:
|
| 314 |
+
return func(*args, **kwargs)
|
| 315 |
+
except exceptions as e:
|
| 316 |
+
classified_error = error_handler.classify_error(e)
|
| 317 |
+
|
| 318 |
+
# 检查是否可重试
|
| 319 |
+
if attempt == strategy.max_attempts or not error_handler.is_retryable(classified_error):
|
| 320 |
+
logger.error(f"{context} 最终失败 (尝试 {attempt}/{strategy.max_attempts}): {str(e)}")
|
| 321 |
+
raise classified_error
|
| 322 |
+
|
| 323 |
+
# 计算延迟时间
|
| 324 |
+
delay = strategy.calculate_delay(attempt)
|
| 325 |
+
logger.warning(f"{context} 第 {attempt} 次尝试失败,{delay:.1f}秒后重试: {str(e)}")
|
| 326 |
+
|
| 327 |
+
time.sleep(delay)
|
| 328 |
+
|
| 329 |
+
# 理论上不会执行到这里
|
| 330 |
+
raise TranscriptServiceError("重试逻辑异常", ErrorCode.UNKNOWN_ERROR)
|
| 331 |
+
|
| 332 |
+
return wrapper
|
| 333 |
+
return decorator
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def safe_execute(func: Callable, *args, **kwargs) -> tuple[bool, Any, Optional[TranscriptServiceError]]:
|
| 337 |
+
"""安全执行函数
|
| 338 |
+
|
| 339 |
+
Args:
|
| 340 |
+
func: 要执行的函数
|
| 341 |
+
*args: 位置参数
|
| 342 |
+
**kwargs: 关键字参数
|
| 343 |
+
|
| 344 |
+
Returns:
|
| 345 |
+
(是否成功, 结果或None, 错误或None)
|
| 346 |
+
"""
|
| 347 |
+
try:
|
| 348 |
+
result = func(*args, **kwargs)
|
| 349 |
+
return True, result, None
|
| 350 |
+
except Exception as e:
|
| 351 |
+
error = error_handler.handle_error(e, f"执行 {func.__name__}")
|
| 352 |
+
return False, None, error
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
async def safe_execute_async(func: Callable, *args, **kwargs) -> tuple[bool, Any, Optional[TranscriptServiceError]]:
|
| 356 |
+
"""安全执行异步函数
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
func: 要执行的异步函数
|
| 360 |
+
*args: 位置参数
|
| 361 |
+
**kwargs: 关键字参数
|
| 362 |
+
|
| 363 |
+
Returns:
|
| 364 |
+
(是否成功, 结果或None, 错误或None)
|
| 365 |
+
"""
|
| 366 |
+
try:
|
| 367 |
+
result = await func(*args, **kwargs)
|
| 368 |
+
return True, result, None
|
| 369 |
+
except Exception as e:
|
| 370 |
+
error = error_handler.handle_error(e, f"执行 {func.__name__}")
|
| 371 |
+
return False, None, error
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
def get_error_handler() -> ErrorHandler:
|
| 375 |
+
"""获取错误处理器实例
|
| 376 |
+
|
| 377 |
+
Returns:
|
| 378 |
+
错误处理器实例
|
| 379 |
+
"""
|
| 380 |
+
return error_handler
|
src/utils/logger.py
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""日志管理模块
|
| 2 |
+
|
| 3 |
+
提供结构化日志记录功能,支持任务跟踪和状态记录。
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import logging.config
|
| 8 |
+
import logging.handlers
|
| 9 |
+
import uuid
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Any, Dict, Optional
|
| 12 |
+
import yaml
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from rich.console import Console
|
| 16 |
+
from rich.logging import RichHandler
|
| 17 |
+
RICH_AVAILABLE = True
|
| 18 |
+
except ImportError:
|
| 19 |
+
RICH_AVAILABLE = False
|
| 20 |
+
|
| 21 |
+
from ..core.config import get_config
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TaskContextFilter(logging.Filter):
|
| 25 |
+
"""任务上下文过滤器
|
| 26 |
+
|
| 27 |
+
为日志记录添加任务ID上下文信息。
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self):
|
| 31 |
+
super().__init__()
|
| 32 |
+
self.task_id = 'system'
|
| 33 |
+
|
| 34 |
+
def filter(self, record):
|
| 35 |
+
"""添加任务ID到日志记录"""
|
| 36 |
+
# 确保所有记录都有task_id字段
|
| 37 |
+
if not hasattr(record, 'task_id'):
|
| 38 |
+
record.task_id = getattr(self, 'task_id', 'system')
|
| 39 |
+
elif getattr(record, 'task_id', None) is None:
|
| 40 |
+
record.task_id = getattr(self, 'task_id', 'system')
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class Logger:
|
| 45 |
+
"""日志管理器"""
|
| 46 |
+
|
| 47 |
+
def __init__(self, name: str = "transcript_service"):
|
| 48 |
+
"""初始化日志管理器
|
| 49 |
+
|
| 50 |
+
Args:
|
| 51 |
+
name: 日志器名称
|
| 52 |
+
"""
|
| 53 |
+
self.name = name
|
| 54 |
+
self.config = get_config()
|
| 55 |
+
self._setup_logging()
|
| 56 |
+
self.logger = logging.getLogger(name)
|
| 57 |
+
self.task_filter = TaskContextFilter()
|
| 58 |
+
|
| 59 |
+
# 为所有处理器添加任务过滤器
|
| 60 |
+
for handler in self.logger.handlers:
|
| 61 |
+
handler.addFilter(self.task_filter)
|
| 62 |
+
|
| 63 |
+
# 同时为根日志器的处理器添加过滤器
|
| 64 |
+
root_logger = logging.getLogger()
|
| 65 |
+
for handler in root_logger.handlers:
|
| 66 |
+
if not any(isinstance(f, TaskContextFilter) for f in handler.filters):
|
| 67 |
+
handler.addFilter(self.task_filter)
|
| 68 |
+
|
| 69 |
+
def _setup_logging(self):
|
| 70 |
+
"""设置日志配置"""
|
| 71 |
+
# 确保日志目录存在
|
| 72 |
+
logs_dir = self.config.get_logs_dir()
|
| 73 |
+
|
| 74 |
+
# 加载日志配置文件
|
| 75 |
+
config_file = self.config.get_project_root() / "config" / "logging.yaml"
|
| 76 |
+
|
| 77 |
+
if config_file.exists():
|
| 78 |
+
with open(config_file, 'r', encoding='utf-8') as file:
|
| 79 |
+
logging_config = yaml.safe_load(file)
|
| 80 |
+
|
| 81 |
+
# 更新文件路径为绝对路径
|
| 82 |
+
for handler_name, handler_config in logging_config.get('handlers', {}).items():
|
| 83 |
+
if 'filename' in handler_config:
|
| 84 |
+
handler_config['filename'] = str(logs_dir / Path(handler_config['filename']).name)
|
| 85 |
+
|
| 86 |
+
logging.config.dictConfig(logging_config)
|
| 87 |
+
else:
|
| 88 |
+
# 使用默认配置
|
| 89 |
+
self._setup_default_logging()
|
| 90 |
+
|
| 91 |
+
def _setup_default_logging(self):
|
| 92 |
+
"""设置默认日志配置"""
|
| 93 |
+
# 控制台处理器
|
| 94 |
+
if RICH_AVAILABLE:
|
| 95 |
+
console = Console()
|
| 96 |
+
console_handler = RichHandler(
|
| 97 |
+
console=console,
|
| 98 |
+
show_time=True,
|
| 99 |
+
show_path=True,
|
| 100 |
+
markup=True
|
| 101 |
+
)
|
| 102 |
+
else:
|
| 103 |
+
console_handler = logging.StreamHandler()
|
| 104 |
+
console_formatter = logging.Formatter(
|
| 105 |
+
'[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
|
| 106 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 107 |
+
)
|
| 108 |
+
console_handler.setFormatter(console_formatter)
|
| 109 |
+
|
| 110 |
+
console_handler.setLevel(logging.DEBUG if self.config.app.debug else logging.INFO)
|
| 111 |
+
|
| 112 |
+
# 文件处理器
|
| 113 |
+
log_file = self.config.get_logs_dir() / "app.log"
|
| 114 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
| 115 |
+
log_file,
|
| 116 |
+
maxBytes=10*1024*1024, # 10MB
|
| 117 |
+
backupCount=5,
|
| 118 |
+
encoding='utf-8'
|
| 119 |
+
)
|
| 120 |
+
file_handler.setLevel(logging.INFO)
|
| 121 |
+
|
| 122 |
+
# 格式化器(简化版本)
|
| 123 |
+
formatter = logging.Formatter(
|
| 124 |
+
'[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
|
| 125 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 126 |
+
)
|
| 127 |
+
file_handler.setFormatter(formatter)
|
| 128 |
+
|
| 129 |
+
# 配置根日志器
|
| 130 |
+
root_logger = logging.getLogger()
|
| 131 |
+
root_logger.setLevel(logging.DEBUG if self.config.app.debug else logging.INFO)
|
| 132 |
+
root_logger.addHandler(console_handler)
|
| 133 |
+
root_logger.addHandler(file_handler)
|
| 134 |
+
|
| 135 |
+
def set_task_id(self, task_id: str):
|
| 136 |
+
"""设置当前任务ID
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
task_id: 任务ID
|
| 140 |
+
"""
|
| 141 |
+
self.task_filter.task_id = task_id
|
| 142 |
+
|
| 143 |
+
def clear_task_id(self):
|
| 144 |
+
"""清除当前任务ID"""
|
| 145 |
+
self.task_filter.task_id = 'system'
|
| 146 |
+
|
| 147 |
+
def debug(self, message: str, **kwargs):
|
| 148 |
+
"""记录调试信息"""
|
| 149 |
+
self.logger.debug(message, extra=kwargs)
|
| 150 |
+
|
| 151 |
+
def info(self, message: str, **kwargs):
|
| 152 |
+
"""记录一般信息"""
|
| 153 |
+
self.logger.info(message, extra=kwargs)
|
| 154 |
+
|
| 155 |
+
def warning(self, message: str, **kwargs):
|
| 156 |
+
"""记录警告信息"""
|
| 157 |
+
self.logger.warning(message, extra=kwargs)
|
| 158 |
+
|
| 159 |
+
def error(self, message: str, **kwargs):
|
| 160 |
+
"""记录错误信息"""
|
| 161 |
+
self.logger.error(message, extra=kwargs)
|
| 162 |
+
|
| 163 |
+
def critical(self, message: str, **kwargs):
|
| 164 |
+
"""记录严重错误"""
|
| 165 |
+
self.logger.critical(message, extra=kwargs)
|
| 166 |
+
|
| 167 |
+
def exception(self, message: str, **kwargs):
|
| 168 |
+
"""记录异常信息(包含堆栈跟踪)"""
|
| 169 |
+
self.logger.exception(message, extra=kwargs)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
class TaskLogger:
|
| 173 |
+
"""任务日志记录器
|
| 174 |
+
|
| 175 |
+
为特定任务提供上下文日志记录。
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
def __init__(self, task_id: Optional[str] = None, logger_name: str = "transcript_service"):
|
| 179 |
+
"""初始化任务日志记录器
|
| 180 |
+
|
| 181 |
+
Args:
|
| 182 |
+
task_id: 任务ID,如果为None则自动生成
|
| 183 |
+
logger_name: 基础日志器名称
|
| 184 |
+
"""
|
| 185 |
+
self.task_id = task_id or str(uuid.uuid4())[:8]
|
| 186 |
+
self.logger = Logger(logger_name)
|
| 187 |
+
self.logger.set_task_id(self.task_id)
|
| 188 |
+
|
| 189 |
+
def __enter__(self):
|
| 190 |
+
"""进入上下文管理器"""
|
| 191 |
+
return self
|
| 192 |
+
|
| 193 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 194 |
+
"""退出上下文管理器"""
|
| 195 |
+
self.logger.clear_task_id()
|
| 196 |
+
|
| 197 |
+
def debug(self, message: str, **kwargs):
|
| 198 |
+
"""记录调试信息"""
|
| 199 |
+
self.logger.debug(message, **kwargs)
|
| 200 |
+
|
| 201 |
+
def info(self, message: str, **kwargs):
|
| 202 |
+
"""记录一般信息"""
|
| 203 |
+
self.logger.info(message, **kwargs)
|
| 204 |
+
|
| 205 |
+
def warning(self, message: str, **kwargs):
|
| 206 |
+
"""记录警告信息"""
|
| 207 |
+
self.logger.warning(message, **kwargs)
|
| 208 |
+
|
| 209 |
+
def error(self, message: str, **kwargs):
|
| 210 |
+
"""记录错误信息"""
|
| 211 |
+
self.logger.error(message, **kwargs)
|
| 212 |
+
|
| 213 |
+
def critical(self, message: str, **kwargs):
|
| 214 |
+
"""记录严重错误"""
|
| 215 |
+
self.logger.critical(message, **kwargs)
|
| 216 |
+
|
| 217 |
+
def exception(self, message: str, **kwargs):
|
| 218 |
+
"""记录异常信息"""
|
| 219 |
+
self.logger.exception(message, **kwargs)
|
| 220 |
+
|
| 221 |
+
def set_task_id(self, task_id: str):
|
| 222 |
+
"""设置当前任务ID
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
task_id: 任务ID
|
| 226 |
+
"""
|
| 227 |
+
self.logger.set_task_id(task_id)
|
| 228 |
+
|
| 229 |
+
def clear_task_id(self):
|
| 230 |
+
"""清除当前任务ID"""
|
| 231 |
+
self.logger.clear_task_id()
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# 全局日志实例
|
| 235 |
+
logger = Logger()
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def get_logger(name: str = "transcript_service") -> Logger:
|
| 239 |
+
"""获取日志实例
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
name: 日志器名称
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
日志实例
|
| 246 |
+
"""
|
| 247 |
+
return Logger(name)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def get_task_logger(task_id: Optional[str] = None, logger_name: str = "transcript_service") -> TaskLogger:
|
| 251 |
+
"""获取任务日志实例
|
| 252 |
+
|
| 253 |
+
Args:
|
| 254 |
+
task_id: 任务ID
|
| 255 |
+
logger_name: 日志器名称
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
任务日志实例
|
| 259 |
+
"""
|
| 260 |
+
return TaskLogger(task_id, logger_name)
|