PCNUSMSE commited on
Commit
4e37375
·
verified ·
1 Parent(s): 2c5a743

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,12 +1,46 @@
1
  ---
2
  title: Transcript Service
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Transcript Service
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ # 🎙️ 音频转文字服务
14
+
15
+ 基于 Gradio 的智能音频转文字 Web 服务。
16
+
17
+ ## ✨ 功能特点
18
+
19
+ - 🎤 支持多种音频格式(MP3, WAV, M4A 等)
20
+ - 📝 自动语音识别转文字
21
+ - ☁️ 阿里云 OSS 云存储
22
+ - 🤖 阿里云 DashScope API 支持
23
+ - 🌐 简洁易用的 Web 界面
24
+
25
+ ## 🚀 使用方法
26
+
27
+ 1. 上传音频文件
28
+ 2. 选择语言(自动检测或手动指定)
29
+ 3. 点击"转换"按钮
30
+ 4. 等待处理完成
31
+ 5. 查看或下载转换结果
32
+
33
+ ## 🛠️ 技术栈
34
+
35
+ - **前端**: Gradio 5.9.1
36
+ - **后端**: Python 3.10
37
+ - **存储**: 阿里云 OSS
38
+ - **AI 服务**: 阿里云 DashScope
39
+
40
+ ## 📝 许可证
41
+
42
+ Apache License 2.0
43
+
44
+ ---
45
+
46
+ **部署在 Hugging Face Spaces** 🤗
app.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """音频转文字服务主应用程序
2
+
3
+ 基于Gradio的音频转文字Web服务应用程序入口。
4
+ """
5
+
6
+ import asyncio
7
+ import sys
8
+ import signal
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ # 添加项目根目录到Python路径
14
+ project_root = Path(__file__).parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ # 加载环境变量
18
+ from dotenv import load_dotenv
19
+ load_dotenv(project_root / ".env")
20
+
21
+ from src.core.config import get_config, reload_config
22
+ from src.utils.logger import get_logger
23
+ from src.api.gradio_interface import get_gradio_interface
24
+ from src.core.task_manager import get_task_manager, TaskStatus
25
+
26
+
27
+ class TranscriptServiceApp:
28
+ """音频转文字服务应用程序"""
29
+
30
+ def __init__(self, environment: Optional[str] = None):
31
+ """初始化应用程序
32
+
33
+ Args:
34
+ environment: 运行环境 (development/production)
35
+ """
36
+ # 加载配置
37
+ if environment:
38
+ self.config = reload_config(environment)
39
+ else:
40
+ self.config = get_config()
41
+
42
+ # 初始化日志
43
+ self.logger = get_logger("transcript_service.app")
44
+
45
+ # 初始化界面
46
+ self.gradio_interface = get_gradio_interface()
47
+
48
+ # 添加健康检查端点
49
+ self._setup_health_endpoint()
50
+
51
+ # 运行状态
52
+ self.is_running = False
53
+
54
+ self.logger.info(f"应用程序初始化完成 - 环境: {self.config.environment}")
55
+
56
+ def _setup_health_endpoint(self):
57
+ """设置健康检查端点"""
58
+ try:
59
+ import gradio as gr
60
+
61
+ def health_check():
62
+ """健康检查函数"""
63
+ import json
64
+ import time
65
+
66
+ health_data = {
67
+ "status": "healthy",
68
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
69
+ "environment": self.config.environment,
70
+ "version": self.config.app.version,
71
+ "uptime": time.time() - getattr(self, '_start_time', time.time()),
72
+ "services": {
73
+ "oss": self._check_oss_connection(),
74
+ "dashscope": self._check_dashscope_connection()
75
+ }
76
+ }
77
+
78
+ return json.dumps(health_data, indent=2, ensure_ascii=False)
79
+
80
+ # 在Gradio应用中添加健康检查端点
81
+ if hasattr(self.gradio_interface, 'app'):
82
+ from fastapi.responses import JSONResponse
83
+
84
+ @self.gradio_interface.app.get("/health")
85
+ async def health_endpoint():
86
+ health_data = {
87
+ "status": "healthy",
88
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
89
+ "environment": self.config.environment,
90
+ "version": self.config.app.version,
91
+ "uptime": time.time() - getattr(self, '_start_time', time.time()),
92
+ "services": {
93
+ "oss": self._check_oss_connection(),
94
+ "dashscope": self._check_dashscope_connection()
95
+ }
96
+ }
97
+ return JSONResponse(content=health_data)
98
+
99
+ except Exception as e:
100
+ self.logger.warning(f"设置健康检查端点失败: {e}")
101
+
102
+ def _check_oss_connection(self) -> bool:
103
+ """检查OSS连接"""
104
+ try:
105
+ if not (self.config.oss.access_key_id and self.config.oss.access_key_secret):
106
+ return False
107
+
108
+ import oss2
109
+ auth = oss2.Auth(self.config.oss.access_key_id, self.config.oss.access_key_secret)
110
+ service = oss2.Service(auth, "https://oss-cn-beijing.aliyuncs.com")
111
+
112
+ # 简单的连接测试
113
+ list(service.list_buckets(max_keys=1))
114
+ return True
115
+ except Exception:
116
+ return False
117
+
118
+ def _check_dashscope_connection(self) -> bool:
119
+ """检查DashScope连接"""
120
+ try:
121
+ if not self.config.dashscope.api_key:
122
+ return False
123
+
124
+ # 简单的API key格式检查
125
+ return self.config.dashscope.api_key.startswith("sk-")
126
+ except Exception:
127
+ return False
128
+
129
+ def setup_signal_handlers(self):
130
+ """设置信号处理器"""
131
+ # 移除优雅关闭功能,允许应用直接终止
132
+ pass
133
+
134
+ def validate_environment(self) -> bool:
135
+ """验证运行环境
136
+
137
+ Returns:
138
+ 环境是否有效
139
+ """
140
+ try:
141
+ # 检查必要的环境变量
142
+ missing_vars = []
143
+ if not self.config.oss.access_key_id:
144
+ missing_vars.append("OSS_ACCESS_KEY_ID")
145
+ if not self.config.oss.access_key_secret:
146
+ missing_vars.append("OSS_ACCESS_KEY_SECRET")
147
+ if not self.config.dashscope.api_key:
148
+ missing_vars.append("DASHSCOPE_API_KEY")
149
+
150
+ if missing_vars:
151
+ self.logger.error(f"缺少必要的环境变量: {missing_vars}")
152
+ return False
153
+
154
+ # 检查目录权限
155
+ logs_dir = self.config.get_logs_dir()
156
+ temp_dir = self.config.get_temp_dir()
157
+
158
+ for directory in [logs_dir, temp_dir]:
159
+ if not directory.exists():
160
+ directory.mkdir(parents=True, exist_ok=True)
161
+
162
+ # 测试写权限
163
+ test_file = directory / ".write_test"
164
+ try:
165
+ test_file.write_text("test")
166
+ test_file.unlink()
167
+ except Exception as e:
168
+ self.logger.error(f"目录权限检查失败 {directory}: {str(e)}")
169
+ return False
170
+
171
+ self.logger.info("环境验证通过")
172
+ return True
173
+
174
+ except Exception as e:
175
+ self.logger.exception(f"环境验证失败: {str(e)}")
176
+ return False
177
+
178
+ def run(self, **launch_kwargs):
179
+ """启动应用程序
180
+
181
+ Args:
182
+ **launch_kwargs: Gradio启动参数
183
+ """
184
+ try:
185
+ # 设置信号处理器
186
+ self.setup_signal_handlers()
187
+
188
+ # 验证环境
189
+ if not self.validate_environment():
190
+ self.logger.error("环境验证失败,应用程序无法启动")
191
+ sys.exit(1)
192
+
193
+ # 启动应用
194
+ self.is_running = True
195
+ self._start_time = time.time() # 记录启动时间
196
+ self.logger.info("正在启动音频转文字服务...")
197
+
198
+ # 启动Gradio界面
199
+ self.gradio_interface.launch(**launch_kwargs)
200
+
201
+ except OSError as e:
202
+ if "address already in use" in str(e).lower():
203
+ port = launch_kwargs.get('server_port', self.config.app.port)
204
+ self.logger.warning(f"端口 {port} 已被占用。正在尝试使用一个可用的随机端口...")
205
+
206
+ # 显式设置 server_port=None 来让 Gradio 自动查找可用端口
207
+ launch_kwargs['server_port'] = None
208
+
209
+ try:
210
+ # 再次尝试启动
211
+ self.gradio_interface.launch(**launch_kwargs)
212
+ except Exception as final_e:
213
+ self.logger.exception(f"尝试使用随机端口后,应用程序启动仍然失败: {str(final_e)}")
214
+ sys.exit(1)
215
+ else:
216
+ self.logger.exception(f"启动时发生未处理的网络错误: {str(e)}")
217
+ sys.exit(1)
218
+ except KeyboardInterrupt:
219
+ self.logger.info("接收到键盘中断信号")
220
+ self.shutdown()
221
+ except Exception as e:
222
+ self.logger.exception(f"应用程序启动失败: {str(e)}")
223
+ sys.exit(1)
224
+
225
+ def shutdown(self):
226
+ """关闭应用程序"""
227
+ if not self.is_running:
228
+ return
229
+
230
+ self.logger.info("开始关闭应用程序...")
231
+ self.is_running = False
232
+
233
+ try:
234
+ # 清理任务管理器
235
+ task_manager = get_task_manager()
236
+
237
+ # 取消所有待处理的任务
238
+ pending_tasks = task_manager.get_tasks_by_status(TaskStatus.PENDING)
239
+ for task in pending_tasks:
240
+ try:
241
+ loop = asyncio.get_running_loop()
242
+ asyncio.create_task(task_manager.cancel_task(task.id))
243
+ except RuntimeError: # No running loop
244
+ asyncio.run(task_manager.cancel_task(task.id))
245
+
246
+
247
+ # 等待正在处理的任务完成(最多等待30秒)
248
+ active_tasks = (
249
+ task_manager.get_tasks_by_status(TaskStatus.VALIDATING) +
250
+ task_manager.get_tasks_by_status(TaskStatus.UPLOADING) +
251
+ task_manager.get_tasks_by_status(TaskStatus.TRANSCRIBING)
252
+ )
253
+
254
+ if active_tasks:
255
+ self.logger.info(f"等待 {len(active_tasks)} 个活跃任务完成...")
256
+ # 这里可以添加更复杂的等待逻辑, 但为简单起见, 我们直接继续
257
+
258
+ # 清理临时文件
259
+ self.cleanup_temp_files()
260
+
261
+ self.logger.info("应用程序已安全关闭")
262
+
263
+ except Exception as e:
264
+ self.logger.exception(f"关闭应用程序时发生错误: {str(e)}")
265
+
266
+ def cleanup_temp_files(self):
267
+ """清理临时文件"""
268
+ try:
269
+ temp_dir = self.config.get_temp_dir()
270
+ if temp_dir.exists():
271
+ for file_path in temp_dir.glob("*"):
272
+ if file_path.is_file():
273
+ file_path.unlink()
274
+ self.logger.info("临时文件清理完成")
275
+ except Exception as e:
276
+ self.logger.warning(f"清理临时文件失败: {str(e)}")
277
+
278
+ def get_app_info(self) -> dict:
279
+ """获取应用程序信息
280
+
281
+ Returns:
282
+ 应用程序信息字典
283
+ """
284
+ return {
285
+ "name": self.config.app.name,
286
+ "version": self.config.app.version,
287
+ "environment": self.config.environment,
288
+ "debug": self.config.app.debug,
289
+ "host": self.config.app.host,
290
+ "port": self.config.app.port,
291
+ "is_running": self.is_running
292
+ }
293
+
294
+
295
+ def create_app(environment: Optional[str] = None) -> TranscriptServiceApp:
296
+ """创建应用程序实例
297
+
298
+ Args:
299
+ environment: 运行环境
300
+
301
+ Returns:
302
+ 应用程序实例
303
+ """
304
+ return TranscriptServiceApp(environment)
305
+
306
+
307
+ def main():
308
+ """主函数入口"""
309
+ import argparse
310
+
311
+ parser = argparse.ArgumentParser(description="音频转文字服务")
312
+ parser.add_argument(
313
+ "--env",
314
+ choices=["development", "production"],
315
+ default="development",
316
+ help="运行环境"
317
+ )
318
+ parser.add_argument(
319
+ "--host",
320
+ default=None,
321
+ help="服务主机地址"
322
+ )
323
+ parser.add_argument(
324
+ "--port",
325
+ type=int,
326
+ default=None,
327
+ help="服务端口"
328
+ )
329
+ parser.add_argument(
330
+ "--share",
331
+ action="store_true",
332
+ help="启用Gradio分享链接"
333
+ )
334
+ parser.add_argument(
335
+ "--debug",
336
+ action="store_true",
337
+ help="启用调试模式"
338
+ )
339
+
340
+ args = parser.parse_args()
341
+
342
+ # 创建应用
343
+ app = create_app(args.env)
344
+
345
+ # 准备启动参数
346
+ launch_kwargs = {
347
+ 'share': False # 生产环境禁用share
348
+ }
349
+
350
+ if args.host:
351
+ launch_kwargs['server_name'] = args.host
352
+ if args.port:
353
+ launch_kwargs['server_port'] = args.port
354
+ if args.share:
355
+ launch_kwargs['share'] = True # 如果用户明确要求share
356
+ if args.debug:
357
+ launch_kwargs['debug'] = True
358
+
359
+ # 启动应用
360
+ app.run(**launch_kwargs)
361
+
362
+
363
+ if __name__ == "__main__":
364
+ main()
365
+
config/environments/development.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 开发环境配置
2
+ app:
3
+ name: "音频转文字服务"
4
+ version: "1.0.0"
5
+ debug: true
6
+ host: "127.0.0.1"
7
+ port: 7860
8
+ max_file_size: 2147483648 # 2GB
9
+ max_files_count: 100
10
+ concurrent_tasks: 5
11
+
12
+ # OSS配置
13
+ oss:
14
+ endpoint: "oss-cn-beijing.aliyuncs.com"
15
+ bucket_name: "audio-transcript-dev"
16
+ upload_timeout: 300
17
+ url_expire_hours: 24
18
+ temp_prefix: "temp/audio"
19
+ auto_cleanup_days: 7
20
+
21
+ # 阿里云百炼API配置
22
+ dashscope:
23
+ base_url: "https://dashscope.aliyuncs.com/api/v1"
24
+ model: "paraformer-v2"
25
+ timeout: 300
26
+ max_retries: 3
27
+ retry_delay: 5
28
+ language_hints: ["zh", "en"]
29
+
30
+ # 任务配置
31
+ task:
32
+ status_check_interval: 2
33
+ max_processing_time: 3600 # 1小时
34
+ queue_size: 1000
35
+
36
+ # 日志配置
37
+ logging:
38
+ level: "DEBUG"
39
+ format: "detailed"
40
+ file_max_size: "10MB"
41
+ backup_count: 5
config/environments/production.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 生产环境配置
2
+ app:
3
+ name: "音频转文字服务"
4
+ version: "1.0.0"
5
+ debug: false
6
+ host: "0.0.0.0"
7
+ port: 8080
8
+ max_file_size: 2147483648 # 2GB
9
+ max_files_count: 100
10
+ concurrent_tasks: 10
11
+
12
+ # OSS配置
13
+ oss:
14
+ endpoint: "oss-cn-beijing.aliyuncs.com"
15
+ bucket_name: "audio-transcript-prod"
16
+ upload_timeout: 300
17
+ url_expire_hours: 24
18
+ temp_prefix: "temp/audio"
19
+ auto_cleanup_days: 7
20
+
21
+ # 阿里云百炼API配置
22
+ dashscope:
23
+ base_url: "https://dashscope.aliyuncs.com/api/v1"
24
+ model: "paraformer-v2"
25
+ timeout: 300
26
+ max_retries: 5
27
+ retry_delay: 10
28
+ language_hints: ["zh", "en"]
29
+
30
+ # 任务配置
31
+ task:
32
+ status_check_interval: 5
33
+ max_processing_time: 3600 # 1小时
34
+ queue_size: 2000
35
+
36
+ # 日志配置
37
+ logging:
38
+ level: "INFO"
39
+ format: "structured"
40
+ file_max_size: "50MB"
41
+ backup_count: 10
config/logging.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 日志系统配置
2
+ version: 1
3
+ disable_existing_loggers: false
4
+
5
+ formatters:
6
+ detailed:
7
+ format: '[%(asctime)s] [%(levelname)s] [%(name)s] [%(task_id)s] %(message)s'
8
+ datefmt: '%Y-%m-%d %H:%M:%S'
9
+
10
+ structured:
11
+ format: '{"timestamp": "%(asctime)s", "level": "%(levelname)s", "module": "%(name)s", "task_id": "%(task_id)s", "message": "%(message)s"}'
12
+ datefmt: '%Y-%m-%d %H:%M:%S'
13
+
14
+ simple:
15
+ format: '[%(levelname)s] %(message)s'
16
+
17
+ handlers:
18
+ console:
19
+ class: logging.StreamHandler
20
+ level: DEBUG
21
+ formatter: detailed
22
+ stream: ext://sys.stdout
23
+
24
+ file_handler:
25
+ class: logging.handlers.RotatingFileHandler
26
+ level: INFO
27
+ formatter: structured
28
+ filename: logs/app.log
29
+ maxBytes: 10485760 # 10MB
30
+ backupCount: 5
31
+ encoding: utf8
32
+
33
+ error_file_handler:
34
+ class: logging.handlers.RotatingFileHandler
35
+ level: ERROR
36
+ formatter: detailed
37
+ filename: logs/error.log
38
+ maxBytes: 10485760 # 10MB
39
+ backupCount: 5
40
+ encoding: utf8
41
+
42
+ loggers:
43
+ transcript_service:
44
+ level: DEBUG
45
+ handlers: [console, file_handler, error_file_handler]
46
+ propagate: false
47
+
48
+ transcript_service.oss:
49
+ level: INFO
50
+ handlers: [console, file_handler]
51
+ propagate: false
52
+
53
+ transcript_service.api:
54
+ level: INFO
55
+ handlers: [console, file_handler]
56
+ propagate: false
57
+
58
+ transcript_service.task:
59
+ level: DEBUG
60
+ handlers: [console, file_handler]
61
+ propagate: false
62
+
63
+ root:
64
+ level: WARNING
65
+ handlers: [console, file_handler]
pyproject.toml ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "transcript-service"
3
+ version = "1.0.0"
4
+ description = "智能音频转文字Web服务"
5
+ authors = [{name = "Your Name", email = "your.email@example.com"}]
6
+ license = {text = "MIT"}
7
+ readme = "README.md"
8
+ requires-python = ">=3.9"
9
+
10
+ dependencies = [
11
+ # 核心Web框架依赖
12
+ "gradio>=4.44.0",
13
+ "fastapi>=0.104.0",
14
+ "uvicorn>=0.24.0",
15
+
16
+ # 云服务集成依赖
17
+ "oss2>=2.18.0",
18
+ "dashscope>=1.14.0",
19
+
20
+ # 数据处理核心依赖
21
+ "pydantic>=2.5.0",
22
+ "pydantic-settings>=2.1.0",
23
+
24
+ # 文件处理工具依赖
25
+ "python-multipart>=0.0.6",
26
+ "python-magic>=0.4.27",
27
+
28
+ # 配置管理依赖
29
+ "PyYAML>=6.0.1",
30
+ "python-dotenv>=1.0.0",
31
+
32
+ # 日志和监控依赖
33
+ "structlog>=23.2.0",
34
+ "rich>=13.7.0",
35
+
36
+ # HTTP客户端依赖
37
+ "httpx>=0.25.2",
38
+ "aiohttp>=3.9.0",
39
+
40
+ # 命令行工具依赖
41
+ "click>=8.1.7",
42
+ "typer>=0.9.0",
43
+
44
+ # 性能优化可选依赖
45
+ "orjson>=3.9.0",
46
+ "ujson>=5.8.0"
47
+ ]
48
+
49
+ [project.optional-dependencies]
50
+ dev = [
51
+ "pytest>=7.4.0",
52
+ "pytest-asyncio>=0.21.0",
53
+ "pytest-cov>=4.1.0",
54
+ "black>=23.11.0",
55
+ "flake8>=7.0.0",
56
+ "isort>=5.12.0",
57
+ "mypy>=1.7.0",
58
+ "pre-commit>=3.5.0"
59
+ ]
60
+
61
+ [project.scripts]
62
+ transcript-service = "app:main"
63
+
64
+ [build-system]
65
+ requires = ["hatchling"]
66
+ build-backend = "hatchling.build"
67
+
68
+ [tool.hatch.build.targets.wheel]
69
+ packages = ["src"]
70
+
71
+ [dependency-groups]
72
+ dev = [
73
+ "pytest>=7.4.0",
74
+ "pytest-asyncio>=0.21.0",
75
+ "pytest-cov>=4.1.0",
76
+ "black>=23.11.0",
77
+ "flake8>=7.0.0",
78
+ "isort>=5.12.0",
79
+ "mypy>=1.7.0",
80
+ "pre-commit>=3.5.0"
81
+ ]
82
+
83
+ [tool.black]
84
+ line-length = 88
85
+ target-version = ['py39']
86
+ include = '\.pyi?$'
87
+ extend-exclude = '''
88
+ /(
89
+ # directories
90
+ \.eggs
91
+ | \.git
92
+ | \.hg
93
+ | \.mypy_cache
94
+ | \.tox
95
+ | \.venv
96
+ | _build
97
+ | buck-out
98
+ | build
99
+ | dist
100
+ )/
101
+ '''
102
+
103
+ [tool.isort]
104
+ profile = "black"
105
+ multi_line_output = 3
106
+ line_length = 88
107
+ known_first_party = ["src"]
108
+
109
+ [tool.flake8]
110
+ max-line-length = 88
111
+ extend-ignore = ["E203", "W503"]
112
+ exclude = [
113
+ ".git",
114
+ "__pycache__",
115
+ ".venv",
116
+ "build",
117
+ "dist",
118
+ "*.egg-info"
119
+ ]
120
+
121
+ [tool.mypy]
122
+ python_version = "3.9"
123
+ warn_return_any = true
124
+ warn_unused_configs = true
125
+ disallow_untyped_defs = true
126
+ disallow_incomplete_defs = true
127
+ check_untyped_defs = true
128
+ disallow_untyped_decorators = true
129
+ no_implicit_optional = true
130
+ warn_redundant_casts = true
131
+ warn_unused_ignores = true
132
+ warn_no_return = true
133
+ warn_unreachable = true
134
+ strict_equality = true
135
+
136
+ [[tool.mypy.overrides]]
137
+ module = ["gradio.*", "oss2.*", "dashscope.*"]
138
+ ignore_missing_imports = true
139
+
140
+ [tool.pytest.ini_options]
141
+ minversion = "7.0"
142
+ addopts = "-ra -q --strict-markers"
143
+ testpaths = ["tests"]
144
+ markers = [
145
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
146
+ "integration: marks tests as integration tests",
147
+ "unit: marks tests as unit tests"
148
+ ]
requirements.txt ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 核心依赖
2
+ gradio>=4.44.0
3
+ fastapi>=0.104.0
4
+ uvicorn>=0.24.0
5
+
6
+ # 云服务依赖
7
+ oss2>=2.18.0
8
+ dashscope>=1.14.0
9
+
10
+ # 数据处理依赖
11
+ pydantic>=2.5.0
12
+ pydantic-settings>=2.1.0
13
+
14
+ # 文件处理依赖
15
+ python-multipart>=0.0.6
16
+ python-magic>=0.4.27
17
+
18
+ # 配置管理
19
+ PyYAML>=6.0.1
20
+ python-dotenv>=1.0.0
21
+
22
+ # 日志和监控
23
+ structlog>=23.2.0
24
+ rich>=13.7.0
25
+
26
+ # HTTP客户端
27
+ httpx>=0.25.2
28
+ aiohttp>=3.9.0
29
+
30
+ # 工具依赖
31
+ click>=8.1.7
32
+ typer>=0.9.0
33
+
34
+ # 开发依赖
35
+ pytest>=7.4.0
36
+ pytest-asyncio>=0.21.0
37
+ black>=23.11.0
38
+ flake8>=6.1.0
39
+ isort>=5.12.0
40
+
41
+ # 可选依赖(用于性能优化)
42
+ orjson>=3.9.0
43
+ ujson>=5.8.0
src/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """源代码主模块
2
+
3
+ 应用程序源代码的根模块,集成所有功能组件。
4
+ """
5
+
6
+ # 导入核心模块
7
+ from .core import (
8
+ Config, get_config, reload_config,
9
+ TaskManager, TaskStatus, TaskPriority, Task, get_task_manager, task_manager
10
+ )
11
+
12
+ # 导入服务模块
13
+ from .services import (
14
+ FileValidator, get_file_validator, file_validator,
15
+ OSSService, get_oss_service, oss_service,
16
+ ParaformerService, get_paraformer_service, paraformer_service
17
+ )
18
+
19
+ # 导入工具模块
20
+ from .utils import (
21
+ Logger, TaskLogger, get_logger, get_task_logger, logger
22
+ )
23
+
24
+ # 导入API模块
25
+ from .api import (
26
+ GradioInterface, get_gradio_interface, create_demo_interface, gradio_interface
27
+ )
28
+
29
+ __all__ = [
30
+ # 核心模块
31
+ "Config", "get_config", "reload_config",
32
+ "TaskManager", "TaskStatus", "TaskPriority", "Task", "get_task_manager", "task_manager",
33
+
34
+ # 服务模块
35
+ "FileValidator", "get_file_validator", "file_validator",
36
+ "OSSService", "get_oss_service", "oss_service",
37
+ "ParaformerService", "get_paraformer_service", "paraformer_service",
38
+
39
+ # 工具模块
40
+ "Logger", "TaskLogger", "get_logger", "get_task_logger", "logger",
41
+
42
+ # API模块
43
+ "GradioInterface", "get_gradio_interface", "create_demo_interface", "gradio_interface"
44
+ ]
src/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.14 kB). View file
 
src/api/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """API模块
2
+
3
+ 包含应用程序的API接口和用户界面。
4
+ """
5
+
6
+ from .gradio_interface import GradioInterface, get_gradio_interface, create_demo_interface, gradio_interface
7
+
8
+ __all__ = [
9
+ "GradioInterface",
10
+ "get_gradio_interface",
11
+ "create_demo_interface",
12
+ "gradio_interface"
13
+ ]
src/api/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (385 Bytes). View file
 
src/api/__pycache__/gradio_interface.cpython-310.pyc ADDED
Binary file (15.5 kB). View file
 
src/api/gradio_interface.py ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Gradio用户界面模块
2
+
3
+ 提供基于Gradio的Web界面,支持文件上传、进度显示和结果展示。
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Tuple, Any
10
+ import gradio as gr
11
+ import pandas as pd
12
+
13
+ from ..core.config import get_config
14
+ from ..core.task_manager import get_task_manager, TaskStatus, TaskPriority
15
+ from ..utils.logger import get_task_logger
16
+ from ..services.file_validator import get_file_validator
17
+
18
+
19
+ class GradioInterface:
20
+ """Gradio界面管理器"""
21
+
22
+ def __init__(self):
23
+ """初始化Gradio界面"""
24
+ self.config = get_config()
25
+ self.task_manager = get_task_manager()
26
+ self.file_validator = get_file_validator()
27
+ self.logger = get_task_logger(logger_name="transcript_service.gradio")
28
+
29
+ # 当前任务ID
30
+ self.current_task_id = None
31
+
32
+ # 创建界面
33
+ self.interface = self._create_interface()
34
+
35
+ # 注册任务状态回调
36
+ self.task_manager.add_status_callback(self._on_task_status_change)
37
+
38
+ def _create_interface(self) -> gr.Blocks:
39
+ """创建Gradio界面"""
40
+ # 获取支持的格式信息
41
+ supported_formats = self.file_validator.get_supported_formats()
42
+
43
+ with gr.Blocks(
44
+ title="音频转文字服务",
45
+ theme=gr.themes.Soft(),
46
+ css="""
47
+ .main-container { max-width: 1000px; margin: 0 auto; }
48
+ .upload-area { border: 2px dashed #ccc; border-radius: 10px; padding: 20px; text-align: center; }
49
+ .result-area { margin-top: 20px; }
50
+ .status-simple { font-size: 16px; font-weight: bold; }
51
+ """
52
+ ) as interface:
53
+ # 简洁标题
54
+ gr.Markdown("# 🎵 音频转文字服务")
55
+
56
+ with gr.Row():
57
+ with gr.Column(scale=3):
58
+ # 文件上传区
59
+ file_upload = gr.File(
60
+ label="📁 选择音频文件(支持多文件)",
61
+ file_count="multiple",
62
+ file_types=list(supported_formats['extensions']),
63
+ height=120
64
+ )
65
+
66
+ # 简化的配置区
67
+ with gr.Row():
68
+ # 任务优先级
69
+ priority_select = gr.Radio(
70
+ label="优先级",
71
+ choices=[("普通", "NORMAL"), ("高优先级", "HIGH")],
72
+ value="NORMAL"
73
+ )
74
+
75
+ # 参数设置区(默认隐藏)
76
+ with gr.Accordion("⚙️ 转录参数设置", open=False) as params_section:
77
+ # 语言选择
78
+ language_select = gr.CheckboxGroup(
79
+ label="识别语言",
80
+ choices=[
81
+ ("中文", "zh"), ("英文", "en"), ("日语", "ja"),
82
+ ("粤语", "yue"), ("韩语", "ko"), ("德语", "de"),
83
+ ("法语", "fr"), ("俄语", "ru")
84
+ ],
85
+ value=["zh", "en"]
86
+ )
87
+
88
+ with gr.Row():
89
+ # 基础选项
90
+ disfluency_removal = gr.Checkbox(
91
+ label="过滤语气词",
92
+ value=True
93
+ )
94
+ timestamp_alignment = gr.Checkbox(
95
+ label="时间戳校准",
96
+ value=True
97
+ )
98
+ diarization_enabled = gr.Checkbox(
99
+ label="说话人分离",
100
+ value=True
101
+ )
102
+
103
+ with gr.Row():
104
+ speaker_count = gr.Number(
105
+ label="说话人数量(可选)",
106
+ value=None,
107
+ minimum=None,
108
+ maximum=100,
109
+ step=1,
110
+ info="留空则自动判断,如需指定请输入2-100之间的数值"
111
+ )
112
+ channel_select = gr.Textbox(
113
+ label="音轨索引",
114
+ value="0",
115
+ info="多音轨文件的音轨索引,用逗号分隔"
116
+ )
117
+
118
+ # 高级选项(更深层折叠)
119
+ with gr.Accordion("高级选项", open=False):
120
+ vocabulary_id = gr.Textbox(
121
+ label="热词ID v2",
122
+ value="",
123
+ info="v2模型的热词ID"
124
+ )
125
+ phrase_id = gr.Textbox(
126
+ label="热词ID v1",
127
+ value="",
128
+ info="v1模型的热词ID"
129
+ )
130
+ special_word_filter = gr.Textbox(
131
+ label="敏感词过滤配置",
132
+ value="",
133
+ lines=2,
134
+ placeholder='JSON格式配置',
135
+ info="敏感词过滤的JSON配置"
136
+ )
137
+
138
+ # 控制按钮
139
+ with gr.Row():
140
+ start_btn = gr.Button("🚀 开始转录", variant="primary", size="lg")
141
+ cancel_btn = gr.Button("❌ 取消", variant="secondary")
142
+ clear_btn = gr.Button("🗑️ 清空", variant="secondary")
143
+
144
+ with gr.Column(scale=2):
145
+ # 简化的状态显示
146
+ status_text = gr.Textbox(
147
+ label="📊 当前状态",
148
+ value="等待上传文件...",
149
+ interactive=False,
150
+ elem_classes=["status-simple"]
151
+ )
152
+
153
+ # 转录结果
154
+ result_text = gr.Textbox(
155
+ label="📝 转录结果",
156
+ placeholder="转录结果将在这里显示...",
157
+ lines=12,
158
+ max_lines=20,
159
+ show_copy_button=True,
160
+ elem_classes=["result-area"]
161
+ )
162
+
163
+ # 文件统计表格
164
+ stats_df = gr.Dataframe(
165
+ headers=["文件名", "时长", "文本长度", "置信度"],
166
+ datatype=["str", "str", "number", "number"],
167
+ label="📈 处理统计",
168
+ visible=False
169
+ )
170
+
171
+ # 折叠的详细信息区域
172
+ with gr.Accordion("📋 详细信息", open=False) as detail_section:
173
+ with gr.Tabs():
174
+ with gr.Tab("系统信息"):
175
+ system_info = gr.JSON(
176
+ label="服务状态",
177
+ value=self._get_system_info()
178
+ )
179
+ format_info = gr.JSON(
180
+ label="支持格式",
181
+ value=supported_formats
182
+ )
183
+
184
+ with gr.Tab("任务信息"):
185
+ task_info = gr.JSON(
186
+ label="当前任务",
187
+ value={}
188
+ )
189
+
190
+ with gr.Tab("完整结果"):
191
+ result_json = gr.JSON(
192
+ label="JSON结果",
193
+ value={}
194
+ )
195
+
196
+ with gr.Tab("处理日志"):
197
+ log_text = gr.Textbox(
198
+ label="详细日志",
199
+ lines=8,
200
+ max_lines=12,
201
+ interactive=False,
202
+ show_copy_button=True
203
+ )
204
+ log_download = gr.File(
205
+ label="下载日志文件",
206
+ visible=False
207
+ )
208
+
209
+
210
+
211
+ # 添加手动刷新按钮
212
+ with gr.Row():
213
+ refresh_btn = gr.Button("🔄 刷新状态", variant="secondary", size="sm")
214
+ refresh_btn.click(
215
+ fn=self._update_interface,
216
+ outputs=[status_text, task_info, result_text, result_json, stats_df, system_info, log_text]
217
+ )
218
+
219
+ # 事件处理
220
+ start_btn.click(
221
+ fn=self._process_files,
222
+ inputs=[
223
+ file_upload, priority_select, language_select,
224
+ disfluency_removal, timestamp_alignment, diarization_enabled,
225
+ speaker_count, channel_select, vocabulary_id,
226
+ phrase_id, special_word_filter
227
+ ],
228
+ outputs=[status_text, task_info, log_text]
229
+ )
230
+
231
+ cancel_btn.click(
232
+ fn=self._cancel_current_task,
233
+ outputs=[status_text, task_info]
234
+ )
235
+
236
+ clear_btn.click(
237
+ fn=self._clear_interface,
238
+ outputs=[file_upload, result_text, result_json, stats_df, log_text, status_text, task_info]
239
+ )
240
+
241
+ # 定时更新
242
+ interface.load(
243
+ fn=self._update_interface,
244
+ outputs=[status_text, task_info, result_text, result_json, stats_df, system_info, log_text]
245
+ )
246
+
247
+ return interface
248
+
249
+ def _get_custom_css(self) -> str:
250
+ """获取自定义CSS样式"""
251
+ return """
252
+ .gradio-container {
253
+ max-width: 1200px !important;
254
+ }
255
+ .gr-button-primary {
256
+ background: linear-gradient(45deg, #FF6B6B, #4ECDC4) !important;
257
+ border: none !important;
258
+ }
259
+ .gr-button-primary:hover {
260
+ transform: translateY(-2px) !important;
261
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
262
+ }
263
+ .progress-bar {
264
+ background: linear-gradient(90deg, #FF6B6B, #4ECDC4) !important;
265
+ }
266
+ """
267
+
268
+ def _get_system_info(self) -> Dict:
269
+ """获取系统信息"""
270
+ stats = self.task_manager.get_statistics()
271
+ return {
272
+ "服务状态": "运行中",
273
+ "当前任务数": stats['total_tasks'],
274
+ "待处理": stats['pending'],
275
+ "处理中": stats['validating'] + stats['uploading'] + stats['transcribing'],
276
+ "已完成": stats['completed'],
277
+ "失败": stats['failed'],
278
+ "队列大小": stats['queue_size']
279
+ }
280
+
281
+ def _get_timestamp(self) -> str:
282
+ """获取当前时间戳"""
283
+ from datetime import datetime
284
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
285
+
286
+ async def _process_files(
287
+ self,
288
+ files: List,
289
+ priority: str,
290
+ languages: List[str],
291
+ disfluency_removal: bool,
292
+ timestamp_alignment: bool,
293
+ diarization_enabled: bool,
294
+ speaker_count: Optional[int] | None,
295
+ channel_id: str,
296
+ vocabulary_id: str,
297
+ phrase_id: str,
298
+ special_word_filter: str
299
+ ) -> Tuple[str, Dict, str]:
300
+ """处理上传的文件
301
+
302
+ Args:
303
+ files: 上传的文件列表
304
+ languages: 选择的语言
305
+ priority: 任务优先级
306
+ channel_id: 音轨索引
307
+ disfluency_removal: 是否过滤语气词
308
+ timestamp_alignment: 是否启用时间戳校准
309
+ diarization_enabled: 是否启用说话人分离
310
+ speaker_count: 说话人数量参考值
311
+ vocabulary_id: 热词ID v2
312
+ phrase_id: 热词ID v1
313
+ special_word_filter: 敏感词过滤配置
314
+
315
+ Returns:
316
+ (状态信息, 任务信息, 日志信息)
317
+ """
318
+ try:
319
+ if not files:
320
+ return "请先上传音频文件", {}, "错误: 未选择任何文件"
321
+
322
+ # 记录详细日志
323
+ log_messages = []
324
+ log_messages.append(f"[{self._get_timestamp()}] 开始处理文件上传请求")
325
+ log_messages.append(f"[{self._get_timestamp()}] 接收到 {len(files)} 个文件")
326
+
327
+ # 转换文件路径
328
+ file_paths = [Path(f.name) for f in files]
329
+ log_messages.append(f"[{self._get_timestamp()}] 转换文件路径完成")
330
+
331
+ # 显示文件信息
332
+ for i, file_path in enumerate(file_paths):
333
+ try:
334
+ file_size = file_path.stat().st_size
335
+ log_messages.append(f"[{self._get_timestamp()}] 文件 {i+1}: {file_path.name} (大小: {file_size} 字节)")
336
+ except Exception as e:
337
+ log_messages.append(f"[{self._get_timestamp()}] 文件 {i+1}: {file_path.name} (无法获取文件信息: {str(e)})")
338
+
339
+ # 解析音轨参数
340
+ try:
341
+ channel_list = [int(x.strip()) for x in channel_id.split(',') if x.strip()]
342
+ except ValueError:
343
+ channel_list = [0] # 默认为第一条音轨
344
+
345
+ # 验证说话人数量参数
346
+ validated_speaker_count = None
347
+ if speaker_count is not None:
348
+ if isinstance(speaker_count, (int, float)) and speaker_count >= 2 and speaker_count <= 100:
349
+ validated_speaker_count = int(speaker_count)
350
+ else:
351
+ log_messages.append(f"[{self._get_timestamp()}] 警告: 说话人数量无效({speaker_count}),将使用自动判断")
352
+
353
+ # 解析敏感词过滤参数
354
+ special_filter = None
355
+ if special_word_filter.strip():
356
+ try:
357
+ special_filter = json.loads(special_word_filter)
358
+ except json.JSONDecodeError as e:
359
+ log_messages.append(f"[{self._get_timestamp()}] 警告: 敏感词过滤配置格式错误,将使���默认设置")
360
+
361
+ # 创建任务
362
+ task_priority = TaskPriority.HIGH if priority == "HIGH" else TaskPriority.NORMAL
363
+
364
+ # 准备元数据,包含所有Paraformer参数
365
+ metadata = {
366
+ "languages": languages,
367
+ "file_count": len(file_paths),
368
+ "paraformer_params": {
369
+ "language_hints": languages,
370
+ "channel_id": channel_list,
371
+ "disfluency_removal_enabled": disfluency_removal,
372
+ "timestamp_alignment_enabled": timestamp_alignment,
373
+ "diarization_enabled": diarization_enabled,
374
+ "speaker_count": validated_speaker_count,
375
+ "vocabulary_id": vocabulary_id.strip() if vocabulary_id.strip() else None,
376
+ "phrase_id": phrase_id.strip() if phrase_id.strip() else None,
377
+ "special_word_filter": json.dumps(special_filter) if special_filter else None
378
+ }
379
+ }
380
+
381
+ log_messages.append(f"[{self._get_timestamp()}] 创建任务,优先级: {task_priority.value}")
382
+ log_messages.append(f"[{self._get_timestamp()}] 选择语言: {', '.join(languages) if languages else '自动识别'}")
383
+
384
+ self.current_task_id = await self.task_manager.create_task(
385
+ file_paths=file_paths,
386
+ priority=task_priority,
387
+ metadata=metadata
388
+ )
389
+
390
+ task = self.task_manager.get_task(self.current_task_id)
391
+
392
+ log_messages.append(f"[{self._get_timestamp()}] 任务创建成功,任务ID: {self.current_task_id}")
393
+
394
+ return (
395
+ f"任务已创建: {self.current_task_id}",
396
+ task.to_dict() if task else {},
397
+ "\n".join(log_messages) + f"\n开始处理 {len(file_paths)} 个文件...\n"
398
+ )
399
+
400
+ except Exception as e:
401
+ error_msg = f"创建任务失败: {str(e)}"
402
+ self.logger.exception(error_msg)
403
+ return error_msg, {}, f"错误: {error_msg}\n"
404
+
405
+ def _cancel_current_task(self) -> Tuple[str, Dict]:
406
+ """取消当前任务"""
407
+ if not self.current_task_id:
408
+ return "没有正在执行的任务", {}
409
+
410
+ success = asyncio.create_task(
411
+ self.task_manager.cancel_task(self.current_task_id)
412
+ )
413
+
414
+ if success:
415
+ return f"任务 {self.current_task_id} 已取消", {}
416
+ else:
417
+ return "取消任务失败", {}
418
+
419
+ def _clear_interface(self) -> Tuple[None, str, Dict, List, str, str, Dict]:
420
+ """清空界面"""
421
+ self.current_task_id = None
422
+ return (
423
+ None, # file_upload
424
+ "", # result_text
425
+ {}, # result_json
426
+ [], # stats_df
427
+ "", # log_text
428
+ "界面已清空,等待上传文件...", # status_text
429
+ {} # task_info
430
+ )
431
+
432
+ def _update_interface(self) -> Tuple[str, Dict, str, Dict, List, Dict, str]:
433
+ """更新界面状态"""
434
+ # 更新当前任务状态
435
+ status_text = "等待上传文件..."
436
+ task_info = {}
437
+ result_text = ""
438
+ result_json = {}
439
+ stats_data = []
440
+ log_text = ""
441
+
442
+ if self.current_task_id:
443
+ task = self.task_manager.get_task(self.current_task_id)
444
+ if task:
445
+ task_info = task.to_dict()
446
+ status_text = f"[{task.status.value}] {task.progress.message}"
447
+
448
+ # 收集详细日志
449
+ log_text = self._collect_task_logs(task)
450
+
451
+ # 如果任务完成,显示结果
452
+ if task.status == TaskStatus.COMPLETED:
453
+ self.logger.debug(f"任务已完成,检查转录结果: {task.result.transcription_results}")
454
+ if task.result.transcription_results:
455
+ result_json = task.result.transcription_results
456
+
457
+ # 提取转录文本
458
+ transcriptions = result_json.get('transcriptions', [])
459
+ self.logger.debug(f"转录结果: {transcriptions}")
460
+ result_text = "\n\n".join([
461
+ f"文件: {t.get('file_url', '').split('/')[-1]}\n{t.get('text', '')}"
462
+ for t in transcriptions if t.get('text')
463
+ ])
464
+
465
+ # 生成统计表格
466
+ stats_data = []
467
+ for t in transcriptions:
468
+ if 'error' not in t:
469
+ stats_data.append([
470
+ t.get('file_url', '').split('/')[-1],
471
+ f"{t.get('duration', 0):.1f}s",
472
+ len(t.get('text', '')),
473
+ t.get('language', 'unknown'),
474
+ round(t.get('confidence', 0), 3)
475
+ ])
476
+ else:
477
+ self.logger.debug("任务已完成但没有转录结果")
478
+ elif task.status == TaskStatus.FAILED:
479
+ # 如果任务失败,显示错误信息
480
+ if task.result and task.result.error_message:
481
+ log_text += f"\n[{self._get_timestamp()}] 任务失败: {task.result.error_message}"
482
+
483
+ # 更新系统信息
484
+ system_info = self._get_system_info()
485
+
486
+ return status_text, task_info, result_text, result_json, stats_data, system_info, log_text
487
+
488
+ def _collect_task_logs(self, task) -> str:
489
+ """收集任务的详细日志
490
+
491
+ Args:
492
+ task: 任务对象
493
+
494
+ Returns:
495
+ 格式化的日志字符串
496
+ """
497
+ if not task:
498
+ return "无任务信息"
499
+
500
+ log_lines = []
501
+ log_lines.append(f"[{self._get_timestamp()}] 任务ID: {task.id}")
502
+ log_lines.append(f"[{self._get_timestamp()}] 任务状态: {task.status.value}")
503
+ log_lines.append(f"[{self._get_timestamp()}] 任务创建时间: {task.created_at}")
504
+
505
+ # 添加进度信息
506
+ if task.progress:
507
+ log_lines.append(f"[{self._get_timestamp()}] 进度信息: {task.progress.message}")
508
+ # TaskProgress对象没有details属性,只使用message
509
+
510
+ # 添加文件信息
511
+ if hasattr(task, 'file_paths') and task.file_paths:
512
+ log_lines.append(f"[{self._get_timestamp()}] 文件列表:")
513
+ for i, file_path in enumerate(task.file_paths):
514
+ try:
515
+ file_size = file_path.stat().st_size
516
+ log_lines.append(f" {i+1}. {file_path.name} ({file_size} bytes)")
517
+ except Exception as e:
518
+ log_lines.append(f" {i+1}. {file_path.name} (无法获取文件信息: {str(e)})")
519
+
520
+ # 添加结果信息(如果任务已完成)
521
+ if task.status == TaskStatus.COMPLETED and task.result:
522
+ log_lines.append(f"[{self._get_timestamp()}] 任务完成时间: {task.completed_at}")
523
+ if hasattr(task.result, 'transcription_results') and task.result.transcription_results:
524
+ transcriptions = task.result.transcription_results.get('transcriptions', [])
525
+ log_lines.append(f"[{self._get_timestamp()}] 转录结果: {len(transcriptions)} 个文件")
526
+
527
+ # 添加错误信息(如果有的话)
528
+ # Task对象没有error属性,错误信息在result中
529
+
530
+ return "\n".join(log_lines)
531
+
532
+ def _on_task_status_change(self, task):
533
+ """任务状态变化回调"""
534
+ self.logger.debug(f"任务状态变化: {task.id} -> {task.status.value}")
535
+ # 当任务状态变化时,不直接更新界面,而是依赖定时更新机制
536
+ # Gradio的回调中不能直接更新界面组件
537
+
538
+ def launch(self, **kwargs):
539
+ """启动Gradio界面"""
540
+ default_kwargs = {
541
+ 'server_name': '0.0.0.0', # 改为0.0.0.0以允许外部访问
542
+ 'server_port': self.config.app.port,
543
+ 'share': True, # 开启分享链接
544
+ 'debug': self.config.app.debug,
545
+ 'show_error': True,
546
+ 'quiet': not self.config.app.debug
547
+ }
548
+ default_kwargs.update(kwargs)
549
+
550
+ self.logger.info(f"启动Gradio界面: http://{default_kwargs['server_name']}:{default_kwargs['server_port']}")
551
+
552
+ return self.interface.launch(**default_kwargs)
553
+
554
+
555
+ # 全局界面实例
556
+ gradio_interface = GradioInterface()
557
+
558
+
559
+ def get_gradio_interface() -> GradioInterface:
560
+ """获取Gradio界面实例
561
+
562
+ Returns:
563
+ Gradio界面实例
564
+ """
565
+ return gradio_interface
566
+
567
+
568
+ def create_demo_interface() -> gr.Blocks:
569
+ """创建演示界面
570
+
571
+ Returns:
572
+ Gradio界面对象
573
+ """
574
+ return gradio_interface.interface
src/core/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """核心模块
2
+
3
+ 包含应用程序的核心功能和基础组件。
4
+ """
5
+
6
+ from .config import Config, get_config, reload_config
7
+ from .task_manager import TaskManager, TaskStatus, TaskPriority, Task, get_task_manager, task_manager
8
+
9
+ __all__ = [
10
+ "Config",
11
+ "get_config",
12
+ "reload_config",
13
+ "TaskManager",
14
+ "TaskStatus",
15
+ "TaskPriority",
16
+ "Task",
17
+ "get_task_manager",
18
+ "task_manager"
19
+ ]
src/core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (536 Bytes). View file
 
src/core/__pycache__/config.cpython-310.pyc ADDED
Binary file (5.46 kB). View file
 
src/core/__pycache__/task_manager.cpython-310.pyc ADDED
Binary file (14.3 kB). View file
 
src/core/config.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """配置管理模块
2
+
3
+ 提供应用程序配置的加载和管理功能。
4
+ 支持多环境配置和环境变量覆盖。
5
+ """
6
+
7
+ import os
8
+ from pathlib import Path
9
+ from typing import Any, Dict, Optional
10
+ import yaml
11
+ from pydantic import Field
12
+ from pydantic_settings import BaseSettings
13
+ from pydantic_settings import SettingsConfigDict
14
+
15
+
16
+ class AppConfig(BaseSettings):
17
+ """应用程序配置"""
18
+ model_config = SettingsConfigDict(
19
+ env_prefix="APP_",
20
+ env_file=".env",
21
+ env_file_encoding="utf-8",
22
+ case_sensitive=False,
23
+ extra="ignore" # 忽略额外字段
24
+ )
25
+
26
+ name: str = "音频转文字服务"
27
+ version: str = "1.0.0"
28
+ debug: bool = False
29
+ host: str = "127.0.0.1"
30
+ port: int = 7860
31
+ max_file_size: int = 2147483648 # 2GB
32
+ max_files_count: int = 100
33
+ concurrent_tasks: int = 5
34
+
35
+
36
+ class OSSConfig(BaseSettings):
37
+ """OSS配置"""
38
+ model_config = SettingsConfigDict(
39
+ env_prefix="OSS_",
40
+ env_file=".env",
41
+ env_file_encoding="utf-8",
42
+ case_sensitive=False,
43
+ extra="ignore"
44
+ )
45
+
46
+ endpoint: str = Field(..., description="OSS服务端点")
47
+ access_key_id: str = Field(..., description="访问密钥ID")
48
+ access_key_secret: str = Field(..., description="访问密钥密码")
49
+ bucket_name: str = Field(..., description="存储桶名称")
50
+ upload_timeout: int = 300
51
+ url_expire_hours: int = 24
52
+ temp_prefix: str = "temp/audio"
53
+ auto_cleanup_days: int = 7
54
+
55
+
56
+ class DashScopeConfig(BaseSettings):
57
+ """阿里云百炼API配置"""
58
+ model_config = SettingsConfigDict(
59
+ env_prefix="DASHSCOPE_",
60
+ env_file=".env",
61
+ env_file_encoding="utf-8",
62
+ case_sensitive=False,
63
+ extra="ignore"
64
+ )
65
+
66
+ api_key: str = Field(..., description="API密钥")
67
+ base_url: str = "https://dashscope.aliyuncs.com/api/v1"
68
+ model: str = "paraformer-v2"
69
+ timeout: int = 300
70
+ max_retries: int = 3
71
+ retry_delay: int = 5
72
+ language_hints: list[str] = ["zh", "en"]
73
+
74
+
75
+ class TaskConfig(BaseSettings):
76
+ """任务配置"""
77
+ model_config = SettingsConfigDict(
78
+ env_prefix="TASK_",
79
+ env_file=".env",
80
+ env_file_encoding="utf-8",
81
+ case_sensitive=False,
82
+ extra="ignore"
83
+ )
84
+
85
+ status_check_interval: int = 2
86
+ max_processing_time: int = 3600 # 1小时
87
+ queue_size: int = 1000
88
+
89
+
90
+ class LoggingConfig(BaseSettings):
91
+ """日志配置"""
92
+ model_config = SettingsConfigDict(
93
+ env_prefix="LOGGING_",
94
+ env_file=".env",
95
+ env_file_encoding="utf-8",
96
+ case_sensitive=False,
97
+ extra="ignore"
98
+ )
99
+
100
+ level: str = "INFO"
101
+ format: str = "structured"
102
+ file_max_size: str = "10MB"
103
+ backup_count: int = 5
104
+
105
+
106
+ class Config:
107
+ """配置管理器"""
108
+
109
+ def __init__(self, environment: Optional[str] = None):
110
+ """初始化配置管理器
111
+
112
+ Args:
113
+ environment: 环境名称(development/production)
114
+ """
115
+ self.environment = environment or os.getenv("ENVIRONMENT", "development")
116
+ self._config_data = self._load_config()
117
+
118
+ # 初始化各个配置模块
119
+ self.app = AppConfig(**self._config_data.get("app", {}))
120
+
121
+ # OSS配置 - 直接创建实例以支持环境变量覆盖
122
+ self.oss = OSSConfig()
123
+
124
+ # DashScope配置 - 直接创建实例以支持环境变量覆盖
125
+ self.dashscope = DashScopeConfig()
126
+
127
+ self.task = TaskConfig(**self._config_data.get("task", {}))
128
+ self.logging = LoggingConfig(**self._config_data.get("logging", {}))
129
+
130
+ def _load_config(self) -> Dict[str, Any]:
131
+ """加载配置文件"""
132
+ config_dir = Path(__file__).parent.parent.parent / "config" / "environments"
133
+ config_file = config_dir / f"{self.environment}.yaml"
134
+
135
+ if not config_file.exists():
136
+ raise FileNotFoundError(f"配置文件不存在: {config_file}")
137
+
138
+ with open(config_file, 'r', encoding='utf-8') as file:
139
+ return yaml.safe_load(file)
140
+
141
+ def get_project_root(self) -> Path:
142
+ """获取项目根目录"""
143
+ return Path(__file__).parent.parent.parent
144
+
145
+ def get_logs_dir(self) -> Path:
146
+ """获取日志目录"""
147
+ logs_dir = self.get_project_root() / "logs"
148
+ logs_dir.mkdir(exist_ok=True)
149
+ return logs_dir
150
+
151
+ def get_temp_dir(self) -> Path:
152
+ """获取临时文件目录"""
153
+ temp_dir = self.get_project_root() / "temp"
154
+ temp_dir.mkdir(exist_ok=True)
155
+ return temp_dir
156
+
157
+
158
+ # 全局配置实例
159
+ config = Config()
160
+
161
+
162
+ def get_config() -> Config:
163
+ """获取配置实例"""
164
+ return config
165
+
166
+
167
+ def reload_config(environment: Optional[str] = None) -> Config:
168
+ """重新加载配置"""
169
+ global config
170
+ config = Config(environment)
171
+ return config
src/core/task_manager.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """任务管理模块
2
+
3
+ 提供任务状态跟踪、进度管理和任务队列功能。
4
+ """
5
+
6
+ import asyncio
7
+ import time
8
+ import uuid
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime, timedelta
11
+ from enum import Enum
12
+ from pathlib import Path
13
+ from typing import Dict, List, Optional, Callable, Any
14
+ from concurrent.futures import ThreadPoolExecutor
15
+
16
+ from ..core.config import get_config
17
+ from ..utils.logger import get_task_logger
18
+ from ..services.file_validator import get_file_validator
19
+ from ..services.oss_service import get_oss_service
20
+ from ..services.paraformer_service import get_paraformer_service
21
+
22
+
23
+ class TaskStatus(Enum):
24
+ """任务状态"""
25
+ PENDING = "pending"
26
+ VALIDATING = "validating"
27
+ UPLOADING = "uploading"
28
+ TRANSCRIBING = "transcribing"
29
+ COMPLETED = "completed"
30
+ FAILED = "failed"
31
+ CANCELLED = "cancelled"
32
+
33
+
34
+ class TaskPriority(Enum):
35
+ """任务优先级"""
36
+ LOW = 1
37
+ NORMAL = 2
38
+ HIGH = 3
39
+ URGENT = 4
40
+
41
+
42
+ @dataclass
43
+ class TaskProgress:
44
+ """任务进度信息"""
45
+ stage: str = ""
46
+ current: int = 0
47
+ total: int = 100
48
+ message: str = ""
49
+ percentage: float = 0.0
50
+
51
+ def update(self, current: int = None, total: int = None, message: str = None):
52
+ """更新进度信息"""
53
+ if current is not None:
54
+ self.current = current
55
+ if total is not None:
56
+ self.total = total
57
+ if message is not None:
58
+ self.message = message
59
+
60
+ if self.total > 0:
61
+ self.percentage = min(100.0, (self.current / self.total) * 100)
62
+
63
+
64
+ @dataclass
65
+ class TaskResult:
66
+ """任务结果"""
67
+ success: bool = False
68
+ data: Optional[Dict] = None
69
+ error_message: Optional[str] = None
70
+ processed_files: List[str] = field(default_factory=list)
71
+ failed_files: List[str] = field(default_factory=list)
72
+ transcription_results: Optional[Dict] = None
73
+ duration: float = 0.0
74
+
75
+ def to_dict(self) -> Dict:
76
+ """转换为字典格式"""
77
+ return {
78
+ 'success': self.success,
79
+ 'data': self.data,
80
+ 'error_message': self.error_message,
81
+ 'processed_files': self.processed_files,
82
+ 'failed_files': self.failed_files,
83
+ 'transcription_results': self.transcription_results,
84
+ 'duration': self.duration
85
+ }
86
+
87
+
88
+ @dataclass
89
+ class Task:
90
+ """任务信息"""
91
+ id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
92
+ status: TaskStatus = TaskStatus.PENDING
93
+ priority: TaskPriority = TaskPriority.NORMAL
94
+ file_paths: List[Path] = field(default_factory=list)
95
+ progress: TaskProgress = field(default_factory=TaskProgress)
96
+ result: TaskResult = field(default_factory=TaskResult)
97
+ created_at: datetime = field(default_factory=datetime.now)
98
+ started_at: Optional[datetime] = None
99
+ completed_at: Optional[datetime] = None
100
+ callback: Optional[Callable] = None
101
+ metadata: Dict[str, Any] = field(default_factory=dict)
102
+
103
+ def to_dict(self) -> Dict:
104
+ """转换为字典格式"""
105
+ return {
106
+ 'id': self.id,
107
+ 'status': self.status.value,
108
+ 'priority': self.priority.value,
109
+ 'file_count': len(self.file_paths),
110
+ 'file_names': [fp.name for fp in self.file_paths],
111
+ 'progress': {
112
+ 'stage': self.progress.stage,
113
+ 'current': self.progress.current,
114
+ 'total': self.progress.total,
115
+ 'percentage': self.progress.percentage,
116
+ 'message': self.progress.message
117
+ },
118
+ 'result': self.result.to_dict(),
119
+ 'created_at': self.created_at.isoformat() if self.created_at else None,
120
+ 'started_at': self.started_at.isoformat() if self.started_at else None,
121
+ 'completed_at': self.completed_at.isoformat() if self.completed_at else None,
122
+ 'metadata': self.metadata
123
+ }
124
+
125
+
126
+ class TaskManager:
127
+ """任务管理器"""
128
+
129
+ def __init__(self):
130
+ """初始化任务管理器"""
131
+ self.config = get_config()
132
+ self.logger = get_task_logger(logger_name="transcript_service.task")
133
+
134
+ # 任务存储
135
+ self.tasks: Dict[str, Task] = {}
136
+ self.task_queue: asyncio.Queue = asyncio.Queue(maxsize=self.config.task.queue_size)
137
+
138
+ # 服务实例
139
+ self.file_validator = get_file_validator()
140
+ self.oss_service = get_oss_service()
141
+ self.paraformer_service = get_paraformer_service()
142
+
143
+ # 工作线程池
144
+ self.executor = ThreadPoolExecutor(max_workers=self.config.app.concurrent_tasks)
145
+
146
+ # 状态回调
147
+ self.status_callbacks: List[Callable] = []
148
+
149
+ # 任务处理器状态
150
+ self._processor_started = False
151
+
152
+ # 启动任务处理器
153
+ self._start_task_processor()
154
+
155
+ def add_status_callback(self, callback: Callable):
156
+ """添加状态变化回调函数
157
+
158
+ Args:
159
+ callback: 回调函数
160
+ """
161
+ self.status_callbacks.append(callback)
162
+
163
+ def _notify_status_change(self, task: Task):
164
+ """通知状态变化"""
165
+ for callback in self.status_callbacks:
166
+ try:
167
+ callback(task)
168
+ except Exception as e:
169
+ self.logger.error(f"回调函数执行失败: {str(e)}")
170
+
171
+ async def create_task(self, file_paths: List[Path], priority: TaskPriority = TaskPriority.NORMAL, metadata = None) -> str:
172
+ """创建新任务
173
+
174
+ Args:
175
+ file_paths: 文件路径列表
176
+ priority: 任务优先级
177
+ metadata: 任务元数据
178
+
179
+ Returns:
180
+ 任务ID
181
+ """
182
+ # 确保任务处理器已启动
183
+ if not self._processor_started:
184
+ self._ensure_processor_started()
185
+
186
+ task = Task(
187
+ file_paths=file_paths,
188
+ priority=priority,
189
+ metadata=metadata or {}
190
+ )
191
+
192
+ self.tasks[task.id] = task
193
+
194
+ # 添加到队列
195
+ await self.task_queue.put(task.id)
196
+
197
+ self.logger.info(f"创建任务: {task.id}, 文件数量: {len(file_paths)}")
198
+ return task.id
199
+
200
+ def get_task(self, task_id: str) -> Optional[Task]:
201
+ """获取任务信息
202
+
203
+ Args:
204
+ task_id: 任务ID
205
+
206
+ Returns:
207
+ 任务对象
208
+ """
209
+ return self.tasks.get(task_id)
210
+
211
+ def get_all_tasks(self) -> List[Task]:
212
+ """获取所有任务"""
213
+ return list(self.tasks.values())
214
+
215
+ def get_tasks_by_status(self, status: TaskStatus) -> List[Task]:
216
+ """根据状态获取任务"""
217
+ return [task for task in self.tasks.values() if task.status == status]
218
+
219
+ async def cancel_task(self, task_id: str) -> bool:
220
+ """取消任务
221
+
222
+ Args:
223
+ task_id: 任务ID
224
+
225
+ Returns:
226
+ 是否成功取消
227
+ """
228
+ task = self.get_task(task_id)
229
+ if not task:
230
+ return False
231
+
232
+ if task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED]:
233
+ return False
234
+
235
+ task.status = TaskStatus.CANCELLED
236
+ task.completed_at = datetime.now()
237
+ task.progress.message = "任务已取消"
238
+
239
+ self._notify_status_change(task)
240
+ self.logger.info(f"任务已取消: {task_id}")
241
+ return True
242
+
243
+ def _start_task_processor(self):
244
+ """启动任务处理器"""
245
+ try:
246
+ # 只有在有运行的事件循环时才启动任务处理器
247
+ loop = asyncio.get_running_loop()
248
+ asyncio.create_task(self._process_tasks())
249
+ except RuntimeError:
250
+ # 没有运行的事件循环,延迟启动
251
+ self.logger.debug("没有运行的事件循环,任务处理器将在需要时启动")
252
+ self._processor_started = False
253
+ else:
254
+ self._processor_started = True
255
+
256
+ def _ensure_processor_started(self):
257
+ """确保任务处理器已启动"""
258
+ if not self._processor_started:
259
+ try:
260
+ loop = asyncio.get_running_loop()
261
+ asyncio.create_task(self._process_tasks())
262
+ self._processor_started = True
263
+ except RuntimeError:
264
+ self.logger.warning("无法启动任务处理器:没有运行的事件循环")
265
+
266
+ async def _process_tasks(self):
267
+ """处理任务队列"""
268
+ while True:
269
+ try:
270
+ # 从队列获取任务
271
+ task_id = await self.task_queue.get()
272
+ task = self.get_task(task_id)
273
+
274
+ if not task or task.status == TaskStatus.CANCELLED:
275
+ self.task_queue.task_done()
276
+ continue
277
+
278
+ # 处理任务
279
+ await self._execute_task(task)
280
+ self.task_queue.task_done()
281
+
282
+ except Exception as e:
283
+ self.logger.exception(f"处理任务队列时发生错误: {str(e)}")
284
+ await asyncio.sleep(1)
285
+
286
+ async def _execute_task(self, task: Task):
287
+ """执行任务
288
+
289
+ Args:
290
+ task: 任务对象
291
+ """
292
+ try:
293
+ # 设置任务日志上下文
294
+ self.logger.set_task_id(task.id)
295
+
296
+ task.status = TaskStatus.VALIDATING
297
+ task.started_at = datetime.now()
298
+ task.progress.stage = "文件验证"
299
+ task.progress.update(0, 100, "开始验证文件")
300
+ self._notify_status_change(task)
301
+
302
+ # 1. 文件验证
303
+ valid_files, invalid_files = await self._validate_files(task)
304
+ if not valid_files:
305
+ task.status = TaskStatus.FAILED
306
+ task.result.error_message = "没有有效的文件"
307
+ task.result.failed_files = [str(f[0]) for f in invalid_files]
308
+ task.completed_at = datetime.now()
309
+ self._notify_status_change(task)
310
+ return
311
+
312
+ # 2. 文件上传
313
+ task.status = TaskStatus.UPLOADING
314
+ task.progress.stage = "文件上传"
315
+ task.progress.update(0, len(valid_files), "开始上传文件到OSS")
316
+ self._notify_status_change(task)
317
+
318
+ upload_results = await self._upload_files(task, valid_files)
319
+ successful_uploads = [r for r in upload_results if r[1]]
320
+
321
+ if not successful_uploads:
322
+ task.status = TaskStatus.FAILED
323
+ task.result.error_message = "文件上传失败"
324
+ task.completed_at = datetime.now()
325
+ self._notify_status_change(task)
326
+ return
327
+
328
+ # 3. 转录处理
329
+ task.status = TaskStatus.TRANSCRIBING
330
+ task.progress.stage = "语音转录"
331
+ task.progress.update(0, 100, "开始语音转录")
332
+ self._notify_status_change(task)
333
+
334
+ file_urls = [r[2] for r in successful_uploads]
335
+ success, transcription_result, error = await self._transcribe_audio(task, file_urls)
336
+
337
+ # 4. 完成任务
338
+ task.completed_at = datetime.now()
339
+ task.result.duration = (task.completed_at - task.started_at).total_seconds()
340
+
341
+ if success:
342
+ task.status = TaskStatus.COMPLETED
343
+ task.result.success = True
344
+ task.result.transcription_results = transcription_result
345
+ task.result.processed_files = [r[0] for r in successful_uploads]
346
+ task.progress.update(100, 100, "转录完成")
347
+ else:
348
+ task.status = TaskStatus.FAILED
349
+ task.result.error_message = error
350
+
351
+ self._notify_status_change(task)
352
+
353
+ except Exception as e:
354
+ task.status = TaskStatus.FAILED
355
+ task.result.error_message = f"任务执行失败: {str(e)}"
356
+ task.completed_at = datetime.now()
357
+ self.logger.exception(f"执行任务时发生错误: {task.id}")
358
+ self._notify_status_change(task)
359
+ finally:
360
+ self.logger.clear_task_id()
361
+
362
+ async def _validate_files(self, task: Task) -> tuple:
363
+ """验证文件"""
364
+ self.logger.info(f"开始验证 {len(task.file_paths)} 个文件")
365
+
366
+ valid_files, invalid_files = self.file_validator.validate_multiple_files(task.file_paths)
367
+
368
+ task.progress.update(100, 100, f"验证完成: {len(valid_files)} 个有效文件")
369
+ self.logger.info(f"文件验证完成: {len(valid_files)} 个有效文件, {len(invalid_files)} 个无效文件")
370
+
371
+ return valid_files, invalid_files
372
+
373
+ async def _upload_files(self, task: Task, file_paths: List[Path]) -> List[tuple]:
374
+ """上传文件"""
375
+ self.logger.info(f"开始上传 {len(file_paths)} 个文件")
376
+
377
+ results = []
378
+ for i, file_path in enumerate(file_paths):
379
+ if task.status == TaskStatus.CANCELLED:
380
+ break
381
+
382
+ success, url_or_error, object_key = await self.oss_service.upload_file(file_path, task.id)
383
+ results.append((file_path.name, success, url_or_error, object_key))
384
+
385
+ # 更新进度
386
+ task.progress.update(i + 1, len(file_paths), f"已上传 {i + 1}/{len(file_paths)} 个文件")
387
+ self._notify_status_change(task)
388
+
389
+ self.logger.info(f"文件上传完成: {len([r for r in results if r[1]])} 个成功")
390
+ return results
391
+
392
+ async def _transcribe_audio(self, task: Task, file_urls: List[str]) -> tuple:
393
+ """转录音频"""
394
+ self.logger.info(f"开始转录 {len(file_urls)} 个音频文件")
395
+
396
+ # 提取Paraformer参数
397
+ paraformer_params = None
398
+ if 'paraformer_params' in task.metadata:
399
+ paraformer_params = task.metadata['paraformer_params']
400
+ self.logger.info(f"使用自定义Paraformer参数: {paraformer_params}")
401
+
402
+ success, results, error = await self.paraformer_service.batch_process_with_retry(
403
+ file_urls, task.id, paraformer_params
404
+ )
405
+
406
+ if success:
407
+ task.progress.update(100, 100, "转录完成")
408
+ self.logger.info(f"转录完成: {len(file_urls)} 个文件")
409
+ else:
410
+ self.logger.error(f"转录失败: {error}")
411
+
412
+ return success, results, error
413
+
414
+ def cleanup_completed_tasks(self, hours: int = 24):
415
+ """清理已完成的任务
416
+
417
+ Args:
418
+ hours: 保留时间(小时)
419
+ """
420
+ cutoff_time = datetime.now() - timedelta(hours=hours)
421
+ to_remove = []
422
+
423
+ for task_id, task in self.tasks.items():
424
+ if (task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED, TaskStatus.CANCELLED] and
425
+ task.completed_at and task.completed_at < cutoff_time):
426
+ to_remove.append(task_id)
427
+
428
+ for task_id in to_remove:
429
+ del self.tasks[task_id]
430
+
431
+ self.logger.info(f"清理了 {len(to_remove)} 个过期任务")
432
+
433
+ def get_statistics(self) -> Dict:
434
+ """获取任务统计信息"""
435
+ stats = {
436
+ 'total_tasks': len(self.tasks),
437
+ 'pending': len(self.get_tasks_by_status(TaskStatus.PENDING)),
438
+ 'validating': len(self.get_tasks_by_status(TaskStatus.VALIDATING)),
439
+ 'uploading': len(self.get_tasks_by_status(TaskStatus.UPLOADING)),
440
+ 'transcribing': len(self.get_tasks_by_status(TaskStatus.TRANSCRIBING)),
441
+ 'completed': len(self.get_tasks_by_status(TaskStatus.COMPLETED)),
442
+ 'failed': len(self.get_tasks_by_status(TaskStatus.FAILED)),
443
+ 'cancelled': len(self.get_tasks_by_status(TaskStatus.CANCELLED)),
444
+ 'queue_size': self.task_queue.qsize()
445
+ }
446
+ return stats
447
+
448
+
449
+ # 全局任务管理器实例
450
+ task_manager = None
451
+
452
+
453
+ def get_task_manager() -> TaskManager:
454
+ """获取任务管理器实例
455
+
456
+ Returns:
457
+ 任务管理器实例
458
+ """
459
+ global task_manager
460
+ if task_manager is None:
461
+ task_manager = TaskManager()
462
+ return task_manager
src/services/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """服务模块
2
+
3
+ 包含应用程序的核心业务逻辑服务。
4
+ """
5
+
6
+ from .file_validator import FileValidator, get_file_validator, file_validator
7
+ from .oss_service import OSSService, get_oss_service, oss_service
8
+ from .paraformer_service import ParaformerService, get_paraformer_service, paraformer_service
9
+
10
+ __all__ = [
11
+ "FileValidator",
12
+ "get_file_validator",
13
+ "file_validator",
14
+ "OSSService",
15
+ "get_oss_service",
16
+ "oss_service",
17
+ "ParaformerService",
18
+ "get_paraformer_service",
19
+ "paraformer_service"
20
+ ]
src/services/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (585 Bytes). View file
 
src/services/__pycache__/file_validator.cpython-310.pyc ADDED
Binary file (7.32 kB). View file
 
src/services/__pycache__/oss_service.cpython-310.pyc ADDED
Binary file (8.51 kB). View file
 
src/services/__pycache__/paraformer_service.cpython-310.pyc ADDED
Binary file (9.86 kB). View file
 
src/services/file_validator.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """文件验证模块
2
+
3
+ 提供音频文件格式验证、大小检查等功能。
4
+ """
5
+
6
+ import magic
7
+ from pathlib import Path
8
+ from typing import List, Optional, Tuple
9
+ import mimetypes
10
+
11
+ from ..core.config import get_config
12
+ from ..utils.logger import get_task_logger
13
+
14
+
15
+ class FileValidator:
16
+ """文件验证器"""
17
+
18
+ # 支持的音频文件格式
19
+ SUPPORTED_EXTENSIONS = {
20
+ '.aac', '.amr', '.avi', '.flac', '.flv', '.m4a', '.mkv',
21
+ '.mov', '.mp3', '.mp4', '.mpeg', '.ogg', '.opus', '.wav',
22
+ '.webm', '.wma', '.wmv'
23
+ }
24
+
25
+ # 支持的MIME类型
26
+ SUPPORTED_MIME_TYPES = {
27
+ 'audio/aac', 'audio/amr', 'audio/flac', 'audio/mp3', 'audio/mpeg',
28
+ 'audio/mp4', 'audio/ogg', 'audio/opus', 'audio/wav', 'audio/webm',
29
+ 'audio/x-wav', 'audio/x-flac', 'audio/x-m4a',
30
+ 'video/mp4', 'video/avi', 'video/x-flv', 'video/quicktime',
31
+ 'video/x-msvideo', 'video/webm', 'video/x-ms-wmv'
32
+ }
33
+
34
+ def __init__(self):
35
+ """初始化文件验证器"""
36
+ self.config = get_config()
37
+ self.logger = get_task_logger(logger_name="transcript_service.validator")
38
+
39
+ # 初始化libmagic
40
+ try:
41
+ self.magic = magic.Magic(mime=True)
42
+ except Exception as e:
43
+ self.logger.warning(f"无法初始化libmagic: {str(e)}, 将使用基础验证")
44
+ self.magic = None
45
+
46
+ def validate_file(self, file_path: Path) -> Tuple[bool, Optional[str]]:
47
+ """验证单个文件
48
+
49
+ Args:
50
+ file_path: 文件路径
51
+
52
+ Returns:
53
+ (是否有效, 错误信息)
54
+ """
55
+ try:
56
+ # 检查文件是否存在
57
+ if not file_path.exists():
58
+ return False, f"文件不存在: {file_path}"
59
+
60
+ # 检查是否是文件
61
+ if not file_path.is_file():
62
+ return False, f"不是有效的文件: {file_path}"
63
+
64
+ # 检查文件大小
65
+ file_size = file_path.stat().st_size
66
+ if file_size == 0:
67
+ return False, f"文件为空: {file_path.name}"
68
+
69
+ if file_size > self.config.app.max_file_size:
70
+ size_mb = file_size / (1024 * 1024)
71
+ max_size_mb = self.config.app.max_file_size / (1024 * 1024)
72
+ return False, f"文件大小 {size_mb:.1f}MB 超过限制 {max_size_mb:.1f}MB: {file_path.name}"
73
+
74
+ # 检查文件扩展名
75
+ file_ext = file_path.suffix.lower()
76
+ if file_ext not in self.SUPPORTED_EXTENSIONS:
77
+ return False, f"不支持的文件格式 {file_ext}: {file_path.name}"
78
+
79
+ # 检查MIME类型
80
+ if not self._check_mime_type(file_path):
81
+ return False, f"文件内容与扩展名不匹配: {file_path.name}"
82
+
83
+ # 检查文件完整性
84
+ if not self._check_file_integrity(file_path):
85
+ return False, f"文件可能损坏或不完整: {file_path.name}"
86
+
87
+ self.logger.info(f"文件验证通过: {file_path.name}")
88
+ return True, None
89
+
90
+ except Exception as e:
91
+ error_msg = f"验证文件时发生错误: {file_path.name}, 错误: {str(e)}"
92
+ self.logger.exception(error_msg)
93
+ return False, error_msg
94
+
95
+ def validate_multiple_files(self, file_paths: List[Path]) -> Tuple[List[Path], List[Tuple[Path, str]]]:
96
+ """验证多个文件
97
+
98
+ Args:
99
+ file_paths: 文件路径列表
100
+
101
+ Returns:
102
+ (有效文件列表, 无效文件列表[(文件路径, 错误信息)])
103
+ """
104
+ # 检查文件数量
105
+ if len(file_paths) > self.config.app.max_files_count:
106
+ self.logger.warning(f"文件数量 {len(file_paths)} 超过限制 {self.config.app.max_files_count}")
107
+
108
+ valid_files = []
109
+ invalid_files = []
110
+
111
+ for file_path in file_paths[:self.config.app.max_files_count]:
112
+ is_valid, error_msg = self.validate_file(file_path)
113
+ if is_valid:
114
+ valid_files.append(file_path)
115
+ else:
116
+ invalid_files.append((file_path, error_msg))
117
+
118
+ # 如果超过限制,记录被跳过的文件
119
+ if len(file_paths) > self.config.app.max_files_count:
120
+ skipped_count = len(file_paths) - self.config.app.max_files_count
121
+ self.logger.warning(f"跳过了 {skipped_count} 个文件(超过批处理限制)")
122
+
123
+ self.logger.info(f"文件验证完成: {len(valid_files)} 个有效文件, {len(invalid_files)} 个无效文件")
124
+ return valid_files, invalid_files
125
+
126
+ def _check_mime_type(self, file_path: Path) -> bool:
127
+ """检查文件MIME类型
128
+
129
+ Args:
130
+ file_path: 文件路径
131
+
132
+ Returns:
133
+ MIME类型是否匹配
134
+ """
135
+ try:
136
+ # 使用libmagic检查
137
+ if self.magic:
138
+ mime_type = self.magic.from_file(str(file_path))
139
+ if mime_type in self.SUPPORTED_MIME_TYPES:
140
+ return True
141
+
142
+ # 使用mimetypes作为备选方案
143
+ mime_type, _ = mimetypes.guess_type(str(file_path))
144
+ if mime_type and mime_type in self.SUPPORTED_MIME_TYPES:
145
+ return True
146
+
147
+ # 对于某些格式,检查文件头
148
+ return self._check_file_header(file_path)
149
+
150
+ except Exception as e:
151
+ self.logger.warning(f"检查MIME类型时发生错误: {file_path.name}, 错误: {str(e)}")
152
+ # 如果MIME检查失败,只要扩展名正确就通过
153
+ return True
154
+
155
+ def _check_file_header(self, file_path: Path) -> bool:
156
+ """检查文件头部特征
157
+
158
+ Args:
159
+ file_path: 文件路径
160
+
161
+ Returns:
162
+ 文件头是否匹配
163
+ """
164
+ try:
165
+ with open(file_path, 'rb') as f:
166
+ header = f.read(16)
167
+
168
+ if not header:
169
+ return False
170
+
171
+ # 检查常见音频格式的文件头
172
+ if header.startswith(b'ID3') or header[4:8] == b'ftyp': # MP3, MP4
173
+ return True
174
+ elif header.startswith(b'RIFF') and b'WAVE' in header: # WAV
175
+ return True
176
+ elif header.startswith(b'fLaC'): # FLAC
177
+ return True
178
+ elif header.startswith(b'OggS'): # OGG
179
+ return True
180
+ elif header.startswith(b'\xff\xfb') or header.startswith(b'\xff\xfa'): # MP3
181
+ return True
182
+
183
+ # 如果无法识别文件头,但扩展名正确,就通过验证
184
+ return True
185
+
186
+ except Exception as e:
187
+ self.logger.warning(f"检查文件头时发生错误: {file_path.name}, 错误: {str(e)}")
188
+ return True
189
+
190
+ def _check_file_integrity(self, file_path: Path) -> bool:
191
+ """检查文件完整性
192
+
193
+ Args:
194
+ file_path: 文件路径
195
+
196
+ Returns:
197
+ 文件是否完整
198
+ """
199
+ try:
200
+ # 基础完整性检查:确保文件可以完全读取
201
+ with open(file_path, 'rb') as f:
202
+ # 读取文件开头和结尾
203
+ f.read(1024) # 读取前1KB
204
+ f.seek(-min(1024, file_path.stat().st_size), 2) # 读取后1KB
205
+ f.read()
206
+
207
+ return True
208
+
209
+ except Exception as e:
210
+ self.logger.warning(f"检查文件完整性时发生错误: {file_path.name}, 错误: {str(e)}")
211
+ return False
212
+
213
+ def get_file_info(self, file_path: Path) -> dict:
214
+ """获取文件信息
215
+
216
+ Args:
217
+ file_path: 文件路径
218
+
219
+ Returns:
220
+ 文件信息字典
221
+ """
222
+ try:
223
+ stat = file_path.stat()
224
+
225
+ # 获取MIME类型
226
+ mime_type = None
227
+ if self.magic:
228
+ try:
229
+ mime_type = self.magic.from_file(str(file_path))
230
+ except:
231
+ pass
232
+
233
+ if not mime_type:
234
+ mime_type, _ = mimetypes.guess_type(str(file_path))
235
+
236
+ return {
237
+ 'name': file_path.name,
238
+ 'size': stat.st_size,
239
+ 'size_mb': round(stat.st_size / (1024 * 1024), 2),
240
+ 'extension': file_path.suffix.lower(),
241
+ 'mime_type': mime_type,
242
+ 'modified_time': stat.st_mtime,
243
+ 'is_supported': file_path.suffix.lower() in self.SUPPORTED_EXTENSIONS
244
+ }
245
+
246
+ except Exception as e:
247
+ self.logger.error(f"获取文件信息失败: {file_path.name}, 错误: {str(e)}")
248
+ return {
249
+ 'name': file_path.name,
250
+ 'error': str(e)
251
+ }
252
+
253
+ def get_supported_formats(self) -> dict:
254
+ """获取支持的文件格式信息
255
+
256
+ Returns:
257
+ 支持的格式信息
258
+ """
259
+ return {
260
+ 'extensions': sorted(list(self.SUPPORTED_EXTENSIONS)),
261
+ 'mime_types': sorted(list(self.SUPPORTED_MIME_TYPES)),
262
+ 'max_file_size_mb': self.config.app.max_file_size / (1024 * 1024),
263
+ 'max_files_count': self.config.app.max_files_count
264
+ }
265
+
266
+
267
+ # 全局文件验证器实例
268
+ file_validator = FileValidator()
269
+
270
+
271
+ def get_file_validator() -> FileValidator:
272
+ """获取文件验证器实例
273
+
274
+ Returns:
275
+ 文件验证器实例
276
+ """
277
+ return file_validator
src/services/oss_service.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OSS云存储服务模块
2
+
3
+ 提供阿里云OSS文件上传、下载和管理功能。
4
+ """
5
+
6
+ import os
7
+ import uuid
8
+ from datetime import datetime, timedelta
9
+ from pathlib import Path
10
+ from typing import List, Optional, Tuple
11
+ import asyncio
12
+ import aiohttp
13
+ import oss2
14
+ from oss2.exceptions import OssError
15
+
16
+ from ..core.config import get_config
17
+ from ..utils.logger import get_task_logger
18
+
19
+
20
+ class OSSService:
21
+ """OSS云存储服务"""
22
+
23
+ def __init__(self):
24
+ """初始化OSS服务"""
25
+ self.config = get_config()
26
+ self.oss_config = self.config.oss
27
+
28
+ # 初始化OSS客户端
29
+ auth = oss2.Auth(
30
+ self.oss_config.access_key_id,
31
+ self.oss_config.access_key_secret
32
+ )
33
+ self.bucket = oss2.Bucket(
34
+ auth,
35
+ self.oss_config.endpoint,
36
+ self.oss_config.bucket_name
37
+ )
38
+
39
+ self.logger = get_task_logger(logger_name="transcript_service.oss")
40
+
41
+ def _generate_object_key(self, filename: str, task_id: str) -> str:
42
+ """生成OSS对象键名
43
+
44
+ Args:
45
+ filename: 原始文件名
46
+ task_id: 任务ID
47
+
48
+ Returns:
49
+ OSS对象键名
50
+ """
51
+ now = datetime.now()
52
+ date_path = now.strftime("%Y/%m/%d")
53
+ timestamp = now.strftime("%Y%m%d_%H%M%S")
54
+
55
+ # 获取文件扩展名
56
+ file_ext = Path(filename).suffix
57
+ safe_filename = f"{timestamp}_{task_id}_{uuid.uuid4().hex[:8]}{file_ext}"
58
+
59
+ return f"{self.oss_config.temp_prefix}/{date_path}/{safe_filename}"
60
+
61
+ async def upload_file(self, file_path: Path, task_id: str) -> Tuple[bool, str, Optional[str]]:
62
+ """上传文件到OSS
63
+
64
+ Args:
65
+ file_path: 本地文件路径
66
+ task_id: 任务ID
67
+
68
+ Returns:
69
+ (是否成功, 公网URL或错误信息, 对象键名)
70
+ """
71
+ try:
72
+ self.logger.info(f"开始上传文件到OSS: {file_path.name}")
73
+
74
+ # 生成对象键名
75
+ object_key = self._generate_object_key(file_path.name, task_id)
76
+
77
+ # 上传文件并设置公共读取权限
78
+ try:
79
+ # 首先上传文件
80
+ self.bucket.put_object_from_file(object_key, str(file_path))
81
+
82
+ # 设置对象ACL为公共读取
83
+ self.bucket.put_object_acl(object_key, oss2.OBJECT_ACL_PUBLIC_READ)
84
+
85
+ # 生成公网访问URL
86
+ url = self._generate_public_url(object_key)
87
+ self.logger.info(f"文件上传成功: {object_key}, URL: {url}")
88
+ return True, url, object_key
89
+
90
+ except oss2.exceptions.OssError as oss_err:
91
+ # 如果设置ACL失败,尝试使用签名URL
92
+ if 'public-read' in str(oss_err).lower():
93
+ self.logger.warning(f"ACL设置失败,使用签名URL: {oss_err}")
94
+ url = self._generate_signed_url(object_key)
95
+ self.logger.info(f"文件上传成功: {object_key}, URL: {url}")
96
+ return True, url, object_key
97
+ else:
98
+ raise
99
+
100
+ except OssError as e:
101
+ error_msg = f"OSS错误: {str(e)}"
102
+ self.logger.error(error_msg)
103
+ return False, error_msg, None
104
+ except Exception as e:
105
+ error_msg = f"上传文件时发生未知错误: {str(e)}"
106
+ self.logger.exception(error_msg)
107
+ return False, error_msg, None
108
+
109
+ async def upload_multiple_files(self, file_paths: List[Path], task_id: str) -> List[Tuple[str, bool, str, Optional[str]]]:
110
+ """批量上传文件到OSS
111
+
112
+ Args:
113
+ file_paths: 本地文件路径列表
114
+ task_id: 任务ID
115
+
116
+ Returns:
117
+ [(文件名, 是否成功, URL或错误信息, 对象键名), ...]
118
+ """
119
+ results = []
120
+
121
+ # 创建异步任务
122
+ tasks = []
123
+ for file_path in file_paths:
124
+ task = self._upload_single_file_async(file_path, task_id)
125
+ tasks.append((file_path.name, task))
126
+
127
+ # 等待所有上传完成
128
+ for filename, task in tasks:
129
+ success, url_or_error, object_key = await task
130
+ results.append((filename, success, url_or_error, object_key))
131
+
132
+ return results
133
+
134
+ async def _upload_single_file_async(self, file_path: Path, task_id: str) -> Tuple[bool, str, Optional[str]]:
135
+ """异步上传单个文件"""
136
+ return await asyncio.get_event_loop().run_in_executor(
137
+ None,
138
+ lambda: asyncio.run(self.upload_file(file_path, task_id))
139
+ )
140
+
141
+ def _generate_public_url(self, object_key: str) -> str:
142
+ """生成公网访问URL
143
+
144
+ Args:
145
+ object_key: OSS对象键名
146
+
147
+ Returns:
148
+ 公网访问URL
149
+ """
150
+ # 生成简单的公网访问URL(不带签名)
151
+ # 正确的格式: https://bucket-name.endpoint/object-key
152
+ # 注意: endpoint不能包含协议前缀
153
+ endpoint = self.oss_config.endpoint
154
+ if endpoint.startswith('http://'):
155
+ endpoint = endpoint[7:]
156
+ elif endpoint.startswith('https://'):
157
+ endpoint = endpoint[8:]
158
+
159
+ # 构造公网URL - 注意这里的格式必须正确
160
+ url = f"https://{self.oss_config.bucket_name}.{endpoint}/{object_key}"
161
+
162
+ # 记录生成的URL以便调试
163
+ self.logger.debug(f"生成公网URL: {url}")
164
+
165
+ return url
166
+
167
+ def _generate_signed_url(self, object_key: str) -> str:
168
+ """生成签名URL(备用方案)
169
+
170
+ Args:
171
+ object_key: OSS对象键名
172
+
173
+ Returns:
174
+ 签名URL
175
+ """
176
+ # 生成有时效性的签名URL
177
+ expire_time = int((datetime.now() + timedelta(hours=self.oss_config.url_expire_hours)).timestamp())
178
+ url = self.bucket.sign_url('GET', object_key, expire_time)
179
+ return url
180
+
181
+ def delete_file(self, object_key: str) -> bool:
182
+ """删除OSS文件
183
+
184
+ Args:
185
+ object_key: OSS对象键名
186
+
187
+ Returns:
188
+ 是否删除成功
189
+ """
190
+ try:
191
+ self.bucket.delete_object(object_key)
192
+ self.logger.info(f"文件删除成功: {object_key}")
193
+ return True
194
+ except OssError as e:
195
+ self.logger.error(f"删除文件失败: {object_key}, 错误: {str(e)}")
196
+ return False
197
+ except Exception as e:
198
+ self.logger.exception(f"删除文件时发生未知错误: {object_key}, 错误: {str(e)}")
199
+ return False
200
+
201
+ def cleanup_old_files(self, days: Optional[int] = None) -> int:
202
+ """清理过期的临时文件
203
+
204
+ Args:
205
+ days: 保留天数,默认使用配置中的值
206
+
207
+ Returns:
208
+ 删除的文件数量
209
+ """
210
+ cleanup_days = days or self.oss_config.auto_cleanup_days
211
+ cutoff_date = datetime.now() - timedelta(days=cleanup_days)
212
+
213
+ deleted_count = 0
214
+ prefix = self.oss_config.temp_prefix
215
+
216
+ try:
217
+ # 列出所有临时文件
218
+ for obj in oss2.ObjectIterator(self.bucket, prefix=prefix):
219
+ # 检查文件最后修改时间
220
+ if obj.last_modified.replace(tzinfo=None) < cutoff_date:
221
+ if self.delete_file(obj.key):
222
+ deleted_count += 1
223
+
224
+ self.logger.info(f"清理完成,删除了 {deleted_count} 个过期文件")
225
+ return deleted_count
226
+
227
+ except Exception as e:
228
+ self.logger.exception(f"清理过期文件时发生错误: {str(e)}")
229
+ return deleted_count
230
+
231
+ def get_file_info(self, object_key: str) -> Optional[dict]:
232
+ """获取文件信息
233
+
234
+ Args:
235
+ object_key: OSS对象键名
236
+
237
+ Returns:
238
+ 文件信息字典
239
+ """
240
+ try:
241
+ info = self.bucket.head_object(object_key)
242
+ return {
243
+ 'size': info.content_length,
244
+ 'last_modified': info.last_modified,
245
+ 'etag': info.etag,
246
+ 'content_type': info.content_type
247
+ }
248
+ except OssError as e:
249
+ self.logger.error(f"获取文件信息失败: {object_key}, 错误: {str(e)}")
250
+ return None
251
+
252
+ def check_bucket_exists(self) -> bool:
253
+ """检查存储桶是否存在
254
+
255
+ Returns:
256
+ 存储桶是否存在
257
+ """
258
+ try:
259
+ return self.bucket.bucket_exists()
260
+ except Exception as e:
261
+ self.logger.error(f"检查存储桶失败: {str(e)}")
262
+ return False
263
+
264
+ def get_bucket_info(self) -> Optional[dict]:
265
+ """获取存储桶信息
266
+
267
+ Returns:
268
+ 存储桶信息
269
+ """
270
+ try:
271
+ info = self.bucket.get_bucket_info()
272
+ return {
273
+ 'name': info.name,
274
+ 'location': info.location,
275
+ 'creation_date': info.creation_date,
276
+ 'storage_class': info.storage_class
277
+ }
278
+ except Exception as e:
279
+ self.logger.error(f"获取存储桶信息失败: {str(e)}")
280
+ return None
281
+
282
+
283
+ # 全局OSS服务实例
284
+ oss_service = OSSService()
285
+
286
+
287
+ def get_oss_service() -> OSSService:
288
+ """获取OSS服务实例
289
+
290
+ Returns:
291
+ OSS服务实例
292
+ """
293
+ return oss_service
src/services/paraformer_service.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Paraformer转录服务模块
2
+
3
+ 提供阿里云百炼平台Paraformer-v2模型的语音转录功能。
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import time
9
+ from typing import Dict, List, Optional, Tuple
10
+ from enum import Enum
11
+ import httpx
12
+ from dashscope import audio
13
+
14
+ from ..core.config import get_config
15
+ from ..utils.logger import get_task_logger
16
+
17
+
18
+ class TaskStatus(Enum):
19
+ """任务状态枚举"""
20
+ PENDING = "PENDING"
21
+ RUNNING = "RUNNING"
22
+ SUCCEEDED = "SUCCEEDED"
23
+ FAILED = "FAILED"
24
+ CANCELLED = "CANCELLED"
25
+
26
+
27
+ class ParaformerService:
28
+ """Paraformer转录服务"""
29
+
30
+ def __init__(self):
31
+ """初始化Paraformer服务"""
32
+ self.config = get_config()
33
+ self.api_config = self.config.dashscope
34
+ self.logger = get_task_logger(logger_name="transcript_service.api")
35
+
36
+ # 设置API密钥
37
+ audio.api_key = self.api_config.api_key
38
+
39
+ async def submit_transcription_task(
40
+ self,
41
+ file_urls: List[str],
42
+ task_id: str,
43
+ paraformer_params: Optional[Dict] = None
44
+ ) -> Tuple[bool, str, Optional[str]]:
45
+ """提交转录任务
46
+
47
+ Args:
48
+ file_urls: 音频文件URL列表
49
+ task_id: 任务ID
50
+ paraformer_params: Paraformer额外参数
51
+
52
+ Returns:
53
+ (是否成功, 消息, API任务ID)
54
+ """
55
+ try:
56
+ self.logger.info(f"提交转录任务: {len(file_urls)} 个文件")
57
+
58
+ # 准备请求参数
59
+ transcription_params = {
60
+ 'model': self.api_config.model,
61
+ 'file_urls': file_urls
62
+ }
63
+
64
+ # 添加额外参数(如果提供)
65
+ if paraformer_params:
66
+ # 语言提示
67
+ if 'language_hints' in paraformer_params and paraformer_params['language_hints']:
68
+ transcription_params['language_hints'] = paraformer_params['language_hints']
69
+ else:
70
+ transcription_params['language_hints'] = self.api_config.language_hints
71
+
72
+ # 音轨选择
73
+ if 'channel_id' in paraformer_params and paraformer_params['channel_id']:
74
+ transcription_params['channel_id'] = paraformer_params['channel_id']
75
+
76
+ # 语气词过滤
77
+ if 'disfluency_removal_enabled' in paraformer_params:
78
+ transcription_params['disfluency_removal_enabled'] = paraformer_params['disfluency_removal_enabled']
79
+
80
+ # 时间戳校准
81
+ if 'timestamp_alignment_enabled' in paraformer_params:
82
+ transcription_params['timestamp_alignment_enabled'] = paraformer_params['timestamp_alignment_enabled']
83
+
84
+ # 说话人分离
85
+ if 'diarization_enabled' in paraformer_params:
86
+ transcription_params['diarization_enabled'] = paraformer_params['diarization_enabled']
87
+
88
+ # 说话人数量
89
+ if 'speaker_count' in paraformer_params and paraformer_params['speaker_count']:
90
+ transcription_params['speaker_count'] = paraformer_params['speaker_count']
91
+
92
+ # 热词ID v2
93
+ if 'vocabulary_id' in paraformer_params and paraformer_params['vocabulary_id']:
94
+ transcription_params['vocabulary_id'] = paraformer_params['vocabulary_id']
95
+
96
+ # 热词ID v1
97
+ if 'phrase_id' in paraformer_params and paraformer_params['phrase_id']:
98
+ transcription_params['phrase_id'] = paraformer_params['phrase_id']
99
+
100
+ # 敏感词过滤
101
+ if 'special_word_filter' in paraformer_params and paraformer_params['special_word_filter']:
102
+ transcription_params['special_word_filter'] = paraformer_params['special_word_filter']
103
+ else:
104
+ # 使用默认配置
105
+ transcription_params['language_hints'] = self.api_config.language_hints
106
+
107
+ # 记录最终参数用于调试
108
+ self.logger.info(f"转录参数: {transcription_params}")
109
+
110
+ # 调用API
111
+ response = audio.asr.Transcription.async_call(**transcription_params)
112
+
113
+ if response.status_code == 200:
114
+ api_task_id = response.output.task_id
115
+ self.logger.info(f"任务提交成功, API任务ID: {api_task_id}")
116
+ return True, f"任务提交成功", api_task_id
117
+ else:
118
+ error_msg = f"API调用失败, 状态码: {response.status_code}, 错误: {response.message}"
119
+ self.logger.error(error_msg)
120
+ return False, error_msg, None
121
+
122
+ except Exception as e:
123
+ error_msg = f"提交转录任务时发生错误: {str(e)}"
124
+ self.logger.exception(error_msg)
125
+ return False, error_msg, None
126
+
127
+ async def check_task_status(self, api_task_id: str) -> Tuple[TaskStatus, Optional[dict], Optional[str]]:
128
+ """检查任务状态
129
+
130
+ Args:
131
+ api_task_id: API任务ID
132
+
133
+ Returns:
134
+ (任务状态, 结果数据, 错误信息)
135
+ """
136
+ try:
137
+ response = audio.asr.Transcription.fetch(task=api_task_id)
138
+
139
+ if response.status_code == 200:
140
+ task_status = TaskStatus(response.output.task_status)
141
+
142
+ if task_status == TaskStatus.SUCCEEDED:
143
+ # 解析转录结果
144
+ results = await self._parse_transcription_results(response.output.results)
145
+ return task_status, results, None
146
+ elif task_status == TaskStatus.FAILED:
147
+ error_msg = getattr(response.output, 'message', '转录失败')
148
+ return task_status, None, error_msg
149
+ else:
150
+ # 任务进行中
151
+ return task_status, None, None
152
+ else:
153
+ error_msg = f"检查任务状态失败: {response.message}"
154
+ self.logger.error(error_msg)
155
+ return TaskStatus.FAILED, None, error_msg
156
+
157
+ except Exception as e:
158
+ error_msg = f"检查任务状态时发生错误: {str(e)}"
159
+ self.logger.exception(error_msg)
160
+ return TaskStatus.FAILED, None, error_msg
161
+
162
+ async def process_audio_files(
163
+ self,
164
+ file_urls: List[str],
165
+ task_id: str,
166
+ paraformer_params: Optional[Dict] = None
167
+ ) -> Tuple[bool, Optional[dict], Optional[str]]:
168
+ """处理音频文件转录(完整流程)
169
+
170
+ Args:
171
+ file_urls: 音频文件URL列表
172
+ task_id: 任务ID
173
+ paraformer_params: Paraformer额外参数
174
+
175
+ Returns:
176
+ (是否成功, 转录结果, 错误信息)
177
+ """
178
+ try:
179
+ # 保存原始URL映射,用于结果处理
180
+ self._original_urls = file_urls.copy()
181
+ self.logger.info(f"保存原始URL: {self._original_urls}")
182
+
183
+ # 1. 提交任务
184
+ success, message, api_task_id = await self.submit_transcription_task(file_urls, task_id, paraformer_params)
185
+ if not success:
186
+ return False, None, message
187
+
188
+ self.logger.info(f"开始监控任务状态: {api_task_id}")
189
+
190
+ # 2. 监控任务状态
191
+ max_wait_time = self.api_config.timeout
192
+ check_interval = self.config.task.status_check_interval
193
+ start_time = time.time()
194
+
195
+ while time.time() - start_time < max_wait_time:
196
+ status, results, error = await self.check_task_status(api_task_id)
197
+
198
+ if status == TaskStatus.SUCCEEDED:
199
+ self.logger.info(f"转录完成: {api_task_id}")
200
+ return True, results, None
201
+ elif status == TaskStatus.FAILED:
202
+ self.logger.error(f"转录失败: {api_task_id}, 错误: {error}")
203
+ return False, None, error
204
+ elif status in [TaskStatus.PENDING, TaskStatus.RUNNING]:
205
+ self.logger.debug(f"任务进行中: {api_task_id}, 状态: {status.value}")
206
+ await asyncio.sleep(check_interval)
207
+ else:
208
+ error_msg = f"未知任务状态: {status}"
209
+ self.logger.error(error_msg)
210
+ return False, None, error_msg
211
+
212
+ # 超时
213
+ error_msg = f"任务超时: {api_task_id} (等待时间: {max_wait_time}秒)"
214
+ self.logger.error(error_msg)
215
+ return False, None, error_msg
216
+
217
+ except Exception as e:
218
+ error_msg = f"处理音频文件时发生错误: {str(e)}"
219
+ self.logger.exception(error_msg)
220
+ return False, None, error_msg
221
+
222
+ async def _parse_transcription_results(self, results: List) -> dict:
223
+ """解析转录结果
224
+
225
+ Args:
226
+ results: API返回的结果列表
227
+
228
+ Returns:
229
+ 解析后的结果字典
230
+ """
231
+ parsed_results = {
232
+ 'transcriptions': [],
233
+ 'summary': {
234
+ 'total_files': len(results),
235
+ 'total_duration': 0,
236
+ 'total_text_length': 0,
237
+ 'languages_detected': set()
238
+ }
239
+ }
240
+
241
+ for i, result in enumerate(results):
242
+ try:
243
+ # 使用原始URL而不是API返回的file_url
244
+ original_url = ''
245
+ if hasattr(self, '_original_urls') and i < len(self._original_urls):
246
+ original_url = self._original_urls[i]
247
+ self.logger.info(f"使用原始URL[{i}]: {original_url}")
248
+ else:
249
+ original_url = result.get('file_url', '')
250
+ self.logger.warning(f"未找到原始URL[{i}],使用API返回的URL: {original_url}")
251
+
252
+ # 从transcription_url下载实际的转录结果
253
+ transcription_text = ''
254
+ duration = 0
255
+ language = 'unknown'
256
+ confidence = 0
257
+ segments = []
258
+
259
+ if result.get('subtask_status') == 'SUCCEEDED' and result.get('transcription_url'):
260
+ try:
261
+ # 下载转录结果
262
+ async with httpx.AsyncClient() as client:
263
+ response = await client.get(result['transcription_url'])
264
+ if response.status_code == 200:
265
+ transcription_data = response.json()
266
+ # 根据实际返回的数据结构解析
267
+ # 获取原始时长(毫秒)
268
+ original_duration_ms = transcription_data.get('properties', {}).get('original_duration_in_milliseconds', 0)
269
+ duration = original_duration_ms / 1000.0 # 转换为秒
270
+ language = 'en' # 根据测试设置默认为英语
271
+
272
+ # 从transcripts中提取文本
273
+ transcription_text = ''
274
+ all_sentences = []
275
+ transcripts = transcription_data.get('transcripts', [])
276
+ if transcripts:
277
+ # 提取第一个transcript的文本
278
+ first_transcript = transcripts[0]
279
+ transcription_text = first_transcript.get('text', '')
280
+
281
+ # 获取句子信息
282
+ all_sentences = first_transcript.get('sentences', [])
283
+
284
+ # 计算置信度平均值(如果有句子信息)
285
+ confidence = 0
286
+ if all_sentences:
287
+ confidences = [sentence.get('confidence', 0) for sentence in all_sentences if 'confidence' in sentence]
288
+ if confidences:
289
+ confidence = sum(confidences) / len(confidences)
290
+ else:
291
+ self.logger.warning(f"下载转录结果失败,状态码: {response.status_code}")
292
+ self.logger.warning(f"响应内容: {response.text}")
293
+ except Exception as e:
294
+ self.logger.warning(f"下载转录结果时发生错误: {str(e)}")
295
+
296
+ transcription = {
297
+ 'file_url': original_url,
298
+ 'text': transcription_text,
299
+ 'duration': duration,
300
+ 'language': language,
301
+ 'confidence': confidence,
302
+ 'segments': segments
303
+ }
304
+
305
+ # 如果需要调试,保存API返回的原始file_url
306
+ api_file_url = result.get('file_url', '')
307
+ if api_file_url and api_file_url != original_url:
308
+ transcription['api_file_url'] = api_file_url
309
+
310
+ parsed_results['transcriptions'].append(transcription)
311
+
312
+ # 更新摘要信息
313
+ parsed_results['summary']['total_duration'] += transcription['duration']
314
+ parsed_results['summary']['total_text_length'] += len(transcription['text'])
315
+ parsed_results['summary']['languages_detected'].add(transcription['language'])
316
+
317
+ except Exception as e:
318
+ self.logger.warning(f"解析单个转录结果时发生错误: {str(e)}")
319
+ # 添加错误的结果项
320
+ original_url = ''
321
+ if hasattr(self, '_original_urls') and i < len(self._original_urls):
322
+ original_url = self._original_urls[i]
323
+
324
+ parsed_results['transcriptions'].append({
325
+ 'file_url': original_url,
326
+ 'error': str(e),
327
+ 'raw_result': result
328
+ })
329
+
330
+ # 转换语言集合为列表
331
+ parsed_results['summary']['languages_detected'] = list(parsed_results['summary']['languages_detected'])
332
+
333
+ return parsed_results
334
+
335
+ async def batch_process_with_retry(
336
+ self,
337
+ file_urls: List[str],
338
+ task_id: str,
339
+ paraformer_params: Optional[Dict] = None
340
+ ) -> Tuple[bool, Optional[dict], Optional[str]]:
341
+ """批量处理音频文件(带重试机制)
342
+
343
+ Args:
344
+ file_urls: 音频文件URL列表
345
+ task_id: 任务ID
346
+ paraformer_params: Paraformer额外参数
347
+
348
+ Returns:
349
+ (是否成功, 转录结果, 错误信息)
350
+ """
351
+ max_retries = self.api_config.max_retries
352
+ retry_delay = self.api_config.retry_delay
353
+
354
+ for attempt in range(max_retries + 1):
355
+ try:
356
+ success, results, error = await self.process_audio_files(file_urls, task_id, paraformer_params)
357
+
358
+ if success:
359
+ return True, results, None
360
+
361
+ # 如果是最后一次重试,返回错误
362
+ if attempt == max_retries:
363
+ return False, None, error
364
+
365
+ # 等待后重试
366
+ self.logger.warning(f"第 {attempt + 1} 次尝试失败,{retry_delay} 秒后重试: {error}")
367
+ await asyncio.sleep(retry_delay * (attempt + 1)) # 递增延迟
368
+
369
+ except Exception as e:
370
+ error_msg = f"重试过程中发生错误: {str(e)}"
371
+ self.logger.exception(error_msg)
372
+
373
+ if attempt == max_retries:
374
+ return False, None, error_msg
375
+
376
+ await asyncio.sleep(retry_delay * (attempt + 1))
377
+
378
+ return False, None, "重试次数已达上限"
379
+
380
+ def get_service_info(self) -> dict:
381
+ """获取服务信息
382
+
383
+ Returns:
384
+ 服务配置信息
385
+ """
386
+ return {
387
+ 'model': self.api_config.model,
388
+ 'base_url': self.api_config.base_url,
389
+ 'timeout': self.api_config.timeout,
390
+ 'max_retries': self.api_config.max_retries,
391
+ 'retry_delay': self.api_config.retry_delay,
392
+ 'language_hints': self.api_config.language_hints,
393
+ 'status_check_interval': self.config.task.status_check_interval
394
+ }
395
+
396
+
397
+ # 全局Paraformer服务实例
398
+ paraformer_service = ParaformerService()
399
+
400
+
401
+ def get_paraformer_service() -> ParaformerService:
402
+ """获取Paraformer服务实例
403
+
404
+ Returns:
405
+ Paraformer服务实例
406
+ """
407
+ return paraformer_service
src/utils/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """工具模块
2
+
3
+ 包含应用程序的工具函数和辅助类。
4
+ """
5
+
6
+ from .logger import Logger, TaskLogger, get_logger, get_task_logger, logger
7
+ from .error_handler import (
8
+ ErrorCode, TranscriptServiceError, FileValidationError, NetworkError,
9
+ APIError, OSSError, SystemError, RetryStrategy, ErrorHandler,
10
+ retry_async, retry_sync, safe_execute, safe_execute_async, get_error_handler, error_handler
11
+ )
12
+
13
+ __all__ = [
14
+ "Logger",
15
+ "TaskLogger",
16
+ "get_logger",
17
+ "get_task_logger",
18
+ "logger",
19
+ "ErrorCode",
20
+ "TranscriptServiceError",
21
+ "FileValidationError",
22
+ "NetworkError",
23
+ "APIError",
24
+ "OSSError",
25
+ "SystemError",
26
+ "RetryStrategy",
27
+ "ErrorHandler",
28
+ "retry_async",
29
+ "retry_sync",
30
+ "safe_execute",
31
+ "safe_execute_async",
32
+ "get_error_handler",
33
+ "error_handler"
34
+ ]
src/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (850 Bytes). View file
 
src/utils/__pycache__/error_handler.cpython-310.pyc ADDED
Binary file (10.2 kB). View file
 
src/utils/__pycache__/logger.cpython-310.pyc ADDED
Binary file (7.74 kB). View file
 
src/utils/error_handler.py ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """错误处理和容错机制模块
2
+
3
+ 提供统一的错误处理、重试逻辑和异常恢复功能。
4
+ """
5
+
6
+ import asyncio
7
+ import functools
8
+ import time
9
+ from typing import Any, Callable, Dict, Optional, Type, Union
10
+ from enum import Enum
11
+
12
+ from ..core.config import get_config
13
+ from ..utils.logger import get_task_logger
14
+
15
+
16
+ class ErrorCode(Enum):
17
+ """错误代码"""
18
+ # 文件相关错误
19
+ FILE_NOT_FOUND = "FILE_001"
20
+ FILE_TOO_LARGE = "FILE_002"
21
+ FILE_FORMAT_UNSUPPORTED = "FILE_003"
22
+ FILE_CORRUPTED = "FILE_004"
23
+
24
+ # 网络相关错误
25
+ NETWORK_TIMEOUT = "NET_001"
26
+ NETWORK_CONNECTION_ERROR = "NET_002"
27
+ NETWORK_DNS_ERROR = "NET_003"
28
+
29
+ # API相关错误
30
+ API_KEY_INVALID = "API_001"
31
+ API_QUOTA_EXCEEDED = "API_002"
32
+ API_SERVICE_UNAVAILABLE = "API_003"
33
+ API_RATE_LIMITED = "API_004"
34
+
35
+ # OSS相关错误
36
+ OSS_ACCESS_DENIED = "OSS_001"
37
+ OSS_BUCKET_NOT_FOUND = "OSS_002"
38
+ OSS_UPLOAD_FAILED = "OSS_003"
39
+
40
+ # 系统相关错误
41
+ SYSTEM_OUT_OF_MEMORY = "SYS_001"
42
+ SYSTEM_DISK_FULL = "SYS_002"
43
+ SYSTEM_PERMISSION_DENIED = "SYS_003"
44
+
45
+ # 通用错误
46
+ UNKNOWN_ERROR = "GEN_001"
47
+ TIMEOUT_ERROR = "GEN_002"
48
+ VALIDATION_ERROR = "GEN_003"
49
+
50
+
51
+ class TranscriptServiceError(Exception):
52
+ """服务自定义异常基类"""
53
+
54
+ def __init__(self, message: str, error_code: ErrorCode = ErrorCode.UNKNOWN_ERROR, details: Dict = None):
55
+ """初始化异常
56
+
57
+ Args:
58
+ message: 错误消息
59
+ error_code: 错误代码
60
+ details: 额外详情
61
+ """
62
+ super().__init__(message)
63
+ self.message = message
64
+ self.error_code = error_code
65
+ self.details = details or {}
66
+ self.timestamp = time.time()
67
+
68
+ def to_dict(self) -> Dict[str, Any]:
69
+ """转换为字典格式"""
70
+ return {
71
+ 'error_code': self.error_code.value,
72
+ 'message': self.message,
73
+ 'details': self.details,
74
+ 'timestamp': self.timestamp
75
+ }
76
+
77
+
78
+ class FileValidationError(TranscriptServiceError):
79
+ """文件验证错误"""
80
+ pass
81
+
82
+
83
+ class NetworkError(TranscriptServiceError):
84
+ """网络相关错误"""
85
+ pass
86
+
87
+
88
+ class APIError(TranscriptServiceError):
89
+ """API调用错误"""
90
+ pass
91
+
92
+
93
+ class OSSError(TranscriptServiceError):
94
+ """OSS操作错误"""
95
+ pass
96
+
97
+
98
+ class SystemError(TranscriptServiceError):
99
+ """系统错误"""
100
+ pass
101
+
102
+
103
+ class RetryStrategy:
104
+ """重试策略"""
105
+
106
+ def __init__(
107
+ self,
108
+ max_attempts: int = 3,
109
+ base_delay: float = 1.0,
110
+ max_delay: float = 60.0,
111
+ exponential_base: float = 2.0,
112
+ jitter: bool = True
113
+ ):
114
+ """初始化重试策略
115
+
116
+ Args:
117
+ max_attempts: 最大重试次数
118
+ base_delay: 基础延迟时间(秒)
119
+ max_delay: 最大延迟时间(秒)
120
+ exponential_base: 指数退避基数
121
+ jitter: 是否添加随机抖动
122
+ """
123
+ self.max_attempts = max_attempts
124
+ self.base_delay = base_delay
125
+ self.max_delay = max_delay
126
+ self.exponential_base = exponential_base
127
+ self.jitter = jitter
128
+
129
+ def calculate_delay(self, attempt: int) -> float:
130
+ """计算延迟时间
131
+
132
+ Args:
133
+ attempt: 当前尝试次数(从1开始)
134
+
135
+ Returns:
136
+ 延迟时间(秒)
137
+ """
138
+ delay = self.base_delay * (self.exponential_base ** (attempt - 1))
139
+ delay = min(delay, self.max_delay)
140
+
141
+ if self.jitter:
142
+ import random
143
+ delay *= (0.5 + random.random() * 0.5) # 添加±50%的随机抖动
144
+
145
+ return delay
146
+
147
+
148
+ class ErrorHandler:
149
+ """错误处理器"""
150
+
151
+ def __init__(self):
152
+ """初始化错误处理器"""
153
+ self.config = get_config()
154
+ self.logger = get_task_logger(logger_name="transcript_service.error")
155
+
156
+ # 错误分类映射
157
+ self.error_mapping = {
158
+ # 文件错误
159
+ FileNotFoundError: (FileValidationError, ErrorCode.FILE_NOT_FOUND),
160
+ PermissionError: (SystemError, ErrorCode.SYSTEM_PERMISSION_DENIED),
161
+
162
+ # 网络错误
163
+ asyncio.TimeoutError: (NetworkError, ErrorCode.NETWORK_TIMEOUT),
164
+ ConnectionError: (NetworkError, ErrorCode.NETWORK_CONNECTION_ERROR),
165
+
166
+ # 通用错误
167
+ ValueError: (TranscriptServiceError, ErrorCode.VALIDATION_ERROR),
168
+ RuntimeError: (TranscriptServiceError, ErrorCode.UNKNOWN_ERROR),
169
+ }
170
+
171
+ # 可重试的错误类型
172
+ self.retryable_errors = {
173
+ ErrorCode.NETWORK_TIMEOUT,
174
+ ErrorCode.NETWORK_CONNECTION_ERROR,
175
+ ErrorCode.API_RATE_LIMITED,
176
+ ErrorCode.OSS_UPLOAD_FAILED,
177
+ ErrorCode.API_SERVICE_UNAVAILABLE
178
+ }
179
+
180
+ def classify_error(self, error: Exception) -> TranscriptServiceError:
181
+ """分类和包装错误
182
+
183
+ Args:
184
+ error: 原始异常
185
+
186
+ Returns:
187
+ 分类后的服务异常
188
+ """
189
+ if isinstance(error, TranscriptServiceError):
190
+ return error
191
+
192
+ error_type = type(error)
193
+ if error_type in self.error_mapping:
194
+ exception_class, error_code = self.error_mapping[error_type]
195
+ return exception_class(str(error), error_code)
196
+
197
+ # 根据错误消息内容进行分类
198
+ error_msg = str(error).lower()
199
+
200
+ if "timeout" in error_msg:
201
+ return NetworkError(str(error), ErrorCode.NETWORK_TIMEOUT)
202
+ elif "permission denied" in error_msg:
203
+ return SystemError(str(error), ErrorCode.SYSTEM_PERMISSION_DENIED)
204
+ elif "api key" in error_msg:
205
+ return APIError(str(error), ErrorCode.API_KEY_INVALID)
206
+ elif "quota" in error_msg or "limit" in error_msg:
207
+ return APIError(str(error), ErrorCode.API_QUOTA_EXCEEDED)
208
+ else:
209
+ return TranscriptServiceError(str(error), ErrorCode.UNKNOWN_ERROR)
210
+
211
+ def is_retryable(self, error: TranscriptServiceError) -> bool:
212
+ """判断错误是否可重试
213
+
214
+ Args:
215
+ error: 服务异常
216
+
217
+ Returns:
218
+ 是否可重试
219
+ """
220
+ return error.error_code in self.retryable_errors
221
+
222
+ def handle_error(self, error: Exception, context: str = "") -> TranscriptServiceError:
223
+ """处理错误
224
+
225
+ Args:
226
+ error: 原始异常
227
+ context: 错误上下文
228
+
229
+ Returns:
230
+ 处理后的服务异常
231
+ """
232
+ classified_error = self.classify_error(error)
233
+
234
+ # 记录错误日志
235
+ log_msg = f"错误处理 - {context}: {classified_error.message}"
236
+ if classified_error.error_code in [ErrorCode.UNKNOWN_ERROR, ErrorCode.SYSTEM_OUT_OF_MEMORY]:
237
+ self.logger.exception(log_msg)
238
+ else:
239
+ self.logger.error(log_msg)
240
+
241
+ return classified_error
242
+
243
+
244
+ # 全局错误处理器实例
245
+ error_handler = ErrorHandler()
246
+
247
+
248
+ def retry_async(
249
+ strategy: Optional[RetryStrategy] = None,
250
+ exceptions: tuple = (Exception,),
251
+ context: str = ""
252
+ ):
253
+ """异步函数重试装饰器
254
+
255
+ Args:
256
+ strategy: 重试策略
257
+ exceptions: 需要重试的异常类型
258
+ context: 上下文信息
259
+ """
260
+ if strategy is None:
261
+ strategy = RetryStrategy()
262
+
263
+ def decorator(func: Callable):
264
+ @functools.wraps(func)
265
+ async def wrapper(*args, **kwargs):
266
+ logger = get_task_logger(logger_name="transcript_service.retry")
267
+
268
+ for attempt in range(1, strategy.max_attempts + 1):
269
+ try:
270
+ return await func(*args, **kwargs)
271
+ except exceptions as e:
272
+ classified_error = error_handler.classify_error(e)
273
+
274
+ # 检查是否可重试
275
+ if attempt == strategy.max_attempts or not error_handler.is_retryable(classified_error):
276
+ logger.error(f"{context} 最终失败 (尝试 {attempt}/{strategy.max_attempts}): {str(e)}")
277
+ raise classified_error
278
+
279
+ # 计算延迟时间
280
+ delay = strategy.calculate_delay(attempt)
281
+ logger.warning(f"{context} 第 {attempt} 次尝试失败,{delay:.1f}秒后重试: {str(e)}")
282
+
283
+ await asyncio.sleep(delay)
284
+
285
+ # 理论上不会执行到这里
286
+ raise TranscriptServiceError("重试逻辑异常", ErrorCode.UNKNOWN_ERROR)
287
+
288
+ return wrapper
289
+ return decorator
290
+
291
+
292
+ def retry_sync(
293
+ strategy: Optional[RetryStrategy] = None,
294
+ exceptions: tuple = (Exception,),
295
+ context: str = ""
296
+ ):
297
+ """同步函数重试装饰器
298
+
299
+ Args:
300
+ strategy: 重试策略
301
+ exceptions: 需要重试的异常类型
302
+ context: 上下文信息
303
+ """
304
+ if strategy is None:
305
+ strategy = RetryStrategy()
306
+
307
+ def decorator(func: Callable):
308
+ @functools.wraps(func)
309
+ def wrapper(*args, **kwargs):
310
+ logger = get_task_logger(logger_name="transcript_service.retry")
311
+
312
+ for attempt in range(1, strategy.max_attempts + 1):
313
+ try:
314
+ return func(*args, **kwargs)
315
+ except exceptions as e:
316
+ classified_error = error_handler.classify_error(e)
317
+
318
+ # 检查是否可重试
319
+ if attempt == strategy.max_attempts or not error_handler.is_retryable(classified_error):
320
+ logger.error(f"{context} 最终失败 (尝试 {attempt}/{strategy.max_attempts}): {str(e)}")
321
+ raise classified_error
322
+
323
+ # 计算延迟时间
324
+ delay = strategy.calculate_delay(attempt)
325
+ logger.warning(f"{context} 第 {attempt} 次尝试失败,{delay:.1f}秒后重试: {str(e)}")
326
+
327
+ time.sleep(delay)
328
+
329
+ # 理论上不会执行到这里
330
+ raise TranscriptServiceError("重试逻辑异常", ErrorCode.UNKNOWN_ERROR)
331
+
332
+ return wrapper
333
+ return decorator
334
+
335
+
336
+ def safe_execute(func: Callable, *args, **kwargs) -> tuple[bool, Any, Optional[TranscriptServiceError]]:
337
+ """安全执行函数
338
+
339
+ Args:
340
+ func: 要执行的函数
341
+ *args: 位置参数
342
+ **kwargs: 关键字参数
343
+
344
+ Returns:
345
+ (是否成功, 结果或None, 错误或None)
346
+ """
347
+ try:
348
+ result = func(*args, **kwargs)
349
+ return True, result, None
350
+ except Exception as e:
351
+ error = error_handler.handle_error(e, f"执行 {func.__name__}")
352
+ return False, None, error
353
+
354
+
355
+ async def safe_execute_async(func: Callable, *args, **kwargs) -> tuple[bool, Any, Optional[TranscriptServiceError]]:
356
+ """安全执行异步函数
357
+
358
+ Args:
359
+ func: 要执行的异步函数
360
+ *args: 位置参数
361
+ **kwargs: 关键字参数
362
+
363
+ Returns:
364
+ (是否成功, 结果或None, 错误或None)
365
+ """
366
+ try:
367
+ result = await func(*args, **kwargs)
368
+ return True, result, None
369
+ except Exception as e:
370
+ error = error_handler.handle_error(e, f"执行 {func.__name__}")
371
+ return False, None, error
372
+
373
+
374
+ def get_error_handler() -> ErrorHandler:
375
+ """获取错误处理器实例
376
+
377
+ Returns:
378
+ 错误处理器实例
379
+ """
380
+ return error_handler
src/utils/logger.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """日志管理模块
2
+
3
+ 提供结构化日志记录功能,支持任务跟踪和状态记录。
4
+ """
5
+
6
+ import logging
7
+ import logging.config
8
+ import logging.handlers
9
+ import uuid
10
+ from pathlib import Path
11
+ from typing import Any, Dict, Optional
12
+ import yaml
13
+
14
+ try:
15
+ from rich.console import Console
16
+ from rich.logging import RichHandler
17
+ RICH_AVAILABLE = True
18
+ except ImportError:
19
+ RICH_AVAILABLE = False
20
+
21
+ from ..core.config import get_config
22
+
23
+
24
+ class TaskContextFilter(logging.Filter):
25
+ """任务上下文过滤器
26
+
27
+ 为日志记录添加任务ID上下文信息。
28
+ """
29
+
30
+ def __init__(self):
31
+ super().__init__()
32
+ self.task_id = 'system'
33
+
34
+ def filter(self, record):
35
+ """添加任务ID到日志记录"""
36
+ # 确保所有记录都有task_id字段
37
+ if not hasattr(record, 'task_id'):
38
+ record.task_id = getattr(self, 'task_id', 'system')
39
+ elif getattr(record, 'task_id', None) is None:
40
+ record.task_id = getattr(self, 'task_id', 'system')
41
+ return True
42
+
43
+
44
+ class Logger:
45
+ """日志管理器"""
46
+
47
+ def __init__(self, name: str = "transcript_service"):
48
+ """初始化日志管理器
49
+
50
+ Args:
51
+ name: 日志器名称
52
+ """
53
+ self.name = name
54
+ self.config = get_config()
55
+ self._setup_logging()
56
+ self.logger = logging.getLogger(name)
57
+ self.task_filter = TaskContextFilter()
58
+
59
+ # 为所有处理器添加任务过滤器
60
+ for handler in self.logger.handlers:
61
+ handler.addFilter(self.task_filter)
62
+
63
+ # 同时为根日志器的处理器添加过滤器
64
+ root_logger = logging.getLogger()
65
+ for handler in root_logger.handlers:
66
+ if not any(isinstance(f, TaskContextFilter) for f in handler.filters):
67
+ handler.addFilter(self.task_filter)
68
+
69
+ def _setup_logging(self):
70
+ """设置日志配置"""
71
+ # 确保日志目录存在
72
+ logs_dir = self.config.get_logs_dir()
73
+
74
+ # 加载日志配置文件
75
+ config_file = self.config.get_project_root() / "config" / "logging.yaml"
76
+
77
+ if config_file.exists():
78
+ with open(config_file, 'r', encoding='utf-8') as file:
79
+ logging_config = yaml.safe_load(file)
80
+
81
+ # 更新文件路径为绝对路径
82
+ for handler_name, handler_config in logging_config.get('handlers', {}).items():
83
+ if 'filename' in handler_config:
84
+ handler_config['filename'] = str(logs_dir / Path(handler_config['filename']).name)
85
+
86
+ logging.config.dictConfig(logging_config)
87
+ else:
88
+ # 使用默认配置
89
+ self._setup_default_logging()
90
+
91
+ def _setup_default_logging(self):
92
+ """设置默认日志配置"""
93
+ # 控制台处理器
94
+ if RICH_AVAILABLE:
95
+ console = Console()
96
+ console_handler = RichHandler(
97
+ console=console,
98
+ show_time=True,
99
+ show_path=True,
100
+ markup=True
101
+ )
102
+ else:
103
+ console_handler = logging.StreamHandler()
104
+ console_formatter = logging.Formatter(
105
+ '[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
106
+ datefmt='%Y-%m-%d %H:%M:%S'
107
+ )
108
+ console_handler.setFormatter(console_formatter)
109
+
110
+ console_handler.setLevel(logging.DEBUG if self.config.app.debug else logging.INFO)
111
+
112
+ # 文件处理器
113
+ log_file = self.config.get_logs_dir() / "app.log"
114
+ file_handler = logging.handlers.RotatingFileHandler(
115
+ log_file,
116
+ maxBytes=10*1024*1024, # 10MB
117
+ backupCount=5,
118
+ encoding='utf-8'
119
+ )
120
+ file_handler.setLevel(logging.INFO)
121
+
122
+ # 格式化器(简化版本)
123
+ formatter = logging.Formatter(
124
+ '[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s',
125
+ datefmt='%Y-%m-%d %H:%M:%S'
126
+ )
127
+ file_handler.setFormatter(formatter)
128
+
129
+ # 配置根日志器
130
+ root_logger = logging.getLogger()
131
+ root_logger.setLevel(logging.DEBUG if self.config.app.debug else logging.INFO)
132
+ root_logger.addHandler(console_handler)
133
+ root_logger.addHandler(file_handler)
134
+
135
+ def set_task_id(self, task_id: str):
136
+ """设置当前任务ID
137
+
138
+ Args:
139
+ task_id: 任务ID
140
+ """
141
+ self.task_filter.task_id = task_id
142
+
143
+ def clear_task_id(self):
144
+ """清除当前任务ID"""
145
+ self.task_filter.task_id = 'system'
146
+
147
+ def debug(self, message: str, **kwargs):
148
+ """记录调试信息"""
149
+ self.logger.debug(message, extra=kwargs)
150
+
151
+ def info(self, message: str, **kwargs):
152
+ """记录一般信息"""
153
+ self.logger.info(message, extra=kwargs)
154
+
155
+ def warning(self, message: str, **kwargs):
156
+ """记录警告信息"""
157
+ self.logger.warning(message, extra=kwargs)
158
+
159
+ def error(self, message: str, **kwargs):
160
+ """记录错误信息"""
161
+ self.logger.error(message, extra=kwargs)
162
+
163
+ def critical(self, message: str, **kwargs):
164
+ """记录严重错误"""
165
+ self.logger.critical(message, extra=kwargs)
166
+
167
+ def exception(self, message: str, **kwargs):
168
+ """记录异常信息(包含堆栈跟踪)"""
169
+ self.logger.exception(message, extra=kwargs)
170
+
171
+
172
+ class TaskLogger:
173
+ """任务日志记录器
174
+
175
+ 为特定任务提供上下文日志记录。
176
+ """
177
+
178
+ def __init__(self, task_id: Optional[str] = None, logger_name: str = "transcript_service"):
179
+ """初始化任务日志记录器
180
+
181
+ Args:
182
+ task_id: 任务ID,如果为None则自动生成
183
+ logger_name: 基础日志器名称
184
+ """
185
+ self.task_id = task_id or str(uuid.uuid4())[:8]
186
+ self.logger = Logger(logger_name)
187
+ self.logger.set_task_id(self.task_id)
188
+
189
+ def __enter__(self):
190
+ """进入上下文管理器"""
191
+ return self
192
+
193
+ def __exit__(self, exc_type, exc_val, exc_tb):
194
+ """退出上下文管理器"""
195
+ self.logger.clear_task_id()
196
+
197
+ def debug(self, message: str, **kwargs):
198
+ """记录调试信息"""
199
+ self.logger.debug(message, **kwargs)
200
+
201
+ def info(self, message: str, **kwargs):
202
+ """记录一般信息"""
203
+ self.logger.info(message, **kwargs)
204
+
205
+ def warning(self, message: str, **kwargs):
206
+ """记录警告信息"""
207
+ self.logger.warning(message, **kwargs)
208
+
209
+ def error(self, message: str, **kwargs):
210
+ """记录错误信息"""
211
+ self.logger.error(message, **kwargs)
212
+
213
+ def critical(self, message: str, **kwargs):
214
+ """记录严重错误"""
215
+ self.logger.critical(message, **kwargs)
216
+
217
+ def exception(self, message: str, **kwargs):
218
+ """记录异常信息"""
219
+ self.logger.exception(message, **kwargs)
220
+
221
+ def set_task_id(self, task_id: str):
222
+ """设置当前任务ID
223
+
224
+ Args:
225
+ task_id: 任务ID
226
+ """
227
+ self.logger.set_task_id(task_id)
228
+
229
+ def clear_task_id(self):
230
+ """清除当前任务ID"""
231
+ self.logger.clear_task_id()
232
+
233
+
234
+ # 全局日志实例
235
+ logger = Logger()
236
+
237
+
238
+ def get_logger(name: str = "transcript_service") -> Logger:
239
+ """获取日志实例
240
+
241
+ Args:
242
+ name: 日志器名称
243
+
244
+ Returns:
245
+ 日志实例
246
+ """
247
+ return Logger(name)
248
+
249
+
250
+ def get_task_logger(task_id: Optional[str] = None, logger_name: str = "transcript_service") -> TaskLogger:
251
+ """获取任务日志实例
252
+
253
+ Args:
254
+ task_id: 任务ID
255
+ logger_name: 日志器名称
256
+
257
+ Returns:
258
+ 任务日志实例
259
+ """
260
+ return TaskLogger(task_id, logger_name)