# ============================================================ # 第二阶段:AI 视觉审计(Qwen-VL 导演核心) # ============================================================ import dashscope from dashscope import MultiModalConversation # 从环境变量或界面获取千问密钥 DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "") def call_qwen_vision(frames: list, chunk_idx: int, total_chunks: int, video_duration: float, api_key: str) -> list: """ 【勇哥专用:千问魔心点火版】 将帧图像投喂给 Qwen-VL-Max,带思考链审计 """ # 优先使用界面输入的 Key,否则用环境变量 effective_key = api_key.strip() or DASHSCOPE_API_KEY if not effective_key: raise ValueError("❌ 缺少 DashScope API Key,请在界面或环境变量中设置") time_per_frame = 1.0 / FPS_AUDIT chunk_start_time = (chunk_idx * CHUNK_SIZE) * time_per_frame # 1. 构建导演指令 (包含 CoT 思考要求) prompt_text = ( f"你现在是一位精通非遗竹编手艺的纪录片导演。现在审计第 {chunk_idx+1}/{total_chunks} 包素材。\n" f"时间范围:{chunk_start_time:.2f}s 起。请先在 标签内分析画面的手法的精准度、" f"光影的治愈感以及动作的连贯性,然后给出剪辑 JSON 指令。\n" f"要求:start 绝不能是整数(如 3.0 必须写成 3.47),duration 在 1.5-8s 之间。" ) # 2. 准备多模态内容(抽样 8 帧,确保不超 Token 限制) sample_frames = frames[::max(1, len(frames)//8)][:8] content = [{"text": prompt_text}] for fp in sample_frames: # Qwen-VL 接收本地路径的 file:// 协议 content.append({"image": f"file://{fp.absolute()}"}) # 3. 点火调用 responses = MultiModalConversation.call( model='qwen-vl-max', # 或者使用最新的 qwen-vl-max-2025-01-25 api_key=effective_key, messages=[{"role": "user", "content": content}] ) if responses.status_code != 200: raise RuntimeError(f"Qwen API 报错: {responses.message}") raw_output = responses.output.choices[0].message.content[0]["text"] # 4. 提取 JSON 指令(过滤掉 里的思考过程) match = re.search(r'\[\s*\{.*\}\s*\]', raw_output, re.DOTALL) if not match: return [] try: clips = json.loads(match.group()) except: return [] # 5. 勇哥铁律校验 validated = [] for c in clips: try: s = float(c["start"]) if s == int(s): s += 0.47 # 强制非整数偏移 validated.append({ "start": round(s, 2), "duration": max(1.5, min(float(c.get("duration", 3)), 8.0)), "speed": max(0.8, min(float(c.get("speed", 1.0)), 1.2)), "reason": str(c.get("reason", "未分类"))[:15] }) except: continue return validated