heyong-ai commited on
Commit
84eafbf
·
verified ·
1 Parent(s): 22aa0b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -638
app.py CHANGED
@@ -1,648 +1,76 @@
1
- """
2
- VideoAuto-R1 (Qwen3-VL) Demo
3
- A Gradio-based chat interface for adaptive inference with image/video inputs.
4
- """
5
- import spaces
6
- import os
7
- import base64
8
- from io import BytesIO
9
 
10
- import torch
11
- import gradio as gr
12
- from PIL import Image
13
- from transformers import AutoProcessor, AutoTokenizer
14
 
15
- from videoauto_r1.qwen_vl_utils.vision_process import process_vision_info
16
- from videoauto_r1.modeling_qwen3_vl_patched import Qwen3VLForConditionalGeneration
17
- from videoauto_r1.early_exit import compute_first_boxed_answer_probs
18
-
19
-
20
- # ============================================================================
21
- # Constants
22
- # ============================================================================
23
-
24
- COT_SYSTEM_PROMPT_ANSWER_TWICE = (
25
- "You are a helpful assistant.\n"
26
- "FIRST: Output your initial answer inside the first \\boxed{...} without any analysis or explanations. "
27
- "If you cannot determine the answer without reasoning, output \\boxed{Let's analyze the problem step by step.} instead.\n"
28
- "THEN: Think through the reasoning as an internal monologue enclosed within <think>...</think>.\n"
29
- "AT LAST: Output the final answer again inside \\boxed{...}. If you believe the previous answer was correct, repeat it; otherwise, correct it.\n"
30
- "Output format: \\boxed{...}<think>...</think>\\boxed{...}"
31
- )
32
-
33
- VIDEO_EXTS = (".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm")
34
- IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp", ".tiff")
35
-
36
- CUSTOM_CSS = """
37
- #chatbot .message[class*="user"] {
38
- max-width: 50% !important;
39
- }
40
-
41
- #chatbot .message[class*="bot"],
42
- #chatbot .message[class*="assistant"] {
43
- max-width: 60% !important;
44
- }
45
-
46
- #chatbot .message > div {
47
- width: 100% !important;
48
- max-width: 100% !important;
49
- }
50
- """
51
-
52
- MODEL_PATH = "IVUL-KAUST/VideoAuto-R1-Qwen3-VL-8B"
53
-
54
-
55
- # ============================================================================
56
- # Global Model Variables
57
- # ============================================================================
58
-
59
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
60
-
61
- # Load model
62
- model = (
63
- Qwen3VLForConditionalGeneration.from_pretrained(
64
- MODEL_PATH,
65
- dtype="bfloat16",
66
- attn_implementation="sdpa",
67
- )
68
- .to("cuda")
69
- .eval()
70
- )
71
-
72
- processor = AutoProcessor.from_pretrained(MODEL_PATH)
73
- tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
74
-
75
-
76
- # ============================================================================
77
- # Utility Functions
78
- # ============================================================================
79
-
80
-
81
- def detect_media_type(file_path: str | None) -> str | None:
82
- """
83
- Detect media type from file extension.
84
-
85
- Args:
86
- file_path: Path to the media file
87
-
88
- Returns:
89
- 'image', 'video', or None
90
- """
91
- if not file_path:
92
- return None
93
-
94
- p = file_path.lower()
95
- if p.endswith(VIDEO_EXTS):
96
- return "video"
97
- if p.endswith(IMAGE_EXTS):
98
- return "image"
99
-
100
- # Fallback: try to open as image
101
- try:
102
- Image.open(file_path)
103
- return "image"
104
- except Exception:
105
- return "video"
106
-
107
-
108
- def process_image(
109
- image_path: str,
110
- image_min_pixels: int = 128 * 28 * 28,
111
- image_max_pixels: int = 16384 * 28 * 28,
112
- ) -> dict | None:
113
  """
114
- Process image file to base64 format.
115
-
116
- Args:
117
- image_path: Path to image file
118
- image_min_pixels: Minimum pixel count
119
- image_max_pixels: Maximum pixel count
120
-
121
- Returns:
122
- Dictionary with image data or None
123
  """
124
- if image_path is None:
125
- return None
126
-
127
- image = Image.open(image_path).convert("RGB")
128
- buffer = BytesIO()
129
- image.save(buffer, format="JPEG")
130
- base64_bytes = base64.b64encode(buffer.getvalue())
131
- base64_string = base64_bytes.decode("utf-8")
132
-
133
- return {
134
- "type": "image",
135
- "image": f"data:image/jpeg;base64,{base64_string}",
136
- "min_pixels": image_min_pixels,
137
- "max_pixels": image_max_pixels,
138
- }
139
-
140
-
141
- def process_video(
142
- video_path: str,
143
- video_min_pixels: int = 16 * 28 * 28,
144
- video_max_pixels: int = 768 * 28 * 28,
145
- video_total_pixels: int = 128000 * 28 * 28,
146
- min_frames: int = 4,
147
- max_frames: int = 64,
148
- fps: float = 2.0,
149
- ) -> dict | None:
150
- """
151
- Process video file configuration.
152
-
153
- Args:
154
- video_path: Path to video file
155
- video_min_pixels: Minimum pixels per frame
156
- video_max_pixels: Maximum pixels per frame
157
- video_total_pixels: Total pixels across all frames
158
- min_frames: Minimum number of frames
159
- max_frames: Maximum number of frames
160
- fps: Frames per second for sampling
161
-
162
- Returns:
163
- Dictionary with video configuration or None
164
- """
165
- if video_path is None:
166
- return None
167
-
168
- return {
169
- "type": "video",
170
- "video": video_path,
171
- "min_pixels": video_min_pixels,
172
- "max_pixels": video_max_pixels,
173
- "total_pixels": video_total_pixels,
174
- "min_frames": min_frames,
175
- "max_frames": max_frames,
176
- "fps": fps,
177
- }
178
-
179
-
180
- @spaces.GPU(duration=180)
181
- def generate(
182
- media_input: str | None,
183
- prompt: str,
184
- early_exit_thresh: float,
185
- temperature: float,
186
- max_new_tokens: int = 4096,
187
- ) -> dict:
188
- """
189
- Generate response with adaptive inference.
190
-
191
- Args:
192
- media_input: Path to media file
193
- prompt: Text prompt
194
- early_exit_thresh: Confidence threshold for early exit
195
- temperature: Sampling temperature
196
- max_new_tokens: Maximum tokens to generate
197
-
198
- Returns:
199
- Dictionary containing response and metadata
200
- """
201
- # Prepare message
202
- message = [{"role": "system", "content": COT_SYSTEM_PROMPT_ANSWER_TWICE}]
203
- content_parts = []
204
-
205
- # Process media input
206
- if media_input is not None:
207
- media_type = detect_media_type(media_input)
208
-
209
- if media_type == "video":
210
- video_dict = process_video(media_input)
211
- if video_dict:
212
- content_parts.append(video_dict)
213
- elif media_type == "image":
214
- image_dict = process_image(media_input)
215
- if image_dict:
216
- content_parts.append(image_dict)
217
-
218
- # Add text prompt
219
- content_parts.append({"type": "text", "text": prompt})
220
- message.append({"role": "user", "content": content_parts})
221
-
222
- # Apply chat template
223
- text = processor.apply_chat_template([message], tokenize=False, add_generation_prompt=True)
224
-
225
- # Process vision inputs
226
- image_inputs, video_inputs, video_kwargs = process_vision_info(
227
- [message],
228
- image_patch_size=16,
229
- return_video_kwargs=True,
230
- return_video_metadata=True,
231
- )
232
-
233
- if video_inputs is not None:
234
- video_inputs, video_metadatas = zip(*video_inputs)
235
- video_inputs = list(video_inputs)
236
- video_metadatas = list(video_metadatas)
237
- else:
238
- video_metadatas = None
239
-
240
- # Prepare inputs
241
- inputs = processor(
242
- text=text,
243
- images=image_inputs,
244
- videos=video_inputs,
245
- video_metadata=video_metadatas,
246
- do_resize=False,
247
- padding=True,
248
- return_tensors="pt",
249
- **video_kwargs,
250
  )
251
- inputs = inputs.to(device)
252
-
253
- # Generation configuration
254
- gen_kwargs = {
255
- "max_new_tokens": max_new_tokens,
256
- "temperature": temperature if temperature > 0 else None,
257
- "do_sample": temperature > 0,
258
- "top_p": 0.9 if temperature > 0 else None,
259
- "num_beams": 1,
260
- "use_cache": True,
261
- "return_dict_in_generate": True,
262
- "output_scores": True,
263
- }
264
-
265
- # Generate response
266
- with torch.no_grad():
267
- gen_out = model.generate(
268
- **inputs,
269
- eos_token_id=tokenizer.eos_token_id,
270
- pad_token_id=tokenizer.pad_token_id,
271
- **gen_kwargs,
272
- )
273
 
274
- # Decode output
275
- generated_ids = gen_out.sequences[0][len(inputs.input_ids[0]) :]
276
- answer = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
277
-
278
- # Compute confidence
279
- first_box_probs = compute_first_boxed_answer_probs(
280
- b=0,
281
- gen_ids=generated_ids,
282
- gen_out=gen_out,
283
- ans=answer,
284
- task="",
285
- tokenizer=tokenizer,
 
286
  )
287
 
288
- # Parse response
289
- first_answer = answer.split("<think>")[0]
290
- second_answer = answer.split("</think>")[-1] if "</think>" in answer else first_answer
291
- reasoning = answer.split("<think>")[-1].split("</think>")[0] if "<think>" in answer else "N/A"
292
-
293
- # Determine inference mode
294
- if first_box_probs >= early_exit_thresh:
295
- need_cot = False
296
- reasoning = False
297
- else:
298
- need_cot = True
299
-
300
- return {
301
- "full_response": answer,
302
- "first_answer": first_answer,
303
- "confidence": f"{first_box_probs:.4f}",
304
- "need_cot": need_cot,
305
- "reasoning": reasoning,
306
- "second_answer": second_answer,
307
- }
308
 
 
 
 
 
 
 
309
 
310
- # ============================================================================
311
- # Gradio Callback Functions
312
- # ============================================================================
313
-
314
-
315
- def update_preview(file_path: str | None):
316
- """Update preview widgets based on media type."""
317
- mtype = detect_media_type(file_path)
318
-
319
- if mtype == "image":
320
- return (
321
- gr.update(value=file_path, visible=True), # image_preview
322
- gr.update(value=None, visible=False), # video_preview
323
- )
324
- elif mtype == "video":
325
- return (
326
- gr.update(value=None, visible=False), # image_preview
327
- gr.update(value=file_path, visible=True), # video_preview
328
- )
329
- else:
330
- return (
331
- gr.update(value=None, visible=False),
332
- gr.update(value=None, visible=False),
333
- )
334
-
335
-
336
- def chat_generate(
337
- media_path,
338
- user_text,
339
- messages_state,
340
- chatbot_state,
341
- last_media_state,
342
- early_exit_thresh,
343
- temperature,
344
- ):
345
- """Handle chat message generation."""
346
- if user_text is None or str(user_text).strip() == "":
347
- raise gr.Error("Chat message cannot be empty.")
348
-
349
- # Clear history if media changed
350
- if (
351
- (media_path is not None)
352
- and (last_media_state is not None)
353
- and (os.path.basename(media_path) != os.path.basename(last_media_state))
354
- ):
355
- messages_state = []
356
- chatbot_state = []
357
-
358
- # Initialize system prompt
359
- if len(messages_state) == 0:
360
- messages_state.append({"role": "system", "content": COT_SYSTEM_PROMPT_ANSWER_TWICE})
361
-
362
- # Prepare user message
363
- content_parts = []
364
- if media_path is not None:
365
- mtype = detect_media_type(media_path)
366
- if mtype == "video":
367
- vd = process_video(media_path)
368
- if vd:
369
- content_parts.append(vd)
370
- elif mtype == "image":
371
- imd = process_image(media_path)
372
- if imd:
373
- content_parts.append(imd)
374
-
375
- content_parts.append({"type": "text", "text": user_text})
376
- messages_state.append({"role": "user", "content": content_parts})
377
-
378
- # Generate response
379
- result = generate(media_path, user_text, early_exit_thresh, temperature)
380
-
381
- # Format assistant response
382
- first_ans = (result.get("first_answer") or "").strip()
383
- conf = result.get("confidence", "N/A")
384
- need_cot = result.get("need_cot", "")
385
- reasoning = result.get("reasoning", "")
386
- final_ans = (result.get("second_answer") or "").strip()
387
-
388
- if need_cot:
389
- decision_prompt = f"Continue CoT Reasoning (confidence = {conf})"
390
- else:
391
- decision_prompt = f"Early Exit (confidence = {conf})"
392
-
393
- assistant_display_1 = f"**Initial Answer:**\n{first_ans}\n\n" f"**{decision_prompt}**\n\n"
394
-
395
- # Update state
396
- messages_state.append({"role": "assistant", "content": assistant_display_1})
397
- chatbot_state.append({"role": "user", "content": user_text})
398
- chatbot_state.append({"role": "assistant", "content": assistant_display_1})
399
-
400
- if need_cot:
401
- assistant_display_2 = (
402
- f"\n\n**<think>**\n\n{reasoning}\n**</think>**\n\n" f"**Reviewed Answer:**\n{final_ans}\n\n"
403
- )
404
-
405
- messages_state.append({"role": "assistant", "content": assistant_display_2})
406
- chatbot_state.append({"role": "assistant", "content": assistant_display_2})
407
-
408
- # Disable textbox and send button after generation to prevent interleaved conversation
409
- return (
410
- messages_state,
411
- chatbot_state,
412
- media_path,
413
- gr.update(value="", interactive=False), # Disable and clear textbox
414
- gr.update(interactive=False), # Disable send button
415
- )
416
-
417
-
418
- def clear_history():
419
- """Clear all chat history and reset interface."""
420
- return (
421
- [], # messages_state
422
- [], # chatbot_state
423
- None, # last_media_state
424
- gr.update(value=None), # file
425
- gr.update(value=None, visible=False), # image_preview
426
- gr.update(value=None, visible=False), # video_preview
427
- gr.update(value="", interactive=True), # Re-enable and clear textbox
428
- gr.update(interactive=True), # Re-enable send button
429
- )
430
-
431
-
432
- # ============================================================================
433
- # Example Data
434
- # ============================================================================
435
-
436
- EXAMPLES = [
437
- [
438
- "assets/yt--MAYaJ5cyOE_70.mp4",
439
- "Question: Which one of these descriptions correctly matches the actions in the video?\nOptions:\n(A) officiating\n(B) skating\n(C) stopping\n(D) playing sports\nPut your final answer in \\boxed{}.",
440
- # GT is B
441
- ],
442
- [
443
- "assets/validation_Finance_2.mp4",
444
- "Using the Arbitrage Pricing Theory model shown above, calculate the expected return E(rp) if the risk-free rate increases to 5%. All other risk premiums (RP) and beta (\\beta) values remain unchanged.\nOptions:\nA. 13.4%\nB. 14.8%\nC. 15.6%\nD. 16.1%\nE. 16.5%\nF. 16.9%\nG. 17.5%\nH. 17.8%\nI. 17.2%\nJ. 18.1%\nPut your final answer in \\boxed{}.",
445
- # GT is I
446
- ],
447
- [
448
- "assets/M3CoT-25169-0.png",
449
- "Within the image, you'll notice several purchased items. And we assume that the water temperature is 4 ° C at this time.\nWithin the image, can you identify the count of items among the provided options that will go below the waterline?\nA. 0\nB. 1\nC. 2\nD. 3\nPut your final answer in \\boxed{}.",
450
- # GT is B
451
- ],
452
- [
453
- None,
454
- "Determine the value of the parameter $m$ such that the equation $(m-2)x^2 + (m^2-4m+3)x - (6m^2-2) = 0$ has real solutions, and the sum of the cubes of these solutions is equal to zero.\nPut your final answer in \\boxed{}.",
455
- # GT is 3
456
- ],
457
- ]
458
-
459
-
460
- # ============================================================================
461
- # Gradio Interface
462
- # ============================================================================
463
-
464
- demo = gr.Blocks(title="VideoAuto-R1 Demo")
465
-
466
- with demo:
467
- gr.Markdown("# [VideoAuto-R1 Demo](https://github.com/IVUL-KAUST/VideoAuto-R1/)")
468
-
469
- # Display system prompt
470
- with gr.Accordion("System Prompt", open=False):
471
- gr.Markdown(f"```\n{COT_SYSTEM_PROMPT_ANSWER_TWICE}\n```")
472
-
473
- # State variables
474
- messages_state = gr.State([])
475
- chatbot_state = gr.State([])
476
- last_media_state = gr.State(None)
477
-
478
- with gr.Row():
479
- # Left column: Media input and settings
480
- with gr.Column(scale=3):
481
- media_input = gr.File(
482
- label="Upload Image or Video",
483
- file_types=["image", "video"],
484
- type="filepath",
485
- )
486
- image_preview = gr.Image(label="Image Preview", visible=False)
487
- video_preview = gr.Video(label="Video Preview", visible=False)
488
-
489
- with gr.Accordion("Advanced Settings", open=True):
490
- early_exit_thresh = gr.Slider(
491
- minimum=0.0,
492
- maximum=1.0,
493
- value=0.98,
494
- step=0.01,
495
- label="Early Exit Threshold",
496
- )
497
- temperature = gr.Slider(
498
- minimum=0.0,
499
- maximum=2.0,
500
- value=0.0,
501
- step=0.1,
502
- label="Temperature",
503
- )
504
-
505
- # Right column: Chat interface
506
- with gr.Column(scale=7):
507
- chatbot = gr.Chatbot(
508
- label="Chat",
509
- elem_id="chatbot",
510
- height=600,
511
- sanitize_html=False,
512
- )
513
- textbox = gr.Textbox(
514
- show_label=False,
515
- placeholder="Enter text and press ENTER",
516
- lines=2,
517
- )
518
- with gr.Row():
519
- send_btn = gr.Button("Send", variant="primary")
520
- clear_btn = gr.Button("Clear")
521
-
522
- gr.Markdown("Please click the **Clear** button before starting a new conversation or trying a new example.")
523
-
524
- # Event handlers
525
- media_input.change(
526
- fn=update_preview,
527
- inputs=[media_input],
528
- outputs=[image_preview, video_preview],
529
- )
530
-
531
- # Send button click: generate response and disable input controls
532
- send_btn.click(
533
- fn=chat_generate,
534
- inputs=[
535
- media_input,
536
- textbox,
537
- messages_state,
538
- chatbot_state,
539
- last_media_state,
540
- early_exit_thresh,
541
- temperature,
542
- ],
543
- outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
544
- ).then(
545
- fn=lambda cs: cs,
546
- inputs=[chatbot_state],
547
- outputs=[chatbot],
548
- )
549
-
550
- # Textbox submit: generate response and disable input controls
551
- textbox.submit(
552
- fn=chat_generate,
553
- inputs=[
554
- media_input,
555
- textbox,
556
- messages_state,
557
- chatbot_state,
558
- last_media_state,
559
- early_exit_thresh,
560
- temperature,
561
- ],
562
- outputs=[messages_state, chatbot_state, last_media_state, textbox, send_btn],
563
- ).then(
564
- fn=lambda cs: cs,
565
- inputs=[chatbot_state],
566
- outputs=[chatbot],
567
- )
568
-
569
- # Clear button: reset all states and re-enable input controls
570
- clear_btn.click(
571
- fn=clear_history,
572
- inputs=[],
573
- outputs=[
574
- messages_state,
575
- chatbot_state,
576
- last_media_state,
577
- media_input,
578
- image_preview,
579
- video_preview,
580
- textbox,
581
- send_btn,
582
- ],
583
- ).then(
584
- fn=lambda cs: cs,
585
- inputs=[chatbot_state],
586
- outputs=[chatbot],
587
- )
588
-
589
- examples_ds = gr.Dataset(
590
- components=[media_input, textbox],
591
- samples=EXAMPLES,
592
- label="Examples",
593
- type="index", # important: pass selected row index to fn
594
- )
595
-
596
- def load_example(idx: int | None):
597
- # idx can be None when deselecting
598
- if idx is None:
599
- # just clear everything
600
- return clear_history()
601
-
602
- media, text = EXAMPLES[idx][0], EXAMPLES[idx][1]
603
-
604
- # 1) clear all states + re-enable inputs
605
- ms, cs, last, file_u, img_u, vid_u, tb_u, send_u = clear_history()
606
-
607
- # 2) set selected example values
608
- file_u = gr.update(value=media)
609
- tb_u = gr.update(value=text, interactive=True)
610
- send_u = gr.update(interactive=True)
611
-
612
- # 3) update preview explicitly (don't rely on File.change always firing)
613
- img_u, vid_u = update_preview(media)
614
-
615
- # 4) optionally set last_media_state to current media
616
- last = media
617
-
618
- return ms, cs, last, file_u, img_u, vid_u, tb_u, send_u
619
-
620
- examples_ds.select(
621
- fn=load_example,
622
- inputs=[examples_ds],
623
- outputs=[
624
- messages_state,
625
- chatbot_state,
626
- last_media_state,
627
- media_input,
628
- image_preview,
629
- video_preview,
630
- textbox,
631
- send_btn,
632
- ],
633
- ).then(
634
- fn=lambda cs: cs,
635
- inputs=[chatbot_state],
636
- outputs=[chatbot],
637
- )
638
-
639
-
640
- # Launch demo
641
- demo.launch(
642
- share=True,
643
- server_name="0.0.0.0",
644
- server_port=7860,
645
- allowed_paths=["assets"],
646
- debug=True,
647
- css=CUSTOM_CSS,
648
- )
 
1
+ # ============================================================
2
+ # 第二阶段:AI 视觉审计(Qwen-VL 导演核心)
3
+ # ============================================================
4
+ import dashscope
5
+ from dashscope import MultiModalConversation
 
 
 
6
 
7
+ # 从环境变量或界面获取千问密钥
8
+ DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY", "")
 
 
9
 
10
+ def call_qwen_vision(frames: list, chunk_idx: int, total_chunks: int,
11
+ video_duration: float, api_key: str) -> list:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  """
13
+ 【勇哥专用:千问魔心点火版】
14
+ 将帧图像投喂给 Qwen-VL-Max,带思考链审计
 
 
 
 
 
 
 
15
  """
16
+ # 优先使用界面输入的 Key,否则用环境变量
17
+ effective_key = api_key.strip() or DASHSCOPE_API_KEY
18
+ if not effective_key:
19
+ raise ValueError("❌ 缺少 DashScope API Key,请在界面或环境变量中设置")
20
+
21
+ time_per_frame = 1.0 / FPS_AUDIT
22
+ chunk_start_time = (chunk_idx * CHUNK_SIZE) * time_per_frame
23
+
24
+ # 1. 构建导演指令 (包含 CoT 思考要求)
25
+ prompt_text = (
26
+ f"你现在是一位精通非遗竹编手艺的纪录片导演。现在审计第 {chunk_idx+1}/{total_chunks} 包素材。\n"
27
+ f"时间范围:{chunk_start_time:.2f}s 起。请先在 <think> 标签内分析画面的手法的精准度、"
28
+ f"光影的治愈感以及动作的连贯性,然后给出剪辑 JSON 指令。\n"
29
+ f"要求:start 绝不能是整数(如 3.0 必须写成 3.47),duration 在 1.5-8s 之间。"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # 2. 准备多模态内容(抽样 8 帧,确保不超 Token 限制)
33
+ sample_frames = frames[::max(1, len(frames)//8)][:8]
34
+ content = [{"text": prompt_text}]
35
+
36
+ for fp in sample_frames:
37
+ # Qwen-VL 接收本地路径的 file:// 协议
38
+ content.append({"image": f"file://{fp.absolute()}"})
39
+
40
+ # 3. 点火调用
41
+ responses = MultiModalConversation.call(
42
+ model='qwen-vl-max', # 或者使用最新的 qwen-vl-max-2025-01-25
43
+ api_key=effective_key,
44
+ messages=[{"role": "user", "content": content}]
45
  )
46
 
47
+ if responses.status_code != 200:
48
+ raise RuntimeError(f"Qwen API 报错: {responses.message}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ raw_output = responses.output.choices[0].message.content[0]["text"]
51
+
52
+ # 4. 提取 JSON 指令(过滤掉 <think> 里的思考过程)
53
+ match = re.search(r'\[\s*\{.*\}\s*\]', raw_output, re.DOTALL)
54
+ if not match:
55
+ return []
56
 
57
+ try:
58
+ clips = json.loads(match.group())
59
+ except:
60
+ return []
61
+
62
+ # 5. 勇哥铁律校验
63
+ validated = []
64
+ for c in clips:
65
+ try:
66
+ s = float(c["start"])
67
+ if s == int(s): s += 0.47 # 强制非整数偏移
68
+ validated.append({
69
+ "start": round(s, 2),
70
+ "duration": max(1.5, min(float(c.get("duration", 3)), 8.0)),
71
+ "speed": max(0.8, min(float(c.get("speed", 1.0)), 1.2)),
72
+ "reason": str(c.get("reason", "未分类"))[:15]
73
+ })
74
+ except: continue
75
+
76
+ return validated