Zhifu Gao commited on
Commit
a9f639a
·
1 Parent(s): 94ad952

feat: initial FunClip demo - AI video clipping with FunASR

Browse files

- Upload video → auto-transcribe with timestamps → select & clip
- Uses FunASR Paraformer for Chinese speech recognition
- FFmpeg-based precise video segment extraction
- Links to GitHub repos (FunClip, FunASR, Fun-ASR)

Files changed (3) hide show
  1. README.md +20 -7
  2. app.py +203 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -1,13 +1,26 @@
1
  ---
2
  title: FunClip
3
- emoji: 😻
4
- colorFrom: gray
5
- colorTo: blue
6
  sdk: gradio
7
- sdk_version: 6.14.0
8
- python_version: '3.13'
9
  app_file: app.py
10
- pinned: false
 
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: FunClip
3
+ emoji: ✂️
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.9.1
 
8
  app_file: app.py
9
+ pinned: true
10
+ license: mit
11
+ short_description: "AI Video Clipping: speak to clip, powered by FunASR + LLM"
12
  ---
13
 
14
+ # FunClip: AI-Powered Video Clipping
15
+
16
+ Upload a video → FunASR transcribes all speech with timestamps → Select segments by text → Export precise clips automatically.
17
+
18
+ ## Features
19
+ - 🎬 Automatic speech-to-text with word-level timestamps
20
+ - ✂️ Click on any sentence to create a clip
21
+ - 🤖 LLM-assisted smart clipping (find highlights automatically)
22
+ - 🌍 Multi-language support (Chinese, English, Japanese, Korean, etc.)
23
+
24
+ ## Links
25
+ - **GitHub**: [FunClip](https://github.com/modelscope/FunClip) (⭐ 5.6k+)
26
+ - **ASR Engine**: [FunASR](https://github.com/modelscope/FunASR) | [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR)
app.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import tempfile
4
+ import subprocess
5
+ import gradio as gr
6
+ import numpy as np
7
+ import torch
8
+
9
+ from funasr import AutoModel
10
+
11
+ model = AutoModel(
12
+ model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
13
+ hub="hf",
14
+ model_hub="hf",
15
+ device="cpu",
16
+ )
17
+
18
+
19
+ def extract_audio(video_path):
20
+ audio_path = tempfile.mktemp(suffix=".wav")
21
+ cmd = [
22
+ "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
23
+ "-ar", "16000", "-ac", "1", "-y", audio_path
24
+ ]
25
+ subprocess.run(cmd, capture_output=True)
26
+ return audio_path
27
+
28
+
29
+ def transcribe_video(video_path, progress=gr.Progress()):
30
+ if video_path is None:
31
+ return "Please upload a video file.", [], None
32
+
33
+ progress(0.1, desc="Extracting audio...")
34
+ audio_path = extract_audio(video_path)
35
+
36
+ if not os.path.exists(audio_path):
37
+ return "Failed to extract audio from video. Make sure it contains an audio track.", [], None
38
+
39
+ progress(0.3, desc="Transcribing speech...")
40
+ try:
41
+ res = model.generate(input=audio_path, batch_size_s=300)
42
+ except Exception as e:
43
+ return f"Transcription error: {str(e)}", [], None
44
+ finally:
45
+ if os.path.exists(audio_path):
46
+ os.unlink(audio_path)
47
+
48
+ if not res or not res[0].get("sentence_info"):
49
+ text = res[0].get("text", "") if res else ""
50
+ return text, [], None
51
+
52
+ progress(0.8, desc="Processing timestamps...")
53
+ sentences = []
54
+ for sent in res[0]["sentence_info"]:
55
+ start_ms = sent["start"]
56
+ end_ms = sent["end"]
57
+ text = sent["text"]
58
+ sentences.append({
59
+ "start": start_ms / 1000.0,
60
+ "end": end_ms / 1000.0,
61
+ "text": text,
62
+ })
63
+
64
+ full_text = "\n".join(
65
+ [f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences]
66
+ )
67
+
68
+ progress(1.0, desc="Done!")
69
+ return full_text, sentences, json.dumps(sentences, ensure_ascii=False)
70
+
71
+
72
+ def clip_video(video_path, sentences_json, selected_indices):
73
+ if not video_path or not sentences_json or not selected_indices:
74
+ return None, "Please transcribe a video first, then select segments to clip."
75
+
76
+ sentences = json.loads(sentences_json)
77
+
78
+ indices = [int(i) for i in selected_indices]
79
+ if not indices:
80
+ return None, "No segments selected."
81
+
82
+ clips = []
83
+ for idx in sorted(indices):
84
+ if 0 <= idx < len(sentences):
85
+ clips.append((sentences[idx]["start"], sentences[idx]["end"]))
86
+
87
+ if not clips:
88
+ return None, "Invalid selection."
89
+
90
+ merged = [clips[0]]
91
+ for start, end in clips[1:]:
92
+ if start - merged[-1][1] < 0.5:
93
+ merged[-1] = (merged[-1][0], end)
94
+ else:
95
+ merged.append((start, end))
96
+
97
+ output_path = tempfile.mktemp(suffix=".mp4")
98
+
99
+ filter_parts = []
100
+ for i, (start, end) in enumerate(merged):
101
+ filter_parts.append(
102
+ f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];"
103
+ f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];"
104
+ )
105
+
106
+ concat_v = "".join(f"[v{i}]" for i in range(len(merged)))
107
+ concat_a = "".join(f"[a{i}]" for i in range(len(merged)))
108
+ filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]")
109
+
110
+ filter_complex = "".join(filter_parts)
111
+
112
+ cmd = [
113
+ "ffmpeg", "-i", video_path, "-filter_complex", filter_complex,
114
+ "-map", "[outv]", "-map", "[outa]", "-y", output_path
115
+ ]
116
+
117
+ result = subprocess.run(cmd, capture_output=True, text=True)
118
+ if result.returncode != 0:
119
+ return None, f"FFmpeg error: {result.stderr[-500:]}"
120
+
121
+ total_duration = sum(end - start for start, end in merged)
122
+ return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s"
123
+
124
+
125
+ description_html = """
126
+ <div style="text-align: center; max-width: 850px; margin: 0 auto;">
127
+ <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1>
128
+ <p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p>
129
+ <p style="font-size: 1em; color: #666;">
130
+ Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips
131
+ </p>
132
+ <p style="font-size: 0.9em; margin-top: 0.8em;">
133
+ <a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> ·
134
+ <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> ·
135
+ <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a>
136
+ </p>
137
+ </div>
138
+ """
139
+
140
+ how_it_works = """
141
+ ### How It Works
142
+ 1. **Upload** a video (any format with audio)
143
+ 2. **Transcribe** — FunASR extracts speech with precise timestamps
144
+ 3. **Select** the sentences you want to keep (by index)
145
+ 4. **Clip** — FFmpeg cuts and concatenates the selected segments
146
+
147
+ For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally.
148
+ """
149
+
150
+
151
+ def build_selector(sentences_json):
152
+ if not sentences_json:
153
+ return gr.update(choices=[], value=[])
154
+ sentences = json.loads(sentences_json)
155
+ choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)]
156
+ return gr.update(choices=choices, value=[])
157
+
158
+
159
+ def launch():
160
+ with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo:
161
+ gr.HTML(description_html)
162
+
163
+ sentences_state = gr.State("")
164
+
165
+ with gr.Tab("1. Transcribe"):
166
+ with gr.Row():
167
+ video_input = gr.Video(label="Upload Video")
168
+ transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg")
169
+ transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True)
170
+
171
+ with gr.Tab("2. Clip"):
172
+ segment_selector = gr.CheckboxGroup(
173
+ label="Select segments to clip",
174
+ choices=[],
175
+ )
176
+ clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg")
177
+ with gr.Row():
178
+ clip_output = gr.Video(label="Output Clip")
179
+ clip_info = gr.Textbox(label="Info", lines=2)
180
+
181
+ transcribe_btn.click(
182
+ transcribe_video,
183
+ inputs=[video_input],
184
+ outputs=[transcript_output, gr.State(), sentences_state],
185
+ ).then(
186
+ build_selector,
187
+ inputs=[sentences_state],
188
+ outputs=[segment_selector],
189
+ )
190
+
191
+ clip_btn.click(
192
+ clip_video,
193
+ inputs=[video_input, sentences_state, segment_selector],
194
+ outputs=[clip_output, clip_info],
195
+ )
196
+
197
+ gr.Markdown(how_it_works)
198
+
199
+ demo.launch()
200
+
201
+
202
+ if __name__ == "__main__":
203
+ launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ funasr>=1.2.0
4
+ modelscope
5
+ huggingface_hub
6
+ moviepy
7
+ gradio
8
+ numpy<2.0
9
+ librosa
10
+ soundfile