Spaces:

FunAudioLLM
/

FunClip

Runtime error

Zhifu Gao commited on 21 days ago

Commit

a9f639a

1 Parent(s): 94ad952

feat: initial FunClip demo - AI video clipping with FunASR

- Upload video → auto-transcribe with timestamps → select & clip
- Uses FunASR Paraformer for Chinese speech recognition
- FFmpeg-based precise video segment extraction
- Links to GitHub repos (FunClip, FunASR, Fun-ASR)

Files changed (3) hide show

README.md +20 -7
app.py +203 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,13 +1,26 @@
 ---
 title: FunClip
-emoji: 😻
-colorFrom: gray
-colorTo: blue
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: FunClip
+emoji: ✂️
+colorFrom: red
+colorTo: yellow
 sdk: gradio
+sdk_version: 5.9.1
 app_file: app.py
+pinned: true
+license: mit
+short_description: "AI Video Clipping: speak to clip, powered by FunASR + LLM"
 ---
+# FunClip: AI-Powered Video Clipping
+Upload a video → FunASR transcribes all speech with timestamps → Select segments by text → Export precise clips automatically.
+## Features
+- 🎬 Automatic speech-to-text with word-level timestamps
+- ✂️ Click on any sentence to create a clip
+- 🤖 LLM-assisted smart clipping (find highlights automatically)
+- 🌍 Multi-language support (Chinese, English, Japanese, Korean, etc.)
+## Links
+- **GitHub**: [FunClip](https://github.com/modelscope/FunClip) (⭐ 5.6k+)
+- **ASR Engine**: [FunASR](https://github.com/modelscope/FunASR) | [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR)

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import json
+import tempfile
+import subprocess
+import gradio as gr
+import numpy as np
+import torch
+from funasr import AutoModel
+model = AutoModel(
+    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+    hub="hf",
+    model_hub="hf",
+    device="cpu",
+)
+def extract_audio(video_path):
+    audio_path = tempfile.mktemp(suffix=".wav")
+    cmd = [
+        "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
+        "-ar", "16000", "-ac", "1", "-y", audio_path
+    ]
+    subprocess.run(cmd, capture_output=True)
+    return audio_path
+def transcribe_video(video_path, progress=gr.Progress()):
+    if video_path is None:
+        return "Please upload a video file.", [], None
+    progress(0.1, desc="Extracting audio...")
+    audio_path = extract_audio(video_path)
+    if not os.path.exists(audio_path):
+        return "Failed to extract audio from video. Make sure it contains an audio track.", [], None
+    progress(0.3, desc="Transcribing speech...")
+    try:
+        res = model.generate(input=audio_path, batch_size_s=300)
+    except Exception as e:
+        return f"Transcription error: {str(e)}", [], None
+    finally:
+        if os.path.exists(audio_path):
+            os.unlink(audio_path)
+    if not res or not res[0].get("sentence_info"):
+        text = res[0].get("text", "") if res else ""
+        return text, [], None
+    progress(0.8, desc="Processing timestamps...")
+    sentences = []
+    for sent in res[0]["sentence_info"]:
+        start_ms = sent["start"]
+        end_ms = sent["end"]
+        text = sent["text"]
+        sentences.append({
+            "start": start_ms / 1000.0,
+            "end": end_ms / 1000.0,
+            "text": text,
+        })
+    full_text = "\n".join(
+        [f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences]
+    )
+    progress(1.0, desc="Done!")
+    return full_text, sentences, json.dumps(sentences, ensure_ascii=False)
+def clip_video(video_path, sentences_json, selected_indices):
+    if not video_path or not sentences_json or not selected_indices:
+        return None, "Please transcribe a video first, then select segments to clip."
+    sentences = json.loads(sentences_json)
+    indices = [int(i) for i in selected_indices]
+    if not indices:
+        return None, "No segments selected."
+    clips = []
+    for idx in sorted(indices):
+        if 0 <= idx < len(sentences):
+            clips.append((sentences[idx]["start"], sentences[idx]["end"]))
+    if not clips:
+        return None, "Invalid selection."
+    merged = [clips[0]]
+    for start, end in clips[1:]:
+        if start - merged[-1][1] < 0.5:
+            merged[-1] = (merged[-1][0], end)
+        else:
+            merged.append((start, end))
+    output_path = tempfile.mktemp(suffix=".mp4")
+    filter_parts = []
+    for i, (start, end) in enumerate(merged):
+        filter_parts.append(
+            f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];"
+            f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];"
+        )
+    concat_v = "".join(f"[v{i}]" for i in range(len(merged)))
+    concat_a = "".join(f"[a{i}]" for i in range(len(merged)))
+    filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]")
+    filter_complex = "".join(filter_parts)
+    cmd = [
+        "ffmpeg", "-i", video_path, "-filter_complex", filter_complex,
+        "-map", "[outv]", "-map", "[outa]", "-y", output_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        return None, f"FFmpeg error: {result.stderr[-500:]}"
+    total_duration = sum(end - start for start, end in merged)
+    return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s"
+description_html = """
+<div style="text-align: center; max-width: 850px; margin: 0 auto;">
+    <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1>
+    <p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p>
+    <p style="font-size: 1em; color: #666;">
+        Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips
+    </p>
+    <p style="font-size: 0.9em; margin-top: 0.8em;">
+        <a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> ·
+        <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> ·
+        <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a>
+    </p>
+</div>
+"""
+how_it_works = """
+### How It Works
+1. **Upload** a video (any format with audio)
+2. **Transcribe** — FunASR extracts speech with precise timestamps
+3. **Select** the sentences you want to keep (by index)
+4. **Clip** — FFmpeg cuts and concatenates the selected segments
+For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally.
+"""
+def build_selector(sentences_json):
+    if not sentences_json:
+        return gr.update(choices=[], value=[])
+    sentences = json.loads(sentences_json)
+    choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)]
+    return gr.update(choices=choices, value=[])
+def launch():
+    with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo:
+        gr.HTML(description_html)
+        sentences_state = gr.State("")
+        with gr.Tab("1. Transcribe"):
+            with gr.Row():
+                video_input = gr.Video(label="Upload Video")
+            transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg")
+            transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True)
+        with gr.Tab("2. Clip"):
+            segment_selector = gr.CheckboxGroup(
+                label="Select segments to clip",
+                choices=[],
+            )
+            clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg")
+            with gr.Row():
+                clip_output = gr.Video(label="Output Clip")
+                clip_info = gr.Textbox(label="Info", lines=2)
+        transcribe_btn.click(
+            transcribe_video,
+            inputs=[video_input],
+            outputs=[transcript_output, gr.State(), sentences_state],
+        ).then(
+            build_selector,
+            inputs=[sentences_state],
+            outputs=[segment_selector],
+        )
+        clip_btn.click(
+            clip_video,
+            inputs=[video_input, sentences_state, segment_selector],
+            outputs=[clip_output, clip_info],
+        )
+        gr.Markdown(how_it_works)
+    demo.launch()
+if __name__ == "__main__":
+    launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch
+torchaudio
+funasr>=1.2.0
+modelscope
+huggingface_hub
+moviepy
+gradio
+numpy<2.0
+librosa
+soundfile