Spaces:

FunAudioLLM
/

FunClip

Runtime error

File size: 6,794 Bytes

a9f639a

import os
import json
import tempfile
import subprocess
import gradio as gr
import numpy as np
import torch

from funasr import AutoModel

model = AutoModel(
    model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    hub="hf",
    model_hub="hf",
    device="cpu",
)


def extract_audio(video_path):
    audio_path = tempfile.mktemp(suffix=".wav")
    cmd = [
        "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
        "-ar", "16000", "-ac", "1", "-y", audio_path
    ]
    subprocess.run(cmd, capture_output=True)
    return audio_path


def transcribe_video(video_path, progress=gr.Progress()):
    if video_path is None:
        return "Please upload a video file.", [], None

    progress(0.1, desc="Extracting audio...")
    audio_path = extract_audio(video_path)

    if not os.path.exists(audio_path):
        return "Failed to extract audio from video. Make sure it contains an audio track.", [], None

    progress(0.3, desc="Transcribing speech...")
    try:
        res = model.generate(input=audio_path, batch_size_s=300)
    except Exception as e:
        return f"Transcription error: {str(e)}", [], None
    finally:
        if os.path.exists(audio_path):
            os.unlink(audio_path)

    if not res or not res[0].get("sentence_info"):
        text = res[0].get("text", "") if res else ""
        return text, [], None

    progress(0.8, desc="Processing timestamps...")
    sentences = []
    for sent in res[0]["sentence_info"]:
        start_ms = sent["start"]
        end_ms = sent["end"]
        text = sent["text"]
        sentences.append({
            "start": start_ms / 1000.0,
            "end": end_ms / 1000.0,
            "text": text,
        })

    full_text = "\n".join(
        [f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences]
    )

    progress(1.0, desc="Done!")
    return full_text, sentences, json.dumps(sentences, ensure_ascii=False)


def clip_video(video_path, sentences_json, selected_indices):
    if not video_path or not sentences_json or not selected_indices:
        return None, "Please transcribe a video first, then select segments to clip."

    sentences = json.loads(sentences_json)

    indices = [int(i) for i in selected_indices]
    if not indices:
        return None, "No segments selected."

    clips = []
    for idx in sorted(indices):
        if 0 <= idx < len(sentences):
            clips.append((sentences[idx]["start"], sentences[idx]["end"]))

    if not clips:
        return None, "Invalid selection."

    merged = [clips[0]]
    for start, end in clips[1:]:
        if start - merged[-1][1] < 0.5:
            merged[-1] = (merged[-1][0], end)
        else:
            merged.append((start, end))

    output_path = tempfile.mktemp(suffix=".mp4")

    filter_parts = []
    for i, (start, end) in enumerate(merged):
        filter_parts.append(
            f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];"
            f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];"
        )

    concat_v = "".join(f"[v{i}]" for i in range(len(merged)))
    concat_a = "".join(f"[a{i}]" for i in range(len(merged)))
    filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]")

    filter_complex = "".join(filter_parts)

    cmd = [
        "ffmpeg", "-i", video_path, "-filter_complex", filter_complex,
        "-map", "[outv]", "-map", "[outa]", "-y", output_path
    ]

    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        return None, f"FFmpeg error: {result.stderr[-500:]}"

    total_duration = sum(end - start for start, end in merged)
    return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s"


description_html = """
<div style="text-align: center; max-width: 850px; margin: 0 auto;">
    <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1>
    <p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p>
    <p style="font-size: 1em; color: #666;">
        Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips
    </p>
    <p style="font-size: 0.9em; margin-top: 0.8em;">
        <a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> ·
        <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> ·
        <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a>
    </p>
</div>
"""

how_it_works = """
### How It Works
1. **Upload** a video (any format with audio)
2. **Transcribe** — FunASR extracts speech with precise timestamps
3. **Select** the sentences you want to keep (by index)
4. **Clip** — FFmpeg cuts and concatenates the selected segments

For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally.
"""


def build_selector(sentences_json):
    if not sentences_json:
        return gr.update(choices=[], value=[])
    sentences = json.loads(sentences_json)
    choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)]
    return gr.update(choices=choices, value=[])


def launch():
    with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo:
        gr.HTML(description_html)

        sentences_state = gr.State("")

        with gr.Tab("1. Transcribe"):
            with gr.Row():
                video_input = gr.Video(label="Upload Video")
            transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg")
            transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True)

        with gr.Tab("2. Clip"):
            segment_selector = gr.CheckboxGroup(
                label="Select segments to clip",
                choices=[],
            )
            clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg")
            with gr.Row():
                clip_output = gr.Video(label="Output Clip")
                clip_info = gr.Textbox(label="Info", lines=2)

        transcribe_btn.click(
            transcribe_video,
            inputs=[video_input],
            outputs=[transcript_output, gr.State(), sentences_state],
        ).then(
            build_selector,
            inputs=[sentences_state],
            outputs=[segment_selector],
        )

        clip_btn.click(
            clip_video,
            inputs=[video_input, sentences_state, segment_selector],
            outputs=[clip_output, clip_info],
        )

        gr.Markdown(how_it_works)

    demo.launch()


if __name__ == "__main__":
    launch()