Fun-ASR-Nano-GPU-Debug

Running on Zero

File size: 7,041 Bytes

cb8606e
4a8414a
a198709
59b606f
9c07b4e
59b606f
cb8606e
a483939
 
 
 
 
 
 
59b606f
 
 
4e69efc
 
a483939
 
59b606f
 
 
a483939
59b606f
 
399aaa2
a483939
 
 
cb8606e
 
 
 
 
 
 
 
 
 
 
59b606f
 
cb8606e
 
59b606f
 
 
cb8606e
 
59b606f
 
 
 
 
 
 
 
 
cb8606e
59b606f
 
 
 
 
 
 
 
 
cb8606e
59b606f
cb8606e
59b606f
 
cb8606e
 
59b606f
 
 
 
cb8606e
 
 
 
 
 
59b606f
 
 
 
 
 
 
 
cb8606e
59b606f
cb8606e
59b606f
cb8606e
59b606f
8ced29b
59b606f
 
 
8ced29b
 
59b606f
 
 
 
 
 
 
 
 
 
 
d666310
59b606f
cb8606e
 
59b606f
 
cb8606e
 
 
59b606f
cb8606e
59b606f
cb8606e
59b606f
 
 
 
cb8606e
59b606f
cb8606e
 
59b606f
 
cb8606e
59b606f
cb8606e
 
59b606f
 
 
 
 
 
 
 
 
 
 
cb8606e
 
59b606f
cb8606e
 
59b606f
 
 
 
 
8ced29b
 
59b606f
 
 
 
 
cb8606e
59b606f
 
 
 
 
 
cb8606e
59b606f
 
 
 
 
cb8606e
 
59b606f

import os
import spaces

REPO_TYPE = "hf"

from huggingface_hub import snapshot_download

MODEL_CACHE_DIR = "./models"
FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")

os.makedirs(MODEL_CACHE_DIR, exist_ok=True)

FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
VAD_MODEL_REPO_ID = "funasr/fsmn-vad"


def download_model_if_not_exists(repo_id, local_path, model_name):
    if not os.path.exists(local_path):
        print(f"Downloading {model_name} to {local_path} ...")
        snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
        print(f"{model_name} downloaded.")
    else:
        print(f"{model_name} found locally, skipping download.")


download_model_if_not_exists(FUN_ASR_NANO_REPO_ID, FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
download_model_if_not_exists(SENSE_VOICE_SMALL_REPO_ID, SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
download_model_if_not_exists(VAD_MODEL_REPO_ID, VAD_MODEL_LOCAL_PATH, "VAD Model")

import gradio as gr
import time
import tempfile
import logging
import torch
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Lazy model loading - models are loaded on first use inside @spaces.GPU
loaded_models = {}


def get_or_load_model(pipeline_type):
    if pipeline_type in loaded_models:
        return loaded_models[pipeline_type]

    if pipeline_type == "fun-asr-nano":
        model = AutoModel(
            model=FUN_ASR_NANO_LOCAL_PATH,
            trust_remote_code=True,
            vad_model=VAD_MODEL_LOCAL_PATH,
            vad_kwargs={"max_single_segment_time": 30000},
            device="cuda",
            disable_update=True,
            hub="hf",
        )
    elif pipeline_type == "sensevoice":
        model = AutoModel(
            model=SENSE_VOICE_SMALL_LOCAL_PATH,
            trust_remote_code=False,
            vad_model=VAD_MODEL_LOCAL_PATH,
            vad_kwargs={"max_single_segment_time": 30000},
            device="cuda",
            disable_update=True,
            hub="hf",
        )
    else:
        raise ValueError(f"Unknown pipeline type: {pipeline_type}")

    loaded_models[pipeline_type] = model
    return model


@spaces.GPU(duration=120)
def transcribe_audio(audio_input, audio_url, pipeline_type, start_time=None, end_time=None):
    try:
        # Determine audio source
        audio_path = None
        is_temp_file = False

        if audio_input is not None and len(audio_input) > 0:
            audio_path = audio_input
        elif audio_url is not None and len(audio_url.strip()) > 0:
            import requests as req
            response = req.get(audio_url, stream=True, timeout=30)
            if response.status_code == 200:
                with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                    audio_path = f.name
                    is_temp_file = True
            else:
                return f"Failed to download audio: HTTP {response.status_code}", "", None
        else:
            return "No audio provided. Upload a file, record, or enter a URL.", "", None

        # Trim if needed
        if start_time > 0 or end_time > 0:
            from pydub import AudioSegment
            audio = AudioSegment.from_file(audio_path)
            duration = len(audio) / 1000
            s = float(start_time) if start_time > 0 else 0
            e = float(end_time) if end_time > 0 else duration
            trimmed = audio[int(s * 1000):int(e * 1000)]
            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
            trimmed.export(tmp.name, format="wav")
            audio_path = tmp.name
            is_temp_file = True

        # Load model (lazy, inside GPU context)
        model = get_or_load_model(pipeline_type)

        # Transcribe
        t0 = time.time()
        if pipeline_type == "fun-asr-nano":
            res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
        else:
            res = model.generate(
                input=audio_path, cache={}, language="auto",
                use_itn=True, batch_size_s=60, merge_vad=True, merge_length_s=15,
            )

        transcription = rich_transcription_postprocess(res[0]["text"])
        elapsed = time.time() - t0

        metrics = f"Transcription time: {elapsed:.2f}s\nPipeline: {pipeline_type}\nDevice: cuda"

        # Save transcription file
        txt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
        txt_file.write(transcription)
        txt_file.close()

        return metrics, transcription, txt_file.name

    except Exception as e:
        logging.error(f"Transcription error: {e}")
        return f"Error: {str(e)}", "", None
    finally:
        if is_temp_file and audio_path and os.path.exists(audio_path):
            os.remove(audio_path)


with gr.Blocks(title="Fun-ASR-Nano | GPU Demo") as demo:
    gr.Markdown("""
# Fun-ASR-Nano: LLM-Powered Speech Recognition (GPU)

End-to-end ASR model trained on tens of millions of hours, supporting **31 languages** including Chinese dialects.

- **GitHub**: [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR) | [FunASR Toolkit](https://github.com/modelscope/FunASR)
- **Model**: [Fun-ASR-Nano-2512](https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512)
    """)

    with gr.Row():
        audio_input = gr.Audio(label="Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
        audio_url = gr.Textbox(label="Or Enter Audio URL", placeholder="https://example.com/audio.wav")

    with gr.Row():
        pipeline_type = gr.Dropdown(
            choices=["fun-asr-nano", "sensevoice"],
            label="Model",
            value="fun-asr-nano"
        )
        start_time = gr.Number(label="Start Time (s)", value=0, minimum=0)
        end_time = gr.Number(label="End Time (s)", value=0, minimum=0)

    transcribe_btn = gr.Button("Transcribe", variant="primary")

    with gr.Row():
        metrics_output = gr.Textbox(label="Metrics", lines=4)
        transcription_output = gr.Textbox(label="Transcription", lines=10)
        transcription_file = gr.File(label="Download")

    transcribe_btn.click(
        transcribe_audio,
        inputs=[audio_input, audio_url, pipeline_type, start_time, end_time],
        outputs=[metrics_output, transcription_output, transcription_file],
    )

    gr.Markdown("""
### Supported Languages
- **Fun-ASR-Nano**: 31 languages + Chinese dialects (Cantonese, Sichuan, Shanghai, Minnan, etc.)
- **SenseVoice**: Chinese, English, Cantonese, Japanese, Korean
    """)

demo.queue().launch()