xiaoyunchong.xyc
fix: gradio TypeError with Number value=None, pin gradio version
8ced29b
import os
import spaces
REPO_TYPE = "hf"
from huggingface_hub import snapshot_download
MODEL_CACHE_DIR = "./models"
FUN_ASR_NANO_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "Fun-ASR-Nano")
SENSE_VOICE_SMALL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "SenseVoiceSmall")
VAD_MODEL_LOCAL_PATH = os.path.join(MODEL_CACHE_DIR, "fsmn-vad")
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
FUN_ASR_NANO_REPO_ID = "FunAudioLLM/Fun-ASR-Nano-2512"
SENSE_VOICE_SMALL_REPO_ID = "FunAudioLLM/SenseVoiceSmall"
VAD_MODEL_REPO_ID = "funasr/fsmn-vad"
def download_model_if_not_exists(repo_id, local_path, model_name):
if not os.path.exists(local_path):
print(f"Downloading {model_name} to {local_path} ...")
snapshot_download(repo_id=repo_id, local_dir=local_path, ignore_patterns=["*.onnx"])
print(f"{model_name} downloaded.")
else:
print(f"{model_name} found locally, skipping download.")
download_model_if_not_exists(FUN_ASR_NANO_REPO_ID, FUN_ASR_NANO_LOCAL_PATH, "Fun-ASR-Nano")
download_model_if_not_exists(SENSE_VOICE_SMALL_REPO_ID, SENSE_VOICE_SMALL_LOCAL_PATH, "SenseVoiceSmall")
download_model_if_not_exists(VAD_MODEL_REPO_ID, VAD_MODEL_LOCAL_PATH, "VAD Model")
import gradio as gr
import time
import tempfile
import logging
import torch
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Lazy model loading - models are loaded on first use inside @spaces.GPU
loaded_models = {}
def get_or_load_model(pipeline_type):
if pipeline_type in loaded_models:
return loaded_models[pipeline_type]
if pipeline_type == "fun-asr-nano":
model = AutoModel(
model=FUN_ASR_NANO_LOCAL_PATH,
trust_remote_code=True,
vad_model=VAD_MODEL_LOCAL_PATH,
vad_kwargs={"max_single_segment_time": 30000},
device="cuda",
disable_update=True,
hub="hf",
)
elif pipeline_type == "sensevoice":
model = AutoModel(
model=SENSE_VOICE_SMALL_LOCAL_PATH,
trust_remote_code=False,
vad_model=VAD_MODEL_LOCAL_PATH,
vad_kwargs={"max_single_segment_time": 30000},
device="cuda",
disable_update=True,
hub="hf",
)
else:
raise ValueError(f"Unknown pipeline type: {pipeline_type}")
loaded_models[pipeline_type] = model
return model
@spaces.GPU(duration=120)
def transcribe_audio(audio_input, audio_url, pipeline_type, start_time=None, end_time=None):
try:
# Determine audio source
audio_path = None
is_temp_file = False
if audio_input is not None and len(audio_input) > 0:
audio_path = audio_input
elif audio_url is not None and len(audio_url.strip()) > 0:
import requests as req
response = req.get(audio_url, stream=True, timeout=30)
if response.status_code == 200:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
audio_path = f.name
is_temp_file = True
else:
return f"Failed to download audio: HTTP {response.status_code}", "", None
else:
return "No audio provided. Upload a file, record, or enter a URL.", "", None
# Trim if needed
if start_time > 0 or end_time > 0:
from pydub import AudioSegment
audio = AudioSegment.from_file(audio_path)
duration = len(audio) / 1000
s = float(start_time) if start_time > 0 else 0
e = float(end_time) if end_time > 0 else duration
trimmed = audio[int(s * 1000):int(e * 1000)]
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
trimmed.export(tmp.name, format="wav")
audio_path = tmp.name
is_temp_file = True
# Load model (lazy, inside GPU context)
model = get_or_load_model(pipeline_type)
# Transcribe
t0 = time.time()
if pipeline_type == "fun-asr-nano":
res = model.generate(input=[audio_path], use_itn=True, batch_size=1)
else:
res = model.generate(
input=audio_path, cache={}, language="auto",
use_itn=True, batch_size_s=60, merge_vad=True, merge_length_s=15,
)
transcription = rich_transcription_postprocess(res[0]["text"])
elapsed = time.time() - t0
metrics = f"Transcription time: {elapsed:.2f}s\nPipeline: {pipeline_type}\nDevice: cuda"
# Save transcription file
txt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8")
txt_file.write(transcription)
txt_file.close()
return metrics, transcription, txt_file.name
except Exception as e:
logging.error(f"Transcription error: {e}")
return f"Error: {str(e)}", "", None
finally:
if is_temp_file and audio_path and os.path.exists(audio_path):
os.remove(audio_path)
with gr.Blocks(title="Fun-ASR-Nano | GPU Demo") as demo:
gr.Markdown("""
# Fun-ASR-Nano: LLM-Powered Speech Recognition (GPU)
End-to-end ASR model trained on tens of millions of hours, supporting **31 languages** including Chinese dialects.
- **GitHub**: [Fun-ASR](https://github.com/FunAudioLLM/Fun-ASR) | [FunASR Toolkit](https://github.com/modelscope/FunASR)
- **Model**: [Fun-ASR-Nano-2512](https://huggingface.co/FunAudioLLM/Fun-ASR-Nano-2512)
""")
with gr.Row():
audio_input = gr.Audio(label="Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
audio_url = gr.Textbox(label="Or Enter Audio URL", placeholder="https://example.com/audio.wav")
with gr.Row():
pipeline_type = gr.Dropdown(
choices=["fun-asr-nano", "sensevoice"],
label="Model",
value="fun-asr-nano"
)
start_time = gr.Number(label="Start Time (s)", value=0, minimum=0)
end_time = gr.Number(label="End Time (s)", value=0, minimum=0)
transcribe_btn = gr.Button("Transcribe", variant="primary")
with gr.Row():
metrics_output = gr.Textbox(label="Metrics", lines=4)
transcription_output = gr.Textbox(label="Transcription", lines=10)
transcription_file = gr.File(label="Download")
transcribe_btn.click(
transcribe_audio,
inputs=[audio_input, audio_url, pipeline_type, start_time, end_time],
outputs=[metrics_output, transcription_output, transcription_file],
)
gr.Markdown("""
### Supported Languages
- **Fun-ASR-Nano**: 31 languages + Chinese dialects (Cantonese, Sichuan, Shanghai, Minnan, etc.)
- **SenseVoice**: Chinese, English, Cantonese, Japanese, Korean
""")
demo.queue().launch()