# app.py - Speech to ASL Avatar on Hugging Face Spaces (ZeroGPU compatible) import gradio as gr import whisper import requests import tempfile import os from spaces import GPU # Required for ZeroGPU hardware # Load API key from HF Space secrets (Settings → Secrets) API_KEY = os.environ.get("SIGN_SPEAK_API_KEY") if not API_KEY: raise ValueError("SIGN_SPEAK_API_KEY not set in Space secrets!") BASE_URL = "https://api.sign-speak.com" PRODUCE_SIGN_URL = f"{BASE_URL}/produce-sign" def get_sign_language(text: str, request_class="BLOCKING", identity="MALE"): """ Calls Sign-Speak API to generate ASL avatar video. This runs on CPU (external API call), no GPU needed here. """ headers = { "X-api-key": API_KEY, "Content-Type": "application/json" } payload = { "english": text.strip(), "request_class": request_class.upper(), "identity": identity.upper(), # Optional: "model_version": "SLP.2.xs" for faster/smaller model } response = requests.post(PRODUCE_SIGN_URL, json=payload, headers=headers) if response.status_code == 200: # Save MP4 bytes to temp file (Gradio Video component needs filepath) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp: tmp.write(response.content) return tmp.name elif response.status_code == 202: data = response.json() batch_id = data.get("batch_id") raise ValueError(f"Batch processing started (ID: {batch_id}). Add polling if needed.") else: raise ValueError(f"Sign-Speak API error {response.status_code}: {response.text}") @GPU(duration=120) # ← ZeroGPU decorator: request GPU for up to 120 seconds def transcribe_and_translate(audio_filepath): """ Heavy function: loads Whisper model and transcribes audio. Marked with @GPU so ZeroGPU hardware is allocated here. """ if audio_filepath is None: return "No audio recorded.", None try: # Load Whisper model (small for better speed on ZeroGPU) # Use device="cuda" to force GPU usage inside this decorated function model = whisper.load_model("small", device="cuda") # or "base" if even faster needed # Transcribe the audio file result = model.transcribe(audio_filepath, language="en") text = result["text"].strip() if not text: return "No speech detected in the recording.", None # Generate ASL video from text video_path = get_sign_language(text) return f"Transcribed: \"{text}\"", video_path except Exception as e: return f"Error: {str(e)}", None # ── Gradio UI ──────────────────────────────────────────────────────────────── with gr.Blocks(title="Speech → ASL Avatar Translator") as demo: gr.Markdown(""" # Speech to ASL Avatar (ZeroGPU) 1. Record your voice using the microphone below 2. Click **Translate** 3. Whisper transcribes → Sign-Speak generates ASL signing video """) with gr.Row(): audio_input = gr.Audio( sources=["microphone", "upload"], # added upload fallback type="filepath", label="Speak here (click record) or upload audio", format="wav" ) submit_btn = gr.Button("Sign Translate", variant="primary") transcript_output = gr.Textbox(label="Transcribed Text / Status", lines=3) video_output = gr.Video(label="ASL Avatar Signing Video", autoplay=True) # Wire up the button submit_btn.click( fn=transcribe_and_translate, inputs=audio_input, outputs=[transcript_output, video_output] ) # Launch (HF Spaces ignores server_name/port) demo.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=False, ssr_mode=False # disable experimental SSR to avoid proxy issues )