Inclusion_demo / app.py
Devubiodee's picture
Update app.py
6897d69 verified
# app.py - Speech to ASL Avatar on Hugging Face Spaces (ZeroGPU compatible)
import gradio as gr
import whisper
import requests
import tempfile
import os
from spaces import GPU # Required for ZeroGPU hardware
# Load API key from HF Space secrets (Settings β†’ Secrets)
API_KEY = os.environ.get("SIGN_SPEAK_API_KEY")
if not API_KEY:
raise ValueError("SIGN_SPEAK_API_KEY not set in Space secrets!")
BASE_URL = "https://api.sign-speak.com"
PRODUCE_SIGN_URL = f"{BASE_URL}/produce-sign"
def get_sign_language(text: str, request_class="BLOCKING", identity="MALE"):
"""
Calls Sign-Speak API to generate ASL avatar video.
This runs on CPU (external API call), no GPU needed here.
"""
headers = {
"X-api-key": API_KEY,
"Content-Type": "application/json"
}
payload = {
"english": text.strip(),
"request_class": request_class.upper(),
"identity": identity.upper(),
# Optional: "model_version": "SLP.2.xs" for faster/smaller model
}
response = requests.post(PRODUCE_SIGN_URL, json=payload, headers=headers)
if response.status_code == 200:
# Save MP4 bytes to temp file (Gradio Video component needs filepath)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp:
tmp.write(response.content)
return tmp.name
elif response.status_code == 202:
data = response.json()
batch_id = data.get("batch_id")
raise ValueError(f"Batch processing started (ID: {batch_id}). Add polling if needed.")
else:
raise ValueError(f"Sign-Speak API error {response.status_code}: {response.text}")
@GPU(duration=120) # ← ZeroGPU decorator: request GPU for up to 120 seconds
def transcribe_and_translate(audio_filepath):
"""
Heavy function: loads Whisper model and transcribes audio.
Marked with @GPU so ZeroGPU hardware is allocated here.
"""
if audio_filepath is None:
return "No audio recorded.", None
try:
# Load Whisper model (small for better speed on ZeroGPU)
# Use device="cuda" to force GPU usage inside this decorated function
model = whisper.load_model("small", device="cuda") # or "base" if even faster needed
# Transcribe the audio file
result = model.transcribe(audio_filepath, language="en")
text = result["text"].strip()
if not text:
return "No speech detected in the recording.", None
# Generate ASL video from text
video_path = get_sign_language(text)
return f"Transcribed: \"{text}\"", video_path
except Exception as e:
return f"Error: {str(e)}", None
# ── Gradio UI ────────────────────────────────────────────────────────────────
with gr.Blocks(title="Speech β†’ ASL Avatar Translator") as demo:
gr.Markdown("""
# Speech to ASL Avatar (ZeroGPU)
1. Record your voice using the microphone below
2. Click **Translate**
3. Whisper transcribes β†’ Sign-Speak generates ASL signing video
""")
with gr.Row():
audio_input = gr.Audio(
sources=["microphone", "upload"], # added upload fallback
type="filepath",
label="Speak here (click record) or upload audio",
format="wav"
)
submit_btn = gr.Button("Sign Translate", variant="primary")
transcript_output = gr.Textbox(label="Transcribed Text / Status", lines=3)
video_output = gr.Video(label="ASL Avatar Signing Video", autoplay=True)
# Wire up the button
submit_btn.click(
fn=transcribe_and_translate,
inputs=audio_input,
outputs=[transcript_output, video_output]
)
# Launch (HF Spaces ignores server_name/port)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
ssr_mode=False # disable experimental SSR to avoid proxy issues
)