ASR_API / app.py
palli23's picture
Update app.py
4c5bf36 verified
# app.py — Clean output, no timestamps crash, stronger anti-hallucination
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
import gradio as gr
import spaces
from transformers import pipeline
import torch
import gc
import re
# Post-processing to remove common ASR junk
def clean_asr_text(text: str) -> str:
if not text:
return ""
# Remove bracketed/angled garbage
text = re.sub(r'<[^>]+>', '', text) # <UNK>, <|0.00|>, etc.
text = re.sub(r'\[.*?\]', '', text) # [HIK:xxx], [laughter], etc.
# Extra Icelandic/common noise patterns
text = re.sub(r'(?i)\b(unk|hik|laughter|music|cough|applause|noise|background)\b', '', text)
# Normalize spaces & punctuation
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'^\s+|\s+$', '', text)
text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
text = re.sub(r' +([.,!?])', r'\1', text)
return text.strip()
# ──────────────────────────────────────────────
# Transcription function
# ──────────────────────────────────────────────
@spaces.GPU(duration=180)
def transcribe_3min(audio_path):
if not audio_path:
return "Hlaðið upp hljóðskrá"
pipe = pipeline(
"automatic-speech-recognition",
model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",
torch_dtype=torch.float16,
device=0,
)
try:
result = pipe(
audio_path,
chunk_length_s=30,
batch_size=8,
return_timestamps=False, # ← avoids generation_config crash
generate_kwargs={
"num_beams": 5,
"repetition_penalty": 1.3, # stronger than before
"no_repeat_ngram_size": 4, # prevents repeating phrases
"temperature": 0.0, # greedy decoding = least creative garbage
"suppress_tokens": [-1], # try to block <unk>
"max_new_tokens": 444, # safety limit
}
)
raw_text = result.get("text", "")
cleaned = clean_asr_text(raw_text)
except Exception as e:
cleaned = f"Villa við umritun: {str(e)}"
# Clean up GPU memory
del pipe
gc.collect()
torch.cuda.empty_cache()
return cleaned or "(ekkert texti fannst eða villa kom upp)"
# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
with gr.Blocks() as demo:
gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
gr.Markdown(
"**Model:** palli23/whisper-tiny-distilled-spjallromur-polish-v5 \n"
"**Stillingar:** no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4 \n"
"Reynir að fjarlægja <UNK>, [HIK...], [laughter] o.s.frv."
)
gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
audio_in = gr.Audio(
type="filepath",
label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)",
format="mp3"
)
btn = gr.Button("Umrita", variant="primary", size="lg")
output = gr.Textbox(
lines=25,
label="Útskrift (hreinsuð)",
placeholder="Hér kemur textinn..."
)
examples = gr.Examples(
examples=[
["example_clip_14nov2025.mp3"],
],
inputs=audio_in,
label="Dæmi (ef þú hefur sett upp dæmi skrá)"
)
btn.click(
fn=transcribe_3min,
inputs=audio_in,
outputs=output
)
# ──────────────────────────────────────────────
# Launch
# ──────────────────────────────────────────────
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # ← set to True locally if you want public link
debug=False
)