File size: 4,523 Bytes
6d433ff 2102ae8 6adf5a9 3b102fc 6adf5a9 a0182fe 45c12a4 a39d532 6d433ff a39d532 6d433ff a39d532 e37e472 6d433ff 2102ae8 9648db0 2102ae8 9648db0 a39d532 6d433ff 9648db0 6d433ff 3b102fc 2102ae8 6d433ff 4c5bf36 6d433ff 2102ae8 6d433ff a39d532 6d433ff 9648db0 2102ae8 6d433ff c675e00 6d433ff 2102ae8 a39d532 6d433ff a0182fe 2102ae8 6d433ff 2102ae8 6d433ff ac10614 6d433ff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | # app.py — Clean output, no timestamps crash, stronger anti-hallucination
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
import gradio as gr
import spaces
from transformers import pipeline
import torch
import gc
import re
# Post-processing to remove common ASR junk
def clean_asr_text(text: str) -> str:
if not text:
return ""
# Remove bracketed/angled garbage
text = re.sub(r'<[^>]+>', '', text) # <UNK>, <|0.00|>, etc.
text = re.sub(r'\[.*?\]', '', text) # [HIK:xxx], [laughter], etc.
# Extra Icelandic/common noise patterns
text = re.sub(r'(?i)\b(unk|hik|laughter|music|cough|applause|noise|background)\b', '', text)
# Normalize spaces & punctuation
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'^\s+|\s+$', '', text)
text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
text = re.sub(r' +([.,!?])', r'\1', text)
return text.strip()
# ──────────────────────────────────────────────
# Transcription function
# ──────────────────────────────────────────────
@spaces.GPU(duration=180)
def transcribe_3min(audio_path):
if not audio_path:
return "Hlaðið upp hljóðskrá"
pipe = pipeline(
"automatic-speech-recognition",
model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",
torch_dtype=torch.float16,
device=0,
)
try:
result = pipe(
audio_path,
chunk_length_s=30,
batch_size=8,
return_timestamps=False, # ← avoids generation_config crash
generate_kwargs={
"num_beams": 5,
"repetition_penalty": 1.3, # stronger than before
"no_repeat_ngram_size": 4, # prevents repeating phrases
"temperature": 0.0, # greedy decoding = least creative garbage
"suppress_tokens": [-1], # try to block <unk>
"max_new_tokens": 444, # safety limit
}
)
raw_text = result.get("text", "")
cleaned = clean_asr_text(raw_text)
except Exception as e:
cleaned = f"Villa við umritun: {str(e)}"
# Clean up GPU memory
del pipe
gc.collect()
torch.cuda.empty_cache()
return cleaned or "(ekkert texti fannst eða villa kom upp)"
# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
with gr.Blocks() as demo:
gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
gr.Markdown(
"**Model:** palli23/whisper-tiny-distilled-spjallromur-polish-v5 \n"
"**Stillingar:** no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4 \n"
"Reynir að fjarlægja <UNK>, [HIK...], [laughter] o.s.frv."
)
gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
audio_in = gr.Audio(
type="filepath",
label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)",
format="mp3"
)
btn = gr.Button("Umrita", variant="primary", size="lg")
output = gr.Textbox(
lines=25,
label="Útskrift (hreinsuð)",
placeholder="Hér kemur textinn..."
)
examples = gr.Examples(
examples=[
["example_clip_14nov2025.mp3"],
],
inputs=audio_in,
label="Dæmi (ef þú hefur sett upp dæmi skrá)"
)
btn.click(
fn=transcribe_3min,
inputs=audio_in,
outputs=output
)
# ──────────────────────────────────────────────
# Launch
# ──────────────────────────────────────────────
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # ← set to True locally if you want public link
debug=False
) |