# app.py — Clean output, no timestamps crash, stronger anti-hallucination import os os.environ["OMP_NUM_THREADS"] = "1" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" import gradio as gr import spaces from transformers import pipeline import torch import gc import re # Post-processing to remove common ASR junk def clean_asr_text(text: str) -> str: if not text: return "" # Remove bracketed/angled garbage text = re.sub(r'<[^>]+>', '', text) # , <|0.00|>, etc. text = re.sub(r'\[.*?\]', '', text) # [HIK:xxx], [laughter], etc. # Extra Icelandic/common noise patterns text = re.sub(r'(?i)\b(unk|hik|laughter|music|cough|applause|noise|background)\b', '', text) # Normalize spaces & punctuation text = re.sub(r'\s+', ' ', text) text = re.sub(r'^\s+|\s+$', '', text) text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!') text = re.sub(r' +([.,!?])', r'\1', text) return text.strip() # ────────────────────────────────────────────── # Transcription function # ────────────────────────────────────────────── @spaces.GPU(duration=180) def transcribe_3min(audio_path): if not audio_path: return "Hlaðið upp hljóðskrá" pipe = pipeline( "automatic-speech-recognition", model="palli23/whisper-tiny-distilled-spjallromur-polish-v5", torch_dtype=torch.float16, device=0, ) try: result = pipe( audio_path, chunk_length_s=30, batch_size=8, return_timestamps=False, # ← avoids generation_config crash generate_kwargs={ "num_beams": 5, "repetition_penalty": 1.3, # stronger than before "no_repeat_ngram_size": 4, # prevents repeating phrases "temperature": 0.0, # greedy decoding = least creative garbage "suppress_tokens": [-1], # try to block "max_new_tokens": 444, # safety limit } ) raw_text = result.get("text", "") cleaned = clean_asr_text(raw_text) except Exception as e: cleaned = f"Villa við umritun: {str(e)}" # Clean up GPU memory del pipe gc.collect() torch.cuda.empty_cache() return cleaned or "(ekkert texti fannst eða villa kom upp)" # ────────────────────────────────────────────── # Gradio UI # ────────────────────────────────────────────── with gr.Blocks() as demo: gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)") gr.Markdown( "**Model:** palli23/whisper-tiny-distilled-spjallromur-polish-v5 \n" "**Stillingar:** no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4 \n" "Reynir að fjarlægja , [HIK...], [laughter] o.s.frv." ) gr.Markdown("**Hafa samband:** pallinr1@protonmail.com") audio_in = gr.Audio( type="filepath", label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)", format="mp3" ) btn = gr.Button("Umrita", variant="primary", size="lg") output = gr.Textbox( lines=25, label="Útskrift (hreinsuð)", placeholder="Hér kemur textinn..." ) examples = gr.Examples( examples=[ ["example_clip_14nov2025.mp3"], ], inputs=audio_in, label="Dæmi (ef þú hefur sett upp dæmi skrá)" ) btn.click( fn=transcribe_3min, inputs=audio_in, outputs=output ) # ────────────────────────────────────────────── # Launch # ────────────────────────────────────────────── if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, # ← set to True locally if you want public link debug=False )