Spaces:

palli23
/

ASR_API

Running on Zero

File size: 4,523 Bytes

6d433ff
2102ae8
6adf5a9
 
3b102fc
6adf5a9
 
 
a0182fe
45c12a4
 
a39d532
 
6d433ff
a39d532
 
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
 
a39d532
 
e37e472
6d433ff
 
 
2102ae8
9648db0
 
 
2102ae8
9648db0
a39d532
6d433ff
9648db0
6d433ff
3b102fc
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
4c5bf36
6d433ff
 
2102ae8
6d433ff
 
a39d532
6d433ff
 
 
 
9648db0
 
 
2102ae8
6d433ff
c675e00
6d433ff
 
 
2102ae8
a39d532
6d433ff
 
 
 
 
a0182fe
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
ac10614
6d433ff

# app.py — Clean output, no timestamps crash, stronger anti-hallucination

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import gradio as gr
import spaces
from transformers import pipeline
import torch
import gc
import re

# Post-processing to remove common ASR junk
def clean_asr_text(text: str) -> str:
    if not text:
        return ""
    
    # Remove bracketed/angled garbage
    text = re.sub(r'<[^>]+>', '', text)                     # <UNK>, <|0.00|>, etc.
    text = re.sub(r'\[.*?\]', '', text)                     # [HIK:xxx], [laughter], etc.
    
    # Extra Icelandic/common noise patterns
    text = re.sub(r'(?i)\b(unk|hik|laughter|music|cough|applause|noise|background)\b', '', text)
    
    # Normalize spaces & punctuation
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^\s+|\s+$', '', text)
    text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
    text = re.sub(r' +([.,!?])', r'\1', text)
    
    return text.strip()

# ──────────────────────────────────────────────
# Transcription function
# ──────────────────────────────────────────────
@spaces.GPU(duration=180)
def transcribe_3min(audio_path):
    if not audio_path:
        return "Hlaðið upp hljóðskrá"

    pipe = pipeline(
        "automatic-speech-recognition",
        model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",
        torch_dtype=torch.float16,
        device=0,
    )

    try:
        result = pipe(
            audio_path,
            chunk_length_s=30,
            batch_size=8,
            return_timestamps=False,           # ← avoids generation_config crash
            generate_kwargs={
                "num_beams": 5,
                "repetition_penalty": 1.3,     # stronger than before
                "no_repeat_ngram_size": 4,     # prevents repeating phrases
                "temperature": 0.0,            # greedy decoding = least creative garbage
                "suppress_tokens": [-1],       # try to block <unk>
                "max_new_tokens": 444,         # safety limit
            }
        )

        raw_text = result.get("text", "")
        cleaned = clean_asr_text(raw_text)

    except Exception as e:
        cleaned = f"Villa við umritun: {str(e)}"

    # Clean up GPU memory
    del pipe
    gc.collect()
    torch.cuda.empty_cache()

    return cleaned or "(ekkert texti fannst eða villa kom upp)"

# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
    gr.Markdown(
        "**Model:** palli23/whisper-tiny-distilled-spjallromur-polish-v5  \n"
        "**Stillingar:** no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4  \n"
        "Reynir að fjarlægja <UNK>, [HIK...], [laughter] o.s.frv."
    )
    gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")

    audio_in = gr.Audio(
        type="filepath",
        label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)",
        format="mp3"
    )
    
    btn = gr.Button("Umrita", variant="primary", size="lg")
    
    output = gr.Textbox(
        lines=25,
        label="Útskrift (hreinsuð)",
        placeholder="Hér kemur textinn..."
    )

    examples = gr.Examples(
        examples=[
            ["example_clip_14nov2025.mp3"],
        ],
        inputs=audio_in,
        label="Dæmi (ef þú hefur sett upp dæmi skrá)"
    )

    btn.click(
        fn=transcribe_3min,
        inputs=audio_in,
        outputs=output
    )

# ──────────────────────────────────────────────
# Launch
# ──────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,               # ← set to True locally if you want public link
        debug=False
    )