File size: 4,523 Bytes
6d433ff
2102ae8
6adf5a9
 
3b102fc
6adf5a9
 
 
a0182fe
45c12a4
 
a39d532
 
6d433ff
a39d532
 
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
 
a39d532
 
e37e472
6d433ff
 
 
2102ae8
9648db0
 
 
2102ae8
9648db0
a39d532
6d433ff
9648db0
6d433ff
3b102fc
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
4c5bf36
6d433ff
 
2102ae8
6d433ff
 
a39d532
6d433ff
 
 
 
9648db0
 
 
2102ae8
6d433ff
c675e00
6d433ff
 
 
2102ae8
a39d532
6d433ff
 
 
 
 
a0182fe
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
2102ae8
6d433ff
 
 
 
 
 
 
 
 
 
 
 
 
ac10614
6d433ff
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# app.py — Clean output, no timestamps crash, stronger anti-hallucination

import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import gradio as gr
import spaces
from transformers import pipeline
import torch
import gc
import re

# Post-processing to remove common ASR junk
def clean_asr_text(text: str) -> str:
    if not text:
        return ""
    
    # Remove bracketed/angled garbage
    text = re.sub(r'<[^>]+>', '', text)                     # <UNK>, <|0.00|>, etc.
    text = re.sub(r'\[.*?\]', '', text)                     # [HIK:xxx], [laughter], etc.
    
    # Extra Icelandic/common noise patterns
    text = re.sub(r'(?i)\b(unk|hik|laughter|music|cough|applause|noise|background)\b', '', text)
    
    # Normalize spaces & punctuation
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'^\s+|\s+$', '', text)
    text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
    text = re.sub(r' +([.,!?])', r'\1', text)
    
    return text.strip()

# ──────────────────────────────────────────────
# Transcription function
# ──────────────────────────────────────────────
@spaces.GPU(duration=180)
def transcribe_3min(audio_path):
    if not audio_path:
        return "Hlaðið upp hljóðskrá"

    pipe = pipeline(
        "automatic-speech-recognition",
        model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",
        torch_dtype=torch.float16,
        device=0,
    )

    try:
        result = pipe(
            audio_path,
            chunk_length_s=30,
            batch_size=8,
            return_timestamps=False,           # ← avoids generation_config crash
            generate_kwargs={
                "num_beams": 5,
                "repetition_penalty": 1.3,     # stronger than before
                "no_repeat_ngram_size": 4,     # prevents repeating phrases
                "temperature": 0.0,            # greedy decoding = least creative garbage
                "suppress_tokens": [-1],       # try to block <unk>
                "max_new_tokens": 444,         # safety limit
            }
        )

        raw_text = result.get("text", "")
        cleaned = clean_asr_text(raw_text)

    except Exception as e:
        cleaned = f"Villa við umritun: {str(e)}"

    # Clean up GPU memory
    del pipe
    gc.collect()
    torch.cuda.empty_cache()

    return cleaned or "(ekkert texti fannst eða villa kom upp)"

# ──────────────────────────────────────────────
# Gradio UI
# ──────────────────────────────────────────────
with gr.Blocks() as demo:
    gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
    gr.Markdown(
        "**Model:** palli23/whisper-tiny-distilled-spjallromur-polish-v5  \n"
        "**Stillingar:** no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4  \n"
        "Reynir að fjarlægja <UNK>, [HIK...], [laughter] o.s.frv."
    )
    gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")

    audio_in = gr.Audio(
        type="filepath",
        label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)",
        format="mp3"
    )
    
    btn = gr.Button("Umrita", variant="primary", size="lg")
    
    output = gr.Textbox(
        lines=25,
        label="Útskrift (hreinsuð)",
        placeholder="Hér kemur textinn..."
    )

    examples = gr.Examples(
        examples=[
            ["example_clip_14nov2025.mp3"],
        ],
        inputs=audio_in,
        label="Dæmi (ef þú hefur sett upp dæmi skrá)"
    )

    btn.click(
        fn=transcribe_3min,
        inputs=audio_in,
        outputs=output
    )

# ──────────────────────────────────────────────
# Launch
# ──────────────────────────────────────────────
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,               # ← set to True locally if you want public link
        debug=False
    )