Spaces:

palli23
/

ASR_API

Running on Zero

App Files Files Community

ASR_API / app.py

palli23

Update app.py

4c5bf36 verified 5 days ago

raw

history blame contribute delete

4.52 kB

	# app.py — Clean output, no timestamps crash, stronger anti-hallucination

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

	import gradio as gr
	import spaces
	from transformers import pipeline
	import torch
	import gc
	import re

	# Post-processing to remove common ASR junk
	def clean_asr_text(text: str) -> str:
	if not text:
	return ""

	# Remove bracketed/angled garbage
	text = re.sub(r'<[^>]+>', '', text) # <UNK>, <\|0.00\|>, etc.
	text = re.sub(r'\[.*?\]', '', text) # [HIK:xxx], [laughter], etc.

	# Extra Icelandic/common noise patterns
	text = re.sub(r'(?i)\b(unk\|hik\|laughter\|music\|cough\|applause\|noise\|background)\b', '', text)

	# Normalize spaces & punctuation
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'^\s+\|\s+$', '', text)
	text = text.replace(' ,', ',').replace(' .', '.').replace(' ?', '?').replace(' !', '!')
	text = re.sub(r' +([.,!?])', r'\1', text)

	return text.strip()

	# ──────────────────────────────────────────────
	# Transcription function
	# ──────────────────────────────────────────────
	@spaces.GPU(duration=180)
	def transcribe_3min(audio_path):
	if not audio_path:
	return "Hlaðið upp hljóðskrá"

	pipe = pipeline(
	"automatic-speech-recognition",
	model="palli23/whisper-tiny-distilled-spjallromur-polish-v5",
	torch_dtype=torch.float16,
	device=0,
	)

	try:
	result = pipe(
	audio_path,
	chunk_length_s=30,
	batch_size=8,
	return_timestamps=False, # ← avoids generation_config crash
	generate_kwargs={
	"num_beams": 5,
	"repetition_penalty": 1.3, # stronger than before
	"no_repeat_ngram_size": 4, # prevents repeating phrases
	"temperature": 0.0, # greedy decoding = least creative garbage
	"suppress_tokens": [-1], # try to block <unk>
	"max_new_tokens": 444, # safety limit
	}
	)

	raw_text = result.get("text", "")
	cleaned = clean_asr_text(raw_text)

	except Exception as e:
	cleaned = f"Villa við umritun: {str(e)}"

	# Clean up GPU memory
	del pipe
	gc.collect()
	torch.cuda.empty_cache()

	return cleaned or "(ekkert texti fannst eða villa kom upp)"

	# ──────────────────────────────────────────────
	# Gradio UI
	# ──────────────────────────────────────────────
	with gr.Blocks() as demo:
	gr.Markdown("# Íslenskt ASR – 3 mínútur (hreinsuð útgáfa)")
	gr.Markdown(
	"Model: palli23/whisper-tiny-distilled-spjallromur-polish-v5 \n"
	"Stillingar: no timestamps, temperature=0.0, repetition_penalty=1.3, no_repeat_ngram_size=4 \n"
	"Reynir að fjarlægja <UNK>, [HIK...], [laughter] o.s.frv."
	)
	gr.Markdown("Hafa samband: pallinr1@protonmail.com")

	audio_in = gr.Audio(
	type="filepath",
	label="Hlaðið upp .mp3 / .wav / .m4a (allt að ~5 mín)",
	format="mp3"
	)

	btn = gr.Button("Umrita", variant="primary", size="lg")

	output = gr.Textbox(
	lines=25,
	label="Útskrift (hreinsuð)",
	placeholder="Hér kemur textinn..."
	)

	examples = gr.Examples(
	examples=[
	["example_clip_14nov2025.mp3"],
	],
	inputs=audio_in,
	label="Dæmi (ef þú hefur sett upp dæmi skrá)"
	)

	btn.click(
	fn=transcribe_3min,
	inputs=audio_in,
	outputs=output
	)

	# ──────────────────────────────────────────────
	# Launch
	# ──────────────────────────────────────────────
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False, # ← set to True locally if you want public link
	debug=False
	)