Spaces:

USF00
/

TTS_Deploy

Running

App Files Files Community

TTS_Deploy / utils /text_utils.py

USF00

Initial commit: FastAPI TTS Project ready for Vast.ai

493b3af 13 days ago

raw

history blame contribute delete

3.09 kB

	import re
	from typing import List

	_AR_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
	_SENT_SPLIT = re.compile(r"(?<=[\\.\\!\\?])\s+\|(?<=[\u061F\u06D4])\s+\|(?<=\n)\s*")
	_SOFT_SPLIT = re.compile(r"(?<=[,;:،؛])\s+")

	def normalize_whitespace(s: str) -> str:
	s = s.replace("\u00A0", " ")
	s = re.sub(r"[ \t]+", " ", s)
	s = re.sub(r"\n{3,}", "\n\n", s)
	return s.strip()

	def normalize_punctuation(s: str) -> str:
	s = s.replace("…", "...")
	s = s.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
	s = s.replace("—", "-").replace("–", "-")
	s = re.sub(r"\s+([,.;:!?])", r"\1", s)
	s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s)
	return s

	def normalize_arabic(s: str) -> str:
	s = s.replace("\u0640", "") # tatweel
	s = _AR_DIACRITICS.sub("", s) # diacritics
	s = s.replace("،", "، ").replace("؟", "؟ ").replace("؛", "؛ ")
	s = re.sub(r"\s+", " ", s)
	return s.strip()

	def clean_text(s: str, lang: str = "en") -> str:
	s = normalize_whitespace(s)
	s = normalize_punctuation(s)
	if lang.lower().startswith("ar"):
	s = normalize_arabic(s)
	s = re.sub(r"[\u200B-\u200F\u202A-\u202E]", "", s) # bidi junk
	return normalize_whitespace(s)

	def split_into_sentences(text: str) -> List[str]:
	text = normalize_whitespace(text)
	return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]

	def hard_wrap_by_words(s: str, max_chars: int) -> List[str]:
	words = s.split()
	out, cur = [], []
	for w in words:
	cand = (" ".join(cur + [w])).strip()
	if len(cand) <= max_chars:
	cur.append(w)
	else:
	if cur:
	out.append(" ".join(cur))
	cur = [w]
	else:
	out.append(w[:max_chars])
	rest = w[max_chars:]
	if rest:
	cur = [rest]
	if cur:
	out.append(" ".join(cur))
	return [x.strip() for x in out if x.strip()]

	def chunk_text(text: str, lang: str = "en", max_chars: int = 220, min_chars: int = 20) -> List[str]:
	text = clean_text(text, lang=lang)
	sents = split_into_sentences(text)

	chunks, cur = [], ""

	def flush():
	nonlocal cur
	if cur.strip():
	chunks.append(cur.strip())
	cur = ""

	for sent in sents:
	if len(sent) > max_chars:
	subs = [x.strip() for x in _SOFT_SPLIT.split(sent) if x.strip()]
	if len(subs) == 1:
	subs = hard_wrap_by_words(sent, max_chars=max_chars)
	else:
	subs = [sent]

	for part in subs:
	if not cur:
	cur = part
	elif len(cur) + 1 + len(part) <= max_chars:
	cur = cur + " " + part
	else:
	flush()
	cur = part

	flush()

	merged = []
	for ch in chunks:
	if merged and len(ch) < min_chars:
	merged[-1] = (merged[-1] + " " + ch).strip()
	else:
	merged.append(ch)
	return merged