Spaces:

USF00
/

TTS_Deploy

Running

File size: 3,091 Bytes

493b3af

import re
from typing import List

_AR_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
_SENT_SPLIT = re.compile(r"(?<=[\\.\\!\\?])\s+|(?<=[\u061F\u06D4])\s+|(?<=\n)\s*")
_SOFT_SPLIT = re.compile(r"(?<=[,;:،؛])\s+")

def normalize_whitespace(s: str) -> str:
    s = s.replace("\u00A0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def normalize_punctuation(s: str) -> str:
    s = s.replace("…", "...")
    s = s.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
    s = s.replace("—", "-").replace("–", "-")
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)
    s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s)
    return s

def normalize_arabic(s: str) -> str:
    s = s.replace("\u0640", "")            # tatweel
    s = _AR_DIACRITICS.sub("", s)          # diacritics
    s = s.replace("،", "، ").replace("؟", "؟ ").replace("؛", "؛ ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_text(s: str, lang: str = "en") -> str:
    s = normalize_whitespace(s)
    s = normalize_punctuation(s)
    if lang.lower().startswith("ar"):
        s = normalize_arabic(s)
    s = re.sub(r"[\u200B-\u200F\u202A-\u202E]", "", s)  # bidi junk
    return normalize_whitespace(s)

def split_into_sentences(text: str) -> List[str]:
    text = normalize_whitespace(text)
    return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]

def hard_wrap_by_words(s: str, max_chars: int) -> List[str]:
    words = s.split()
    out, cur = [], []
    for w in words:
        cand = (" ".join(cur + [w])).strip()
        if len(cand) <= max_chars:
            cur.append(w)
        else:
            if cur:
                out.append(" ".join(cur))
                cur = [w]
            else:
                out.append(w[:max_chars])
                rest = w[max_chars:]
                if rest:
                    cur = [rest]
    if cur:
        out.append(" ".join(cur))
    return [x.strip() for x in out if x.strip()]

def chunk_text(text: str, lang: str = "en", max_chars: int = 220, min_chars: int = 20) -> List[str]:
    text = clean_text(text, lang=lang)
    sents = split_into_sentences(text)

    chunks, cur = [], ""

    def flush():
        nonlocal cur
        if cur.strip():
            chunks.append(cur.strip())
        cur = ""

    for sent in sents:
        if len(sent) > max_chars:
            subs = [x.strip() for x in _SOFT_SPLIT.split(sent) if x.strip()]
            if len(subs) == 1:
                subs = hard_wrap_by_words(sent, max_chars=max_chars)
        else:
            subs = [sent]

        for part in subs:
            if not cur:
                cur = part
            elif len(cur) + 1 + len(part) <= max_chars:
                cur = cur + " " + part
            else:
                flush()
                cur = part

    flush()

    merged = []
    for ch in chunks:
        if merged and len(ch) < min_chars:
            merged[-1] = (merged[-1] + " " + ch).strip()
        else:
            merged.append(ch)
    return merged