Spaces:
Running
Running
| import re | |
| from typing import List | |
| _AR_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]") | |
| _SENT_SPLIT = re.compile(r"(?<=[\\.\\!\\?])\s+|(?<=[\u061F\u06D4])\s+|(?<=\n)\s*") | |
| _SOFT_SPLIT = re.compile(r"(?<=[,;:ΨΨ])\s+") | |
| def normalize_whitespace(s: str) -> str: | |
| s = s.replace("\u00A0", " ") | |
| s = re.sub(r"[ \t]+", " ", s) | |
| s = re.sub(r"\n{3,}", "\n\n", s) | |
| return s.strip() | |
| def normalize_punctuation(s: str) -> str: | |
| s = s.replace("β¦", "...") | |
| s = s.replace("β", '"').replace("β", '"').replace("β", "'").replace("β", "'") | |
| s = s.replace("β", "-").replace("β", "-") | |
| s = re.sub(r"\s+([,.;:!?])", r"\1", s) | |
| s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s) | |
| return s | |
| def normalize_arabic(s: str) -> str: | |
| s = s.replace("\u0640", "") # tatweel | |
| s = _AR_DIACRITICS.sub("", s) # diacritics | |
| s = s.replace("Ψ", "Ψ ").replace("Ψ", "Ψ ").replace("Ψ", "Ψ ") | |
| s = re.sub(r"\s+", " ", s) | |
| return s.strip() | |
| def clean_text(s: str, lang: str = "en") -> str: | |
| s = normalize_whitespace(s) | |
| s = normalize_punctuation(s) | |
| if lang.lower().startswith("ar"): | |
| s = normalize_arabic(s) | |
| s = re.sub(r"[\u200B-\u200F\u202A-\u202E]", "", s) # bidi junk | |
| return normalize_whitespace(s) | |
| def split_into_sentences(text: str) -> List[str]: | |
| text = normalize_whitespace(text) | |
| return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()] | |
| def hard_wrap_by_words(s: str, max_chars: int) -> List[str]: | |
| words = s.split() | |
| out, cur = [], [] | |
| for w in words: | |
| cand = (" ".join(cur + [w])).strip() | |
| if len(cand) <= max_chars: | |
| cur.append(w) | |
| else: | |
| if cur: | |
| out.append(" ".join(cur)) | |
| cur = [w] | |
| else: | |
| out.append(w[:max_chars]) | |
| rest = w[max_chars:] | |
| if rest: | |
| cur = [rest] | |
| if cur: | |
| out.append(" ".join(cur)) | |
| return [x.strip() for x in out if x.strip()] | |
| def chunk_text(text: str, lang: str = "en", max_chars: int = 220, min_chars: int = 20) -> List[str]: | |
| text = clean_text(text, lang=lang) | |
| sents = split_into_sentences(text) | |
| chunks, cur = [], "" | |
| def flush(): | |
| nonlocal cur | |
| if cur.strip(): | |
| chunks.append(cur.strip()) | |
| cur = "" | |
| for sent in sents: | |
| if len(sent) > max_chars: | |
| subs = [x.strip() for x in _SOFT_SPLIT.split(sent) if x.strip()] | |
| if len(subs) == 1: | |
| subs = hard_wrap_by_words(sent, max_chars=max_chars) | |
| else: | |
| subs = [sent] | |
| for part in subs: | |
| if not cur: | |
| cur = part | |
| elif len(cur) + 1 + len(part) <= max_chars: | |
| cur = cur + " " + part | |
| else: | |
| flush() | |
| cur = part | |
| flush() | |
| merged = [] | |
| for ch in chunks: | |
| if merged and len(ch) < min_chars: | |
| merged[-1] = (merged[-1] + " " + ch).strip() | |
| else: | |
| merged.append(ch) | |
| return merged | |