Spaces:
Running
Running
File size: 3,091 Bytes
493b3af | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import re
from typing import List
_AR_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
_SENT_SPLIT = re.compile(r"(?<=[\\.\\!\\?])\s+|(?<=[\u061F\u06D4])\s+|(?<=\n)\s*")
_SOFT_SPLIT = re.compile(r"(?<=[,;:ุุ])\s+")
def normalize_whitespace(s: str) -> str:
s = s.replace("\u00A0", " ")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\n{3,}", "\n\n", s)
return s.strip()
def normalize_punctuation(s: str) -> str:
s = s.replace("โฆ", "...")
s = s.replace("โ", '"').replace("โ", '"').replace("โ", "'").replace("โ", "'")
s = s.replace("โ", "-").replace("โ", "-")
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s)
return s
def normalize_arabic(s: str) -> str:
s = s.replace("\u0640", "") # tatweel
s = _AR_DIACRITICS.sub("", s) # diacritics
s = s.replace("ุ", "ุ ").replace("ุ", "ุ ").replace("ุ", "ุ ")
s = re.sub(r"\s+", " ", s)
return s.strip()
def clean_text(s: str, lang: str = "en") -> str:
s = normalize_whitespace(s)
s = normalize_punctuation(s)
if lang.lower().startswith("ar"):
s = normalize_arabic(s)
s = re.sub(r"[\u200B-\u200F\u202A-\u202E]", "", s) # bidi junk
return normalize_whitespace(s)
def split_into_sentences(text: str) -> List[str]:
text = normalize_whitespace(text)
return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]
def hard_wrap_by_words(s: str, max_chars: int) -> List[str]:
words = s.split()
out, cur = [], []
for w in words:
cand = (" ".join(cur + [w])).strip()
if len(cand) <= max_chars:
cur.append(w)
else:
if cur:
out.append(" ".join(cur))
cur = [w]
else:
out.append(w[:max_chars])
rest = w[max_chars:]
if rest:
cur = [rest]
if cur:
out.append(" ".join(cur))
return [x.strip() for x in out if x.strip()]
def chunk_text(text: str, lang: str = "en", max_chars: int = 220, min_chars: int = 20) -> List[str]:
text = clean_text(text, lang=lang)
sents = split_into_sentences(text)
chunks, cur = [], ""
def flush():
nonlocal cur
if cur.strip():
chunks.append(cur.strip())
cur = ""
for sent in sents:
if len(sent) > max_chars:
subs = [x.strip() for x in _SOFT_SPLIT.split(sent) if x.strip()]
if len(subs) == 1:
subs = hard_wrap_by_words(sent, max_chars=max_chars)
else:
subs = [sent]
for part in subs:
if not cur:
cur = part
elif len(cur) + 1 + len(part) <= max_chars:
cur = cur + " " + part
else:
flush()
cur = part
flush()
merged = []
for ch in chunks:
if merged and len(ch) < min_chars:
merged[-1] = (merged[-1] + " " + ch).strip()
else:
merged.append(ch)
return merged
|