File size: 3,091 Bytes
493b3af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
from typing import List

_AR_DIACRITICS = re.compile(r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06ED]")
_SENT_SPLIT = re.compile(r"(?<=[\\.\\!\\?])\s+|(?<=[\u061F\u06D4])\s+|(?<=\n)\s*")
_SOFT_SPLIT = re.compile(r"(?<=[,;:ุŒุ›])\s+")

def normalize_whitespace(s: str) -> str:
    s = s.replace("\u00A0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def normalize_punctuation(s: str) -> str:
    s = s.replace("โ€ฆ", "...")
    s = s.replace("โ€œ", '"').replace("โ€", '"').replace("โ€™", "'").replace("โ€˜", "'")
    s = s.replace("โ€”", "-").replace("โ€“", "-")
    s = re.sub(r"\s+([,.;:!?])", r"\1", s)
    s = re.sub(r"([,.;:!?])([^\s])", r"\1 \2", s)
    return s

def normalize_arabic(s: str) -> str:
    s = s.replace("\u0640", "")            # tatweel
    s = _AR_DIACRITICS.sub("", s)          # diacritics
    s = s.replace("ุŒ", "ุŒ ").replace("ุŸ", "ุŸ ").replace("ุ›", "ุ› ")
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def clean_text(s: str, lang: str = "en") -> str:
    s = normalize_whitespace(s)
    s = normalize_punctuation(s)
    if lang.lower().startswith("ar"):
        s = normalize_arabic(s)
    s = re.sub(r"[\u200B-\u200F\u202A-\u202E]", "", s)  # bidi junk
    return normalize_whitespace(s)

def split_into_sentences(text: str) -> List[str]:
    text = normalize_whitespace(text)
    return [p.strip() for p in _SENT_SPLIT.split(text) if p.strip()]

def hard_wrap_by_words(s: str, max_chars: int) -> List[str]:
    words = s.split()
    out, cur = [], []
    for w in words:
        cand = (" ".join(cur + [w])).strip()
        if len(cand) <= max_chars:
            cur.append(w)
        else:
            if cur:
                out.append(" ".join(cur))
                cur = [w]
            else:
                out.append(w[:max_chars])
                rest = w[max_chars:]
                if rest:
                    cur = [rest]
    if cur:
        out.append(" ".join(cur))
    return [x.strip() for x in out if x.strip()]

def chunk_text(text: str, lang: str = "en", max_chars: int = 220, min_chars: int = 20) -> List[str]:
    text = clean_text(text, lang=lang)
    sents = split_into_sentences(text)

    chunks, cur = [], ""

    def flush():
        nonlocal cur
        if cur.strip():
            chunks.append(cur.strip())
        cur = ""

    for sent in sents:
        if len(sent) > max_chars:
            subs = [x.strip() for x in _SOFT_SPLIT.split(sent) if x.strip()]
            if len(subs) == 1:
                subs = hard_wrap_by_words(sent, max_chars=max_chars)
        else:
            subs = [sent]

        for part in subs:
            if not cur:
                cur = part
            elif len(cur) + 1 + len(part) <= max_chars:
                cur = cur + " " + part
            else:
                flush()
                cur = part

    flush()

    merged = []
    for ch in chunks:
        if merged and len(ch) < min_chars:
            merged[-1] = (merged[-1] + " " + ch).strip()
        else:
            merged.append(ch)
    return merged