File size: 7,062 Bytes
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c78c2c1
6c9b8f1
 
 
 
 
 
 
 
 
c78c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c9b8f1
 
c78c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c78c2c1
6c9b8f1
 
 
 
 
 
c78c2c1
6c9b8f1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
PhilVerify β€” Text Preprocessor
Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
"""
import re
import string
import unicodedata
from dataclasses import dataclass, field

# ── Filipino + English stopwords ──────────────────────────────────────────────
TAGALOG_STOPWORDS = {
    "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
    "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
    "dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
    "lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
    "kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
    "ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
    "hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
}

ENGLISH_STOPWORDS = {
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
    "for", "of", "with", "by", "from", "is", "are", "was", "were",
    "be", "been", "being", "have", "has", "had", "do", "does", "did",
    "will", "would", "could", "should", "may", "might", "shall", "can",
    "not", "no", "nor", "so", "yet", "both", "either", "neither",
    "this", "that", "these", "those", "it", "its", "i", "me", "my",
    "we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
}

ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS

# ── Patterns ──────────────────────────────────────────────────────────────────
_URL_PATTERN = re.compile(
    r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
_MENTION_PATTERN = re.compile(r"@\w+")
_HASHTAG_PATTERN = re.compile(r"#\w+")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}")  # "graaabe" β†’ "grabe"
_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
_WHITESPACE_PATTERN = re.compile(r"\s+")

# Emoji removal via unicode category
def _remove_emojis(text: str) -> str:
    return "".join(
        ch for ch in text
        if not unicodedata.category(ch).startswith("So")  # Symbol, Other
        and unicodedata.category(ch) not in ("Mn",)       # Modifier letters
    )


@dataclass
class PreprocessResult:
    original: str
    cleaned: str
    normalized: str
    tokens: list[str] = field(default_factory=list)
    filtered_tokens: list[str] = field(default_factory=list)
    lemmatized_tokens: list[str] = field(default_factory=list)
    char_count: int = 0
    word_count: int = 0


class TextPreprocessor:
    """
    Multi-step text cleaner for Tagalog / English / Taglish content.

    Pipeline:
        1. strip_html        β€” remove HTML tags
        2. strip_urls        β€” remove hyperlinks
        3. strip_mentions    β€” remove @user
        4. strip_hashtags    β€” remove #tag text (keep token)
        5. strip_emojis      β€” remove Unicode emoji
        6. lowercase         β€” normalize case
        7. normalize_chars   β€” collapse repeated chars, excessive !??
        8. strip_punct       β€” remove punctuation except apostrophe
        9. tokenize          β€” split on whitespace
       10. remove_stopwords  β€” drop EN + TL stopwords
       11. lemmatize         β€” WordNet lemmatization (opt-in, English-biased;
                               Tagalog tokens are returned unchanged)

    Args:
        lemmatize: if True, step 11 is applied and lemmatized_tokens is populated.
                   Off by default β€” transformer models handle subword tokenization
                   themselves and do not benefit from lemmatization.
    """

    def __init__(self, lemmatize: bool = False):
        self.lemmatize = lemmatize

    def _lemmatize_tokens(self, tokens: list[str]) -> list[str]:
        """
        POS-aware WordNet lemmatization. Downloads NLTK data on first call.
        Falls back to identity on any error (e.g. missing corpus).
        """
        try:
            import nltk
            from nltk.corpus import wordnet
            from nltk.stem import WordNetLemmatizer

            for resource, path in [
                ("wordnet", "corpora/wordnet"),
                ("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
            ]:
                try:
                    nltk.data.find(path)
                except LookupError:
                    nltk.download(resource, quiet=True)

            def _wn_pos(tag: str) -> str:
                if tag.startswith("J"):
                    return wordnet.ADJ
                if tag.startswith("V"):
                    return wordnet.VERB
                if tag.startswith("R"):
                    return wordnet.ADV
                return wordnet.NOUN

            lemmatizer = WordNetLemmatizer()
            tagged = nltk.pos_tag(tokens)
            return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
        except Exception:
            return tokens

    def clean(self, text: str) -> str:
        """Steps 1-6: structural cleaning."""
        text = _HTML_TAG_PATTERN.sub(" ", text)
        text = _URL_PATTERN.sub(" ", text)
        text = _MENTION_PATTERN.sub(" ", text)
        text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text)  # Keep word, drop #
        text = _remove_emojis(text)
        text = text.lower()
        return _WHITESPACE_PATTERN.sub(" ", text).strip()

    def normalize(self, text: str) -> str:
        """Steps 7-8: character-level normalization."""
        text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text)   # "graaabe" β†’ "graabe"
        text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text)   # "!!!" β†’ "!"
        # Keep apostrophes (di, 'di, hindi), remove other punct
        text = "".join(
            ch if ch not in string.punctuation or ch == "'" else " "
            for ch in text
        )
        return _WHITESPACE_PATTERN.sub(" ", text).strip()

    def tokenize(self, text: str) -> list[str]:
        """Step 9: whitespace tokenization."""
        return [t for t in text.split() if len(t) > 1]

    def remove_stopwords(self, tokens: list[str]) -> list[str]:
        """Step 10: remove EN + TL stopwords."""
        return [t for t in tokens if t not in ALL_STOPWORDS]

    def preprocess(self, text: str) -> PreprocessResult:
        """Run the full pipeline and return a structured result."""
        cleaned = self.clean(text)
        normalized = self.normalize(cleaned)
        tokens = self.tokenize(normalized)
        filtered = self.remove_stopwords(tokens)
        lemmatized = self._lemmatize_tokens(filtered) if self.lemmatize else []
        return PreprocessResult(
            original=text,
            cleaned=cleaned,
            normalized=normalized,
            tokens=tokens,
            filtered_tokens=filtered,
            lemmatized_tokens=lemmatized,
            char_count=len(normalized),
            word_count=len(tokens),
        )