Spaces:
Running
Running
File size: 7,062 Bytes
6c9b8f1 c78c2c1 6c9b8f1 c78c2c1 6c9b8f1 c78c2c1 6c9b8f1 c78c2c1 6c9b8f1 c78c2c1 6c9b8f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | """
PhilVerify β Text Preprocessor
Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
"""
import re
import string
import unicodedata
from dataclasses import dataclass, field
# ββ Filipino + English stopwords ββββββββββββββββββββββββββββββββββββββββββββββ
TAGALOG_STOPWORDS = {
"ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
"ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
"dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
"lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
"kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
"ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
"hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
}
ENGLISH_STOPWORDS = {
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
"for", "of", "with", "by", "from", "is", "are", "was", "were",
"be", "been", "being", "have", "has", "had", "do", "does", "did",
"will", "would", "could", "should", "may", "might", "shall", "can",
"not", "no", "nor", "so", "yet", "both", "either", "neither",
"this", "that", "these", "those", "it", "its", "i", "me", "my",
"we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
}
ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS
# ββ Patterns ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_URL_PATTERN = re.compile(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
_MENTION_PATTERN = re.compile(r"@\w+")
_HASHTAG_PATTERN = re.compile(r"#\w+")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}") # "graaabe" β "grabe"
_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
_WHITESPACE_PATTERN = re.compile(r"\s+")
# Emoji removal via unicode category
def _remove_emojis(text: str) -> str:
return "".join(
ch for ch in text
if not unicodedata.category(ch).startswith("So") # Symbol, Other
and unicodedata.category(ch) not in ("Mn",) # Modifier letters
)
@dataclass
class PreprocessResult:
original: str
cleaned: str
normalized: str
tokens: list[str] = field(default_factory=list)
filtered_tokens: list[str] = field(default_factory=list)
lemmatized_tokens: list[str] = field(default_factory=list)
char_count: int = 0
word_count: int = 0
class TextPreprocessor:
"""
Multi-step text cleaner for Tagalog / English / Taglish content.
Pipeline:
1. strip_html β remove HTML tags
2. strip_urls β remove hyperlinks
3. strip_mentions β remove @user
4. strip_hashtags β remove #tag text (keep token)
5. strip_emojis β remove Unicode emoji
6. lowercase β normalize case
7. normalize_chars β collapse repeated chars, excessive !??
8. strip_punct β remove punctuation except apostrophe
9. tokenize β split on whitespace
10. remove_stopwords β drop EN + TL stopwords
11. lemmatize β WordNet lemmatization (opt-in, English-biased;
Tagalog tokens are returned unchanged)
Args:
lemmatize: if True, step 11 is applied and lemmatized_tokens is populated.
Off by default β transformer models handle subword tokenization
themselves and do not benefit from lemmatization.
"""
def __init__(self, lemmatize: bool = False):
self.lemmatize = lemmatize
def _lemmatize_tokens(self, tokens: list[str]) -> list[str]:
"""
POS-aware WordNet lemmatization. Downloads NLTK data on first call.
Falls back to identity on any error (e.g. missing corpus).
"""
try:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
for resource, path in [
("wordnet", "corpora/wordnet"),
("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
]:
try:
nltk.data.find(path)
except LookupError:
nltk.download(resource, quiet=True)
def _wn_pos(tag: str) -> str:
if tag.startswith("J"):
return wordnet.ADJ
if tag.startswith("V"):
return wordnet.VERB
if tag.startswith("R"):
return wordnet.ADV
return wordnet.NOUN
lemmatizer = WordNetLemmatizer()
tagged = nltk.pos_tag(tokens)
return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
except Exception:
return tokens
def clean(self, text: str) -> str:
"""Steps 1-6: structural cleaning."""
text = _HTML_TAG_PATTERN.sub(" ", text)
text = _URL_PATTERN.sub(" ", text)
text = _MENTION_PATTERN.sub(" ", text)
text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text) # Keep word, drop #
text = _remove_emojis(text)
text = text.lower()
return _WHITESPACE_PATTERN.sub(" ", text).strip()
def normalize(self, text: str) -> str:
"""Steps 7-8: character-level normalization."""
text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text) # "graaabe" β "graabe"
text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text) # "!!!" β "!"
# Keep apostrophes (di, 'di, hindi), remove other punct
text = "".join(
ch if ch not in string.punctuation or ch == "'" else " "
for ch in text
)
return _WHITESPACE_PATTERN.sub(" ", text).strip()
def tokenize(self, text: str) -> list[str]:
"""Step 9: whitespace tokenization."""
return [t for t in text.split() if len(t) > 1]
def remove_stopwords(self, tokens: list[str]) -> list[str]:
"""Step 10: remove EN + TL stopwords."""
return [t for t in tokens if t not in ALL_STOPWORDS]
def preprocess(self, text: str) -> PreprocessResult:
"""Run the full pipeline and return a structured result."""
cleaned = self.clean(text)
normalized = self.normalize(cleaned)
tokens = self.tokenize(normalized)
filtered = self.remove_stopwords(tokens)
lemmatized = self._lemmatize_tokens(filtered) if self.lemmatize else []
return PreprocessResult(
original=text,
cleaned=cleaned,
normalized=normalized,
tokens=tokens,
filtered_tokens=filtered,
lemmatized_tokens=lemmatized,
char_count=len(normalized),
word_count=len(tokens),
)
|