Spaces:

SemiAutomat1c
/

philverify-api

Running

File size: 7,062 Bytes

"""
PhilVerify — Text Preprocessor
Handles cleaning, tokenizing, and normalizing Filipino/English/Taglish text.
"""
import re
import string
import unicodedata
from dataclasses import dataclass, field

# ── Filipino + English stopwords ──────────────────────────────────────────────
TAGALOG_STOPWORDS = {
    "ang", "ng", "na", "sa", "at", "ay", "mga", "ni", "nang", "si",
    "ko", "mo", "siya", "kami", "kayo", "sila", "ito", "iyon", "iyan",
    "dito", "doon", "diyan", "nito", "noon", "niyan", "rin", "din", "pa",
    "lang", "lamang", "nga", "naman", "kaya", "pero", "dahil", "kung",
    "kapag", "habang", "bilang", "upang", "para", "mula", "hanggang",
    "ayon", "sinabi", "raw", "daw", "ba", "po", "ho", "oh", "oo",
    "hindi", "wala", "may", "mayroon", "talaga", "pala", "sana",
}

ENGLISH_STOPWORDS = {
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
    "for", "of", "with", "by", "from", "is", "are", "was", "were",
    "be", "been", "being", "have", "has", "had", "do", "does", "did",
    "will", "would", "could", "should", "may", "might", "shall", "can",
    "not", "no", "nor", "so", "yet", "both", "either", "neither",
    "this", "that", "these", "those", "it", "its", "i", "me", "my",
    "we", "our", "you", "your", "they", "their", "he", "his", "she", "her",
}

ALL_STOPWORDS = TAGALOG_STOPWORDS | ENGLISH_STOPWORDS

# ── Patterns ──────────────────────────────────────────────────────────────────
_URL_PATTERN = re.compile(
    r"http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
)
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
_MENTION_PATTERN = re.compile(r"@\w+")
_HASHTAG_PATTERN = re.compile(r"#\w+")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1{2,}")  # "graaabe" → "grabe"
_EXCESSIVE_PUNCT_PATTERN = re.compile(r"([!?.]){2,}")
_WHITESPACE_PATTERN = re.compile(r"\s+")

# Emoji removal via unicode category
def _remove_emojis(text: str) -> str:
    return "".join(
        ch for ch in text
        if not unicodedata.category(ch).startswith("So")  # Symbol, Other
        and unicodedata.category(ch) not in ("Mn",)       # Modifier letters
    )


@dataclass
class PreprocessResult:
    original: str
    cleaned: str
    normalized: str
    tokens: list[str] = field(default_factory=list)
    filtered_tokens: list[str] = field(default_factory=list)
    lemmatized_tokens: list[str] = field(default_factory=list)
    char_count: int = 0
    word_count: int = 0


class TextPreprocessor:
    """
    Multi-step text cleaner for Tagalog / English / Taglish content.

    Pipeline:
        1. strip_html        — remove HTML tags
        2. strip_urls        — remove hyperlinks
        3. strip_mentions    — remove @user
        4. strip_hashtags    — remove #tag text (keep token)
        5. strip_emojis      — remove Unicode emoji
        6. lowercase         — normalize case
        7. normalize_chars   — collapse repeated chars, excessive !??
        8. strip_punct       — remove punctuation except apostrophe
        9. tokenize          — split on whitespace
       10. remove_stopwords  — drop EN + TL stopwords
       11. lemmatize         — WordNet lemmatization (opt-in, English-biased;
                               Tagalog tokens are returned unchanged)

    Args:
        lemmatize: if True, step 11 is applied and lemmatized_tokens is populated.
                   Off by default — transformer models handle subword tokenization
                   themselves and do not benefit from lemmatization.
    """

    def __init__(self, lemmatize: bool = False):
        self.lemmatize = lemmatize

    def _lemmatize_tokens(self, tokens: list[str]) -> list[str]:
        """
        POS-aware WordNet lemmatization. Downloads NLTK data on first call.
        Falls back to identity on any error (e.g. missing corpus).
        """
        try:
            import nltk
            from nltk.corpus import wordnet
            from nltk.stem import WordNetLemmatizer

            for resource, path in [
                ("wordnet", "corpora/wordnet"),
                ("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
            ]:
                try:
                    nltk.data.find(path)
                except LookupError:
                    nltk.download(resource, quiet=True)

            def _wn_pos(tag: str) -> str:
                if tag.startswith("J"):
                    return wordnet.ADJ
                if tag.startswith("V"):
                    return wordnet.VERB
                if tag.startswith("R"):
                    return wordnet.ADV
                return wordnet.NOUN

            lemmatizer = WordNetLemmatizer()
            tagged = nltk.pos_tag(tokens)
            return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
        except Exception:
            return tokens

    def clean(self, text: str) -> str:
        """Steps 1-6: structural cleaning."""
        text = _HTML_TAG_PATTERN.sub(" ", text)
        text = _URL_PATTERN.sub(" ", text)
        text = _MENTION_PATTERN.sub(" ", text)
        text = _HASHTAG_PATTERN.sub(lambda m: m.group(0)[1:], text)  # Keep word, drop #
        text = _remove_emojis(text)
        text = text.lower()
        return _WHITESPACE_PATTERN.sub(" ", text).strip()

    def normalize(self, text: str) -> str:
        """Steps 7-8: character-level normalization."""
        text = _REPEATED_CHAR_PATTERN.sub(r"\1\1", text)   # "graaabe" → "graabe"
        text = _EXCESSIVE_PUNCT_PATTERN.sub(r"\1", text)   # "!!!" → "!"
        # Keep apostrophes (di, 'di, hindi), remove other punct
        text = "".join(
            ch if ch not in string.punctuation or ch == "'" else " "
            for ch in text
        )
        return _WHITESPACE_PATTERN.sub(" ", text).strip()

    def tokenize(self, text: str) -> list[str]:
        """Step 9: whitespace tokenization."""
        return [t for t in text.split() if len(t) > 1]

    def remove_stopwords(self, tokens: list[str]) -> list[str]:
        """Step 10: remove EN + TL stopwords."""
        return [t for t in tokens if t not in ALL_STOPWORDS]

    def preprocess(self, text: str) -> PreprocessResult:
        """Run the full pipeline and return a structured result."""
        cleaned = self.clean(text)
        normalized = self.normalize(cleaned)
        tokens = self.tokenize(normalized)
        filtered = self.remove_stopwords(tokens)
        lemmatized = self._lemmatize_tokens(filtered) if self.lemmatize else []
        return PreprocessResult(
            original=text,
            cleaned=cleaned,
            normalized=normalized,
            tokens=tokens,
            filtered_tokens=filtered,
            lemmatized_tokens=lemmatized,
            char_count=len(normalized),
            word_count=len(tokens),
        )