Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

File size: 5,499 Bytes

58db664

"""
HTML Cleaner for HelpScout message bodies.

Strategy:
  1. Remove blockquotes (quoted previous email threads).
  2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
  3. Remove HelpScout / marketing email boilerplate sections.
  4. Extract plain text from the remaining DOM.
  5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
"""

import re
import unicodedata
from bs4 import BeautifulSoup, Comment

# CSS class / id fragments that indicate quoted / boilerplate content
_QUOTED_CLASS_PATTERNS = [
    "gmail_extra",
    "gmail_quote",
    "ex-gmail",
    "yahoo_quoted",
    "moz-cite-prefix",
    "OutlookMessageHeader",
    "protonmail_quote",
    "apple-mail-previous",
]

# Markers that indicate the start of a quoted section (text-based heuristics)
_QUOTE_TEXT_MARKERS = [
    r"On .{5,80} wrote:",          # "On Mar 2, 2026 ... wrote:"
    r"From:\s",
    r"Sent:\s",
    r"To:\s.*\nCc:",
    r">{1,}",                       # > quoted lines (plain text fallback)
]

_COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]

# Tags whose entire sub-tree we drop unconditionally
_DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}

# Invisible / spacer Unicode characters
_INVISIBLE_CHARS = re.compile(
    r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
)

# Collapse multiple blank lines to one
_MULTI_BLANK = re.compile(r"\n{3,}")


def _remove_quoted_sections(soup: BeautifulSoup) -> None:
    """Remove DOM nodes that represent quoted/threaded email history."""

    # 1. All <blockquote> tags
    for tag in soup.find_all("blockquote"):
        tag.decompose()

    # 2. Divs / spans with known quoted-reply class names
    # Collect candidates first; decompose() invalidates attrs on child nodes
    # that may still appear later in the iteration, so we guard with a check.
    candidates = soup.find_all(True)
    for tag in candidates:
        if tag.attrs is None:
            # Already decomposed (child of a previously decomposed parent)
            continue
        css_classes = " ".join(tag.get("class") or []).lower()
        tag_id = (tag.get("id") or "").lower()
        combined = css_classes + " " + tag_id
        if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
            tag.decompose()

    # 3. HTML comments (<!-- --> contain no user text)
    for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
        comment.extract()


def _remove_boilerplate(soup: BeautifulSoup) -> None:
    """Remove marketing / footer / unsubscribe sections."""

    # Drop heavy layout tags entirely (tables, images carry no message text)
    for tag in soup.find_all(_DROP_TAGS):
        tag.decompose()

    # Drop any element whose text is purely an unsubscribe / footer line
    footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
    for tag in soup.find_all(True):
        if tag.attrs is None:
            continue
        text = tag.get_text(separator=" ", strip=True).lower()
        if any(kw in text for kw in footer_keywords) and len(text) < 300:
            tag.decompose()


def _extract_text(soup: BeautifulSoup) -> str:
    """Get plain text from the cleaned soup, preserving line breaks."""
    lines = []
    for element in soup.recursiveChildGenerator():
        if isinstance(element, str):
            stripped = element.strip()
            if stripped:
                lines.append(stripped)
        elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
            lines.append("\n")
    return " ".join(lines)


def _clean_text(raw: str) -> str:
    """Final text cleanup: invisible chars, excessive whitespace, quote markers."""

    # Remove invisible spacers
    text = _INVISIBLE_CHARS.sub("", raw)

    # Normalize unicode (e.g. soft-hyphen variants)
    text = unicodedata.normalize("NFKC", text)

    # Collapse whitespace sequences (keep single newlines intentional)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r" \n", "\n", text)
    text = re.sub(r"\n ", "\n", text)
    text = _MULTI_BLANK.sub("\n\n", text)

    # Remove lines that are purely quote markers ("> some text")
    lines = text.split("\n")
    lines = [ln for ln in lines if not ln.strip().startswith(">")]
    text = "\n".join(lines)

    # Cut off at first "On <date> wrote:" marker (inline quoted replies)
    for pattern in _COMPILED_QUOTE_MARKERS:
        match = pattern.search(text)
        if match and match.start() > 20:   # don't cut if marker is at very start
            text = text[: match.start()].strip()
            break

    return text.strip()


def clean_html(html_body: str) -> str:
    """
    Full pipeline: HTML → clean plain text containing only the customer's message.

    Args:
        html_body: Raw HTML string from CONVERSATION_THREADS.BODY

    Returns:
        Clean UTF-8 plain text string.
    """
    if not html_body or not html_body.strip():
        return ""

    soup = BeautifulSoup(html_body, "html.parser")

    _remove_quoted_sections(soup)
    _remove_boilerplate(soup)

    raw_text = _extract_text(soup)
    return _clean_text(raw_text)


def clean_html_series(series):
    """
    Vectorized version for a pandas Series.

    Args:
        series: pd.Series of HTML strings

    Returns:
        pd.Series of cleaned plain text strings
    """
    return series.fillna("").apply(clean_html)