""" HTML Cleaner for HelpScout message bodies. Strategy: 1. Remove blockquotes (quoted previous email threads). 2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.). 3. Remove HelpScout / marketing email boilerplate sections. 4. Extract plain text from the remaining DOM. 5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace. """ import re import unicodedata from bs4 import BeautifulSoup, Comment # CSS class / id fragments that indicate quoted / boilerplate content _QUOTED_CLASS_PATTERNS = [ "gmail_extra", "gmail_quote", "ex-gmail", "yahoo_quoted", "moz-cite-prefix", "OutlookMessageHeader", "protonmail_quote", "apple-mail-previous", ] # Markers that indicate the start of a quoted section (text-based heuristics) _QUOTE_TEXT_MARKERS = [ r"On .{5,80} wrote:", # "On Mar 2, 2026 ... wrote:" r"From:\s", r"Sent:\s", r"To:\s.*\nCc:", r">{1,}", # > quoted lines (plain text fallback) ] _COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS] # Tags whose entire sub-tree we drop unconditionally _DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"} # Invisible / spacer Unicode characters _INVISIBLE_CHARS = re.compile( r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]" ) # Collapse multiple blank lines to one _MULTI_BLANK = re.compile(r"\n{3,}") def _remove_quoted_sections(soup: BeautifulSoup) -> None: """Remove DOM nodes that represent quoted/threaded email history.""" # 1. All
tags for tag in soup.find_all("blockquote"): tag.decompose() # 2. Divs / spans with known quoted-reply class names # Collect candidates first; decompose() invalidates attrs on child nodes # that may still appear later in the iteration, so we guard with a check. candidates = soup.find_all(True) for tag in candidates: if tag.attrs is None: # Already decomposed (child of a previously decomposed parent) continue css_classes = " ".join(tag.get("class") or []).lower() tag_id = (tag.get("id") or "").lower() combined = css_classes + " " + tag_id if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS): tag.decompose() # 3. HTML comments ( contain no user text) for comment in soup.find_all(string=lambda t: isinstance(t, Comment)): comment.extract() def _remove_boilerplate(soup: BeautifulSoup) -> None: """Remove marketing / footer / unsubscribe sections.""" # Drop heavy layout tags entirely (tables, images carry no message text) for tag in soup.find_all(_DROP_TAGS): tag.decompose() # Drop any element whose text is purely an unsubscribe / footer line footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"] for tag in soup.find_all(True): if tag.attrs is None: continue text = tag.get_text(separator=" ", strip=True).lower() if any(kw in text for kw in footer_keywords) and len(text) < 300: tag.decompose() def _extract_text(soup: BeautifulSoup) -> str: """Get plain text from the cleaned soup, preserving line breaks.""" lines = [] for element in soup.recursiveChildGenerator(): if isinstance(element, str): stripped = element.strip() if stripped: lines.append(stripped) elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}: lines.append("\n") return " ".join(lines) def _clean_text(raw: str) -> str: """Final text cleanup: invisible chars, excessive whitespace, quote markers.""" # Remove invisible spacers text = _INVISIBLE_CHARS.sub("", raw) # Normalize unicode (e.g. soft-hyphen variants) text = unicodedata.normalize("NFKC", text) # Collapse whitespace sequences (keep single newlines intentional) text = re.sub(r"[ \t]+", " ", text) text = re.sub(r" \n", "\n", text) text = re.sub(r"\n ", "\n", text) text = _MULTI_BLANK.sub("\n\n", text) # Remove lines that are purely quote markers ("> some text") lines = text.split("\n") lines = [ln for ln in lines if not ln.strip().startswith(">")] text = "\n".join(lines) # Cut off at first "Onwrote:" marker (inline quoted replies) for pattern in _COMPILED_QUOTE_MARKERS: match = pattern.search(text) if match and match.start() > 20: # don't cut if marker is at very start text = text[: match.start()].strip() break return text.strip() def clean_html(html_body: str) -> str: """ Full pipeline: HTML → clean plain text containing only the customer's message. Args: html_body: Raw HTML string from CONVERSATION_THREADS.BODY Returns: Clean UTF-8 plain text string. """ if not html_body or not html_body.strip(): return "" soup = BeautifulSoup(html_body, "html.parser") _remove_quoted_sections(soup) _remove_boilerplate(soup) raw_text = _extract_text(soup) return _clean_text(raw_text) def clean_html_series(series): """ Vectorized version for a pandas Series. Args: series: pd.Series of HTML strings Returns: pd.Series of cleaned plain text strings """ return series.fillna("").apply(clean_html)