| """ |
| HTML Cleaner for HelpScout message bodies. |
| |
| Strategy: |
| 1. Remove blockquotes (quoted previous email threads). |
| 2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.). |
| 3. Remove HelpScout / marketing email boilerplate sections. |
| 4. Extract plain text from the remaining DOM. |
| 5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace. |
| """ |
|
|
| import re |
| import unicodedata |
| from bs4 import BeautifulSoup, Comment |
|
|
| |
| _QUOTED_CLASS_PATTERNS = [ |
| "gmail_extra", |
| "gmail_quote", |
| "ex-gmail", |
| "yahoo_quoted", |
| "moz-cite-prefix", |
| "OutlookMessageHeader", |
| "protonmail_quote", |
| "apple-mail-previous", |
| ] |
|
|
| |
| _QUOTE_TEXT_MARKERS = [ |
| r"On .{5,80} wrote:", |
| r"From:\s", |
| r"Sent:\s", |
| r"To:\s.*\nCc:", |
| r">{1,}", |
| ] |
|
|
| _COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS] |
|
|
| |
| _DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"} |
|
|
| |
| _INVISIBLE_CHARS = re.compile( |
| r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]" |
| ) |
|
|
| |
| _MULTI_BLANK = re.compile(r"\n{3,}") |
|
|
|
|
| def _remove_quoted_sections(soup: BeautifulSoup) -> None: |
| """Remove DOM nodes that represent quoted/threaded email history.""" |
|
|
| |
| for tag in soup.find_all("blockquote"): |
| tag.decompose() |
|
|
| |
| |
| |
| candidates = soup.find_all(True) |
| for tag in candidates: |
| if tag.attrs is None: |
| |
| continue |
| css_classes = " ".join(tag.get("class") or []).lower() |
| tag_id = (tag.get("id") or "").lower() |
| combined = css_classes + " " + tag_id |
| if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS): |
| tag.decompose() |
|
|
| |
| for comment in soup.find_all(string=lambda t: isinstance(t, Comment)): |
| comment.extract() |
|
|
|
|
| def _remove_boilerplate(soup: BeautifulSoup) -> None: |
| """Remove marketing / footer / unsubscribe sections.""" |
|
|
| |
| for tag in soup.find_all(_DROP_TAGS): |
| tag.decompose() |
|
|
| |
| footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"] |
| for tag in soup.find_all(True): |
| if tag.attrs is None: |
| continue |
| text = tag.get_text(separator=" ", strip=True).lower() |
| if any(kw in text for kw in footer_keywords) and len(text) < 300: |
| tag.decompose() |
|
|
|
|
| def _extract_text(soup: BeautifulSoup) -> str: |
| """Get plain text from the cleaned soup, preserving line breaks.""" |
| lines = [] |
| for element in soup.recursiveChildGenerator(): |
| if isinstance(element, str): |
| stripped = element.strip() |
| if stripped: |
| lines.append(stripped) |
| elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}: |
| lines.append("\n") |
| return " ".join(lines) |
|
|
|
|
| def _clean_text(raw: str) -> str: |
| """Final text cleanup: invisible chars, excessive whitespace, quote markers.""" |
|
|
| |
| text = _INVISIBLE_CHARS.sub("", raw) |
|
|
| |
| text = unicodedata.normalize("NFKC", text) |
|
|
| |
| text = re.sub(r"[ \t]+", " ", text) |
| text = re.sub(r" \n", "\n", text) |
| text = re.sub(r"\n ", "\n", text) |
| text = _MULTI_BLANK.sub("\n\n", text) |
|
|
| |
| lines = text.split("\n") |
| lines = [ln for ln in lines if not ln.strip().startswith(">")] |
| text = "\n".join(lines) |
|
|
| |
| for pattern in _COMPILED_QUOTE_MARKERS: |
| match = pattern.search(text) |
| if match and match.start() > 20: |
| text = text[: match.start()].strip() |
| break |
|
|
| return text.strip() |
|
|
|
|
| def clean_html(html_body: str) -> str: |
| """ |
| Full pipeline: HTML → clean plain text containing only the customer's message. |
| |
| Args: |
| html_body: Raw HTML string from CONVERSATION_THREADS.BODY |
| |
| Returns: |
| Clean UTF-8 plain text string. |
| """ |
| if not html_body or not html_body.strip(): |
| return "" |
|
|
| soup = BeautifulSoup(html_body, "html.parser") |
|
|
| _remove_quoted_sections(soup) |
| _remove_boilerplate(soup) |
|
|
| raw_text = _extract_text(soup) |
| return _clean_text(raw_text) |
|
|
|
|
| def clean_html_series(series): |
| """ |
| Vectorized version for a pandas Series. |
| |
| Args: |
| series: pd.Series of HTML strings |
| |
| Returns: |
| pd.Series of cleaned plain text strings |
| """ |
| return series.fillna("").apply(clean_html) |