File size: 5,499 Bytes
58db664 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | """
HTML Cleaner for HelpScout message bodies.
Strategy:
1. Remove blockquotes (quoted previous email threads).
2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
3. Remove HelpScout / marketing email boilerplate sections.
4. Extract plain text from the remaining DOM.
5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
"""
import re
import unicodedata
from bs4 import BeautifulSoup, Comment
# CSS class / id fragments that indicate quoted / boilerplate content
_QUOTED_CLASS_PATTERNS = [
"gmail_extra",
"gmail_quote",
"ex-gmail",
"yahoo_quoted",
"moz-cite-prefix",
"OutlookMessageHeader",
"protonmail_quote",
"apple-mail-previous",
]
# Markers that indicate the start of a quoted section (text-based heuristics)
_QUOTE_TEXT_MARKERS = [
r"On .{5,80} wrote:", # "On Mar 2, 2026 ... wrote:"
r"From:\s",
r"Sent:\s",
r"To:\s.*\nCc:",
r">{1,}", # > quoted lines (plain text fallback)
]
_COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]
# Tags whose entire sub-tree we drop unconditionally
_DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}
# Invisible / spacer Unicode characters
_INVISIBLE_CHARS = re.compile(
r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
)
# Collapse multiple blank lines to one
_MULTI_BLANK = re.compile(r"\n{3,}")
def _remove_quoted_sections(soup: BeautifulSoup) -> None:
"""Remove DOM nodes that represent quoted/threaded email history."""
# 1. All <blockquote> tags
for tag in soup.find_all("blockquote"):
tag.decompose()
# 2. Divs / spans with known quoted-reply class names
# Collect candidates first; decompose() invalidates attrs on child nodes
# that may still appear later in the iteration, so we guard with a check.
candidates = soup.find_all(True)
for tag in candidates:
if tag.attrs is None:
# Already decomposed (child of a previously decomposed parent)
continue
css_classes = " ".join(tag.get("class") or []).lower()
tag_id = (tag.get("id") or "").lower()
combined = css_classes + " " + tag_id
if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
tag.decompose()
# 3. HTML comments (<!-- --> contain no user text)
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
comment.extract()
def _remove_boilerplate(soup: BeautifulSoup) -> None:
"""Remove marketing / footer / unsubscribe sections."""
# Drop heavy layout tags entirely (tables, images carry no message text)
for tag in soup.find_all(_DROP_TAGS):
tag.decompose()
# Drop any element whose text is purely an unsubscribe / footer line
footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
for tag in soup.find_all(True):
if tag.attrs is None:
continue
text = tag.get_text(separator=" ", strip=True).lower()
if any(kw in text for kw in footer_keywords) and len(text) < 300:
tag.decompose()
def _extract_text(soup: BeautifulSoup) -> str:
"""Get plain text from the cleaned soup, preserving line breaks."""
lines = []
for element in soup.recursiveChildGenerator():
if isinstance(element, str):
stripped = element.strip()
if stripped:
lines.append(stripped)
elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
lines.append("\n")
return " ".join(lines)
def _clean_text(raw: str) -> str:
"""Final text cleanup: invisible chars, excessive whitespace, quote markers."""
# Remove invisible spacers
text = _INVISIBLE_CHARS.sub("", raw)
# Normalize unicode (e.g. soft-hyphen variants)
text = unicodedata.normalize("NFKC", text)
# Collapse whitespace sequences (keep single newlines intentional)
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r" \n", "\n", text)
text = re.sub(r"\n ", "\n", text)
text = _MULTI_BLANK.sub("\n\n", text)
# Remove lines that are purely quote markers ("> some text")
lines = text.split("\n")
lines = [ln for ln in lines if not ln.strip().startswith(">")]
text = "\n".join(lines)
# Cut off at first "On <date> wrote:" marker (inline quoted replies)
for pattern in _COMPILED_QUOTE_MARKERS:
match = pattern.search(text)
if match and match.start() > 20: # don't cut if marker is at very start
text = text[: match.start()].strip()
break
return text.strip()
def clean_html(html_body: str) -> str:
"""
Full pipeline: HTML → clean plain text containing only the customer's message.
Args:
html_body: Raw HTML string from CONVERSATION_THREADS.BODY
Returns:
Clean UTF-8 plain text string.
"""
if not html_body or not html_body.strip():
return ""
soup = BeautifulSoup(html_body, "html.parser")
_remove_quoted_sections(soup)
_remove_boilerplate(soup)
raw_text = _extract_text(soup)
return _clean_text(raw_text)
def clean_html_series(series):
"""
Vectorized version for a pandas Series.
Args:
series: pd.Series of HTML strings
Returns:
pd.Series of cleaned plain text strings
"""
return series.fillna("").apply(clean_html) |