Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

App Files Files Community

Sentiment_analysis / process_helpscout /html_cleaner.py

Danialebrat

Adding HelpScout to UI

58db664 about 1 month ago

raw

history blame contribute delete

5.5 kB

	"""
	HTML Cleaner for HelpScout message bodies.

	Strategy:
	1. Remove blockquotes (quoted previous email threads).
	2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
	3. Remove HelpScout / marketing email boilerplate sections.
	4. Extract plain text from the remaining DOM.
	5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
	"""

	import re
	import unicodedata
	from bs4 import BeautifulSoup, Comment

	# CSS class / id fragments that indicate quoted / boilerplate content
	_QUOTED_CLASS_PATTERNS = [
	"gmail_extra",
	"gmail_quote",
	"ex-gmail",
	"yahoo_quoted",
	"moz-cite-prefix",
	"OutlookMessageHeader",
	"protonmail_quote",
	"apple-mail-previous",
	]

	# Markers that indicate the start of a quoted section (text-based heuristics)
	_QUOTE_TEXT_MARKERS = [
	r"On .{5,80} wrote:", # "On Mar 2, 2026 ... wrote:"
	r"From:\s",
	r"Sent:\s",
	r"To:\s.*\nCc:",
	r">{1,}", # > quoted lines (plain text fallback)
	]

	_COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]

	# Tags whose entire sub-tree we drop unconditionally
	_DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}

	# Invisible / spacer Unicode characters
	_INVISIBLE_CHARS = re.compile(
	r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
	)

	# Collapse multiple blank lines to one
	_MULTI_BLANK = re.compile(r"\n{3,}")


	def _remove_quoted_sections(soup: BeautifulSoup) -> None:
	"""Remove DOM nodes that represent quoted/threaded email history."""

	# 1. All <blockquote> tags
	for tag in soup.find_all("blockquote"):
	tag.decompose()

	# 2. Divs / spans with known quoted-reply class names
	# Collect candidates first; decompose() invalidates attrs on child nodes
	# that may still appear later in the iteration, so we guard with a check.
	candidates = soup.find_all(True)
	for tag in candidates:
	if tag.attrs is None:
	# Already decomposed (child of a previously decomposed parent)
	continue
	css_classes = " ".join(tag.get("class") or []).lower()
	tag_id = (tag.get("id") or "").lower()
	combined = css_classes + " " + tag_id
	if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
	tag.decompose()

	# 3. HTML comments (<!-- --> contain no user text)
	for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
	comment.extract()


	def _remove_boilerplate(soup: BeautifulSoup) -> None:
	"""Remove marketing / footer / unsubscribe sections."""

	# Drop heavy layout tags entirely (tables, images carry no message text)
	for tag in soup.find_all(_DROP_TAGS):
	tag.decompose()

	# Drop any element whose text is purely an unsubscribe / footer line
	footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
	for tag in soup.find_all(True):
	if tag.attrs is None:
	continue
	text = tag.get_text(separator=" ", strip=True).lower()
	if any(kw in text for kw in footer_keywords) and len(text) < 300:
	tag.decompose()


	def _extract_text(soup: BeautifulSoup) -> str:
	"""Get plain text from the cleaned soup, preserving line breaks."""
	lines = []
	for element in soup.recursiveChildGenerator():
	if isinstance(element, str):
	stripped = element.strip()
	if stripped:
	lines.append(stripped)
	elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
	lines.append("\n")
	return " ".join(lines)


	def _clean_text(raw: str) -> str:
	"""Final text cleanup: invisible chars, excessive whitespace, quote markers."""

	# Remove invisible spacers
	text = _INVISIBLE_CHARS.sub("", raw)

	# Normalize unicode (e.g. soft-hyphen variants)
	text = unicodedata.normalize("NFKC", text)

	# Collapse whitespace sequences (keep single newlines intentional)
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r" \n", "\n", text)
	text = re.sub(r"\n ", "\n", text)
	text = _MULTI_BLANK.sub("\n\n", text)

	# Remove lines that are purely quote markers ("> some text")
	lines = text.split("\n")
	lines = [ln for ln in lines if not ln.strip().startswith(">")]
	text = "\n".join(lines)

	# Cut off at first "On <date> wrote:" marker (inline quoted replies)
	for pattern in _COMPILED_QUOTE_MARKERS:
	match = pattern.search(text)
	if match and match.start() > 20: # don't cut if marker is at very start
	text = text[: match.start()].strip()
	break

	return text.strip()


	def clean_html(html_body: str) -> str:
	"""
	Full pipeline: HTML → clean plain text containing only the customer's message.

	Args:
	html_body: Raw HTML string from CONVERSATION_THREADS.BODY

	Returns:
	Clean UTF-8 plain text string.
	"""
	if not html_body or not html_body.strip():
	return ""

	soup = BeautifulSoup(html_body, "html.parser")

	_remove_quoted_sections(soup)
	_remove_boilerplate(soup)

	raw_text = _extract_text(soup)
	return _clean_text(raw_text)


	def clean_html_series(series):
	"""
	Vectorized version for a pandas Series.

	Args:
	series: pd.Series of HTML strings

	Returns:
	pd.Series of cleaned plain text strings
	"""
	return series.fillna("").apply(clean_html)