File size: 5,499 Bytes
58db664
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
HTML Cleaner for HelpScout message bodies.

Strategy:
  1. Remove blockquotes (quoted previous email threads).
  2. Remove Gmail/Outlook quoted-reply wrappers (ex-gmail_extra, gmail_quote, etc.).
  3. Remove HelpScout / marketing email boilerplate sections.
  4. Extract plain text from the remaining DOM.
  5. Strip invisible Unicode spacers (\\u200c, \\u00ad, etc.) and collapse whitespace.
"""

import re
import unicodedata
from bs4 import BeautifulSoup, Comment

# CSS class / id fragments that indicate quoted / boilerplate content
_QUOTED_CLASS_PATTERNS = [
    "gmail_extra",
    "gmail_quote",
    "ex-gmail",
    "yahoo_quoted",
    "moz-cite-prefix",
    "OutlookMessageHeader",
    "protonmail_quote",
    "apple-mail-previous",
]

# Markers that indicate the start of a quoted section (text-based heuristics)
_QUOTE_TEXT_MARKERS = [
    r"On .{5,80} wrote:",          # "On Mar 2, 2026 ... wrote:"
    r"From:\s",
    r"Sent:\s",
    r"To:\s.*\nCc:",
    r">{1,}",                       # > quoted lines (plain text fallback)
]

_COMPILED_QUOTE_MARKERS = [re.compile(p, re.IGNORECASE) for p in _QUOTE_TEXT_MARKERS]

# Tags whose entire sub-tree we drop unconditionally
_DROP_TAGS = {"script", "style", "head", "meta", "link", "img", "table"}

# Invisible / spacer Unicode characters
_INVISIBLE_CHARS = re.compile(
    r"[\u00ad\u200b\u200c\u200d\u2060\ufeff\u00a0\u034f]"
)

# Collapse multiple blank lines to one
_MULTI_BLANK = re.compile(r"\n{3,}")


def _remove_quoted_sections(soup: BeautifulSoup) -> None:
    """Remove DOM nodes that represent quoted/threaded email history."""

    # 1. All <blockquote> tags
    for tag in soup.find_all("blockquote"):
        tag.decompose()

    # 2. Divs / spans with known quoted-reply class names
    # Collect candidates first; decompose() invalidates attrs on child nodes
    # that may still appear later in the iteration, so we guard with a check.
    candidates = soup.find_all(True)
    for tag in candidates:
        if tag.attrs is None:
            # Already decomposed (child of a previously decomposed parent)
            continue
        css_classes = " ".join(tag.get("class") or []).lower()
        tag_id = (tag.get("id") or "").lower()
        combined = css_classes + " " + tag_id
        if any(pattern in combined for pattern in _QUOTED_CLASS_PATTERNS):
            tag.decompose()

    # 3. HTML comments (<!-- --> contain no user text)
    for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
        comment.extract()


def _remove_boilerplate(soup: BeautifulSoup) -> None:
    """Remove marketing / footer / unsubscribe sections."""

    # Drop heavy layout tags entirely (tables, images carry no message text)
    for tag in soup.find_all(_DROP_TAGS):
        tag.decompose()

    # Drop any element whose text is purely an unsubscribe / footer line
    footer_keywords = ["unsubscribe", "musora media", "31265 wheel", "customeriomail"]
    for tag in soup.find_all(True):
        if tag.attrs is None:
            continue
        text = tag.get_text(separator=" ", strip=True).lower()
        if any(kw in text for kw in footer_keywords) and len(text) < 300:
            tag.decompose()


def _extract_text(soup: BeautifulSoup) -> str:
    """Get plain text from the cleaned soup, preserving line breaks."""
    lines = []
    for element in soup.recursiveChildGenerator():
        if isinstance(element, str):
            stripped = element.strip()
            if stripped:
                lines.append(stripped)
        elif hasattr(element, "name") and element.name in {"br", "p", "div", "li", "h1", "h2", "h3"}:
            lines.append("\n")
    return " ".join(lines)


def _clean_text(raw: str) -> str:
    """Final text cleanup: invisible chars, excessive whitespace, quote markers."""

    # Remove invisible spacers
    text = _INVISIBLE_CHARS.sub("", raw)

    # Normalize unicode (e.g. soft-hyphen variants)
    text = unicodedata.normalize("NFKC", text)

    # Collapse whitespace sequences (keep single newlines intentional)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r" \n", "\n", text)
    text = re.sub(r"\n ", "\n", text)
    text = _MULTI_BLANK.sub("\n\n", text)

    # Remove lines that are purely quote markers ("> some text")
    lines = text.split("\n")
    lines = [ln for ln in lines if not ln.strip().startswith(">")]
    text = "\n".join(lines)

    # Cut off at first "On <date> wrote:" marker (inline quoted replies)
    for pattern in _COMPILED_QUOTE_MARKERS:
        match = pattern.search(text)
        if match and match.start() > 20:   # don't cut if marker is at very start
            text = text[: match.start()].strip()
            break

    return text.strip()


def clean_html(html_body: str) -> str:
    """
    Full pipeline: HTML → clean plain text containing only the customer's message.

    Args:
        html_body: Raw HTML string from CONVERSATION_THREADS.BODY

    Returns:
        Clean UTF-8 plain text string.
    """
    if not html_body or not html_body.strip():
        return ""

    soup = BeautifulSoup(html_body, "html.parser")

    _remove_quoted_sections(soup)
    _remove_boilerplate(soup)

    raw_text = _extract_text(soup)
    return _clean_text(raw_text)


def clean_html_series(series):
    """
    Vectorized version for a pandas Series.

    Args:
        series: pd.Series of HTML strings

    Returns:
        pd.Series of cleaned plain text strings
    """
    return series.fillna("").apply(clean_html)