Sentiment_analysis / visualization /utils /helpscout_pdf.py
Danialebrat's picture
Adding members sections
5f1963f
"""
HelpScout PDF Exporters.
Two classes sharing the MusoraPDF base from pdf_exporter.py:
- HelpScoutDashboardPDF : full HelpScout dashboard report
- HelpScoutAnalysisPDF : filtered analysis report + optional LLM summary
"""
import logging
import os
import sys
import tempfile
from datetime import datetime
from pathlib import Path
import plotly.io as pio
_parent = Path(__file__).resolve().parent.parent
if str(_parent) not in sys.path:
sys.path.insert(0, str(_parent))
from utils.pdf_exporter import MusoraPDF # reuse base class
from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
from visualizations.helpscout_charts import HelpScoutCharts
logger = logging.getLogger(__name__)
_RENDER_SCALE = 3
# ---------------------------------------------------------------------------
# Shared rendering helpers (mixin-style functions)
# ---------------------------------------------------------------------------
def _prepare_fig(fig, is_side_by_side=False):
base_fs = 13 if is_side_by_side else 14
fig.update_layout(
paper_bgcolor="white", plot_bgcolor="white",
font=dict(color="black", size=base_fs),
title_font_size=base_fs + 4,
margin=(dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80)),
)
fig.update_xaxes(automargin=True)
fig.update_yaxes(automargin=True)
def _fig_to_tmp(fig, width=800, height=400, is_side_by_side=False) -> str:
_prepare_fig(fig, is_side_by_side)
img = pio.to_image(fig, format="png", width=width, height=height,
scale=_RENDER_SCALE, engine="kaleido")
tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
tmp.write(img)
tmp.close()
return tmp.name
def _cleanup(paths):
for p in paths:
try:
os.unlink(p)
except OSError:
pass
# ---------------------------------------------------------------------------
# HelpScoutDashboardPDF
# ---------------------------------------------------------------------------
class HelpScoutDashboardPDF:
"""
Generates a comprehensive HelpScout dashboard PDF report.
"""
def __init__(self):
self.charts = HelpScoutCharts()
self.taxonomy = load_topic_taxonomy()
self._tmp: list = []
def generate_report(self, df, filter_info: dict = None) -> bytes:
"""Build and return the full dashboard PDF."""
self.pdf = MusoraPDF()
self._tmp = []
try:
self._cover(df, filter_info)
self._executive_summary(df)
self._sentiment_section(df)
self._topic_section(df)
self._emotion_section(df)
self._flags_section(df)
self._status_source_section(df)
self._timelines_section(df)
self._depth_section(df)
self._member_section(df)
self._data_summary(df, filter_info)
return bytes(self.pdf.output())
finally:
_cleanup(self._tmp)
# ── Rendering helpers ──
def _add_chart(self, fig, width=180, img_w=800, img_h=400):
try:
p = _fig_to_tmp(fig, img_w, img_h)
self._tmp.append(p)
h_mm = width * (img_h / img_w)
self.pdf.check_page_break(h_mm + 5)
self.pdf.image(p, x=10, w=width)
self.pdf.ln(3)
except Exception:
logger.exception("Chart render failed")
self.pdf.body_text("[Chart could not be rendered]")
def _add_two_charts(self, fig1, fig2, width=92):
try:
p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
h_mm = width * (450 / 700)
self.pdf.check_page_break(h_mm + 5)
y = self.pdf.get_y()
self.pdf.image(p1, x=10, y=y, w=width)
self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
self.pdf.set_y(y + h_mm + 3)
except Exception:
logger.exception("Side-by-side render failed")
self.pdf.body_text("[Charts could not be rendered]")
# ── Sections ──
def _cover(self, df, filter_info):
self.pdf.add_page()
self.pdf.ln(40)
r, g, b = MusoraPDF.PRIMARY
self.pdf.set_fill_color(r, g, b)
self.pdf.rect(0, 60, 210, 4, style="F")
self.pdf.ln(20)
self.pdf.set_font("Helvetica", "B", 28)
self.pdf.set_text_color(r, g, b)
self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.set_font("Helvetica", "", 16)
self.pdf.set_text_color(80, 80, 80)
self.pdf.cell(0, 10, "HelpScout Support Dashboard Report",
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(10)
self.pdf.set_font("Helvetica", "", 12)
self.pdf.set_text_color(100, 100, 100)
self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(5)
self.pdf.set_font("Helvetica", "", 10)
self.pdf.cell(0, 7, f"Total Conversations: {len(df):,}",
align="C", new_x="LMARGIN", new_y="NEXT")
if "first_message_at" in df.columns and not df.empty:
valid = df["first_message_at"].dropna()
if not valid.empty:
dr = f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}"
self.pdf.ln(3)
self.pdf.set_font("Helvetica", "I", 9)
self.pdf.set_text_color(120, 120, 120)
self.pdf.cell(0, 6, MusoraPDF._sanitize(f"Data period: {dr}"),
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(20)
self.pdf.set_font("Helvetica", "I", 8)
self.pdf.set_text_color(150, 150, 150)
self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
align="C", new_x="LMARGIN", new_y="NEXT")
def _executive_summary(self, df):
self.pdf.add_page()
self.pdf.section_header("Executive Summary")
total = len(df)
flags = boolean_flag_counts(df)
neg = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum()
pos = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum()
neg_pct = neg / total * 100 if total else 0
pos_pct = pos / total * 100 if total else 0
esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
self.pdf.metric_row([
("Total Conversations", f"{total:,}"),
("Positive %", f"{pos_pct:.1f}%"),
("Negative %", f"{neg_pct:.1f}%"),
("Avg Duration (h)", f"{avg_dur:.1f}"),
])
self.pdf.metric_row([
("Escalations", f"{esc:,}"),
("Refund Requests", f"{flags['is_refund_request']:,}"),
("Cancellations", f"{flags['is_cancellation']:,}"),
("Membership Joins", f"{flags['is_membership']:,}"),
])
def _sentiment_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Sentiment Distribution")
pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
gauge = self.charts.create_sentiment_score_gauge(self._avg_score(df))
self._add_two_charts(pie, gauge)
def _topic_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Topic Analysis")
bar = self.charts.create_topic_bar_chart(df, title="Conversations by Topic")
pie = self.charts.create_topic_pie_chart(df, title="Topic Share")
self._add_two_charts(bar, pie)
self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
def _emotion_section(self, df):
if "emotions" not in df.columns or df["emotions"].dropna().empty:
return
self.pdf.add_page()
self.pdf.section_header("Emotion Analysis")
self._add_chart(self.charts.create_emotion_bar_chart(df, title="Emotion Distribution"))
def _flags_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Billing & Membership Flags")
flags_chart = self.charts.create_boolean_flags_chart(df)
esc_chart = self.charts.create_escalation_breakdown(df)
self._add_two_charts(flags_chart, esc_chart)
def _status_source_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Status & Source Distribution")
status_chart = self.charts.create_status_distribution(df)
source_chart = self.charts.create_source_distribution(df)
self._add_two_charts(status_chart, source_chart)
def _timelines_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Volume & Trends (Weekly)")
self._add_chart(self.charts.create_volume_timeline(df, freq="W"))
self._add_chart(self.charts.create_sentiment_timeline(df, freq="W"))
self._add_chart(self.charts.create_refund_cancel_timeline(df, freq="W"))
def _depth_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Conversation Depth")
dur = self.charts.create_duration_histogram(df)
thd = self.charts.create_thread_count_histogram(df)
self._add_two_charts(dur, thd)
def _member_section(self, df):
if "is_member" not in df.columns:
return
self.pdf.add_page()
self.pdf.section_header("Member vs Non-Member Analysis")
total = len(df)
member_count = int(df["is_member"].sum())
non_member_count = total - member_count
match_pct = member_count / total * 100 if total else 0
self.pdf.metric_row([
("Members", f"{member_count:,}"),
("Non-Members", f"{non_member_count:,}"),
("Email Match Rate", f"{match_pct:.1f}%"),
])
self.pdf.body_text(
"Members are customers whose email was matched against Musora user records. "
"Non-Members contacted support without an associated Musora account."
)
self._add_two_charts(
self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
)
self._add_chart(
self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
img_h=500,
)
def _data_summary(self, df, filter_info):
self.pdf.add_page()
self.pdf.section_header("Data Summary")
self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
self.pdf.body_text(f"Total conversations: {len(df):,}")
self.pdf.callout_box(
"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
"This report is confidential and intended for internal Musora team use only.",
bg_color=(245, 245, 245),
)
@staticmethod
def _avg_score(df) -> float:
score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
"negative": -1, "very_negative": -2}
if "sentiment_polarity" not in df.columns or df.empty:
return 0.0
return float(df["sentiment_polarity"].map(score_map).fillna(0).mean())
# ---------------------------------------------------------------------------
# HelpScoutAnalysisPDF
# ---------------------------------------------------------------------------
class HelpScoutAnalysisPDF:
"""
Generates a focused analysis PDF from the HelpScout Analysis page.
Includes filter summary, distributions, and optionally the LLM summary report.
"""
def __init__(self):
self.charts = HelpScoutCharts()
self.taxonomy = load_topic_taxonomy()
self._tmp: list = []
def generate_report(self, df, filter_info: dict = None,
summary_result: dict = None) -> bytes:
"""
Build and return the analysis PDF.
Args:
df: Filtered HelpScout analysis DataFrame.
filter_info: Dict of filter descriptions for the cover.
summary_result: Output from HelpScoutSummaryAgent.process() or None.
"""
self.pdf = MusoraPDF()
self._tmp = []
try:
self._cover(df, filter_info)
self._filter_summary_section(filter_info, df)
self._kpi_section(df)
self._distributions_section(df)
self._summary_section(summary_result)
self._data_summary(df, filter_info)
return bytes(self.pdf.output())
finally:
_cleanup(self._tmp)
# ── Rendering helpers ──
def _add_chart(self, fig, width=180, img_w=800, img_h=400):
try:
p = _fig_to_tmp(fig, img_w, img_h)
self._tmp.append(p)
h_mm = width * (img_h / img_w)
self.pdf.check_page_break(h_mm + 5)
self.pdf.image(p, x=10, w=width)
self.pdf.ln(3)
except Exception:
logger.exception("Chart render failed")
self.pdf.body_text("[Chart could not be rendered]")
def _add_two_charts(self, fig1, fig2, width=92):
try:
p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
h_mm = width * (450 / 700)
self.pdf.check_page_break(h_mm + 5)
y = self.pdf.get_y()
self.pdf.image(p1, x=10, y=y, w=width)
self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
self.pdf.set_y(y + h_mm + 3)
except Exception:
logger.exception("Side-by-side render failed")
self.pdf.body_text("[Charts could not be rendered]")
# ── Sections ──
def _cover(self, df, filter_info):
self.pdf.add_page()
self.pdf.ln(40)
r, g, b = MusoraPDF.PRIMARY
self.pdf.set_fill_color(r, g, b)
self.pdf.rect(0, 60, 210, 4, style="F")
self.pdf.ln(20)
self.pdf.set_font("Helvetica", "B", 28)
self.pdf.set_text_color(r, g, b)
self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.set_font("Helvetica", "", 16)
self.pdf.set_text_color(80, 80, 80)
self.pdf.cell(0, 10, "HelpScout Analysis Report",
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(10)
self.pdf.set_font("Helvetica", "", 12)
self.pdf.set_text_color(100, 100, 100)
self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(5)
self.pdf.set_font("Helvetica", "", 10)
self.pdf.cell(0, 7, f"Matched Conversations: {len(df):,}",
align="C", new_x="LMARGIN", new_y="NEXT")
if filter_info:
self.pdf.ln(8)
self.pdf.set_font("Helvetica", "B", 9)
self.pdf.set_text_color(80, 80, 80)
self.pdf.cell(0, 6, "Applied Filters:", align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.set_font("Helvetica", "", 9)
for k, v in filter_info.items():
if v:
self.pdf.cell(0, 5, MusoraPDF._sanitize(f"{k}: {v}"),
align="C", new_x="LMARGIN", new_y="NEXT")
self.pdf.ln(20)
self.pdf.set_font("Helvetica", "I", 8)
self.pdf.set_text_color(150, 150, 150)
self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
align="C", new_x="LMARGIN", new_y="NEXT")
def _filter_summary_section(self, filter_info, df):
self.pdf.add_page()
self.pdf.section_header("Filter Set Summary")
if filter_info:
rows = [(k, MusoraPDF._sanitize(str(v))) for k, v in filter_info.items() if v]
if rows:
self.pdf.add_table(["Filter", "Value"], rows, col_widths=[80, 110])
else:
self.pdf.body_text("No filters applied — report covers all available conversations.")
def _kpi_section(self, df):
total = len(df)
flags = boolean_flag_counts(df)
neg_pct = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
pos_pct = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
self.pdf.section_header("Key Metrics")
self.pdf.metric_row([
("Conversations", f"{total:,}"),
("Positive %", f"{pos_pct:.1f}%"),
("Negative %", f"{neg_pct:.1f}%"),
("Avg Duration (h)", f"{avg_dur:.1f}"),
])
self.pdf.metric_row([
("Escalations", f"{esc:,}"),
("Refund Requests", f"{flags['is_refund_request']:,}"),
("Cancellations", f"{flags['is_cancellation']:,}"),
("Membership Joins", f"{flags['is_membership']:,}"),
])
if "is_member" in df.columns:
member_count = int(df["is_member"].sum())
non_member_count = total - member_count
self.pdf.metric_row([
("Members", f"{member_count:,}"),
("Non-Members", f"{non_member_count:,}"),
("Email Match Rate", f"{member_count / total * 100:.1f}%" if total else "N/A"),
])
def _distributions_section(self, df):
self.pdf.add_page()
self.pdf.section_header("Distributions")
pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
tbar = self.charts.create_topic_bar_chart(df, title="Topic Distribution")
self._add_two_charts(pie, tbar)
self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
if "is_member" in df.columns:
self.pdf.add_page()
self.pdf.section_header("Member vs Non-Member Breakdown")
self._add_two_charts(
self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
)
self._add_chart(
self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
img_h=500,
)
def _summary_section(self, result: dict):
self.pdf.add_page()
self.pdf.section_header("AI Summary Report")
if result is None or not result.get("success"):
self.pdf.callout_box(
"AI summary not generated. To include it, click 'Generate Summary Report' "
"in the app before exporting the PDF.",
bg_color=(255, 250, 230),
)
return
summary = result.get("summary", {})
meta = result.get("metadata", {})
exec_summary = MusoraPDF._sanitize(summary.get("executive_summary", ""))
if exec_summary:
self.pdf.subsection_header("Executive Summary")
self.pdf.section_description(exec_summary)
themes = summary.get("top_themes", [])
if themes:
self.pdf.subsection_header("Top Themes")
for t in themes:
theme_text = MusoraPDF._sanitize(
f"{t.get('theme', '')}{t.get('prevalence', '')}: {t.get('description', '')}"
)
self.pdf.body_text(f" * {theme_text}")
complaints = summary.get("top_complaints", [])
if complaints:
self.pdf.subsection_header("Top Complaints")
for c in complaints:
self.pdf.body_text(f" * {MusoraPDF._sanitize(c)}")
insights = summary.get("unexpected_insights", [])
if insights:
self.pdf.subsection_header("Unexpected Insights")
for ins in insights:
self.pdf.body_text(f" * {MusoraPDF._sanitize(ins)}")
quotes = summary.get("notable_quotes", [])
if quotes:
self.pdf.subsection_header("Notable Quotes")
for q in quotes:
self.pdf.body_text(f' "{MusoraPDF._sanitize(q)}"')
self.pdf.ln(4)
self.pdf.callout_box(
f"Analysis based on {meta.get('total_conversations_analyzed', 0)} conversations "
f"| Model: {meta.get('model_used', 'N/A')} "
f"| Tokens: {meta.get('tokens_used', 0):,}",
bg_color=(240, 248, 255),
)
def _data_summary(self, df, filter_info):
self.pdf.add_page()
self.pdf.section_header("Data Summary")
self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
self.pdf.body_text(f"Total conversations in report: {len(df):,}")
self.pdf.callout_box(
"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
"This report is confidential and intended for internal Musora team use only.",
bg_color=(245, 245, 245),
)