Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

App Files Files Community

Sentiment_analysis / visualization /utils /helpscout_pdf.py

Danialebrat

Adding members sections

5f1963f 26 days ago

raw

history blame contribute delete

21.7 kB

	"""
	HelpScout PDF Exporters.

	Two classes sharing the MusoraPDF base from pdf_exporter.py:
	- HelpScoutDashboardPDF : full HelpScout dashboard report
	- HelpScoutAnalysisPDF : filtered analysis report + optional LLM summary
	"""
	import logging
	import os
	import sys
	import tempfile
	from datetime import datetime
	from pathlib import Path

	import plotly.io as pio

	_parent = Path(__file__).resolve().parent.parent
	if str(_parent) not in sys.path:
	sys.path.insert(0, str(_parent))

	from utils.pdf_exporter import MusoraPDF # reuse base class
	from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
	from visualizations.helpscout_charts import HelpScoutCharts

	logger = logging.getLogger(__name__)

	_RENDER_SCALE = 3


	# ---------------------------------------------------------------------------
	# Shared rendering helpers (mixin-style functions)
	# ---------------------------------------------------------------------------

	def _prepare_fig(fig, is_side_by_side=False):
	base_fs = 13 if is_side_by_side else 14
	fig.update_layout(
	paper_bgcolor="white", plot_bgcolor="white",
	font=dict(color="black", size=base_fs),
	title_font_size=base_fs + 4,
	margin=(dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80)),
	)
	fig.update_xaxes(automargin=True)
	fig.update_yaxes(automargin=True)


	def _fig_to_tmp(fig, width=800, height=400, is_side_by_side=False) -> str:
	_prepare_fig(fig, is_side_by_side)
	img = pio.to_image(fig, format="png", width=width, height=height,
	scale=_RENDER_SCALE, engine="kaleido")
	tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	tmp.write(img)
	tmp.close()
	return tmp.name


	def _cleanup(paths):
	for p in paths:
	try:
	os.unlink(p)
	except OSError:
	pass


	# ---------------------------------------------------------------------------
	# HelpScoutDashboardPDF
	# ---------------------------------------------------------------------------

	class HelpScoutDashboardPDF:
	"""
	Generates a comprehensive HelpScout dashboard PDF report.
	"""

	def __init__(self):
	self.charts = HelpScoutCharts()
	self.taxonomy = load_topic_taxonomy()
	self._tmp: list = []

	def generate_report(self, df, filter_info: dict = None) -> bytes:
	"""Build and return the full dashboard PDF."""
	self.pdf = MusoraPDF()
	self._tmp = []
	try:
	self._cover(df, filter_info)
	self._executive_summary(df)
	self._sentiment_section(df)
	self._topic_section(df)
	self._emotion_section(df)
	self._flags_section(df)
	self._status_source_section(df)
	self._timelines_section(df)
	self._depth_section(df)
	self._member_section(df)
	self._data_summary(df, filter_info)
	return bytes(self.pdf.output())
	finally:
	_cleanup(self._tmp)

	# ── Rendering helpers ──

	def _add_chart(self, fig, width=180, img_w=800, img_h=400):
	try:
	p = _fig_to_tmp(fig, img_w, img_h)
	self._tmp.append(p)
	h_mm = width * (img_h / img_w)
	self.pdf.check_page_break(h_mm + 5)
	self.pdf.image(p, x=10, w=width)
	self.pdf.ln(3)
	except Exception:
	logger.exception("Chart render failed")
	self.pdf.body_text("[Chart could not be rendered]")

	def _add_two_charts(self, fig1, fig2, width=92):
	try:
	p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
	p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
	h_mm = width * (450 / 700)
	self.pdf.check_page_break(h_mm + 5)
	y = self.pdf.get_y()
	self.pdf.image(p1, x=10, y=y, w=width)
	self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
	self.pdf.set_y(y + h_mm + 3)
	except Exception:
	logger.exception("Side-by-side render failed")
	self.pdf.body_text("[Charts could not be rendered]")

	# ── Sections ──

	def _cover(self, df, filter_info):
	self.pdf.add_page()
	self.pdf.ln(40)
	r, g, b = MusoraPDF.PRIMARY
	self.pdf.set_fill_color(r, g, b)
	self.pdf.rect(0, 60, 210, 4, style="F")
	self.pdf.ln(20)
	self.pdf.set_font("Helvetica", "B", 28)
	self.pdf.set_text_color(r, g, b)
	self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.set_font("Helvetica", "", 16)
	self.pdf.set_text_color(80, 80, 80)
	self.pdf.cell(0, 10, "HelpScout Support Dashboard Report",
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(10)
	self.pdf.set_font("Helvetica", "", 12)
	self.pdf.set_text_color(100, 100, 100)
	self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(5)
	self.pdf.set_font("Helvetica", "", 10)
	self.pdf.cell(0, 7, f"Total Conversations: {len(df):,}",
	align="C", new_x="LMARGIN", new_y="NEXT")
	if "first_message_at" in df.columns and not df.empty:
	valid = df["first_message_at"].dropna()
	if not valid.empty:
	dr = f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}"
	self.pdf.ln(3)
	self.pdf.set_font("Helvetica", "I", 9)
	self.pdf.set_text_color(120, 120, 120)
	self.pdf.cell(0, 6, MusoraPDF._sanitize(f"Data period: {dr}"),
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(20)
	self.pdf.set_font("Helvetica", "I", 8)
	self.pdf.set_text_color(150, 150, 150)
	self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
	align="C", new_x="LMARGIN", new_y="NEXT")

	def _executive_summary(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Executive Summary")
	total = len(df)
	flags = boolean_flag_counts(df)
	neg = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum()
	pos = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum()
	neg_pct = neg / total * 100 if total else 0
	pos_pct = pos / total * 100 if total else 0
	esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
	avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0

	self.pdf.metric_row([
	("Total Conversations", f"{total:,}"),
	("Positive %", f"{pos_pct:.1f}%"),
	("Negative %", f"{neg_pct:.1f}%"),
	("Avg Duration (h)", f"{avg_dur:.1f}"),
	])
	self.pdf.metric_row([
	("Escalations", f"{esc:,}"),
	("Refund Requests", f"{flags['is_refund_request']:,}"),
	("Cancellations", f"{flags['is_cancellation']:,}"),
	("Membership Joins", f"{flags['is_membership']:,}"),
	])

	def _sentiment_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Sentiment Distribution")
	pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
	gauge = self.charts.create_sentiment_score_gauge(self._avg_score(df))
	self._add_two_charts(pie, gauge)

	def _topic_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Topic Analysis")
	bar = self.charts.create_topic_bar_chart(df, title="Conversations by Topic")
	pie = self.charts.create_topic_pie_chart(df, title="Topic Share")
	self._add_two_charts(bar, pie)
	self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)

	def _emotion_section(self, df):
	if "emotions" not in df.columns or df["emotions"].dropna().empty:
	return
	self.pdf.add_page()
	self.pdf.section_header("Emotion Analysis")
	self._add_chart(self.charts.create_emotion_bar_chart(df, title="Emotion Distribution"))

	def _flags_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Billing & Membership Flags")
	flags_chart = self.charts.create_boolean_flags_chart(df)
	esc_chart = self.charts.create_escalation_breakdown(df)
	self._add_two_charts(flags_chart, esc_chart)

	def _status_source_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Status & Source Distribution")
	status_chart = self.charts.create_status_distribution(df)
	source_chart = self.charts.create_source_distribution(df)
	self._add_two_charts(status_chart, source_chart)

	def _timelines_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Volume & Trends (Weekly)")
	self._add_chart(self.charts.create_volume_timeline(df, freq="W"))
	self._add_chart(self.charts.create_sentiment_timeline(df, freq="W"))
	self._add_chart(self.charts.create_refund_cancel_timeline(df, freq="W"))

	def _depth_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Conversation Depth")
	dur = self.charts.create_duration_histogram(df)
	thd = self.charts.create_thread_count_histogram(df)
	self._add_two_charts(dur, thd)

	def _member_section(self, df):
	if "is_member" not in df.columns:
	return
	self.pdf.add_page()
	self.pdf.section_header("Member vs Non-Member Analysis")
	total = len(df)
	member_count = int(df["is_member"].sum())
	non_member_count = total - member_count
	match_pct = member_count / total * 100 if total else 0
	self.pdf.metric_row([
	("Members", f"{member_count:,}"),
	("Non-Members", f"{non_member_count:,}"),
	("Email Match Rate", f"{match_pct:.1f}%"),
	])
	self.pdf.body_text(
	"Members are customers whose email was matched against Musora user records. "
	"Non-Members contacted support without an associated Musora account."
	)
	self._add_two_charts(
	self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
	self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
	)
	self._add_chart(
	self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
	img_h=500,
	)

	def _data_summary(self, df, filter_info):
	self.pdf.add_page()
	self.pdf.section_header("Data Summary")
	self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	self.pdf.body_text(f"Total conversations: {len(df):,}")
	self.pdf.callout_box(
	"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
	"This report is confidential and intended for internal Musora team use only.",
	bg_color=(245, 245, 245),
	)

	@staticmethod
	def _avg_score(df) -> float:
	score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
	"negative": -1, "very_negative": -2}
	if "sentiment_polarity" not in df.columns or df.empty:
	return 0.0
	return float(df["sentiment_polarity"].map(score_map).fillna(0).mean())


	# ---------------------------------------------------------------------------
	# HelpScoutAnalysisPDF
	# ---------------------------------------------------------------------------

	class HelpScoutAnalysisPDF:
	"""
	Generates a focused analysis PDF from the HelpScout Analysis page.
	Includes filter summary, distributions, and optionally the LLM summary report.
	"""

	def __init__(self):
	self.charts = HelpScoutCharts()
	self.taxonomy = load_topic_taxonomy()
	self._tmp: list = []

	def generate_report(self, df, filter_info: dict = None,
	summary_result: dict = None) -> bytes:
	"""
	Build and return the analysis PDF.

	Args:
	df: Filtered HelpScout analysis DataFrame.
	filter_info: Dict of filter descriptions for the cover.
	summary_result: Output from HelpScoutSummaryAgent.process() or None.
	"""
	self.pdf = MusoraPDF()
	self._tmp = []
	try:
	self._cover(df, filter_info)
	self._filter_summary_section(filter_info, df)
	self._kpi_section(df)
	self._distributions_section(df)
	self._summary_section(summary_result)
	self._data_summary(df, filter_info)
	return bytes(self.pdf.output())
	finally:
	_cleanup(self._tmp)

	# ── Rendering helpers ──

	def _add_chart(self, fig, width=180, img_w=800, img_h=400):
	try:
	p = _fig_to_tmp(fig, img_w, img_h)
	self._tmp.append(p)
	h_mm = width * (img_h / img_w)
	self.pdf.check_page_break(h_mm + 5)
	self.pdf.image(p, x=10, w=width)
	self.pdf.ln(3)
	except Exception:
	logger.exception("Chart render failed")
	self.pdf.body_text("[Chart could not be rendered]")

	def _add_two_charts(self, fig1, fig2, width=92):
	try:
	p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
	p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
	h_mm = width * (450 / 700)
	self.pdf.check_page_break(h_mm + 5)
	y = self.pdf.get_y()
	self.pdf.image(p1, x=10, y=y, w=width)
	self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
	self.pdf.set_y(y + h_mm + 3)
	except Exception:
	logger.exception("Side-by-side render failed")
	self.pdf.body_text("[Charts could not be rendered]")

	# ── Sections ──

	def _cover(self, df, filter_info):
	self.pdf.add_page()
	self.pdf.ln(40)
	r, g, b = MusoraPDF.PRIMARY
	self.pdf.set_fill_color(r, g, b)
	self.pdf.rect(0, 60, 210, 4, style="F")
	self.pdf.ln(20)
	self.pdf.set_font("Helvetica", "B", 28)
	self.pdf.set_text_color(r, g, b)
	self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.set_font("Helvetica", "", 16)
	self.pdf.set_text_color(80, 80, 80)
	self.pdf.cell(0, 10, "HelpScout Analysis Report",
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(10)
	self.pdf.set_font("Helvetica", "", 12)
	self.pdf.set_text_color(100, 100, 100)
	self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(5)
	self.pdf.set_font("Helvetica", "", 10)
	self.pdf.cell(0, 7, f"Matched Conversations: {len(df):,}",
	align="C", new_x="LMARGIN", new_y="NEXT")
	if filter_info:
	self.pdf.ln(8)
	self.pdf.set_font("Helvetica", "B", 9)
	self.pdf.set_text_color(80, 80, 80)
	self.pdf.cell(0, 6, "Applied Filters:", align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.set_font("Helvetica", "", 9)
	for k, v in filter_info.items():
	if v:
	self.pdf.cell(0, 5, MusoraPDF._sanitize(f"{k}: {v}"),
	align="C", new_x="LMARGIN", new_y="NEXT")
	self.pdf.ln(20)
	self.pdf.set_font("Helvetica", "I", 8)
	self.pdf.set_text_color(150, 150, 150)
	self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
	align="C", new_x="LMARGIN", new_y="NEXT")

	def _filter_summary_section(self, filter_info, df):
	self.pdf.add_page()
	self.pdf.section_header("Filter Set Summary")
	if filter_info:
	rows = [(k, MusoraPDF._sanitize(str(v))) for k, v in filter_info.items() if v]
	if rows:
	self.pdf.add_table(["Filter", "Value"], rows, col_widths=[80, 110])
	else:
	self.pdf.body_text("No filters applied — report covers all available conversations.")

	def _kpi_section(self, df):
	total = len(df)
	flags = boolean_flag_counts(df)
	neg_pct = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
	pos_pct = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
	avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
	esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0

	self.pdf.section_header("Key Metrics")
	self.pdf.metric_row([
	("Conversations", f"{total:,}"),
	("Positive %", f"{pos_pct:.1f}%"),
	("Negative %", f"{neg_pct:.1f}%"),
	("Avg Duration (h)", f"{avg_dur:.1f}"),
	])
	self.pdf.metric_row([
	("Escalations", f"{esc:,}"),
	("Refund Requests", f"{flags['is_refund_request']:,}"),
	("Cancellations", f"{flags['is_cancellation']:,}"),
	("Membership Joins", f"{flags['is_membership']:,}"),
	])
	if "is_member" in df.columns:
	member_count = int(df["is_member"].sum())
	non_member_count = total - member_count
	self.pdf.metric_row([
	("Members", f"{member_count:,}"),
	("Non-Members", f"{non_member_count:,}"),
	("Email Match Rate", f"{member_count / total * 100:.1f}%" if total else "N/A"),
	])

	def _distributions_section(self, df):
	self.pdf.add_page()
	self.pdf.section_header("Distributions")
	pie = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
	tbar = self.charts.create_topic_bar_chart(df, title="Topic Distribution")
	self._add_two_charts(pie, tbar)
	self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
	if "is_member" in df.columns:
	self.pdf.add_page()
	self.pdf.section_header("Member vs Non-Member Breakdown")
	self._add_two_charts(
	self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
	self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
	)
	self._add_chart(
	self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
	img_h=500,
	)

	def _summary_section(self, result: dict):
	self.pdf.add_page()
	self.pdf.section_header("AI Summary Report")

	if result is None or not result.get("success"):
	self.pdf.callout_box(
	"AI summary not generated. To include it, click 'Generate Summary Report' "
	"in the app before exporting the PDF.",
	bg_color=(255, 250, 230),
	)
	return

	summary = result.get("summary", {})
	meta = result.get("metadata", {})

	exec_summary = MusoraPDF._sanitize(summary.get("executive_summary", ""))
	if exec_summary:
	self.pdf.subsection_header("Executive Summary")
	self.pdf.section_description(exec_summary)

	themes = summary.get("top_themes", [])
	if themes:
	self.pdf.subsection_header("Top Themes")
	for t in themes:
	theme_text = MusoraPDF._sanitize(
	f"{t.get('theme', '')} — {t.get('prevalence', '')}: {t.get('description', '')}"
	)
	self.pdf.body_text(f" * {theme_text}")

	complaints = summary.get("top_complaints", [])
	if complaints:
	self.pdf.subsection_header("Top Complaints")
	for c in complaints:
	self.pdf.body_text(f" * {MusoraPDF._sanitize(c)}")

	insights = summary.get("unexpected_insights", [])
	if insights:
	self.pdf.subsection_header("Unexpected Insights")
	for ins in insights:
	self.pdf.body_text(f" * {MusoraPDF._sanitize(ins)}")

	quotes = summary.get("notable_quotes", [])
	if quotes:
	self.pdf.subsection_header("Notable Quotes")
	for q in quotes:
	self.pdf.body_text(f' "{MusoraPDF._sanitize(q)}"')

	self.pdf.ln(4)
	self.pdf.callout_box(
	f"Analysis based on {meta.get('total_conversations_analyzed', 0)} conversations "
	f"\| Model: {meta.get('model_used', 'N/A')} "
	f"\| Tokens: {meta.get('tokens_used', 0):,}",
	bg_color=(240, 248, 255),
	)

	def _data_summary(self, df, filter_info):
	self.pdf.add_page()
	self.pdf.section_header("Data Summary")
	self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	self.pdf.body_text(f"Total conversations in report: {len(df):,}")
	self.pdf.callout_box(
	"Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
	"This report is confidential and intended for internal Musora team use only.",
	bg_color=(245, 245, 245),
	)