Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

File size: 21,723 Bytes

"""
HelpScout PDF Exporters.

Two classes sharing the MusoraPDF base from pdf_exporter.py:
  - HelpScoutDashboardPDF  : full HelpScout dashboard report
  - HelpScoutAnalysisPDF   : filtered analysis report + optional LLM summary
"""
import logging
import os
import sys
import tempfile
from datetime import datetime
from pathlib import Path

import plotly.io as pio

_parent = Path(__file__).resolve().parent.parent
if str(_parent) not in sys.path:
    sys.path.insert(0, str(_parent))

from utils.pdf_exporter import MusoraPDF          # reuse base class
from utils.helpscout_utils import boolean_flag_counts, topic_label, load_topic_taxonomy
from visualizations.helpscout_charts import HelpScoutCharts

logger = logging.getLogger(__name__)

_RENDER_SCALE = 3


# ---------------------------------------------------------------------------
# Shared rendering helpers (mixin-style functions)
# ---------------------------------------------------------------------------

def _prepare_fig(fig, is_side_by_side=False):
    base_fs = 13 if is_side_by_side else 14
    fig.update_layout(
        paper_bgcolor="white", plot_bgcolor="white",
        font=dict(color="black", size=base_fs),
        title_font_size=base_fs + 4,
        margin=(dict(l=60, r=40, t=60, b=60) if is_side_by_side else dict(l=80, r=40, t=60, b=80)),
    )
    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)


def _fig_to_tmp(fig, width=800, height=400, is_side_by_side=False) -> str:
    _prepare_fig(fig, is_side_by_side)
    img = pio.to_image(fig, format="png", width=width, height=height,
                       scale=_RENDER_SCALE, engine="kaleido")
    tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    tmp.write(img)
    tmp.close()
    return tmp.name


def _cleanup(paths):
    for p in paths:
        try:
            os.unlink(p)
        except OSError:
            pass


# ---------------------------------------------------------------------------
# HelpScoutDashboardPDF
# ---------------------------------------------------------------------------

class HelpScoutDashboardPDF:
    """
    Generates a comprehensive HelpScout dashboard PDF report.
    """

    def __init__(self):
        self.charts = HelpScoutCharts()
        self.taxonomy = load_topic_taxonomy()
        self._tmp: list = []

    def generate_report(self, df, filter_info: dict = None) -> bytes:
        """Build and return the full dashboard PDF."""
        self.pdf = MusoraPDF()
        self._tmp = []
        try:
            self._cover(df, filter_info)
            self._executive_summary(df)
            self._sentiment_section(df)
            self._topic_section(df)
            self._emotion_section(df)
            self._flags_section(df)
            self._status_source_section(df)
            self._timelines_section(df)
            self._depth_section(df)
            self._member_section(df)
            self._data_summary(df, filter_info)
            return bytes(self.pdf.output())
        finally:
            _cleanup(self._tmp)

    # ── Rendering helpers ──

    def _add_chart(self, fig, width=180, img_w=800, img_h=400):
        try:
            p = _fig_to_tmp(fig, img_w, img_h)
            self._tmp.append(p)
            h_mm = width * (img_h / img_w)
            self.pdf.check_page_break(h_mm + 5)
            self.pdf.image(p, x=10, w=width)
            self.pdf.ln(3)
        except Exception:
            logger.exception("Chart render failed")
            self.pdf.body_text("[Chart could not be rendered]")

    def _add_two_charts(self, fig1, fig2, width=92):
        try:
            p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
            p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
            h_mm = width * (450 / 700)
            self.pdf.check_page_break(h_mm + 5)
            y = self.pdf.get_y()
            self.pdf.image(p1, x=10, y=y, w=width)
            self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
            self.pdf.set_y(y + h_mm + 3)
        except Exception:
            logger.exception("Side-by-side render failed")
            self.pdf.body_text("[Charts could not be rendered]")

    # ── Sections ──

    def _cover(self, df, filter_info):
        self.pdf.add_page()
        self.pdf.ln(40)
        r, g, b = MusoraPDF.PRIMARY
        self.pdf.set_fill_color(r, g, b)
        self.pdf.rect(0, 60, 210, 4, style="F")
        self.pdf.ln(20)
        self.pdf.set_font("Helvetica", "B", 28)
        self.pdf.set_text_color(r, g, b)
        self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.set_font("Helvetica", "", 16)
        self.pdf.set_text_color(80, 80, 80)
        self.pdf.cell(0, 10, "HelpScout Support Dashboard Report",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(10)
        self.pdf.set_font("Helvetica", "", 12)
        self.pdf.set_text_color(100, 100, 100)
        self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(5)
        self.pdf.set_font("Helvetica", "", 10)
        self.pdf.cell(0, 7, f"Total Conversations: {len(df):,}",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        if "first_message_at" in df.columns and not df.empty:
            valid = df["first_message_at"].dropna()
            if not valid.empty:
                dr = f"{valid.min().strftime('%b %d, %Y')} to {valid.max().strftime('%b %d, %Y')}"
                self.pdf.ln(3)
                self.pdf.set_font("Helvetica", "I", 9)
                self.pdf.set_text_color(120, 120, 120)
                self.pdf.cell(0, 6, MusoraPDF._sanitize(f"Data period: {dr}"),
                              align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(20)
        self.pdf.set_font("Helvetica", "I", 8)
        self.pdf.set_text_color(150, 150, 150)
        self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
                      align="C", new_x="LMARGIN", new_y="NEXT")

    def _executive_summary(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Executive Summary")
        total = len(df)
        flags = boolean_flag_counts(df)
        neg = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum()
        pos = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum()
        neg_pct = neg / total * 100 if total else 0
        pos_pct = pos / total * 100 if total else 0
        esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0
        avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0

        self.pdf.metric_row([
            ("Total Conversations", f"{total:,}"),
            ("Positive %", f"{pos_pct:.1f}%"),
            ("Negative %", f"{neg_pct:.1f}%"),
            ("Avg Duration (h)", f"{avg_dur:.1f}"),
        ])
        self.pdf.metric_row([
            ("Escalations", f"{esc:,}"),
            ("Refund Requests", f"{flags['is_refund_request']:,}"),
            ("Cancellations", f"{flags['is_cancellation']:,}"),
            ("Membership Joins", f"{flags['is_membership']:,}"),
        ])

    def _sentiment_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Sentiment Distribution")
        pie  = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
        gauge = self.charts.create_sentiment_score_gauge(self._avg_score(df))
        self._add_two_charts(pie, gauge)

    def _topic_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Topic Analysis")
        bar = self.charts.create_topic_bar_chart(df, title="Conversations by Topic")
        pie = self.charts.create_topic_pie_chart(df, title="Topic Share")
        self._add_two_charts(bar, pie)
        self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)

    def _emotion_section(self, df):
        if "emotions" not in df.columns or df["emotions"].dropna().empty:
            return
        self.pdf.add_page()
        self.pdf.section_header("Emotion Analysis")
        self._add_chart(self.charts.create_emotion_bar_chart(df, title="Emotion Distribution"))

    def _flags_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Billing & Membership Flags")
        flags_chart = self.charts.create_boolean_flags_chart(df)
        esc_chart   = self.charts.create_escalation_breakdown(df)
        self._add_two_charts(flags_chart, esc_chart)

    def _status_source_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Status & Source Distribution")
        status_chart = self.charts.create_status_distribution(df)
        source_chart = self.charts.create_source_distribution(df)
        self._add_two_charts(status_chart, source_chart)

    def _timelines_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Volume & Trends (Weekly)")
        self._add_chart(self.charts.create_volume_timeline(df, freq="W"))
        self._add_chart(self.charts.create_sentiment_timeline(df, freq="W"))
        self._add_chart(self.charts.create_refund_cancel_timeline(df, freq="W"))

    def _depth_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Conversation Depth")
        dur = self.charts.create_duration_histogram(df)
        thd = self.charts.create_thread_count_histogram(df)
        self._add_two_charts(dur, thd)

    def _member_section(self, df):
        if "is_member" not in df.columns:
            return
        self.pdf.add_page()
        self.pdf.section_header("Member vs Non-Member Analysis")
        total = len(df)
        member_count     = int(df["is_member"].sum())
        non_member_count = total - member_count
        match_pct        = member_count / total * 100 if total else 0
        self.pdf.metric_row([
            ("Members",          f"{member_count:,}"),
            ("Non-Members",      f"{non_member_count:,}"),
            ("Email Match Rate", f"{match_pct:.1f}%"),
        ])
        self.pdf.body_text(
            "Members are customers whose email was matched against Musora user records. "
            "Non-Members contacted support without an associated Musora account."
        )
        self._add_two_charts(
            self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
            self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
        )
        self._add_chart(
            self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
            img_h=500,
        )

    def _data_summary(self, df, filter_info):
        self.pdf.add_page()
        self.pdf.section_header("Data Summary")
        self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        self.pdf.body_text(f"Total conversations: {len(df):,}")
        self.pdf.callout_box(
            "Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
            "This report is confidential and intended for internal Musora team use only.",
            bg_color=(245, 245, 245),
        )

    @staticmethod
    def _avg_score(df) -> float:
        score_map = {"very_positive": 2, "positive": 1, "neutral": 0,
                     "negative": -1, "very_negative": -2}
        if "sentiment_polarity" not in df.columns or df.empty:
            return 0.0
        return float(df["sentiment_polarity"].map(score_map).fillna(0).mean())


# ---------------------------------------------------------------------------
# HelpScoutAnalysisPDF
# ---------------------------------------------------------------------------

class HelpScoutAnalysisPDF:
    """
    Generates a focused analysis PDF from the HelpScout Analysis page.
    Includes filter summary, distributions, and optionally the LLM summary report.
    """

    def __init__(self):
        self.charts = HelpScoutCharts()
        self.taxonomy = load_topic_taxonomy()
        self._tmp: list = []

    def generate_report(self, df, filter_info: dict = None,
                        summary_result: dict = None) -> bytes:
        """
        Build and return the analysis PDF.

        Args:
            df:             Filtered HelpScout analysis DataFrame.
            filter_info:    Dict of filter descriptions for the cover.
            summary_result: Output from HelpScoutSummaryAgent.process() or None.
        """
        self.pdf = MusoraPDF()
        self._tmp = []
        try:
            self._cover(df, filter_info)
            self._filter_summary_section(filter_info, df)
            self._kpi_section(df)
            self._distributions_section(df)
            self._summary_section(summary_result)
            self._data_summary(df, filter_info)
            return bytes(self.pdf.output())
        finally:
            _cleanup(self._tmp)

    # ── Rendering helpers ──

    def _add_chart(self, fig, width=180, img_w=800, img_h=400):
        try:
            p = _fig_to_tmp(fig, img_w, img_h)
            self._tmp.append(p)
            h_mm = width * (img_h / img_w)
            self.pdf.check_page_break(h_mm + 5)
            self.pdf.image(p, x=10, w=width)
            self.pdf.ln(3)
        except Exception:
            logger.exception("Chart render failed")
            self.pdf.body_text("[Chart could not be rendered]")

    def _add_two_charts(self, fig1, fig2, width=92):
        try:
            p1 = _fig_to_tmp(fig1, 700, 450, is_side_by_side=True); self._tmp.append(p1)
            p2 = _fig_to_tmp(fig2, 700, 450, is_side_by_side=True); self._tmp.append(p2)
            h_mm = width * (450 / 700)
            self.pdf.check_page_break(h_mm + 5)
            y = self.pdf.get_y()
            self.pdf.image(p1, x=10, y=y, w=width)
            self.pdf.image(p2, x=10 + width + 4, y=y, w=width)
            self.pdf.set_y(y + h_mm + 3)
        except Exception:
            logger.exception("Side-by-side render failed")
            self.pdf.body_text("[Charts could not be rendered]")

    # ── Sections ──

    def _cover(self, df, filter_info):
        self.pdf.add_page()
        self.pdf.ln(40)
        r, g, b = MusoraPDF.PRIMARY
        self.pdf.set_fill_color(r, g, b)
        self.pdf.rect(0, 60, 210, 4, style="F")
        self.pdf.ln(20)
        self.pdf.set_font("Helvetica", "B", 28)
        self.pdf.set_text_color(r, g, b)
        self.pdf.cell(0, 15, "Musora", align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.set_font("Helvetica", "", 16)
        self.pdf.set_text_color(80, 80, 80)
        self.pdf.cell(0, 10, "HelpScout Analysis Report",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(10)
        self.pdf.set_font("Helvetica", "", 12)
        self.pdf.set_text_color(100, 100, 100)
        self.pdf.cell(0, 8, f"Generated: {datetime.now().strftime('%B %d, %Y at %H:%M')}",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(5)
        self.pdf.set_font("Helvetica", "", 10)
        self.pdf.cell(0, 7, f"Matched Conversations: {len(df):,}",
                      align="C", new_x="LMARGIN", new_y="NEXT")
        if filter_info:
            self.pdf.ln(8)
            self.pdf.set_font("Helvetica", "B", 9)
            self.pdf.set_text_color(80, 80, 80)
            self.pdf.cell(0, 6, "Applied Filters:", align="C", new_x="LMARGIN", new_y="NEXT")
            self.pdf.set_font("Helvetica", "", 9)
            for k, v in filter_info.items():
                if v:
                    self.pdf.cell(0, 5, MusoraPDF._sanitize(f"{k}: {v}"),
                                  align="C", new_x="LMARGIN", new_y="NEXT")
        self.pdf.ln(20)
        self.pdf.set_font("Helvetica", "I", 8)
        self.pdf.set_text_color(150, 150, 150)
        self.pdf.cell(0, 6, "Confidential - For Internal Use Only",
                      align="C", new_x="LMARGIN", new_y="NEXT")

    def _filter_summary_section(self, filter_info, df):
        self.pdf.add_page()
        self.pdf.section_header("Filter Set Summary")
        if filter_info:
            rows = [(k, MusoraPDF._sanitize(str(v))) for k, v in filter_info.items() if v]
            if rows:
                self.pdf.add_table(["Filter", "Value"], rows, col_widths=[80, 110])
        else:
            self.pdf.body_text("No filters applied — report covers all available conversations.")

    def _kpi_section(self, df):
        total = len(df)
        flags = boolean_flag_counts(df)
        neg_pct = df["sentiment_polarity"].isin(["negative", "very_negative"]).sum() / total * 100 if total else 0
        pos_pct = df["sentiment_polarity"].isin(["positive", "very_positive"]).sum() / total * 100 if total else 0
        avg_dur = float(df["duration_hours"].mean()) if "duration_hours" in df.columns else 0
        esc = int(df["is_escalation"].sum()) if "is_escalation" in df.columns else 0

        self.pdf.section_header("Key Metrics")
        self.pdf.metric_row([
            ("Conversations",     f"{total:,}"),
            ("Positive %",        f"{pos_pct:.1f}%"),
            ("Negative %",        f"{neg_pct:.1f}%"),
            ("Avg Duration (h)",  f"{avg_dur:.1f}"),
        ])
        self.pdf.metric_row([
            ("Escalations",       f"{esc:,}"),
            ("Refund Requests",   f"{flags['is_refund_request']:,}"),
            ("Cancellations",     f"{flags['is_cancellation']:,}"),
            ("Membership Joins",  f"{flags['is_membership']:,}"),
        ])
        if "is_member" in df.columns:
            member_count = int(df["is_member"].sum())
            non_member_count = total - member_count
            self.pdf.metric_row([
                ("Members",          f"{member_count:,}"),
                ("Non-Members",      f"{non_member_count:,}"),
                ("Email Match Rate", f"{member_count / total * 100:.1f}%" if total else "N/A"),
            ])

    def _distributions_section(self, df):
        self.pdf.add_page()
        self.pdf.section_header("Distributions")
        pie  = self.charts.create_sentiment_pie_chart(df, title="Sentiment Distribution")
        tbar = self.charts.create_topic_bar_chart(df, title="Topic Distribution")
        self._add_two_charts(pie, tbar)
        self._add_chart(self.charts.create_topic_sentiment_heatmap(df), img_h=500)
        if "is_member" in df.columns:
            self.pdf.add_page()
            self.pdf.section_header("Member vs Non-Member Breakdown")
            self._add_two_charts(
                self.charts.create_member_status_chart(df, title="Member vs Non-Member"),
                self.charts.create_member_sentiment_chart(df, title="Sentiment by Member Status"),
            )
            self._add_chart(
                self.charts.create_member_topic_chart(df, title="Top Topics by Member Status"),
                img_h=500,
            )

    def _summary_section(self, result: dict):
        self.pdf.add_page()
        self.pdf.section_header("AI Summary Report")

        if result is None or not result.get("success"):
            self.pdf.callout_box(
                "AI summary not generated. To include it, click 'Generate Summary Report' "
                "in the app before exporting the PDF.",
                bg_color=(255, 250, 230),
            )
            return

        summary = result.get("summary", {})
        meta    = result.get("metadata", {})

        exec_summary = MusoraPDF._sanitize(summary.get("executive_summary", ""))
        if exec_summary:
            self.pdf.subsection_header("Executive Summary")
            self.pdf.section_description(exec_summary)

        themes = summary.get("top_themes", [])
        if themes:
            self.pdf.subsection_header("Top Themes")
            for t in themes:
                theme_text = MusoraPDF._sanitize(
                    f"{t.get('theme', '')} — {t.get('prevalence', '')}: {t.get('description', '')}"
                )
                self.pdf.body_text(f"  * {theme_text}")

        complaints = summary.get("top_complaints", [])
        if complaints:
            self.pdf.subsection_header("Top Complaints")
            for c in complaints:
                self.pdf.body_text(f"  * {MusoraPDF._sanitize(c)}")

        insights = summary.get("unexpected_insights", [])
        if insights:
            self.pdf.subsection_header("Unexpected Insights")
            for ins in insights:
                self.pdf.body_text(f"  * {MusoraPDF._sanitize(ins)}")

        quotes = summary.get("notable_quotes", [])
        if quotes:
            self.pdf.subsection_header("Notable Quotes")
            for q in quotes:
                self.pdf.body_text(f'  "{MusoraPDF._sanitize(q)}"')

        self.pdf.ln(4)
        self.pdf.callout_box(
            f"Analysis based on {meta.get('total_conversations_analyzed', 0)} conversations "
            f"| Model: {meta.get('model_used', 'N/A')} "
            f"| Tokens: {meta.get('tokens_used', 0):,}",
            bg_color=(240, 248, 255),
        )

    def _data_summary(self, df, filter_info):
        self.pdf.add_page()
        self.pdf.section_header("Data Summary")
        self.pdf.body_text(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        self.pdf.body_text(f"Total conversations in report: {len(df):,}")
        self.pdf.callout_box(
            "Data source: SOCIAL_MEDIA_DB.ML_FEATURES.HELPSCOUT_CONVERSATION_FEATURES\n"
            "This report is confidential and intended for internal Musora team use only.",
            bg_color=(245, 245, 245),
        )