Spaces:

MusoraProductDepartment
/

Sentiment_analysis

Sleeping

File size: 8,101 Bytes

599973c

"""
Learning Paths utility helpers — pure functions, no Streamlit dependency.
"""
import json
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pandas as pd


# ── Config helpers ─────────────────────────────────────────────────────────────

def load_lp_config(config_path: str = None) -> dict:
    if config_path is None:
        config_path = Path(__file__).resolve().parent.parent / "config" / "viz_config.json"
    with open(config_path) as f:
        return json.load(f).get("learning_paths", {})


def get_brands(config: dict) -> List[str]:
    return config.get("brands", [])


def get_brand_color(brand: str, config: dict) -> str:
    return config.get("brand_colors", {}).get(brand, "#607D8B")


def label_for_path(path_id, config: dict) -> str:
    return config.get("path_labels", {}).get(str(path_id), f"Path {path_id}")


# ── DataFrame merge helpers ────────────────────────────────────────────────────

def merge_lesson_metrics(
    lesson_map: pd.DataFrame,
    per_path_df: pd.DataFrame,
    video_df: pd.DataFrame,
    sentiment_df: pd.DataFrame,
) -> pd.DataFrame:
    """
    Join all lesson-level metric DataFrames into one tidy frame indexed by
    (learning_path_id, lesson_order). Returns an empty frame if lesson_map is empty.
    """
    if lesson_map.empty:
        return pd.DataFrame()

    base = lesson_map[["brand", "learning_path_id", "first_lesson_content_id",
                        "lesson_order", "lesson_content_id", "content_title"]].copy()

    join_key = ["learning_path_id", "lesson_content_id"]

    if not per_path_df.empty and "content_id" in per_path_df.columns:
        pp = per_path_df.rename(columns={"content_id": "lesson_content_id"})
        cols = ["lesson_content_id", "learning_path_id", "lesson_number",
                "students_completed", "denominator_students", "completion_rate"]
        cols = [c for c in cols if c in pp.columns]
        base = base.merge(pp[cols], on=join_key, how="left")

    if not video_df.empty and "content_id" in video_df.columns:
        vd = video_df.rename(columns={"content_id": "lesson_content_id"})
        cols = ["lesson_content_id", "learning_path_id",
                "total_starts", "total_completions", "video_completion_rate"]
        cols = [c for c in cols if c in vd.columns]
        base = base.merge(vd[cols], on=join_key, how="left")

    if not sentiment_df.empty:
        sent_key = ["learning_path_id", "lesson_order"]
        sent_cols = [c for c in [
            "learning_path_id", "lesson_order",
            "total_comments", "very_positive", "positive", "neutral",
            "negative", "very_negative", "avg_sentiment_score",
        ] if c in sentiment_df.columns]
        base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")

    # Fill numeric nulls with 0 / NaN as appropriate
    for col in ["students_completed", "denominator_students", "total_starts",
                "total_completions", "total_comments",
                "very_positive", "positive", "neutral", "negative", "very_negative"]:
        if col in base.columns:
            base[col] = base[col].fillna(0).astype(int)

    base.sort_values(["learning_path_id", "lesson_order"], inplace=True)
    return base.reset_index(drop=True)


def merge_method_wide(
    method_df: pd.DataFrame,
    video_df: pd.DataFrame,
    sentiment_df: pd.DataFrame,
    config: dict,
) -> pd.DataFrame:
    """Same as merge_lesson_metrics but uses method-wide completion and
    adds method_lesson_number as the continuous x-axis."""
    if method_df.empty:
        return pd.DataFrame()

    base = method_df.rename(columns={"content_id": "lesson_content_id"}).copy()
    join_key = ["learning_path_id", "lesson_content_id"]

    if not video_df.empty and "content_id" in video_df.columns:
        vd = video_df.rename(columns={"content_id": "lesson_content_id"})
        cols = [c for c in ["lesson_content_id", "learning_path_id",
                             "total_starts", "total_completions",
                             "video_completion_rate"] if c in vd.columns]
        base = base.merge(vd[cols], on=join_key, how="left")

    if not sentiment_df.empty and "lesson_order" in base.columns:
        sent_key = ["learning_path_id", "lesson_order"]
        sent_cols = [c for c in [
            "learning_path_id", "lesson_order",
            "total_comments", "very_positive", "positive", "neutral",
            "negative", "very_negative", "avg_sentiment_score",
        ] if c in sentiment_df.columns]
        base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left")

    # Add path label
    base["path_label"] = base["learning_path_id"].apply(
        lambda pid: label_for_path(pid, config)
    )

    for col in ["students_completed", "total_starts", "total_completions",
                "total_comments", "very_positive", "positive", "neutral",
                "negative", "very_negative"]:
        if col in base.columns:
            base[col] = base[col].fillna(0).astype(int)

    base.sort_values("method_lesson_number", inplace=True)
    return base.reset_index(drop=True)


# ── Analysis helpers ───────────────────────────────────────────────────────────

def find_top_dropoffs(df: pd.DataFrame, n: int = 5,
                      rate_col: str = "completion_rate",
                      order_col: str = "lesson_order") -> pd.DataFrame:
    """
    Return the top-N lessons with the largest completion-rate drop
    compared to the previous lesson (within the same learning_path_id).
    """
    if df.empty or rate_col not in df.columns:
        return pd.DataFrame()

    result = df.copy().sort_values(["learning_path_id", order_col])
    result["prev_rate"] = result.groupby("learning_path_id")[rate_col].shift(1)
    result["dropoff"] = result["prev_rate"] - result[rate_col]
    result = result[result["dropoff"].notna() & (result["dropoff"] > 0)]
    return result.nlargest(n, "dropoff")[
        [c for c in ["learning_path_id", order_col, "content_title",
                     "prev_rate", rate_col, "dropoff"] if c in result.columns]
    ].reset_index(drop=True)


def get_overview_kpis(merged: pd.DataFrame) -> dict:
    """Return a dict of high-level KPI values from the merged metrics frame."""
    if merged.empty:
        return {}

    total_students = int(merged["denominator_students"].max()) if "denominator_students" in merged.columns else 0
    avg_completion = float(merged["completion_rate"].mean()) if "completion_rate" in merged.columns else 0.0
    avg_sentiment = float(merged["avg_sentiment_score"].mean()) if "avg_sentiment_score" in merged.columns else 0.0
    total_comments = int(merged["total_comments"].sum()) if "total_comments" in merged.columns else 0
    n_paths = merged["learning_path_id"].nunique() if "learning_path_id" in merged.columns else 0
    n_lessons = len(merged)

    return {
        "total_students": total_students,
        "avg_completion_pct": avg_completion * 100,
        "avg_sentiment_score": avg_sentiment,
        "total_comments": total_comments,
        "n_paths": n_paths,
        "n_lessons": n_lessons,
    }


def filter_by_paths(df: pd.DataFrame,
                    path_ids: Optional[List[int]]) -> pd.DataFrame:
    """Filter df to a subset of learning_path_ids. None or empty = all."""
    if not path_ids or df.empty or "learning_path_id" not in df.columns:
        return df
    return df[df["learning_path_id"].isin(path_ids)].reset_index(drop=True)


def short_title(title: Optional[str], max_len: int = 35) -> str:
    """Truncate a content title for display in labels."""
    if not title or pd.isna(title):
        return "—"
    t = str(title).strip()
    return t if len(t) <= max_len else t[:max_len] + "…"