""" Learning Paths utility helpers — pure functions, no Streamlit dependency. """ import json from pathlib import Path from typing import Dict, List, Optional, Tuple import pandas as pd # ── Config helpers ───────────────────────────────────────────────────────────── def load_lp_config(config_path: str = None) -> dict: if config_path is None: config_path = Path(__file__).resolve().parent.parent / "config" / "viz_config.json" with open(config_path) as f: return json.load(f).get("learning_paths", {}) def get_brands(config: dict) -> List[str]: return config.get("brands", []) def get_brand_color(brand: str, config: dict) -> str: return config.get("brand_colors", {}).get(brand, "#607D8B") def label_for_path(path_id, config: dict) -> str: return config.get("path_labels", {}).get(str(path_id), f"Path {path_id}") # ── DataFrame merge helpers ──────────────────────────────────────────────────── def merge_lesson_metrics( lesson_map: pd.DataFrame, per_path_df: pd.DataFrame, video_df: pd.DataFrame, sentiment_df: pd.DataFrame, ) -> pd.DataFrame: """ Join all lesson-level metric DataFrames into one tidy frame indexed by (learning_path_id, lesson_order). Returns an empty frame if lesson_map is empty. """ if lesson_map.empty: return pd.DataFrame() base = lesson_map[["brand", "learning_path_id", "first_lesson_content_id", "lesson_order", "lesson_content_id", "content_title"]].copy() join_key = ["learning_path_id", "lesson_content_id"] if not per_path_df.empty and "content_id" in per_path_df.columns: pp = per_path_df.rename(columns={"content_id": "lesson_content_id"}) cols = ["lesson_content_id", "learning_path_id", "lesson_number", "students_completed", "denominator_students", "completion_rate"] cols = [c for c in cols if c in pp.columns] base = base.merge(pp[cols], on=join_key, how="left") if not video_df.empty and "content_id" in video_df.columns: vd = video_df.rename(columns={"content_id": "lesson_content_id"}) cols = ["lesson_content_id", "learning_path_id", "total_starts", "total_completions", "video_completion_rate"] cols = [c for c in cols if c in vd.columns] base = base.merge(vd[cols], on=join_key, how="left") if not sentiment_df.empty: sent_key = ["learning_path_id", "lesson_order"] sent_cols = [c for c in [ "learning_path_id", "lesson_order", "total_comments", "very_positive", "positive", "neutral", "negative", "very_negative", "avg_sentiment_score", ] if c in sentiment_df.columns] base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left") # Fill numeric nulls with 0 / NaN as appropriate for col in ["students_completed", "denominator_students", "total_starts", "total_completions", "total_comments", "very_positive", "positive", "neutral", "negative", "very_negative"]: if col in base.columns: base[col] = base[col].fillna(0).astype(int) base.sort_values(["learning_path_id", "lesson_order"], inplace=True) return base.reset_index(drop=True) def merge_method_wide( method_df: pd.DataFrame, video_df: pd.DataFrame, sentiment_df: pd.DataFrame, config: dict, ) -> pd.DataFrame: """Same as merge_lesson_metrics but uses method-wide completion and adds method_lesson_number as the continuous x-axis.""" if method_df.empty: return pd.DataFrame() base = method_df.rename(columns={"content_id": "lesson_content_id"}).copy() join_key = ["learning_path_id", "lesson_content_id"] if not video_df.empty and "content_id" in video_df.columns: vd = video_df.rename(columns={"content_id": "lesson_content_id"}) cols = [c for c in ["lesson_content_id", "learning_path_id", "total_starts", "total_completions", "video_completion_rate"] if c in vd.columns] base = base.merge(vd[cols], on=join_key, how="left") if not sentiment_df.empty and "lesson_order" in base.columns: sent_key = ["learning_path_id", "lesson_order"] sent_cols = [c for c in [ "learning_path_id", "lesson_order", "total_comments", "very_positive", "positive", "neutral", "negative", "very_negative", "avg_sentiment_score", ] if c in sentiment_df.columns] base = base.merge(sentiment_df[sent_cols], on=sent_key, how="left") # Add path label base["path_label"] = base["learning_path_id"].apply( lambda pid: label_for_path(pid, config) ) for col in ["students_completed", "total_starts", "total_completions", "total_comments", "very_positive", "positive", "neutral", "negative", "very_negative"]: if col in base.columns: base[col] = base[col].fillna(0).astype(int) base.sort_values("method_lesson_number", inplace=True) return base.reset_index(drop=True) # ── Analysis helpers ─────────────────────────────────────────────────────────── def find_top_dropoffs(df: pd.DataFrame, n: int = 5, rate_col: str = "completion_rate", order_col: str = "lesson_order") -> pd.DataFrame: """ Return the top-N lessons with the largest completion-rate drop compared to the previous lesson (within the same learning_path_id). """ if df.empty or rate_col not in df.columns: return pd.DataFrame() result = df.copy().sort_values(["learning_path_id", order_col]) result["prev_rate"] = result.groupby("learning_path_id")[rate_col].shift(1) result["dropoff"] = result["prev_rate"] - result[rate_col] result = result[result["dropoff"].notna() & (result["dropoff"] > 0)] return result.nlargest(n, "dropoff")[ [c for c in ["learning_path_id", order_col, "content_title", "prev_rate", rate_col, "dropoff"] if c in result.columns] ].reset_index(drop=True) def get_overview_kpis(merged: pd.DataFrame) -> dict: """Return a dict of high-level KPI values from the merged metrics frame.""" if merged.empty: return {} total_students = int(merged["denominator_students"].max()) if "denominator_students" in merged.columns else 0 avg_completion = float(merged["completion_rate"].mean()) if "completion_rate" in merged.columns else 0.0 avg_sentiment = float(merged["avg_sentiment_score"].mean()) if "avg_sentiment_score" in merged.columns else 0.0 total_comments = int(merged["total_comments"].sum()) if "total_comments" in merged.columns else 0 n_paths = merged["learning_path_id"].nunique() if "learning_path_id" in merged.columns else 0 n_lessons = len(merged) return { "total_students": total_students, "avg_completion_pct": avg_completion * 100, "avg_sentiment_score": avg_sentiment, "total_comments": total_comments, "n_paths": n_paths, "n_lessons": n_lessons, } def filter_by_paths(df: pd.DataFrame, path_ids: Optional[List[int]]) -> pd.DataFrame: """Filter df to a subset of learning_path_ids. None or empty = all.""" if not path_ids or df.empty or "learning_path_id" not in df.columns: return df return df[df["learning_path_id"].isin(path_ids)].reset_index(drop=True) def short_title(title: Optional[str], max_len: int = 35) -> str: """Truncate a content title for display in labels.""" if not title or pd.isna(title): return "—" t = str(title).strip() return t if len(t) <= max_len else t[:max_len] + "…"