{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [],
      "metadata": {
        "id": "KpnKlKtP2pnS"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Step 1: Data Preparation and Cleaning"
      ],
      "metadata": {
        "id": "3GX4xb43H-Sp"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.1 Environment Setup & Data loading"
      ],
      "metadata": {
        "id": "OpjCtFymIB9w"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# ==================================================\n",
        "# UNIVERSAL SETUP CELL\n",
        "# Works in BOTH Google Colab and Hugging Face Spaces\n",
        "# ==================================================\n",
        "\n",
        "import os\n",
        "import random\n",
        "import warnings\n",
        "from pathlib import Path\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "# Reproducibility\n",
        "random.seed(42)\n",
        "np.random.seed(42)\n",
        "\n",
        "# Detect environment automatically\n",
        "if Path(\"/app\").exists():\n",
        "    BASE_PATH = Path(\"/app\")          # Hugging Face Space\n",
        "elif Path(\"/content\").exists():\n",
        "    BASE_PATH = Path(\"/content\")      # Google Colab\n",
        "else:\n",
        "    BASE_PATH = Path.cwd()            # Local fallback\n",
        "\n",
        "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n",
        "OUTPUTS = BASE_PATH / \"outputs\"\n",
        "\n",
        "DATA_PROCESSED.mkdir(exist_ok=True)\n",
        "OUTPUTS.mkdir(exist_ok=True)\n",
        "\n",
        "print(\"Environment ready.\")\n",
        "print(\"Using BASE_PATH:\", BASE_PATH)\n",
        "\n",
        "# Find CSV files anywhere under BASE_PATH\n",
        "csv_paths = [\n",
        "    p for p in BASE_PATH.rglob(\"*.csv\")\n",
        "    if \"sample_data\" not in str(p)\n",
        "]\n",
        "\n",
        "print(\"Found CSV files:\")\n",
        "for p in csv_paths:\n",
        "    print(\"-\", p)\n",
        "\n",
        "# Locate required files\n",
        "reviews_matches = [\n",
        "    p for p in csv_paths\n",
        "    if \"clothing\" in p.name.lower()\n",
        "]\n",
        "\n",
        "returns_matches = [\n",
        "    p for p in csv_paths\n",
        "    if \"return\" in p.name.lower()\n",
        "]\n",
        "\n",
        "if not reviews_matches:\n",
        "    raise FileNotFoundError(\n",
        "        \"Could not find the Womens Clothing E-Commerce Reviews CSV. \"\n",
        "        \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
        "    )\n",
        "\n",
        "if not returns_matches:\n",
        "    raise FileNotFoundError(\n",
        "        \"Could not find the ecommerce returns CSV. \"\n",
        "        \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
        "    )\n",
        "\n",
        "reviews_path = reviews_matches[0]\n",
        "returns_path = returns_matches[0]\n",
        "\n",
        "print(\"Using reviews file:\", reviews_path)\n",
        "print(\"Using returns file:\", returns_path)\n",
        "\n",
        "reviews_df = pd.read_csv(reviews_path)\n",
        "returns_df = pd.read_csv(returns_path)\n",
        "\n",
        "# Main dataframe used by the rest of this notebook\n",
        "df = reviews_df.copy()\n",
        "\n",
        "print(\"Loaded successfully.\")\n",
        "print(\"Reviews shape:\", reviews_df.shape)\n",
        "print(\"Returns shape:\", returns_df.shape)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bsHVIP13nWFe",
        "outputId": "e91ab6ca-f560-49f7-db56-0929a55488a4"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Environment ready.\n",
            "Using BASE_PATH: /content\n",
            "Found CSV files:\n",
            "- /content/Womens Clothing E-Commerce Reviews.csv\n",
            "- /content/ecommerce_returns_cleaned.csv\n",
            "Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n",
            "Using returns file: /content/ecommerce_returns_cleaned.csv\n",
            "Loaded successfully.\n",
            "Reviews shape: (23486, 11)\n",
            "Returns shape: (113314, 29)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.2 Missing value analysis"
      ],
      "metadata": {
        "id": "7TKoOsszIJBX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "missing_summary = pd.DataFrame({\n",
        "    \"column\": df.columns,\n",
        "    \"missing_count\": df.isna().sum().values,\n",
        "    \"missing_pct\": (df.isna().mean().values * 100).round(2)\n",
        "}).sort_values(by=\"missing_pct\", ascending=False)\n",
        "\n",
        "display(missing_summary)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 394
        },
        "id": "3qu3XSzfnV4-",
        "outputId": "9db9967a-19ff-4533-9ac1-485ac363bf26"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "                     column  missing_count  missing_pct\n",
              "3                     Title           3810        16.22\n",
              "4               Review Text            845         3.60\n",
              "9           Department Name             14         0.06\n",
              "10               Class Name             14         0.06\n",
              "8             Division Name             14         0.06\n",
              "0                Unnamed: 0              0         0.00\n",
              "1               Clothing ID              0         0.00\n",
              "2                       Age              0         0.00\n",
              "5                    Rating              0         0.00\n",
              "6           Recommended IND              0         0.00\n",
              "7   Positive Feedback Count              0         0.00"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-4b1265c3-5bc4-4181-b2fc-b56ffe5ed67c\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>column</th>\n",
              "      <th>missing_count</th>\n",
              "      <th>missing_pct</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Title</td>\n",
              "      <td>3810</td>\n",
              "      <td>16.22</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Review Text</td>\n",
              "      <td>845</td>\n",
              "      <td>3.60</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>Department Name</td>\n",
              "      <td>14</td>\n",
              "      <td>0.06</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>Class Name</td>\n",
              "      <td>14</td>\n",
              "      <td>0.06</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>Division Name</td>\n",
              "      <td>14</td>\n",
              "      <td>0.06</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Unnamed: 0</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Clothing ID</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Age</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Rating</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>Recommended IND</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>Positive Feedback Count</td>\n",
              "      <td>0</td>\n",
              "      <td>0.00</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4b1265c3-5bc4-4181-b2fc-b56ffe5ed67c')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-4b1265c3-5bc4-4181-b2fc-b56ffe5ed67c button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-4b1265c3-5bc4-4181-b2fc-b56ffe5ed67c');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "  <div id=\"id_2a77162d-939b-4759-a8b0-6c5eab1e0170\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('missing_summary')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_2a77162d-939b-4759-a8b0-6c5eab1e0170 button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('missing_summary');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "missing_summary",
              "summary": "{\n  \"name\": \"missing_summary\",\n  \"rows\": 11,\n  \"fields\": [\n    {\n      \"column\": \"column\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 11,\n        \"samples\": [\n          \"Unnamed: 0\",\n          \"Title\",\n          \"Recommended IND\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"missing_count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1150,\n        \"min\": 0,\n        \"max\": 3810,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          845,\n          0,\n          3810\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"missing_pct\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 4.895871358975474,\n        \"min\": 0.0,\n        \"max\": 16.22,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          3.6,\n          0.0,\n          16.22\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.3 Data cleaning & feature engineering"
      ],
      "metadata": {
        "id": "EfQ7CQguINjr"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "df_clean = df.copy()\n",
        "\n",
        "# remove useless index column\n",
        "df_clean = df_clean.drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
        "\n",
        "# fill text columns\n",
        "df_clean[\"Title\"] = df_clean[\"Title\"].fillna(\"No Title\")\n",
        "df_clean[\"Review Text\"] = df_clean[\"Review Text\"].fillna(\"No Review\")\n",
        "\n",
        "# fill category columns\n",
        "for col in [\"Department Name\", \"Class Name\", \"Division Name\"]:\n",
        "    df_clean[col] = df_clean[col].fillna(\"Unknown\")\n",
        "\n",
        "# create sentiment label from rating\n",
        "df_clean[\"sentiment\"] = df_clean[\"Rating\"].apply(\n",
        "    lambda x: \"positive\" if x >= 4 else \"negative\"\n",
        ")\n",
        "\n",
        "# create engagement score\n",
        "df_clean[\"engagement_score\"] = (\n",
        "    df_clean[\"Positive Feedback Count\"] + df_clean[\"Recommended IND\"]\n",
        ")\n",
        "\n",
        "print(\"Cleaned shape:\", df_clean.shape)\n",
        "display(df_clean.head())\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 643
        },
        "id": "da3lANHOnV13",
        "outputId": "c76e93fa-6555-4e73-a99e-866c9f076640"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cleaned shape: (23486, 12)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "   Clothing ID  Age                    Title  \\\n",
              "0          767   33                 No Title   \n",
              "1         1080   34                 No Title   \n",
              "2         1077   60  Some major design flaws   \n",
              "3         1049   50         My favorite buy!   \n",
              "4          847   47         Flattering shirt   \n",
              "\n",
              "                                         Review Text  Rating  Recommended IND  \\\n",
              "0  Absolutely wonderful - silky and sexy and comf...       4                1   \n",
              "1  Love this dress!  it's sooo pretty.  i happene...       5                1   \n",
              "2  I had such high hopes for this dress and reall...       3                0   \n",
              "3  I love, love, love this jumpsuit. it's fun, fl...       5                1   \n",
              "4  This shirt is very flattering to all due to th...       5                1   \n",
              "\n",
              "   Positive Feedback Count   Division Name Department Name Class Name  \\\n",
              "0                        0       Initmates        Intimate  Intimates   \n",
              "1                        4         General         Dresses    Dresses   \n",
              "2                        0         General         Dresses    Dresses   \n",
              "3                        0  General Petite         Bottoms      Pants   \n",
              "4                        6         General            Tops    Blouses   \n",
              "\n",
              "  sentiment  engagement_score  \n",
              "0  positive                 1  \n",
              "1  positive                 5  \n",
              "2  negative                 0  \n",
              "3  positive                 1  \n",
              "4  positive                 7  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-06dd2fca-54df-4f3e-a583-fb483bc59aca\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Clothing ID</th>\n",
              "      <th>Age</th>\n",
              "      <th>Title</th>\n",
              "      <th>Review Text</th>\n",
              "      <th>Rating</th>\n",
              "      <th>Recommended IND</th>\n",
              "      <th>Positive Feedback Count</th>\n",
              "      <th>Division Name</th>\n",
              "      <th>Department Name</th>\n",
              "      <th>Class Name</th>\n",
              "      <th>sentiment</th>\n",
              "      <th>engagement_score</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>767</td>\n",
              "      <td>33</td>\n",
              "      <td>No Title</td>\n",
              "      <td>Absolutely wonderful - silky and sexy and comf...</td>\n",
              "      <td>4</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>Initmates</td>\n",
              "      <td>Intimate</td>\n",
              "      <td>Intimates</td>\n",
              "      <td>positive</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1080</td>\n",
              "      <td>34</td>\n",
              "      <td>No Title</td>\n",
              "      <td>Love this dress!  it's sooo pretty.  i happene...</td>\n",
              "      <td>5</td>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "      <td>General</td>\n",
              "      <td>Dresses</td>\n",
              "      <td>Dresses</td>\n",
              "      <td>positive</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1077</td>\n",
              "      <td>60</td>\n",
              "      <td>Some major design flaws</td>\n",
              "      <td>I had such high hopes for this dress and reall...</td>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>General</td>\n",
              "      <td>Dresses</td>\n",
              "      <td>Dresses</td>\n",
              "      <td>negative</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1049</td>\n",
              "      <td>50</td>\n",
              "      <td>My favorite buy!</td>\n",
              "      <td>I love, love, love this jumpsuit. it's fun, fl...</td>\n",
              "      <td>5</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>General Petite</td>\n",
              "      <td>Bottoms</td>\n",
              "      <td>Pants</td>\n",
              "      <td>positive</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>847</td>\n",
              "      <td>47</td>\n",
              "      <td>Flattering shirt</td>\n",
              "      <td>This shirt is very flattering to all due to th...</td>\n",
              "      <td>5</td>\n",
              "      <td>1</td>\n",
              "      <td>6</td>\n",
              "      <td>General</td>\n",
              "      <td>Tops</td>\n",
              "      <td>Blouses</td>\n",
              "      <td>positive</td>\n",
              "      <td>7</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-06dd2fca-54df-4f3e-a583-fb483bc59aca')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-06dd2fca-54df-4f3e-a583-fb483bc59aca button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-06dd2fca-54df-4f3e-a583-fb483bc59aca');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"display(df_clean\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"Clothing ID\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 146,\n        \"min\": 767,\n        \"max\": 1080,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          1080,\n          847,\n          1077\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Age\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 11,\n        \"min\": 33,\n        \"max\": 60,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          34,\n          47,\n          60\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Title\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"Some major design flaws\",\n          \"Flattering shirt\",\n          \"No Title\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Review Text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 5,\n        \"samples\": [\n          \"Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite.  i bought a petite and am 5'8\\\".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.\",\n          \"This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!\",\n          \"I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 3,\n        \"max\": 5,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          4,\n          5,\n          3\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Recommended IND\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0,\n        \"min\": 0,\n        \"max\": 1,\n        \"num_unique_values\": 2,\n        \"samples\": [\n          0,\n          1\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Positive Feedback Count\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 2,\n        \"min\": 0,\n        \"max\": 6,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0,\n          4\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Division Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"Initmates\",\n          \"General\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Department Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"Dresses\",\n          \"Tops\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Class Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 4,\n        \"samples\": [\n          \"Dresses\",\n          \"Blouses\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"sentiment\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 2,\n        \"samples\": [\n          \"negative\",\n          \"positive\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"engagement_score\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 3,\n        \"min\": 0,\n        \"max\": 7,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          5,\n          7\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.4 Product category analysis"
      ],
      "metadata": {
        "id": "gf5CqL9IITIu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "class_summary = (\n",
        "    df_clean.groupby(\"Class Name\")\n",
        "    .agg(\n",
        "        reviews=(\"Rating\", \"count\"),\n",
        "        avg_rating=(\"Rating\", \"mean\"),\n",
        "        recommendation_rate=(\"Recommended IND\", \"mean\"),\n",
        "        avg_feedback=(\"Positive Feedback Count\", \"mean\")\n",
        "    )\n",
        "    .reset_index()\n",
        ")\n",
        "\n",
        "# keep only classes with enough reviews\n",
        "class_summary = class_summary[class_summary[\"reviews\"] >= 100]\n",
        "\n",
        "# sort by rating\n",
        "class_summary = class_summary.sort_values(\n",
        "    by=\"avg_rating\",\n",
        "    ascending=False\n",
        ")\n",
        "\n",
        "display(class_summary.head(10))\n",
        "display(class_summary.tail(10))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 709
        },
        "id": "MAW6wxDCnVzA",
        "outputId": "7cfc9d0b-783a-41f5-ddbc-8d7b0284b7bd"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "    Class Name  reviews  avg_rating  recommendation_rate  avg_feedback\n",
              "9     Layering      146    4.376712             0.883562      1.315068\n",
              "7        Jeans     1147    4.360942             0.881430      1.759372\n",
              "11      Lounge      691    4.301013             0.859624      2.321274\n",
              "6      Jackets      704    4.295455             0.845170      2.826705\n",
              "16       Sleep      228    4.285088             0.855263      1.750000\n",
              "5    Intimates      154    4.279221             0.857143      0.779221\n",
              "10     Legwear      165    4.278788             0.860606      1.272727\n",
              "13       Pants     1388    4.265850             0.832853      2.396974\n",
              "4   Fine gauge     1100    4.260909             0.837273      2.013636\n",
              "14      Shorts      317    4.255521             0.839117      1.675079"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-e39a8ef8-51f5-48c2-982f-98b63e6e7d0b\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Class Name</th>\n",
              "      <th>reviews</th>\n",
              "      <th>avg_rating</th>\n",
              "      <th>recommendation_rate</th>\n",
              "      <th>avg_feedback</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>Layering</td>\n",
              "      <td>146</td>\n",
              "      <td>4.376712</td>\n",
              "      <td>0.883562</td>\n",
              "      <td>1.315068</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>Jeans</td>\n",
              "      <td>1147</td>\n",
              "      <td>4.360942</td>\n",
              "      <td>0.881430</td>\n",
              "      <td>1.759372</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>Lounge</td>\n",
              "      <td>691</td>\n",
              "      <td>4.301013</td>\n",
              "      <td>0.859624</td>\n",
              "      <td>2.321274</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>Jackets</td>\n",
              "      <td>704</td>\n",
              "      <td>4.295455</td>\n",
              "      <td>0.845170</td>\n",
              "      <td>2.826705</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16</th>\n",
              "      <td>Sleep</td>\n",
              "      <td>228</td>\n",
              "      <td>4.285088</td>\n",
              "      <td>0.855263</td>\n",
              "      <td>1.750000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Intimates</td>\n",
              "      <td>154</td>\n",
              "      <td>4.279221</td>\n",
              "      <td>0.857143</td>\n",
              "      <td>0.779221</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10</th>\n",
              "      <td>Legwear</td>\n",
              "      <td>165</td>\n",
              "      <td>4.278788</td>\n",
              "      <td>0.860606</td>\n",
              "      <td>1.272727</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>13</th>\n",
              "      <td>Pants</td>\n",
              "      <td>1388</td>\n",
              "      <td>4.265850</td>\n",
              "      <td>0.832853</td>\n",
              "      <td>2.396974</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Fine gauge</td>\n",
              "      <td>1100</td>\n",
              "      <td>4.260909</td>\n",
              "      <td>0.837273</td>\n",
              "      <td>2.013636</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>Shorts</td>\n",
              "      <td>317</td>\n",
              "      <td>4.255521</td>\n",
              "      <td>0.839117</td>\n",
              "      <td>1.675079</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e39a8ef8-51f5-48c2-982f-98b63e6e7d0b')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-e39a8ef8-51f5-48c2-982f-98b63e6e7d0b button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-e39a8ef8-51f5-48c2-982f-98b63e6e7d0b');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"display(class_summary\",\n  \"rows\": 10,\n  \"fields\": [\n    {\n      \"column\": \"Class Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 10,\n        \"samples\": [\n          \"Fine gauge\",\n          \"Jeans\",\n          \"Intimates\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"reviews\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 471,\n        \"min\": 146,\n        \"max\": 1388,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          1100,\n          1147,\n          154\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.041140875708169405,\n        \"min\": 4.255520504731861,\n        \"max\": 4.376712328767123,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          4.260909090909091,\n          4.360941586748038,\n          4.279220779220779\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"recommendation_rate\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.017400503275162123,\n        \"min\": 0.8328530259365994,\n        \"max\": 0.8835616438356164,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          0.8372727272727273,\n          0.8814298169136879,\n          0.8571428571428571\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_feedback\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.6056747210865839,\n        \"min\": 0.7792207792207793,\n        \"max\": 2.8267045454545454,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          2.0136363636363637,\n          1.7593722755013077,\n          0.7792207792207793\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "    Class Name  reviews  avg_rating  recommendation_rate  avg_feedback\n",
              "4   Fine gauge     1100    4.260909             0.837273      2.013636\n",
              "14      Shorts      317    4.255521             0.839117      1.675079\n",
              "15      Skirts      945    4.245503             0.845503      2.293122\n",
              "12   Outerwear      328    4.198171             0.817073      2.823171\n",
              "18        Swim      350    4.197143             0.805714      2.142857\n",
              "17    Sweaters     1428    4.179272             0.800420      2.208683\n",
              "8        Knits     4843    4.161677             0.817675      2.394797\n",
              "0      Blouses     3097    4.154020             0.810139      2.725218\n",
              "3      Dresses     6319    4.150815             0.808197      3.087514\n",
              "19       Trend      119    3.815126             0.739496      3.369748"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-98a9e9de-10ea-4444-b6be-80f2fab20e4c\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Class Name</th>\n",
              "      <th>reviews</th>\n",
              "      <th>avg_rating</th>\n",
              "      <th>recommendation_rate</th>\n",
              "      <th>avg_feedback</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Fine gauge</td>\n",
              "      <td>1100</td>\n",
              "      <td>4.260909</td>\n",
              "      <td>0.837273</td>\n",
              "      <td>2.013636</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>Shorts</td>\n",
              "      <td>317</td>\n",
              "      <td>4.255521</td>\n",
              "      <td>0.839117</td>\n",
              "      <td>1.675079</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>15</th>\n",
              "      <td>Skirts</td>\n",
              "      <td>945</td>\n",
              "      <td>4.245503</td>\n",
              "      <td>0.845503</td>\n",
              "      <td>2.293122</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>12</th>\n",
              "      <td>Outerwear</td>\n",
              "      <td>328</td>\n",
              "      <td>4.198171</td>\n",
              "      <td>0.817073</td>\n",
              "      <td>2.823171</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>18</th>\n",
              "      <td>Swim</td>\n",
              "      <td>350</td>\n",
              "      <td>4.197143</td>\n",
              "      <td>0.805714</td>\n",
              "      <td>2.142857</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>17</th>\n",
              "      <td>Sweaters</td>\n",
              "      <td>1428</td>\n",
              "      <td>4.179272</td>\n",
              "      <td>0.800420</td>\n",
              "      <td>2.208683</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>Knits</td>\n",
              "      <td>4843</td>\n",
              "      <td>4.161677</td>\n",
              "      <td>0.817675</td>\n",
              "      <td>2.394797</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Blouses</td>\n",
              "      <td>3097</td>\n",
              "      <td>4.154020</td>\n",
              "      <td>0.810139</td>\n",
              "      <td>2.725218</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Dresses</td>\n",
              "      <td>6319</td>\n",
              "      <td>4.150815</td>\n",
              "      <td>0.808197</td>\n",
              "      <td>3.087514</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>19</th>\n",
              "      <td>Trend</td>\n",
              "      <td>119</td>\n",
              "      <td>3.815126</td>\n",
              "      <td>0.739496</td>\n",
              "      <td>3.369748</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-98a9e9de-10ea-4444-b6be-80f2fab20e4c')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-98a9e9de-10ea-4444-b6be-80f2fab20e4c button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-98a9e9de-10ea-4444-b6be-80f2fab20e4c');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"display(class_summary\",\n  \"rows\": 10,\n  \"fields\": [\n    {\n      \"column\": \"Class Name\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 10,\n        \"samples\": [\n          \"Dresses\",\n          \"Shorts\",\n          \"Sweaters\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"reviews\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 2158,\n        \"min\": 119,\n        \"max\": 6319,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          6319,\n          317,\n          1428\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.12860513401132947,\n        \"min\": 3.8151260504201683,\n        \"max\": 4.260909090909091,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          4.150815002373793,\n          4.255520504731861,\n          4.179271708683474\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"recommendation_rate\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.02981935453944708,\n        \"min\": 0.7394957983193278,\n        \"max\": 0.8455026455026455,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          0.8081974996043678,\n          0.8391167192429022,\n          0.8004201680672269\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"avg_feedback\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.5199400215099952,\n        \"min\": 1.6750788643533123,\n        \"max\": 3.369747899159664,\n        \"num_unique_values\": 10,\n        \"samples\": [\n          3.08751384712771,\n          1.6750788643533123,\n          2.208683473389356\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.5 Negative review text analysis"
      ],
      "metadata": {
        "id": "0r3IC4d4IZBj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "negative_reviews = df_clean[df_clean[\"sentiment\"] == \"negative\"].copy()\n",
        "\n",
        "print(\"Negative reviews:\", negative_reviews.shape)\n",
        "\n",
        "negative_reviews[\"Review Text\"] = negative_reviews[\"Review Text\"].astype(str)\n",
        "\n",
        "common_words = (\n",
        "    negative_reviews[\"Review Text\"]\n",
        "    .str.lower()\n",
        "    .str.split(expand=True)\n",
        "    .stack()\n",
        "    .value_counts()\n",
        "    .head(30)\n",
        ")\n",
        "\n",
        "display(common_words)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "C6TafBY7nVv3",
        "outputId": "f45d6bed-bd34-4a3a-e185-35132ffdc3e3"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Negative reviews: (5278, 12)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "the      20383\n",
              "i        13685\n",
              "and      10365\n",
              "it        9411\n",
              "a         8957\n",
              "is        6315\n",
              "this      5886\n",
              "to        5825\n",
              "was       4965\n",
              "in        4754\n",
              "but       4483\n",
              "on        3695\n",
              "of        3609\n",
              "for       3067\n",
              "not       2993\n",
              "so        2563\n",
              "my        2405\n",
              "that      2155\n",
              "like      2117\n",
              "have      2041\n",
              "very      2005\n",
              "with      1924\n",
              "dress     1913\n",
              "too       1764\n",
              "would     1687\n",
              "be        1609\n",
              "just      1581\n",
              "as        1519\n",
              "are       1484\n",
              "top       1418\n",
              "Name: count, dtype: int64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>the</th>\n",
              "      <td>20383</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>i</th>\n",
              "      <td>13685</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>and</th>\n",
              "      <td>10365</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>it</th>\n",
              "      <td>9411</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>a</th>\n",
              "      <td>8957</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>is</th>\n",
              "      <td>6315</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>this</th>\n",
              "      <td>5886</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>to</th>\n",
              "      <td>5825</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>was</th>\n",
              "      <td>4965</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>in</th>\n",
              "      <td>4754</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>but</th>\n",
              "      <td>4483</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>on</th>\n",
              "      <td>3695</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>of</th>\n",
              "      <td>3609</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>for</th>\n",
              "      <td>3067</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>not</th>\n",
              "      <td>2993</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>so</th>\n",
              "      <td>2563</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>my</th>\n",
              "      <td>2405</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>that</th>\n",
              "      <td>2155</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>like</th>\n",
              "      <td>2117</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>have</th>\n",
              "      <td>2041</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>very</th>\n",
              "      <td>2005</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>with</th>\n",
              "      <td>1924</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>dress</th>\n",
              "      <td>1913</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>too</th>\n",
              "      <td>1764</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>would</th>\n",
              "      <td>1687</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>be</th>\n",
              "      <td>1609</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>just</th>\n",
              "      <td>1581</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>as</th>\n",
              "      <td>1519</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>are</th>\n",
              "      <td>1484</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>top</th>\n",
              "      <td>1418</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div><br><label><b>dtype:</b> int64</label>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.6 Synthetic dataset generation"
      ],
      "metadata": {
        "id": "RrZfwc7SIdmx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "\n",
        "np.random.seed(42)\n",
        "\n",
        "synthetic_df = pd.DataFrame({\n",
        "    \"customer_id\": range(1, 501),\n",
        "    \"predicted_return_risk\": np.random.choice(\n",
        "        [\"low\", \"medium\", \"high\"],\n",
        "        size=500,\n",
        "        p=[0.5, 0.3, 0.2]\n",
        "    ),\n",
        "    \"predicted_size_issue\": np.random.choice(\n",
        "        [\"yes\", \"no\"],\n",
        "        size=500,\n",
        "        p=[0.25, 0.75]\n",
        "    ),\n",
        "    \"predicted_satisfaction_next_purchase\": np.random.randint(1, 6, 500)\n",
        "})\n",
        "\n",
        "print(synthetic_df.shape)\n",
        "display(synthetic_df.head())\n",
        "\n",
        "# Save outputs for the Hugging Face app / next analysis notebook\n",
        "df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n",
        "class_summary.to_csv(DATA_PROCESSED / \"class_summary.csv\", index=False)\n",
        "synthetic_df.to_csv(DATA_PROCESSED / \"synthetic_return_risk.csv\", index=False)\n",
        "returns_df.to_csv(DATA_PROCESSED / \"returns_input.csv\", index=False)\n",
        "\n",
        "common_words.reset_index().rename(\n",
        "    columns={\"index\": \"word\", \"Review Text\": \"count\", 0: \"count\"}\n",
        ").to_csv(DATA_PROCESSED / \"common_negative_words.csv\", index=False)\n",
        "\n",
        "print(\"Saved processed files to:\", DATA_PROCESSED)\n",
        "print([p.name for p in DATA_PROCESSED.glob(\"*.csv\")])\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 280
        },
        "id": "gmcDmANBnVsP",
        "outputId": "f831e19b-889c-4610-c348-a50ff7025ac8"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "(500, 4)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "   customer_id predicted_return_risk predicted_size_issue  \\\n",
              "0            1                   low                   no   \n",
              "1            2                  high                   no   \n",
              "2            3                medium                   no   \n",
              "3            4                medium                   no   \n",
              "4            5                   low                   no   \n",
              "\n",
              "   predicted_satisfaction_next_purchase  \n",
              "0                                     4  \n",
              "1                                     3  \n",
              "2                                     5  \n",
              "3                                     1  \n",
              "4                                     5  "
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-712eec04-3e23-428a-9507-c79acc234ed6\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>customer_id</th>\n",
              "      <th>predicted_return_risk</th>\n",
              "      <th>predicted_size_issue</th>\n",
              "      <th>predicted_satisfaction_next_purchase</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>low</td>\n",
              "      <td>no</td>\n",
              "      <td>4</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>2</td>\n",
              "      <td>high</td>\n",
              "      <td>no</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3</td>\n",
              "      <td>medium</td>\n",
              "      <td>no</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4</td>\n",
              "      <td>medium</td>\n",
              "      <td>no</td>\n",
              "      <td>1</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5</td>\n",
              "      <td>low</td>\n",
              "      <td>no</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-712eec04-3e23-428a-9507-c79acc234ed6')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-712eec04-3e23-428a-9507-c79acc234ed6 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-712eec04-3e23-428a-9507-c79acc234ed6');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "summary": "{\n  \"name\": \"print([p\",\n  \"rows\": 5,\n  \"fields\": [\n    {\n      \"column\": \"customer_id\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 1,\n        \"max\": 5,\n        \"num_unique_values\": 5,\n        \"samples\": [\n          2,\n          5,\n          3\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"predicted_return_risk\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"low\",\n          \"high\",\n          \"medium\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"predicted_size_issue\",\n      \"properties\": {\n        \"dtype\": \"category\",\n        \"num_unique_values\": 1,\n        \"samples\": [\n          \"no\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"predicted_satisfaction_next_purchase\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 1,\n        \"min\": 1,\n        \"max\": 5,\n        \"num_unique_values\": 4,\n        \"samples\": [\n          3\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saved processed files to: /content/data_processed\n",
            "['synthetic_return_risk.csv', 'common_negative_words.csv', 'reviews_cleaned.csv', 'class_summary.csv', 'returns_input.csv']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "1.7 sentiment analysis with VADER"
      ],
      "metadata": {
        "id": "jv1UG6gdIhu2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# ==================================================\n",
        "# SENTIMENT ANALYSIS WITH VADER\n",
        "# ==================================================\n",
        "\n",
        "!pip install vaderSentiment --quiet\n",
        "\n",
        "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
        "\n",
        "analyzer = SentimentIntensityAnalyzer()\n",
        "\n",
        "def get_vader_sentiment(text):\n",
        "    if text == \"No Review\" or not isinstance(text, str):\n",
        "        return \"neutral\"\n",
        "    score = analyzer.polarity_scores(text)[\"compound\"]\n",
        "    if score >= 0.05:\n",
        "        return \"positive\"\n",
        "    elif score <= -0.05:\n",
        "        return \"negative\"\n",
        "    else:\n",
        "        return \"neutral\"\n",
        "\n",
        "df_clean[\"vader_sentiment\"] = df_clean[\"Review Text\"].apply(get_vader_sentiment)\n",
        "\n",
        "# Compare rule-based vs NLP-based sentiment\n",
        "comparison = pd.crosstab(\n",
        "    df_clean[\"sentiment\"],\n",
        "    df_clean[\"vader_sentiment\"],\n",
        "    margins=True\n",
        ")\n",
        "print(\"Rule-based vs VADER Sentiment Comparison:\")\n",
        "display(comparison)\n",
        "\n",
        "# Distribution of VADER sentiment\n",
        "vader_dist = df_clean[\"vader_sentiment\"].value_counts()\n",
        "print(\"\\nVADER Sentiment Distribution:\")\n",
        "display(vader_dist)\n",
        "\n",
        "# Average rating per VADER sentiment (validation check)\n",
        "vader_rating = (\n",
        "    df_clean.groupby(\"vader_sentiment\")[\"Rating\"]\n",
        "    .mean()\n",
        "    .round(2)\n",
        "    .reset_index()\n",
        ")\n",
        "print(\"\\nAverage Rating per VADER Sentiment:\")\n",
        "display(vader_rating)\n",
        "\n",
        "# Save enriched dataset\n",
        "df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n",
        "print(\"\\nSaved enriched dataset with VADER sentiment.\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 619
        },
        "id": "og5QuNju33HT",
        "outputId": "a29606b2-0ef4-44d5-f325-0da6c1ff79af"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Rule-based vs VADER Sentiment Comparison:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "vader_sentiment  negative  neutral  positive    All\n",
              "sentiment                                          \n",
              "negative              992      249      4037   5278\n",
              "positive              343      860     17005  18208\n",
              "All                  1335     1109     21042  23486"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-7ff8ef54-a8d7-4b00-a210-10ce471292e1\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th>vader_sentiment</th>\n",
              "      <th>negative</th>\n",
              "      <th>neutral</th>\n",
              "      <th>positive</th>\n",
              "      <th>All</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>sentiment</th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>negative</th>\n",
              "      <td>992</td>\n",
              "      <td>249</td>\n",
              "      <td>4037</td>\n",
              "      <td>5278</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>positive</th>\n",
              "      <td>343</td>\n",
              "      <td>860</td>\n",
              "      <td>17005</td>\n",
              "      <td>18208</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>All</th>\n",
              "      <td>1335</td>\n",
              "      <td>1109</td>\n",
              "      <td>21042</td>\n",
              "      <td>23486</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7ff8ef54-a8d7-4b00-a210-10ce471292e1')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-7ff8ef54-a8d7-4b00-a210-10ce471292e1 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-7ff8ef54-a8d7-4b00-a210-10ce471292e1');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "  <div id=\"id_f5390f3e-5318-4c38-b49b-c82437534f5b\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('comparison')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_f5390f3e-5318-4c38-b49b-c82437534f5b button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('comparison');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "comparison",
              "summary": "{\n  \"name\": \"comparison\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"sentiment\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"negative\",\n          \"positive\",\n          \"All\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"negative\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 503,\n        \"min\": 343,\n        \"max\": 1335,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          992,\n          343,\n          1335\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"neutral\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 442,\n        \"min\": 249,\n        \"max\": 1109,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          249,\n          860,\n          1109\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"positive\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 8884,\n        \"min\": 4037,\n        \"max\": 21042,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          4037,\n          17005,\n          21042\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"All\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 9368,\n        \"min\": 5278,\n        \"max\": 23486,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          5278,\n          18208,\n          23486\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "VADER Sentiment Distribution:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "vader_sentiment\n",
              "positive    21042\n",
              "negative     1335\n",
              "neutral      1109\n",
              "Name: count, dtype: int64"
            ],
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>vader_sentiment</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>positive</th>\n",
              "      <td>21042</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>negative</th>\n",
              "      <td>1335</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>neutral</th>\n",
              "      <td>1109</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div><br><label><b>dtype:</b> int64</label>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Average Rating per VADER Sentiment:\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  vader_sentiment  Rating\n",
              "0        negative    2.69\n",
              "1         neutral    4.20\n",
              "2        positive    4.29"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-19d5a63a-62b0-4850-9028-68f0f7855363\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>vader_sentiment</th>\n",
              "      <th>Rating</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>negative</td>\n",
              "      <td>2.69</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>neutral</td>\n",
              "      <td>4.20</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>positive</td>\n",
              "      <td>4.29</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-19d5a63a-62b0-4850-9028-68f0f7855363')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-19d5a63a-62b0-4850-9028-68f0f7855363 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-19d5a63a-62b0-4850-9028-68f0f7855363');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "  <div id=\"id_d50e31db-cdb5-41fa-a445-b8867a9f8c18\">\n",
              "    <style>\n",
              "      .colab-df-generate {\n",
              "        background-color: #E8F0FE;\n",
              "        border: none;\n",
              "        border-radius: 50%;\n",
              "        cursor: pointer;\n",
              "        display: none;\n",
              "        fill: #1967D2;\n",
              "        height: 32px;\n",
              "        padding: 0 0 0 0;\n",
              "        width: 32px;\n",
              "      }\n",
              "\n",
              "      .colab-df-generate:hover {\n",
              "        background-color: #E2EBFA;\n",
              "        box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "        fill: #174EA6;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate {\n",
              "        background-color: #3B4455;\n",
              "        fill: #D2E3FC;\n",
              "      }\n",
              "\n",
              "      [theme=dark] .colab-df-generate:hover {\n",
              "        background-color: #434B5C;\n",
              "        box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "        filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "        fill: #FFFFFF;\n",
              "      }\n",
              "    </style>\n",
              "    <button class=\"colab-df-generate\" onclick=\"generateWithVariable('vader_rating')\"\n",
              "            title=\"Generate code using this dataframe.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "    <script>\n",
              "      (() => {\n",
              "      const buttonEl =\n",
              "        document.querySelector('#id_d50e31db-cdb5-41fa-a445-b8867a9f8c18 button.colab-df-generate');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      buttonEl.onclick = () => {\n",
              "        google.colab.notebook.generateWithVariable('vader_rating');\n",
              "      }\n",
              "      })();\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "dataframe",
              "variable_name": "vader_rating",
              "summary": "{\n  \"name\": \"vader_rating\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"vader_sentiment\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"negative\",\n          \"neutral\",\n          \"positive\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Rating\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 0.8989067434018578,\n        \"min\": 2.69,\n        \"max\": 4.29,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          2.69,\n          4.2,\n          4.29\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\n",
            "Saved enriched dataset with VADER sentiment.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Automation Opportunities\n",
        "\n",
        "## Automation 1 — Review Sentiment Alert\n",
        "Automatically flag clothing classes when average rating drops below 3.5.\n",
        "\n",
        "## Automation 2 — Product Improvement Suggestions\n",
        "Use negative review keywords to automatically suggest:\n",
        "- sizing guide improvements\n",
        "- fabric description clarification\n",
        "- fit recommendations\n",
        "- photo quality updates\n",
        "\n",
        "## Automation 3 — Future Return Risk Dashboard\n",
        "Combine real reviews with synthetic future risk signals to monitor:\n",
        "- high-risk customer segments\n",
        "- classes with repeated size complaints\n",
        "- products likely to receive negative reviews next season"
      ],
      "metadata": {
        "id": "fmUnLL36pT_z"
      }
    }
  ]
}