{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [], "metadata": { "id": "KpnKlKtP2pnS" } }, { "cell_type": "markdown", "source": [ "Step 1: Data Preparation and Cleaning" ], "metadata": { "id": "3GX4xb43H-Sp" } }, { "cell_type": "markdown", "source": [ "1.1 Environment Setup & Data loading" ], "metadata": { "id": "OpjCtFymIB9w" } }, { "cell_type": "code", "source": [ "# ==================================================\n", "# UNIVERSAL SETUP CELL\n", "# Works in BOTH Google Colab and Hugging Face Spaces\n", "# ==================================================\n", "\n", "import os\n", "import random\n", "import warnings\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# Reproducibility\n", "random.seed(42)\n", "np.random.seed(42)\n", "\n", "# Detect environment automatically\n", "if Path(\"/app\").exists():\n", " BASE_PATH = Path(\"/app\") # Hugging Face Space\n", "elif Path(\"/content\").exists():\n", " BASE_PATH = Path(\"/content\") # Google Colab\n", "else:\n", " BASE_PATH = Path.cwd() # Local fallback\n", "\n", "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n", "OUTPUTS = BASE_PATH / \"outputs\"\n", "\n", "DATA_PROCESSED.mkdir(exist_ok=True)\n", "OUTPUTS.mkdir(exist_ok=True)\n", "\n", "print(\"Environment ready.\")\n", "print(\"Using BASE_PATH:\", BASE_PATH)\n", "\n", "# Find CSV files anywhere under BASE_PATH\n", "csv_paths = [\n", " p for p in BASE_PATH.rglob(\"*.csv\")\n", " if \"sample_data\" not in str(p)\n", "]\n", "\n", "print(\"Found CSV files:\")\n", "for p in csv_paths:\n", " print(\"-\", p)\n", "\n", "# Locate required files\n", "reviews_matches = [\n", " p for p in csv_paths\n", " if \"clothing\" in p.name.lower()\n", "]\n", "\n", "returns_matches = [\n", " p for p in csv_paths\n", " if \"return\" in p.name.lower()\n", "]\n", "\n", "if not reviews_matches:\n", " raise FileNotFoundError(\n", " \"Could not find the Womens Clothing E-Commerce Reviews CSV. \"\n", " \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n", " )\n", "\n", "if not returns_matches:\n", " raise FileNotFoundError(\n", " \"Could not find the ecommerce returns CSV. \"\n", " \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n", " )\n", "\n", "reviews_path = reviews_matches[0]\n", "returns_path = returns_matches[0]\n", "\n", "print(\"Using reviews file:\", reviews_path)\n", "print(\"Using returns file:\", returns_path)\n", "\n", "reviews_df = pd.read_csv(reviews_path)\n", "returns_df = pd.read_csv(returns_path)\n", "\n", "# Main dataframe used by the rest of this notebook\n", "df = reviews_df.copy()\n", "\n", "print(\"Loaded successfully.\")\n", "print(\"Reviews shape:\", reviews_df.shape)\n", "print(\"Returns shape:\", returns_df.shape)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bsHVIP13nWFe", "outputId": "e91ab6ca-f560-49f7-db56-0929a55488a4" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Environment ready.\n", "Using BASE_PATH: /content\n", "Found CSV files:\n", "- /content/Womens Clothing E-Commerce Reviews.csv\n", "- /content/ecommerce_returns_cleaned.csv\n", "Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n", "Using returns file: /content/ecommerce_returns_cleaned.csv\n", "Loaded successfully.\n", "Reviews shape: (23486, 11)\n", "Returns shape: (113314, 29)\n" ] } ] }, { "cell_type": "markdown", "source": [ "1.2 Missing value analysis" ], "metadata": { "id": "7TKoOsszIJBX" } }, { "cell_type": "code", "source": [ "missing_summary = pd.DataFrame({\n", " \"column\": df.columns,\n", " \"missing_count\": df.isna().sum().values,\n", " \"missing_pct\": (df.isna().mean().values * 100).round(2)\n", "}).sort_values(by=\"missing_pct\", ascending=False)\n", "\n", "display(missing_summary)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 394 }, "id": "3qu3XSzfnV4-", "outputId": "9db9967a-19ff-4533-9ac1-485ac363bf26" }, "execution_count": 11, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ " column missing_count missing_pct\n", "3 Title 3810 16.22\n", "4 Review Text 845 3.60\n", "9 Department Name 14 0.06\n", "10 Class Name 14 0.06\n", "8 Division Name 14 0.06\n", "0 Unnamed: 0 0 0.00\n", "1 Clothing ID 0 0.00\n", "2 Age 0 0.00\n", "5 Rating 0 0.00\n", "6 Recommended IND 0 0.00\n", "7 Positive Feedback Count 0 0.00" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
columnmissing_countmissing_pct
3Title381016.22
4Review Text8453.60
9Department Name140.06
10Class Name140.06
8Division Name140.06
0Unnamed: 000.00
1Clothing ID00.00
2Age00.00
5Rating00.00
6Recommended IND00.00
7Positive Feedback Count00.00
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "missing_summary", "summary": "{\n \"name\": \"missing_summary\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"column\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"Unnamed: 0\",\n \"Title\",\n \"Recommended IND\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"missing_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1150,\n \"min\": 0,\n \"max\": 3810,\n \"num_unique_values\": 4,\n \"samples\": [\n 845,\n 0,\n 3810\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"missing_pct\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.895871358975474,\n \"min\": 0.0,\n \"max\": 16.22,\n \"num_unique_values\": 4,\n \"samples\": [\n 3.6,\n 0.0,\n 16.22\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "1.3 Data cleaning & feature engineering" ], "metadata": { "id": "EfQ7CQguINjr" } }, { "cell_type": "code", "source": [ "df_clean = df.copy()\n", "\n", "# remove useless index column\n", "df_clean = df_clean.drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n", "\n", "# fill text columns\n", "df_clean[\"Title\"] = df_clean[\"Title\"].fillna(\"No Title\")\n", "df_clean[\"Review Text\"] = df_clean[\"Review Text\"].fillna(\"No Review\")\n", "\n", "# fill category columns\n", "for col in [\"Department Name\", \"Class Name\", \"Division Name\"]:\n", " df_clean[col] = df_clean[col].fillna(\"Unknown\")\n", "\n", "# create sentiment label from rating\n", "df_clean[\"sentiment\"] = df_clean[\"Rating\"].apply(\n", " lambda x: \"positive\" if x >= 4 else \"negative\"\n", ")\n", "\n", "# create engagement score\n", "df_clean[\"engagement_score\"] = (\n", " df_clean[\"Positive Feedback Count\"] + df_clean[\"Recommended IND\"]\n", ")\n", "\n", "print(\"Cleaned shape:\", df_clean.shape)\n", "display(df_clean.head())\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 643 }, "id": "da3lANHOnV13", "outputId": "c76e93fa-6555-4e73-a99e-866c9f076640" }, "execution_count": 12, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cleaned shape: (23486, 12)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " Clothing ID Age Title \\\n", "0 767 33 No Title \n", "1 1080 34 No Title \n", "2 1077 60 Some major design flaws \n", "3 1049 50 My favorite buy! \n", "4 847 47 Flattering shirt \n", "\n", " Review Text Rating Recommended IND \\\n", "0 Absolutely wonderful - silky and sexy and comf... 4 1 \n", "1 Love this dress! it's sooo pretty. i happene... 5 1 \n", "2 I had such high hopes for this dress and reall... 3 0 \n", "3 I love, love, love this jumpsuit. it's fun, fl... 5 1 \n", "4 This shirt is very flattering to all due to th... 5 1 \n", "\n", " Positive Feedback Count Division Name Department Name Class Name \\\n", "0 0 Initmates Intimate Intimates \n", "1 4 General Dresses Dresses \n", "2 0 General Dresses Dresses \n", "3 0 General Petite Bottoms Pants \n", "4 6 General Tops Blouses \n", "\n", " sentiment engagement_score \n", "0 positive 1 \n", "1 positive 5 \n", "2 negative 0 \n", "3 positive 1 \n", "4 positive 7 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Clothing IDAgeTitleReview TextRatingRecommended INDPositive Feedback CountDivision NameDepartment NameClass Namesentimentengagement_score
076733No TitleAbsolutely wonderful - silky and sexy and comf...410InitmatesIntimateIntimatespositive1
1108034No TitleLove this dress! it's sooo pretty. i happene...514GeneralDressesDressespositive5
2107760Some major design flawsI had such high hopes for this dress and reall...300GeneralDressesDressesnegative0
3104950My favorite buy!I love, love, love this jumpsuit. it's fun, fl...510General PetiteBottomsPantspositive1
484747Flattering shirtThis shirt is very flattering to all due to th...516GeneralTopsBlousespositive7
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(df_clean\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Clothing ID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 146,\n \"min\": 767,\n \"max\": 1080,\n \"num_unique_values\": 5,\n \"samples\": [\n 1080,\n 847,\n 1077\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 33,\n \"max\": 60,\n \"num_unique_values\": 5,\n \"samples\": [\n 34,\n 47,\n 60\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Some major design flaws\",\n \"Flattering shirt\",\n \"No Title\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Review Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8\\\". i love the length on me- hits just a little below the knee. would definitely be a true midi on someone who is truly petite.\",\n \"This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!\",\n \"I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 4,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Recommended IND\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Positive Feedback Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 3,\n \"samples\": [\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Division Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Initmates\",\n \"General\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Department Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Dresses\",\n \"Tops\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Dresses\",\n \"Blouses\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"engagement_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n \"max\": 7,\n \"num_unique_values\": 4,\n \"samples\": [\n 5,\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "1.4 Product category analysis" ], "metadata": { "id": "gf5CqL9IITIu" } }, { "cell_type": "code", "source": [ "class_summary = (\n", " df_clean.groupby(\"Class Name\")\n", " .agg(\n", " reviews=(\"Rating\", \"count\"),\n", " avg_rating=(\"Rating\", \"mean\"),\n", " recommendation_rate=(\"Recommended IND\", \"mean\"),\n", " avg_feedback=(\"Positive Feedback Count\", \"mean\")\n", " )\n", " .reset_index()\n", ")\n", "\n", "# keep only classes with enough reviews\n", "class_summary = class_summary[class_summary[\"reviews\"] >= 100]\n", "\n", "# sort by rating\n", "class_summary = class_summary.sort_values(\n", " by=\"avg_rating\",\n", " ascending=False\n", ")\n", "\n", "display(class_summary.head(10))\n", "display(class_summary.tail(10))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 709 }, "id": "MAW6wxDCnVzA", "outputId": "7cfc9d0b-783a-41f5-ddbc-8d7b0284b7bd" }, "execution_count": 13, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ " Class Name reviews avg_rating recommendation_rate avg_feedback\n", "9 Layering 146 4.376712 0.883562 1.315068\n", "7 Jeans 1147 4.360942 0.881430 1.759372\n", "11 Lounge 691 4.301013 0.859624 2.321274\n", "6 Jackets 704 4.295455 0.845170 2.826705\n", "16 Sleep 228 4.285088 0.855263 1.750000\n", "5 Intimates 154 4.279221 0.857143 0.779221\n", "10 Legwear 165 4.278788 0.860606 1.272727\n", "13 Pants 1388 4.265850 0.832853 2.396974\n", "4 Fine gauge 1100 4.260909 0.837273 2.013636\n", "14 Shorts 317 4.255521 0.839117 1.675079" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Class Namereviewsavg_ratingrecommendation_rateavg_feedback
9Layering1464.3767120.8835621.315068
7Jeans11474.3609420.8814301.759372
11Lounge6914.3010130.8596242.321274
6Jackets7044.2954550.8451702.826705
16Sleep2284.2850880.8552631.750000
5Intimates1544.2792210.8571430.779221
10Legwear1654.2787880.8606061.272727
13Pants13884.2658500.8328532.396974
4Fine gauge11004.2609090.8372732.013636
14Shorts3174.2555210.8391171.675079
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(class_summary\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Fine gauge\",\n \"Jeans\",\n \"Intimates\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 471,\n \"min\": 146,\n \"max\": 1388,\n \"num_unique_values\": 10,\n \"samples\": [\n 1100,\n 1147,\n 154\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.041140875708169405,\n \"min\": 4.255520504731861,\n \"max\": 4.376712328767123,\n \"num_unique_values\": 10,\n \"samples\": [\n 4.260909090909091,\n 4.360941586748038,\n 4.279220779220779\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"recommendation_rate\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.017400503275162123,\n \"min\": 0.8328530259365994,\n \"max\": 0.8835616438356164,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8372727272727273,\n 0.8814298169136879,\n 0.8571428571428571\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_feedback\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6056747210865839,\n \"min\": 0.7792207792207793,\n \"max\": 2.8267045454545454,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0136363636363637,\n 1.7593722755013077,\n 0.7792207792207793\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ " Class Name reviews avg_rating recommendation_rate avg_feedback\n", "4 Fine gauge 1100 4.260909 0.837273 2.013636\n", "14 Shorts 317 4.255521 0.839117 1.675079\n", "15 Skirts 945 4.245503 0.845503 2.293122\n", "12 Outerwear 328 4.198171 0.817073 2.823171\n", "18 Swim 350 4.197143 0.805714 2.142857\n", "17 Sweaters 1428 4.179272 0.800420 2.208683\n", "8 Knits 4843 4.161677 0.817675 2.394797\n", "0 Blouses 3097 4.154020 0.810139 2.725218\n", "3 Dresses 6319 4.150815 0.808197 3.087514\n", "19 Trend 119 3.815126 0.739496 3.369748" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Class Namereviewsavg_ratingrecommendation_rateavg_feedback
4Fine gauge11004.2609090.8372732.013636
14Shorts3174.2555210.8391171.675079
15Skirts9454.2455030.8455032.293122
12Outerwear3284.1981710.8170732.823171
18Swim3504.1971430.8057142.142857
17Sweaters14284.1792720.8004202.208683
8Knits48434.1616770.8176752.394797
0Blouses30974.1540200.8101392.725218
3Dresses63194.1508150.8081973.087514
19Trend1193.8151260.7394963.369748
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"display(class_summary\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Dresses\",\n \"Shorts\",\n \"Sweaters\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2158,\n \"min\": 119,\n \"max\": 6319,\n \"num_unique_values\": 10,\n \"samples\": [\n 6319,\n 317,\n 1428\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.12860513401132947,\n \"min\": 3.8151260504201683,\n \"max\": 4.260909090909091,\n \"num_unique_values\": 10,\n \"samples\": [\n 4.150815002373793,\n 4.255520504731861,\n 4.179271708683474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"recommendation_rate\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.02981935453944708,\n \"min\": 0.7394957983193278,\n \"max\": 0.8455026455026455,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8081974996043678,\n 0.8391167192429022,\n 0.8004201680672269\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_feedback\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5199400215099952,\n \"min\": 1.6750788643533123,\n \"max\": 3.369747899159664,\n \"num_unique_values\": 10,\n \"samples\": [\n 3.08751384712771,\n 1.6750788643533123,\n 2.208683473389356\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "1.5 Negative review text analysis" ], "metadata": { "id": "0r3IC4d4IZBj" } }, { "cell_type": "code", "source": [ "negative_reviews = df_clean[df_clean[\"sentiment\"] == \"negative\"].copy()\n", "\n", "print(\"Negative reviews:\", negative_reviews.shape)\n", "\n", "negative_reviews[\"Review Text\"] = negative_reviews[\"Review Text\"].astype(str)\n", "\n", "common_words = (\n", " negative_reviews[\"Review Text\"]\n", " .str.lower()\n", " .str.split(expand=True)\n", " .stack()\n", " .value_counts()\n", " .head(30)\n", ")\n", "\n", "display(common_words)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "C6TafBY7nVv3", "outputId": "f45d6bed-bd34-4a3a-e185-35132ffdc3e3" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Negative reviews: (5278, 12)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "the 20383\n", "i 13685\n", "and 10365\n", "it 9411\n", "a 8957\n", "is 6315\n", "this 5886\n", "to 5825\n", "was 4965\n", "in 4754\n", "but 4483\n", "on 3695\n", "of 3609\n", "for 3067\n", "not 2993\n", "so 2563\n", "my 2405\n", "that 2155\n", "like 2117\n", "have 2041\n", "very 2005\n", "with 1924\n", "dress 1913\n", "too 1764\n", "would 1687\n", "be 1609\n", "just 1581\n", "as 1519\n", "are 1484\n", "top 1418\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
the20383
i13685
and10365
it9411
a8957
is6315
this5886
to5825
was4965
in4754
but4483
on3695
of3609
for3067
not2993
so2563
my2405
that2155
like2117
have2041
very2005
with1924
dress1913
too1764
would1687
be1609
just1581
as1519
are1484
top1418
\n", "

" ] }, "metadata": {} } ] }, { "cell_type": "markdown", "source": [ "1.6 Synthetic dataset generation" ], "metadata": { "id": "RrZfwc7SIdmx" } }, { "cell_type": "code", "source": [ "import numpy as np\n", "\n", "np.random.seed(42)\n", "\n", "synthetic_df = pd.DataFrame({\n", " \"customer_id\": range(1, 501),\n", " \"predicted_return_risk\": np.random.choice(\n", " [\"low\", \"medium\", \"high\"],\n", " size=500,\n", " p=[0.5, 0.3, 0.2]\n", " ),\n", " \"predicted_size_issue\": np.random.choice(\n", " [\"yes\", \"no\"],\n", " size=500,\n", " p=[0.25, 0.75]\n", " ),\n", " \"predicted_satisfaction_next_purchase\": np.random.randint(1, 6, 500)\n", "})\n", "\n", "print(synthetic_df.shape)\n", "display(synthetic_df.head())\n", "\n", "# Save outputs for the Hugging Face app / next analysis notebook\n", "df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n", "class_summary.to_csv(DATA_PROCESSED / \"class_summary.csv\", index=False)\n", "synthetic_df.to_csv(DATA_PROCESSED / \"synthetic_return_risk.csv\", index=False)\n", "returns_df.to_csv(DATA_PROCESSED / \"returns_input.csv\", index=False)\n", "\n", "common_words.reset_index().rename(\n", " columns={\"index\": \"word\", \"Review Text\": \"count\", 0: \"count\"}\n", ").to_csv(DATA_PROCESSED / \"common_negative_words.csv\", index=False)\n", "\n", "print(\"Saved processed files to:\", DATA_PROCESSED)\n", "print([p.name for p in DATA_PROCESSED.glob(\"*.csv\")])\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 280 }, "id": "gmcDmANBnVsP", "outputId": "f831e19b-889c-4610-c348-a50ff7025ac8" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(500, 4)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " customer_id predicted_return_risk predicted_size_issue \\\n", "0 1 low no \n", "1 2 high no \n", "2 3 medium no \n", "3 4 medium no \n", "4 5 low no \n", "\n", " predicted_satisfaction_next_purchase \n", "0 4 \n", "1 3 \n", "2 5 \n", "3 1 \n", "4 5 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idpredicted_return_riskpredicted_size_issuepredicted_satisfaction_next_purchase
01lowno4
12highno3
23mediumno5
34mediumno1
45lowno5
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "summary": "{\n \"name\": \"print([p\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"customer_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_return_risk\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"low\",\n \"high\",\n \"medium\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_size_issue\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"no\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_satisfaction_next_purchase\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 4,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saved processed files to: /content/data_processed\n", "['synthetic_return_risk.csv', 'common_negative_words.csv', 'reviews_cleaned.csv', 'class_summary.csv', 'returns_input.csv']\n" ] } ] }, { "cell_type": "markdown", "source": [ "1.7 sentiment analysis with VADER" ], "metadata": { "id": "jv1UG6gdIhu2" } }, { "cell_type": "code", "source": [ "# ==================================================\n", "# SENTIMENT ANALYSIS WITH VADER\n", "# ==================================================\n", "\n", "!pip install vaderSentiment --quiet\n", "\n", "from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n", "\n", "analyzer = SentimentIntensityAnalyzer()\n", "\n", "def get_vader_sentiment(text):\n", " if text == \"No Review\" or not isinstance(text, str):\n", " return \"neutral\"\n", " score = analyzer.polarity_scores(text)[\"compound\"]\n", " if score >= 0.05:\n", " return \"positive\"\n", " elif score <= -0.05:\n", " return \"negative\"\n", " else:\n", " return \"neutral\"\n", "\n", "df_clean[\"vader_sentiment\"] = df_clean[\"Review Text\"].apply(get_vader_sentiment)\n", "\n", "# Compare rule-based vs NLP-based sentiment\n", "comparison = pd.crosstab(\n", " df_clean[\"sentiment\"],\n", " df_clean[\"vader_sentiment\"],\n", " margins=True\n", ")\n", "print(\"Rule-based vs VADER Sentiment Comparison:\")\n", "display(comparison)\n", "\n", "# Distribution of VADER sentiment\n", "vader_dist = df_clean[\"vader_sentiment\"].value_counts()\n", "print(\"\\nVADER Sentiment Distribution:\")\n", "display(vader_dist)\n", "\n", "# Average rating per VADER sentiment (validation check)\n", "vader_rating = (\n", " df_clean.groupby(\"vader_sentiment\")[\"Rating\"]\n", " .mean()\n", " .round(2)\n", " .reset_index()\n", ")\n", "print(\"\\nAverage Rating per VADER Sentiment:\")\n", "display(vader_rating)\n", "\n", "# Save enriched dataset\n", "df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n", "print(\"\\nSaved enriched dataset with VADER sentiment.\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 619 }, "id": "og5QuNju33HT", "outputId": "a29606b2-0ef4-44d5-f325-0da6c1ff79af" }, "execution_count": 16, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Rule-based vs VADER Sentiment Comparison:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "vader_sentiment negative neutral positive All\n", "sentiment \n", "negative 992 249 4037 5278\n", "positive 343 860 17005 18208\n", "All 1335 1109 21042 23486" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vader_sentimentnegativeneutralpositiveAll
sentiment
negative99224940375278
positive3438601700518208
All133511092104223486
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "comparison", "summary": "{\n \"name\": \"comparison\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"negative\",\n \"positive\",\n \"All\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"negative\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 503,\n \"min\": 343,\n \"max\": 1335,\n \"num_unique_values\": 3,\n \"samples\": [\n 992,\n 343,\n 1335\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"neutral\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 442,\n \"min\": 249,\n \"max\": 1109,\n \"num_unique_values\": 3,\n \"samples\": [\n 249,\n 860,\n 1109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"positive\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8884,\n \"min\": 4037,\n \"max\": 21042,\n \"num_unique_values\": 3,\n \"samples\": [\n 4037,\n 17005,\n 21042\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"All\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9368,\n \"min\": 5278,\n \"max\": 23486,\n \"num_unique_values\": 3,\n \"samples\": [\n 5278,\n 18208,\n 23486\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "VADER Sentiment Distribution:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "vader_sentiment\n", "positive 21042\n", "negative 1335\n", "neutral 1109\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
vader_sentiment
positive21042
negative1335
neutral1109
\n", "

" ] }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Average Rating per VADER Sentiment:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " vader_sentiment Rating\n", "0 negative 2.69\n", "1 neutral 4.20\n", "2 positive 4.29" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
vader_sentimentRating
0negative2.69
1neutral4.20
2positive4.29
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "vader_rating", "summary": "{\n \"name\": \"vader_rating\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"vader_sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"negative\",\n \"neutral\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8989067434018578,\n \"min\": 2.69,\n \"max\": 4.29,\n \"num_unique_values\": 3,\n \"samples\": [\n 2.69,\n 4.2,\n 4.29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "\n", "Saved enriched dataset with VADER sentiment.\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Automation Opportunities\n", "\n", "## Automation 1 — Review Sentiment Alert\n", "Automatically flag clothing classes when average rating drops below 3.5.\n", "\n", "## Automation 2 — Product Improvement Suggestions\n", "Use negative review keywords to automatically suggest:\n", "- sizing guide improvements\n", "- fabric description clarification\n", "- fit recommendations\n", "- photo quality updates\n", "\n", "## Automation 3 — Future Return Risk Dashboard\n", "Combine real reviews with synthetic future risk signals to monitor:\n", "- high-risk customer segments\n", "- classes with repeated size complaints\n", "- products likely to receive negative reviews next season" ], "metadata": { "id": "fmUnLL36pT_z" } } ] }