{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "KpnKlKtP2pnS"
}
},
{
"cell_type": "markdown",
"source": [
"Step 1: Data Preparation and Cleaning"
],
"metadata": {
"id": "3GX4xb43H-Sp"
}
},
{
"cell_type": "markdown",
"source": [
"1.1 Environment Setup & Data loading"
],
"metadata": {
"id": "OpjCtFymIB9w"
}
},
{
"cell_type": "code",
"source": [
"# ==================================================\n",
"# UNIVERSAL SETUP CELL\n",
"# Works in BOTH Google Colab and Hugging Face Spaces\n",
"# ==================================================\n",
"\n",
"import os\n",
"import random\n",
"import warnings\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# Reproducibility\n",
"random.seed(42)\n",
"np.random.seed(42)\n",
"\n",
"# Detect environment automatically\n",
"if Path(\"/app\").exists():\n",
" BASE_PATH = Path(\"/app\") # Hugging Face Space\n",
"elif Path(\"/content\").exists():\n",
" BASE_PATH = Path(\"/content\") # Google Colab\n",
"else:\n",
" BASE_PATH = Path.cwd() # Local fallback\n",
"\n",
"DATA_PROCESSED = BASE_PATH / \"data_processed\"\n",
"OUTPUTS = BASE_PATH / \"outputs\"\n",
"\n",
"DATA_PROCESSED.mkdir(exist_ok=True)\n",
"OUTPUTS.mkdir(exist_ok=True)\n",
"\n",
"print(\"Environment ready.\")\n",
"print(\"Using BASE_PATH:\", BASE_PATH)\n",
"\n",
"# Find CSV files anywhere under BASE_PATH\n",
"csv_paths = [\n",
" p for p in BASE_PATH.rglob(\"*.csv\")\n",
" if \"sample_data\" not in str(p)\n",
"]\n",
"\n",
"print(\"Found CSV files:\")\n",
"for p in csv_paths:\n",
" print(\"-\", p)\n",
"\n",
"# Locate required files\n",
"reviews_matches = [\n",
" p for p in csv_paths\n",
" if \"clothing\" in p.name.lower()\n",
"]\n",
"\n",
"returns_matches = [\n",
" p for p in csv_paths\n",
" if \"return\" in p.name.lower()\n",
"]\n",
"\n",
"if not reviews_matches:\n",
" raise FileNotFoundError(\n",
" \"Could not find the Womens Clothing E-Commerce Reviews CSV. \"\n",
" \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
" )\n",
"\n",
"if not returns_matches:\n",
" raise FileNotFoundError(\n",
" \"Could not find the ecommerce returns CSV. \"\n",
" \"Upload it to the Colab runtime OR put it in the same folder as app.py on Hugging Face.\"\n",
" )\n",
"\n",
"reviews_path = reviews_matches[0]\n",
"returns_path = returns_matches[0]\n",
"\n",
"print(\"Using reviews file:\", reviews_path)\n",
"print(\"Using returns file:\", returns_path)\n",
"\n",
"reviews_df = pd.read_csv(reviews_path)\n",
"returns_df = pd.read_csv(returns_path)\n",
"\n",
"# Main dataframe used by the rest of this notebook\n",
"df = reviews_df.copy()\n",
"\n",
"print(\"Loaded successfully.\")\n",
"print(\"Reviews shape:\", reviews_df.shape)\n",
"print(\"Returns shape:\", returns_df.shape)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bsHVIP13nWFe",
"outputId": "e91ab6ca-f560-49f7-db56-0929a55488a4"
},
"execution_count": 10,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Environment ready.\n",
"Using BASE_PATH: /content\n",
"Found CSV files:\n",
"- /content/Womens Clothing E-Commerce Reviews.csv\n",
"- /content/ecommerce_returns_cleaned.csv\n",
"Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n",
"Using returns file: /content/ecommerce_returns_cleaned.csv\n",
"Loaded successfully.\n",
"Reviews shape: (23486, 11)\n",
"Returns shape: (113314, 29)\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"1.2 Missing value analysis"
],
"metadata": {
"id": "7TKoOsszIJBX"
}
},
{
"cell_type": "code",
"source": [
"missing_summary = pd.DataFrame({\n",
" \"column\": df.columns,\n",
" \"missing_count\": df.isna().sum().values,\n",
" \"missing_pct\": (df.isna().mean().values * 100).round(2)\n",
"}).sort_values(by=\"missing_pct\", ascending=False)\n",
"\n",
"display(missing_summary)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 394
},
"id": "3qu3XSzfnV4-",
"outputId": "9db9967a-19ff-4533-9ac1-485ac363bf26"
},
"execution_count": 11,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" column missing_count missing_pct\n",
"3 Title 3810 16.22\n",
"4 Review Text 845 3.60\n",
"9 Department Name 14 0.06\n",
"10 Class Name 14 0.06\n",
"8 Division Name 14 0.06\n",
"0 Unnamed: 0 0 0.00\n",
"1 Clothing ID 0 0.00\n",
"2 Age 0 0.00\n",
"5 Rating 0 0.00\n",
"6 Recommended IND 0 0.00\n",
"7 Positive Feedback Count 0 0.00"
],
"text/html": [
"\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" column | \n",
" missing_count | \n",
" missing_pct | \n",
"
\n",
" \n",
" \n",
" \n",
" | 3 | \n",
" Title | \n",
" 3810 | \n",
" 16.22 | \n",
"
\n",
" \n",
" | 4 | \n",
" Review Text | \n",
" 845 | \n",
" 3.60 | \n",
"
\n",
" \n",
" | 9 | \n",
" Department Name | \n",
" 14 | \n",
" 0.06 | \n",
"
\n",
" \n",
" | 10 | \n",
" Class Name | \n",
" 14 | \n",
" 0.06 | \n",
"
\n",
" \n",
" | 8 | \n",
" Division Name | \n",
" 14 | \n",
" 0.06 | \n",
"
\n",
" \n",
" | 0 | \n",
" Unnamed: 0 | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 1 | \n",
" Clothing ID | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 2 | \n",
" Age | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 5 | \n",
" Rating | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 6 | \n",
" Recommended IND | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
" | 7 | \n",
" Positive Feedback Count | \n",
" 0 | \n",
" 0.00 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "missing_summary",
"summary": "{\n \"name\": \"missing_summary\",\n \"rows\": 11,\n \"fields\": [\n {\n \"column\": \"column\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 11,\n \"samples\": [\n \"Unnamed: 0\",\n \"Title\",\n \"Recommended IND\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"missing_count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1150,\n \"min\": 0,\n \"max\": 3810,\n \"num_unique_values\": 4,\n \"samples\": [\n 845,\n 0,\n 3810\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"missing_pct\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 4.895871358975474,\n \"min\": 0.0,\n \"max\": 16.22,\n \"num_unique_values\": 4,\n \"samples\": [\n 3.6,\n 0.0,\n 16.22\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"1.3 Data cleaning & feature engineering"
],
"metadata": {
"id": "EfQ7CQguINjr"
}
},
{
"cell_type": "code",
"source": [
"df_clean = df.copy()\n",
"\n",
"# remove useless index column\n",
"df_clean = df_clean.drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n",
"\n",
"# fill text columns\n",
"df_clean[\"Title\"] = df_clean[\"Title\"].fillna(\"No Title\")\n",
"df_clean[\"Review Text\"] = df_clean[\"Review Text\"].fillna(\"No Review\")\n",
"\n",
"# fill category columns\n",
"for col in [\"Department Name\", \"Class Name\", \"Division Name\"]:\n",
" df_clean[col] = df_clean[col].fillna(\"Unknown\")\n",
"\n",
"# create sentiment label from rating\n",
"df_clean[\"sentiment\"] = df_clean[\"Rating\"].apply(\n",
" lambda x: \"positive\" if x >= 4 else \"negative\"\n",
")\n",
"\n",
"# create engagement score\n",
"df_clean[\"engagement_score\"] = (\n",
" df_clean[\"Positive Feedback Count\"] + df_clean[\"Recommended IND\"]\n",
")\n",
"\n",
"print(\"Cleaned shape:\", df_clean.shape)\n",
"display(df_clean.head())\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 643
},
"id": "da3lANHOnV13",
"outputId": "c76e93fa-6555-4e73-a99e-866c9f076640"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cleaned shape: (23486, 12)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" Clothing ID Age Title \\\n",
"0 767 33 No Title \n",
"1 1080 34 No Title \n",
"2 1077 60 Some major design flaws \n",
"3 1049 50 My favorite buy! \n",
"4 847 47 Flattering shirt \n",
"\n",
" Review Text Rating Recommended IND \\\n",
"0 Absolutely wonderful - silky and sexy and comf... 4 1 \n",
"1 Love this dress! it's sooo pretty. i happene... 5 1 \n",
"2 I had such high hopes for this dress and reall... 3 0 \n",
"3 I love, love, love this jumpsuit. it's fun, fl... 5 1 \n",
"4 This shirt is very flattering to all due to th... 5 1 \n",
"\n",
" Positive Feedback Count Division Name Department Name Class Name \\\n",
"0 0 Initmates Intimate Intimates \n",
"1 4 General Dresses Dresses \n",
"2 0 General Dresses Dresses \n",
"3 0 General Petite Bottoms Pants \n",
"4 6 General Tops Blouses \n",
"\n",
" sentiment engagement_score \n",
"0 positive 1 \n",
"1 positive 5 \n",
"2 negative 0 \n",
"3 positive 1 \n",
"4 positive 7 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Clothing ID | \n",
" Age | \n",
" Title | \n",
" Review Text | \n",
" Rating | \n",
" Recommended IND | \n",
" Positive Feedback Count | \n",
" Division Name | \n",
" Department Name | \n",
" Class Name | \n",
" sentiment | \n",
" engagement_score | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 767 | \n",
" 33 | \n",
" No Title | \n",
" Absolutely wonderful - silky and sexy and comf... | \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" Initmates | \n",
" Intimate | \n",
" Intimates | \n",
" positive | \n",
" 1 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1080 | \n",
" 34 | \n",
" No Title | \n",
" Love this dress! it's sooo pretty. i happene... | \n",
" 5 | \n",
" 1 | \n",
" 4 | \n",
" General | \n",
" Dresses | \n",
" Dresses | \n",
" positive | \n",
" 5 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1077 | \n",
" 60 | \n",
" Some major design flaws | \n",
" I had such high hopes for this dress and reall... | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" General | \n",
" Dresses | \n",
" Dresses | \n",
" negative | \n",
" 0 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1049 | \n",
" 50 | \n",
" My favorite buy! | \n",
" I love, love, love this jumpsuit. it's fun, fl... | \n",
" 5 | \n",
" 1 | \n",
" 0 | \n",
" General Petite | \n",
" Bottoms | \n",
" Pants | \n",
" positive | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 847 | \n",
" 47 | \n",
" Flattering shirt | \n",
" This shirt is very flattering to all due to th... | \n",
" 5 | \n",
" 1 | \n",
" 6 | \n",
" General | \n",
" Tops | \n",
" Blouses | \n",
" positive | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"display(df_clean\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"Clothing ID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 146,\n \"min\": 767,\n \"max\": 1080,\n \"num_unique_values\": 5,\n \"samples\": [\n 1080,\n 847,\n 1077\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 11,\n \"min\": 33,\n \"max\": 60,\n \"num_unique_values\": 5,\n \"samples\": [\n 34,\n 47,\n 60\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Title\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Some major design flaws\",\n \"Flattering shirt\",\n \"No Title\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Review Text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite. i bought a petite and am 5'8\\\". i love the length on me- hits just a little below the knee. would definitely be a true midi on someone who is truly petite.\",\n \"This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!\",\n \"I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 3,\n \"max\": 5,\n \"num_unique_values\": 3,\n \"samples\": [\n 4,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Recommended IND\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 0,\n 1\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Positive Feedback Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2,\n \"min\": 0,\n \"max\": 6,\n \"num_unique_values\": 3,\n \"samples\": [\n 0,\n 4\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Division Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Initmates\",\n \"General\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Department Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Dresses\",\n \"Tops\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"Dresses\",\n \"Blouses\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"negative\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"engagement_score\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n \"max\": 7,\n \"num_unique_values\": 4,\n \"samples\": [\n 5,\n 7\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"1.4 Product category analysis"
],
"metadata": {
"id": "gf5CqL9IITIu"
}
},
{
"cell_type": "code",
"source": [
"class_summary = (\n",
" df_clean.groupby(\"Class Name\")\n",
" .agg(\n",
" reviews=(\"Rating\", \"count\"),\n",
" avg_rating=(\"Rating\", \"mean\"),\n",
" recommendation_rate=(\"Recommended IND\", \"mean\"),\n",
" avg_feedback=(\"Positive Feedback Count\", \"mean\")\n",
" )\n",
" .reset_index()\n",
")\n",
"\n",
"# keep only classes with enough reviews\n",
"class_summary = class_summary[class_summary[\"reviews\"] >= 100]\n",
"\n",
"# sort by rating\n",
"class_summary = class_summary.sort_values(\n",
" by=\"avg_rating\",\n",
" ascending=False\n",
")\n",
"\n",
"display(class_summary.head(10))\n",
"display(class_summary.tail(10))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 709
},
"id": "MAW6wxDCnVzA",
"outputId": "7cfc9d0b-783a-41f5-ddbc-8d7b0284b7bd"
},
"execution_count": 13,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
" Class Name reviews avg_rating recommendation_rate avg_feedback\n",
"9 Layering 146 4.376712 0.883562 1.315068\n",
"7 Jeans 1147 4.360942 0.881430 1.759372\n",
"11 Lounge 691 4.301013 0.859624 2.321274\n",
"6 Jackets 704 4.295455 0.845170 2.826705\n",
"16 Sleep 228 4.285088 0.855263 1.750000\n",
"5 Intimates 154 4.279221 0.857143 0.779221\n",
"10 Legwear 165 4.278788 0.860606 1.272727\n",
"13 Pants 1388 4.265850 0.832853 2.396974\n",
"4 Fine gauge 1100 4.260909 0.837273 2.013636\n",
"14 Shorts 317 4.255521 0.839117 1.675079"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Class Name | \n",
" reviews | \n",
" avg_rating | \n",
" recommendation_rate | \n",
" avg_feedback | \n",
"
\n",
" \n",
" \n",
" \n",
" | 9 | \n",
" Layering | \n",
" 146 | \n",
" 4.376712 | \n",
" 0.883562 | \n",
" 1.315068 | \n",
"
\n",
" \n",
" | 7 | \n",
" Jeans | \n",
" 1147 | \n",
" 4.360942 | \n",
" 0.881430 | \n",
" 1.759372 | \n",
"
\n",
" \n",
" | 11 | \n",
" Lounge | \n",
" 691 | \n",
" 4.301013 | \n",
" 0.859624 | \n",
" 2.321274 | \n",
"
\n",
" \n",
" | 6 | \n",
" Jackets | \n",
" 704 | \n",
" 4.295455 | \n",
" 0.845170 | \n",
" 2.826705 | \n",
"
\n",
" \n",
" | 16 | \n",
" Sleep | \n",
" 228 | \n",
" 4.285088 | \n",
" 0.855263 | \n",
" 1.750000 | \n",
"
\n",
" \n",
" | 5 | \n",
" Intimates | \n",
" 154 | \n",
" 4.279221 | \n",
" 0.857143 | \n",
" 0.779221 | \n",
"
\n",
" \n",
" | 10 | \n",
" Legwear | \n",
" 165 | \n",
" 4.278788 | \n",
" 0.860606 | \n",
" 1.272727 | \n",
"
\n",
" \n",
" | 13 | \n",
" Pants | \n",
" 1388 | \n",
" 4.265850 | \n",
" 0.832853 | \n",
" 2.396974 | \n",
"
\n",
" \n",
" | 4 | \n",
" Fine gauge | \n",
" 1100 | \n",
" 4.260909 | \n",
" 0.837273 | \n",
" 2.013636 | \n",
"
\n",
" \n",
" | 14 | \n",
" Shorts | \n",
" 317 | \n",
" 4.255521 | \n",
" 0.839117 | \n",
" 1.675079 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"display(class_summary\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Fine gauge\",\n \"Jeans\",\n \"Intimates\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 471,\n \"min\": 146,\n \"max\": 1388,\n \"num_unique_values\": 10,\n \"samples\": [\n 1100,\n 1147,\n 154\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.041140875708169405,\n \"min\": 4.255520504731861,\n \"max\": 4.376712328767123,\n \"num_unique_values\": 10,\n \"samples\": [\n 4.260909090909091,\n 4.360941586748038,\n 4.279220779220779\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"recommendation_rate\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.017400503275162123,\n \"min\": 0.8328530259365994,\n \"max\": 0.8835616438356164,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8372727272727273,\n 0.8814298169136879,\n 0.8571428571428571\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_feedback\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.6056747210865839,\n \"min\": 0.7792207792207793,\n \"max\": 2.8267045454545454,\n \"num_unique_values\": 10,\n \"samples\": [\n 2.0136363636363637,\n 1.7593722755013077,\n 0.7792207792207793\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" Class Name reviews avg_rating recommendation_rate avg_feedback\n",
"4 Fine gauge 1100 4.260909 0.837273 2.013636\n",
"14 Shorts 317 4.255521 0.839117 1.675079\n",
"15 Skirts 945 4.245503 0.845503 2.293122\n",
"12 Outerwear 328 4.198171 0.817073 2.823171\n",
"18 Swim 350 4.197143 0.805714 2.142857\n",
"17 Sweaters 1428 4.179272 0.800420 2.208683\n",
"8 Knits 4843 4.161677 0.817675 2.394797\n",
"0 Blouses 3097 4.154020 0.810139 2.725218\n",
"3 Dresses 6319 4.150815 0.808197 3.087514\n",
"19 Trend 119 3.815126 0.739496 3.369748"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Class Name | \n",
" reviews | \n",
" avg_rating | \n",
" recommendation_rate | \n",
" avg_feedback | \n",
"
\n",
" \n",
" \n",
" \n",
" | 4 | \n",
" Fine gauge | \n",
" 1100 | \n",
" 4.260909 | \n",
" 0.837273 | \n",
" 2.013636 | \n",
"
\n",
" \n",
" | 14 | \n",
" Shorts | \n",
" 317 | \n",
" 4.255521 | \n",
" 0.839117 | \n",
" 1.675079 | \n",
"
\n",
" \n",
" | 15 | \n",
" Skirts | \n",
" 945 | \n",
" 4.245503 | \n",
" 0.845503 | \n",
" 2.293122 | \n",
"
\n",
" \n",
" | 12 | \n",
" Outerwear | \n",
" 328 | \n",
" 4.198171 | \n",
" 0.817073 | \n",
" 2.823171 | \n",
"
\n",
" \n",
" | 18 | \n",
" Swim | \n",
" 350 | \n",
" 4.197143 | \n",
" 0.805714 | \n",
" 2.142857 | \n",
"
\n",
" \n",
" | 17 | \n",
" Sweaters | \n",
" 1428 | \n",
" 4.179272 | \n",
" 0.800420 | \n",
" 2.208683 | \n",
"
\n",
" \n",
" | 8 | \n",
" Knits | \n",
" 4843 | \n",
" 4.161677 | \n",
" 0.817675 | \n",
" 2.394797 | \n",
"
\n",
" \n",
" | 0 | \n",
" Blouses | \n",
" 3097 | \n",
" 4.154020 | \n",
" 0.810139 | \n",
" 2.725218 | \n",
"
\n",
" \n",
" | 3 | \n",
" Dresses | \n",
" 6319 | \n",
" 4.150815 | \n",
" 0.808197 | \n",
" 3.087514 | \n",
"
\n",
" \n",
" | 19 | \n",
" Trend | \n",
" 119 | \n",
" 3.815126 | \n",
" 0.739496 | \n",
" 3.369748 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"display(class_summary\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"Class Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Dresses\",\n \"Shorts\",\n \"Sweaters\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reviews\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2158,\n \"min\": 119,\n \"max\": 6319,\n \"num_unique_values\": 10,\n \"samples\": [\n 6319,\n 317,\n 1428\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.12860513401132947,\n \"min\": 3.8151260504201683,\n \"max\": 4.260909090909091,\n \"num_unique_values\": 10,\n \"samples\": [\n 4.150815002373793,\n 4.255520504731861,\n 4.179271708683474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"recommendation_rate\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.02981935453944708,\n \"min\": 0.7394957983193278,\n \"max\": 0.8455026455026455,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.8081974996043678,\n 0.8391167192429022,\n 0.8004201680672269\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"avg_feedback\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5199400215099952,\n \"min\": 1.6750788643533123,\n \"max\": 3.369747899159664,\n \"num_unique_values\": 10,\n \"samples\": [\n 3.08751384712771,\n 1.6750788643533123,\n 2.208683473389356\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"1.5 Negative review text analysis"
],
"metadata": {
"id": "0r3IC4d4IZBj"
}
},
{
"cell_type": "code",
"source": [
"negative_reviews = df_clean[df_clean[\"sentiment\"] == \"negative\"].copy()\n",
"\n",
"print(\"Negative reviews:\", negative_reviews.shape)\n",
"\n",
"negative_reviews[\"Review Text\"] = negative_reviews[\"Review Text\"].astype(str)\n",
"\n",
"common_words = (\n",
" negative_reviews[\"Review Text\"]\n",
" .str.lower()\n",
" .str.split(expand=True)\n",
" .stack()\n",
" .value_counts()\n",
" .head(30)\n",
")\n",
"\n",
"display(common_words)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "C6TafBY7nVv3",
"outputId": "f45d6bed-bd34-4a3a-e185-35132ffdc3e3"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Negative reviews: (5278, 12)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"the 20383\n",
"i 13685\n",
"and 10365\n",
"it 9411\n",
"a 8957\n",
"is 6315\n",
"this 5886\n",
"to 5825\n",
"was 4965\n",
"in 4754\n",
"but 4483\n",
"on 3695\n",
"of 3609\n",
"for 3067\n",
"not 2993\n",
"so 2563\n",
"my 2405\n",
"that 2155\n",
"like 2117\n",
"have 2041\n",
"very 2005\n",
"with 1924\n",
"dress 1913\n",
"too 1764\n",
"would 1687\n",
"be 1609\n",
"just 1581\n",
"as 1519\n",
"are 1484\n",
"top 1418\n",
"Name: count, dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" | the | \n",
" 20383 | \n",
"
\n",
" \n",
" | i | \n",
" 13685 | \n",
"
\n",
" \n",
" | and | \n",
" 10365 | \n",
"
\n",
" \n",
" | it | \n",
" 9411 | \n",
"
\n",
" \n",
" | a | \n",
" 8957 | \n",
"
\n",
" \n",
" | is | \n",
" 6315 | \n",
"
\n",
" \n",
" | this | \n",
" 5886 | \n",
"
\n",
" \n",
" | to | \n",
" 5825 | \n",
"
\n",
" \n",
" | was | \n",
" 4965 | \n",
"
\n",
" \n",
" | in | \n",
" 4754 | \n",
"
\n",
" \n",
" | but | \n",
" 4483 | \n",
"
\n",
" \n",
" | on | \n",
" 3695 | \n",
"
\n",
" \n",
" | of | \n",
" 3609 | \n",
"
\n",
" \n",
" | for | \n",
" 3067 | \n",
"
\n",
" \n",
" | not | \n",
" 2993 | \n",
"
\n",
" \n",
" | so | \n",
" 2563 | \n",
"
\n",
" \n",
" | my | \n",
" 2405 | \n",
"
\n",
" \n",
" | that | \n",
" 2155 | \n",
"
\n",
" \n",
" | like | \n",
" 2117 | \n",
"
\n",
" \n",
" | have | \n",
" 2041 | \n",
"
\n",
" \n",
" | very | \n",
" 2005 | \n",
"
\n",
" \n",
" | with | \n",
" 1924 | \n",
"
\n",
" \n",
" | dress | \n",
" 1913 | \n",
"
\n",
" \n",
" | too | \n",
" 1764 | \n",
"
\n",
" \n",
" | would | \n",
" 1687 | \n",
"
\n",
" \n",
" | be | \n",
" 1609 | \n",
"
\n",
" \n",
" | just | \n",
" 1581 | \n",
"
\n",
" \n",
" | as | \n",
" 1519 | \n",
"
\n",
" \n",
" | are | \n",
" 1484 | \n",
"
\n",
" \n",
" | top | \n",
" 1418 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {}
}
]
},
{
"cell_type": "markdown",
"source": [
"1.6 Synthetic dataset generation"
],
"metadata": {
"id": "RrZfwc7SIdmx"
}
},
{
"cell_type": "code",
"source": [
"import numpy as np\n",
"\n",
"np.random.seed(42)\n",
"\n",
"synthetic_df = pd.DataFrame({\n",
" \"customer_id\": range(1, 501),\n",
" \"predicted_return_risk\": np.random.choice(\n",
" [\"low\", \"medium\", \"high\"],\n",
" size=500,\n",
" p=[0.5, 0.3, 0.2]\n",
" ),\n",
" \"predicted_size_issue\": np.random.choice(\n",
" [\"yes\", \"no\"],\n",
" size=500,\n",
" p=[0.25, 0.75]\n",
" ),\n",
" \"predicted_satisfaction_next_purchase\": np.random.randint(1, 6, 500)\n",
"})\n",
"\n",
"print(synthetic_df.shape)\n",
"display(synthetic_df.head())\n",
"\n",
"# Save outputs for the Hugging Face app / next analysis notebook\n",
"df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n",
"class_summary.to_csv(DATA_PROCESSED / \"class_summary.csv\", index=False)\n",
"synthetic_df.to_csv(DATA_PROCESSED / \"synthetic_return_risk.csv\", index=False)\n",
"returns_df.to_csv(DATA_PROCESSED / \"returns_input.csv\", index=False)\n",
"\n",
"common_words.reset_index().rename(\n",
" columns={\"index\": \"word\", \"Review Text\": \"count\", 0: \"count\"}\n",
").to_csv(DATA_PROCESSED / \"common_negative_words.csv\", index=False)\n",
"\n",
"print(\"Saved processed files to:\", DATA_PROCESSED)\n",
"print([p.name for p in DATA_PROCESSED.glob(\"*.csv\")])\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 280
},
"id": "gmcDmANBnVsP",
"outputId": "f831e19b-889c-4610-c348-a50ff7025ac8"
},
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(500, 4)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" customer_id predicted_return_risk predicted_size_issue \\\n",
"0 1 low no \n",
"1 2 high no \n",
"2 3 medium no \n",
"3 4 medium no \n",
"4 5 low no \n",
"\n",
" predicted_satisfaction_next_purchase \n",
"0 4 \n",
"1 3 \n",
"2 5 \n",
"3 1 \n",
"4 5 "
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" customer_id | \n",
" predicted_return_risk | \n",
" predicted_size_issue | \n",
" predicted_satisfaction_next_purchase | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" low | \n",
" no | \n",
" 4 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" high | \n",
" no | \n",
" 3 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" medium | \n",
" no | \n",
" 5 | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" medium | \n",
" no | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" low | \n",
" no | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"summary": "{\n \"name\": \"print([p\",\n \"rows\": 5,\n \"fields\": [\n {\n \"column\": \"customer_id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 5,\n \"samples\": [\n 2,\n 5,\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_return_risk\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"low\",\n \"high\",\n \"medium\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_size_issue\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"no\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"predicted_satisfaction_next_purchase\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 1,\n \"max\": 5,\n \"num_unique_values\": 4,\n \"samples\": [\n 3\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saved processed files to: /content/data_processed\n",
"['synthetic_return_risk.csv', 'common_negative_words.csv', 'reviews_cleaned.csv', 'class_summary.csv', 'returns_input.csv']\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"1.7 sentiment analysis with VADER"
],
"metadata": {
"id": "jv1UG6gdIhu2"
}
},
{
"cell_type": "code",
"source": [
"# ==================================================\n",
"# SENTIMENT ANALYSIS WITH VADER\n",
"# ==================================================\n",
"\n",
"!pip install vaderSentiment --quiet\n",
"\n",
"from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer\n",
"\n",
"analyzer = SentimentIntensityAnalyzer()\n",
"\n",
"def get_vader_sentiment(text):\n",
" if text == \"No Review\" or not isinstance(text, str):\n",
" return \"neutral\"\n",
" score = analyzer.polarity_scores(text)[\"compound\"]\n",
" if score >= 0.05:\n",
" return \"positive\"\n",
" elif score <= -0.05:\n",
" return \"negative\"\n",
" else:\n",
" return \"neutral\"\n",
"\n",
"df_clean[\"vader_sentiment\"] = df_clean[\"Review Text\"].apply(get_vader_sentiment)\n",
"\n",
"# Compare rule-based vs NLP-based sentiment\n",
"comparison = pd.crosstab(\n",
" df_clean[\"sentiment\"],\n",
" df_clean[\"vader_sentiment\"],\n",
" margins=True\n",
")\n",
"print(\"Rule-based vs VADER Sentiment Comparison:\")\n",
"display(comparison)\n",
"\n",
"# Distribution of VADER sentiment\n",
"vader_dist = df_clean[\"vader_sentiment\"].value_counts()\n",
"print(\"\\nVADER Sentiment Distribution:\")\n",
"display(vader_dist)\n",
"\n",
"# Average rating per VADER sentiment (validation check)\n",
"vader_rating = (\n",
" df_clean.groupby(\"vader_sentiment\")[\"Rating\"]\n",
" .mean()\n",
" .round(2)\n",
" .reset_index()\n",
")\n",
"print(\"\\nAverage Rating per VADER Sentiment:\")\n",
"display(vader_rating)\n",
"\n",
"# Save enriched dataset\n",
"df_clean.to_csv(DATA_PROCESSED / \"reviews_cleaned.csv\", index=False)\n",
"print(\"\\nSaved enriched dataset with VADER sentiment.\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 619
},
"id": "og5QuNju33HT",
"outputId": "a29606b2-0ef4-44d5-f325-0da6c1ff79af"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Rule-based vs VADER Sentiment Comparison:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"vader_sentiment negative neutral positive All\n",
"sentiment \n",
"negative 992 249 4037 5278\n",
"positive 343 860 17005 18208\n",
"All 1335 1109 21042 23486"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | vader_sentiment | \n",
" negative | \n",
" neutral | \n",
" positive | \n",
" All | \n",
"
\n",
" \n",
" | sentiment | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | negative | \n",
" 992 | \n",
" 249 | \n",
" 4037 | \n",
" 5278 | \n",
"
\n",
" \n",
" | positive | \n",
" 343 | \n",
" 860 | \n",
" 17005 | \n",
" 18208 | \n",
"
\n",
" \n",
" | All | \n",
" 1335 | \n",
" 1109 | \n",
" 21042 | \n",
" 23486 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "comparison",
"summary": "{\n \"name\": \"comparison\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"negative\",\n \"positive\",\n \"All\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"negative\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 503,\n \"min\": 343,\n \"max\": 1335,\n \"num_unique_values\": 3,\n \"samples\": [\n 992,\n 343,\n 1335\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"neutral\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 442,\n \"min\": 249,\n \"max\": 1109,\n \"num_unique_values\": 3,\n \"samples\": [\n 249,\n 860,\n 1109\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"positive\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 8884,\n \"min\": 4037,\n \"max\": 21042,\n \"num_unique_values\": 3,\n \"samples\": [\n 4037,\n 17005,\n 21042\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"All\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9368,\n \"min\": 5278,\n \"max\": 23486,\n \"num_unique_values\": 3,\n \"samples\": [\n 5278,\n 18208,\n 23486\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"VADER Sentiment Distribution:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"vader_sentiment\n",
"positive 21042\n",
"negative 1335\n",
"neutral 1109\n",
"Name: count, dtype: int64"
],
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" count | \n",
"
\n",
" \n",
" | vader_sentiment | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" | positive | \n",
" 21042 | \n",
"
\n",
" \n",
" | negative | \n",
" 1335 | \n",
"
\n",
" \n",
" | neutral | \n",
" 1109 | \n",
"
\n",
" \n",
"
\n",
"
"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Average Rating per VADER Sentiment:\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
" vader_sentiment Rating\n",
"0 negative 2.69\n",
"1 neutral 4.20\n",
"2 positive 4.29"
],
"text/html": [
"\n",
" \n",
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" vader_sentiment | \n",
" Rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" negative | \n",
" 2.69 | \n",
"
\n",
" \n",
" | 1 | \n",
" neutral | \n",
" 4.20 | \n",
"
\n",
" \n",
" | 2 | \n",
" positive | \n",
" 4.29 | \n",
"
\n",
" \n",
"
\n",
"
\n",
"
\n",
"
\n"
],
"application/vnd.google.colaboratory.intrinsic+json": {
"type": "dataframe",
"variable_name": "vader_rating",
"summary": "{\n \"name\": \"vader_rating\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"vader_sentiment\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"negative\",\n \"neutral\",\n \"positive\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Rating\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.8989067434018578,\n \"min\": 2.69,\n \"max\": 4.29,\n \"num_unique_values\": 3,\n \"samples\": [\n 2.69,\n 4.2,\n 4.29\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Saved enriched dataset with VADER sentiment.\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# Automation Opportunities\n",
"\n",
"## Automation 1 — Review Sentiment Alert\n",
"Automatically flag clothing classes when average rating drops below 3.5.\n",
"\n",
"## Automation 2 — Product Improvement Suggestions\n",
"Use negative review keywords to automatically suggest:\n",
"- sizing guide improvements\n",
"- fabric description clarification\n",
"- fit recommendations\n",
"- photo quality updates\n",
"\n",
"## Automation 3 — Future Return Risk Dashboard\n",
"Combine real reviews with synthetic future risk signals to monitor:\n",
"- high-risk customer segments\n",
"- classes with repeated size complaints\n",
"- products likely to receive negative reviews next season"
],
"metadata": {
"id": "fmUnLL36pT_z"
}
}
]
}