{ "cells": [ { "cell_type": "markdown", "id": "85361b58", "metadata": { "id": "85361b58" }, "source": [ "# Step 2 — Python Analysis / Modeling\n", "\n", "Clean version for the Hugging Face SE21 app template. It creates dashboard artifacts." ] }, { "cell_type": "markdown", "source": [ "2.1 Environment setup & modeling" ], "metadata": { "id": "WP6akOm_IupL" }, "id": "WP6akOm_IupL" }, { "cell_type": "code", "execution_count": 2, "id": "c88b847c", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "c88b847c", "outputId": "ee60c93f-29b8-4dcc-9fc4-a5b6aa4b9398" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Environment ready.\n", "BASE_PATH: /content\n", "CSV files found:\n", "- /content/Womens Clothing E-Commerce Reviews.csv\n", "- /content/ecommerce_returns_cleaned.csv\n", "Using reviews file: /content/Womens Clothing E-Commerce Reviews.csv\n", "Using returns file: /content/ecommerce_returns_cleaned.csv\n", "Reviews shape: (23486, 10)\n", "Returns shape: (113314, 29)\n", "Reviews columns: ['Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']\n", "Returns columns: ['order_id', 'order_item_id', 'product_id', 'seller_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_delivered_customer_date', 'order_estimated_delivery_date', 'review_score', 'review_comment_title', 'review_comment_message', 'price', 'freight_value', 'total_cost', 'product_category_name', 'product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'has_review_text', 'review_text_length', 'delivery_delay_days', 'negative_keyword_flag', 'synthetic_return_risk', 'likely_return']\n", "Data loaded and cleaned.\n" ] } ], "source": [ "# ==================================================\n", "# STEP 2: UNIVERSAL ANALYSIS SETUP\n", "# Works in BOTH Hugging Face Spaces and Google Colab\n", "# ==================================================\n", "\n", "import os\n", "import json\n", "import random\n", "import warnings\n", "from pathlib import Path\n", "\n", "os.environ.setdefault(\"MPLCONFIGDIR\", \"/tmp/matplotlib\")\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "random.seed(42)\n", "np.random.seed(42)\n", "\n", "# Pick the correct runtime folder automatically.\n", "# Hugging Face Space uses /app. Colab uses /content.\n", "candidate_roots = [Path(\"/app\"), Path(\"/content\"), Path.cwd(), Path(\"/mnt/data\")]\n", "BASE_PATH = None\n", "\n", "for root in candidate_roots:\n", " if root.exists():\n", " csvs = []\n", " for p in root.rglob(\"*.csv\"):\n", " parts = {part.lower() for part in p.parts}\n", " if \"sample_data\" in parts:\n", " continue\n", " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n", " continue\n", " csvs.append(p)\n", " if csvs:\n", " BASE_PATH = root\n", " break\n", "\n", "if BASE_PATH is None:\n", " if Path(\"/app\").exists():\n", " BASE_PATH = Path(\"/app\")\n", " elif Path(\"/content\").exists():\n", " BASE_PATH = Path(\"/content\")\n", " else:\n", " BASE_PATH = Path.cwd()\n", "\n", "DATA_PROCESSED = BASE_PATH / \"data_processed\"\n", "\n", "OUTPUTS = BASE_PATH / \"outputs\"\n", "FIGURES = BASE_PATH / \"figures\"\n", "TABLES = BASE_PATH / \"tables\"\n", "ARTIFACTS = BASE_PATH / \"artifacts\"\n", "\n", "# Extra folders because different templates check different places\n", "OUTPUT_FIGURES = OUTPUTS / \"figures\"\n", "OUTPUT_TABLES = OUTPUTS / \"tables\"\n", "ARTIFACT_FIGURES = ARTIFACTS / \"figures\"\n", "ARTIFACT_TABLES = ARTIFACTS / \"tables\"\n", "\n", "ALL_OUTPUT_DIRS = [\n", " DATA_PROCESSED,\n", " OUTPUTS,\n", " FIGURES,\n", " TABLES,\n", " ARTIFACTS,\n", " OUTPUT_FIGURES,\n", " OUTPUT_TABLES,\n", " ARTIFACT_FIGURES,\n", " ARTIFACT_TABLES,\n", "]\n", "\n", "for folder in ALL_OUTPUT_DIRS:\n", " folder.mkdir(parents=True, exist_ok=True)\n", "\n", "print(\"Environment ready.\")\n", "print(\"BASE_PATH:\", BASE_PATH)\n", "\n", "# Load data created by Step 1 if available.\n", "csv_paths = []\n", "for p in BASE_PATH.rglob(\"*.csv\"):\n", " parts = {part.lower() for part in p.parts}\n", " if \"sample_data\" in parts:\n", " continue\n", " if \"outputs\" in parts or \"figures\" in parts or \"tables\" in parts or \"artifacts\" in parts:\n", " continue\n", " csv_paths.append(p)\n", "\n", "print(\"CSV files found:\")\n", "for p in csv_paths:\n", " print(\"-\", p)\n", "\n", "def first_existing(paths):\n", " for p in paths:\n", " if Path(p).exists():\n", " return Path(p)\n", " return None\n", "\n", "reviews_path = first_existing([\n", " DATA_PROCESSED / \"reviews_cleaned.csv\",\n", " DATA_PROCESSED / \"womens_reviews_cleaned.csv\",\n", " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n", "])\n", "\n", "returns_path = first_existing([\n", " DATA_PROCESSED / \"returns_input.csv\",\n", " DATA_PROCESSED / \"returns_cleaned.csv\",\n", " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n", " DATA_PROCESSED / \"synthetic_return_risk.csv\",\n", "])\n", "\n", "# Fallback search.\n", "if reviews_path is None:\n", " review_matches = [\n", " p for p in csv_paths\n", " if (\"clothing\" in p.name.lower()) or (\"review\" in p.name.lower() and \"return\" not in p.name.lower())\n", " ]\n", " reviews_path = review_matches[0] if review_matches else None\n", "\n", "if returns_path is None:\n", " return_matches = [\n", " p for p in csv_paths\n", " if \"return\" in p.name.lower()\n", " ]\n", " returns_path = return_matches[0] if return_matches else None\n", "\n", "\n", "if returns_path is None:\n", " raise FileNotFoundError(\"Step 2 could not find the ecommerce returns CSV.\")\n", "\n", "print(\"Using reviews file:\", reviews_path)\n", "print(\"Using returns file:\", returns_path)\n", "\n", "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n", "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n", "\n", "print(\"Reviews shape:\", reviews_df.shape)\n", "print(\"Returns shape:\", returns_df.shape)\n", "print(\"Reviews columns:\", reviews_df.columns.tolist())\n", "print(\"Returns columns:\", returns_df.columns.tolist())\n", "\n", "# Basic cleanup / type safety\n", "for col in [\"Age\", \"Rating\", \"Recommended IND\", \"Positive Feedback Count\"]:\n", " if col in reviews_df.columns:\n", " reviews_df[col] = pd.to_numeric(reviews_df[col], errors=\"coerce\")\n", "\n", "if \"Review Text\" in reviews_df.columns:\n", " reviews_df[\"Review Text\"] = reviews_df[\"Review Text\"].fillna(\"\").astype(str)\n", "\n", "if \"Class Name\" in reviews_df.columns:\n", " reviews_df[\"Class Name\"] = reviews_df[\"Class Name\"].fillna(\"Unknown\").astype(str)\n", "\n", "for col in [\"review_score\", \"likely_return\", \"price\", \"freight_value\", \"delivery_delay_days\", \"synthetic_return_risk\"]:\n", " if col in returns_df.columns:\n", " returns_df[col] = pd.to_numeric(returns_df[col], errors=\"coerce\")\n", "\n", "print(\"Data loaded and cleaned.\")" ] }, { "cell_type": "markdown", "source": [ "2.2 Dashboard visualization" ], "metadata": { "id": "KAj-RspZI2EI" }, "id": "KAj-RspZI2EI" }, { "cell_type": "code", "execution_count": 3, "id": "f9eb3801", "metadata": { "id": "f9eb3801" }, "outputs": [], "source": [ "# ==================================================\n", "# HELPERS: save artifacts where the app can find them\n", "# ==================================================\n", "# ==================================================\n", "# HELPERS: save artifacts everywhere the app may check\n", "# ==================================================\n", "\n", "def safe_write_csv(df, path):\n", " try:\n", " df.to_csv(path)\n", " return True\n", " except Exception as e:\n", " print(f\"Could not save {path}: {e}\")\n", " return False\n", "\n", "\n", "def safe_savefig(path):\n", " try:\n", " plt.savefig(path, dpi=150, bbox_inches=\"tight\")\n", " return True\n", " except Exception as e:\n", " print(f\"Could not save {path}: {e}\")\n", " return False\n", "\n", "\n", "def safe_write_text(text, path):\n", " try:\n", " path.write_text(text, encoding=\"utf-8\")\n", " return True\n", " except Exception as e:\n", " print(f\"Could not save {path}: {e}\")\n", " return False\n", "\n", "\n", "def save_table(df, name):\n", " if isinstance(df, pd.Series):\n", " df = df.to_frame()\n", "\n", " table_folders = [\n", " TABLES,\n", " OUTPUT_TABLES,\n", " OUTPUTS,\n", " ARTIFACT_TABLES,\n", " ARTIFACTS,\n", " ]\n", "\n", " saved_anywhere = False\n", "\n", " for folder in table_folders:\n", " folder.mkdir(parents=True, exist_ok=True)\n", " path = folder / f\"{name}.csv\"\n", " saved_anywhere = safe_write_csv(df, path) or saved_anywhere\n", "\n", " if saved_anywhere:\n", " print(f\"Saved table everywhere: {name}.csv\")\n", " else:\n", " raise RuntimeError(f\"Could not save table {name}.csv\")\n", "\n", "\n", "def save_figure(name):\n", " figure_folders = [\n", " FIGURES,\n", " OUTPUT_FIGURES,\n", " OUTPUTS,\n", " ARTIFACT_FIGURES,\n", " ARTIFACTS,\n", " ]\n", "\n", " saved_anywhere = False\n", "\n", " for folder in figure_folders:\n", " folder.mkdir(parents=True, exist_ok=True)\n", " path = folder / f\"{name}.png\"\n", " saved_anywhere = safe_savefig(path) or saved_anywhere\n", "\n", " if saved_anywhere:\n", " print(f\"Saved figure everywhere: {name}.png\")\n", " else:\n", " raise RuntimeError(f\"Could not save figure {name}.png\")\n", "\n", "\n", "def save_text(text, name):\n", " text_folders = [\n", " TABLES,\n", " OUTPUT_TABLES,\n", " OUTPUTS,\n", " ARTIFACT_TABLES,\n", " ARTIFACTS,\n", " ]\n", "\n", " saved_anywhere = False\n", "\n", " for folder in text_folders:\n", " folder.mkdir(parents=True, exist_ok=True)\n", " path = folder / f\"{name}.txt\"\n", " saved_anywhere = safe_write_text(text, path) or saved_anywhere\n", "\n", " if saved_anywhere:\n", " print(f\"Saved text everywhere: {name}.txt\")\n", " else:\n", " raise RuntimeError(f\"Could not save text {name}.txt\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "a99949ac", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a99949ac", "outputId": "012f8a9b-e72b-4288-d471-de548e9d35e0" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved table everywhere: rating_distribution.csv\n", "Saved figure everywhere: rating_distribution.png\n", "Saved table everywhere: recommendation_by_class.csv\n", "Saved figure everywhere: recommendation_by_class.png\n", "Saved table everywhere: average_rating_by_age.csv\n", "Saved figure everywhere: average_rating_by_age.png\n", "Saved table everywhere: negative_keyword_counts.csv\n", "Saved figure everywhere: negative_keyword_counts.png\n", "Saved table everywhere: category_return_rate.csv\n", "Saved figure everywhere: category_return_rate.png\n", "Saved table everywhere: monthly_return_rate.csv\n", "Saved figure everywhere: monthly_return_rate.png\n", "Saved table everywhere: feature_importance.csv\n", "Saved figure everywhere: feature_importance.png\n", "Saved text everywhere: classification_report.txt\n", "Artifact creation section finished.\n" ] } ], "source": [ "# ==================================================\n", "# CREATE DASHBOARD ARTIFACTS\n", "# ==================================================\n", "\n", "created_figures = []\n", "created_tables = []\n", "\n", "# 1) Rating distribution\n", "if \"Rating\" in reviews_df.columns:\n", " rating_distribution = reviews_df[\"Rating\"].dropna().value_counts().sort_index().to_frame(\"count\")\n", " save_table(rating_distribution, \"rating_distribution\")\n", " created_tables.append(\"rating_distribution.csv\")\n", "\n", " plt.figure(figsize=(7, 4))\n", " plt.bar(rating_distribution.index.astype(str), rating_distribution[\"count\"])\n", " plt.title(\"Distribution of Customer Ratings\")\n", " plt.xlabel(\"Rating\")\n", " plt.ylabel(\"Number of Reviews\")\n", " plt.tight_layout()\n", " save_figure(\"rating_distribution\")\n", " created_figures.append(\"rating_distribution.png\")\n", " plt.close()\n", "\n", "# 2) Recommendation rate by clothing class\n", "if {\"Class Name\", \"Recommended IND\"}.issubset(reviews_df.columns):\n", " recommendation_by_class = (\n", " reviews_df.groupby(\"Class Name\")[\"Recommended IND\"]\n", " .mean()\n", " .sort_values(ascending=False)\n", " .head(10)\n", " .to_frame(\"recommendation_rate\")\n", " )\n", " save_table(recommendation_by_class, \"recommendation_by_class\")\n", " created_tables.append(\"recommendation_by_class.csv\")\n", "\n", " plt.figure(figsize=(10, 5))\n", " plt.bar(recommendation_by_class.index.astype(str), recommendation_by_class[\"recommendation_rate\"])\n", " plt.title(\"Top 10 Most Recommended Clothing Classes\")\n", " plt.xlabel(\"Class Name\")\n", " plt.ylabel(\"Recommendation Rate\")\n", " plt.xticks(rotation=75)\n", " plt.tight_layout()\n", " save_figure(\"recommendation_by_class\")\n", " created_figures.append(\"recommendation_by_class.png\")\n", " plt.close()\n", "\n", "# 3) Average rating by age\n", "if {\"Age\", \"Rating\"}.issubset(reviews_df.columns):\n", " average_rating_by_age = (\n", " reviews_df.groupby(\"Age\")[\"Rating\"]\n", " .mean()\n", " .dropna()\n", " .to_frame(\"average_rating\")\n", " )\n", " save_table(average_rating_by_age, \"average_rating_by_age\")\n", " created_tables.append(\"average_rating_by_age.csv\")\n", "\n", " plt.figure(figsize=(10, 4))\n", " plt.plot(average_rating_by_age.index, average_rating_by_age[\"average_rating\"])\n", " plt.title(\"Average Rating by Customer Age\")\n", " plt.xlabel(\"Age\")\n", " plt.ylabel(\"Average Rating\")\n", " plt.tight_layout()\n", " save_figure(\"average_rating_by_age\")\n", " created_figures.append(\"average_rating_by_age.png\")\n", " plt.close()\n", "\n", "# 4) Complaint / return-risk keyword counts\n", "review_text_column = None\n", "for candidate in [\"Review Text\", \"review_text\", \"review_comment_message\"]:\n", " if candidate in reviews_df.columns:\n", " review_text_column = candidate\n", " break\n", "\n", "if review_text_column is not None:\n", " keywords = [\n", " \"bad\", \"poor\", \"cheap\", \"small\", \"large\", \"tight\", \"loose\",\n", " \"scratchy\", \"thin\", \"return\", \"returned\", \"disappointed\",\n", " \"quality\", \"fit\", \"sizing\", \"fabric\", \"uncomfortable\"\n", " ]\n", " text_series = reviews_df[review_text_column].fillna(\"\").astype(str).str.lower()\n", " keyword_counts = {}\n", " for word in keywords:\n", " keyword_counts[word] = int(text_series.str.contains(word, regex=False).sum())\n", "\n", " negative_keyword_counts = (\n", " pd.DataFrame(keyword_counts.items(), columns=[\"keyword\", \"review_count\"])\n", " .sort_values(\"review_count\", ascending=False)\n", " .set_index(\"keyword\")\n", " )\n", " save_table(negative_keyword_counts, \"negative_keyword_counts\")\n", " created_tables.append(\"negative_keyword_counts.csv\")\n", "\n", " top_keywords = negative_keyword_counts.head(10)\n", " plt.figure(figsize=(9, 4))\n", " plt.bar(top_keywords.index.astype(str), top_keywords[\"review_count\"])\n", " plt.title(\"Most Common Return-Risk Keywords in Reviews\")\n", " plt.xlabel(\"Keyword\")\n", " plt.ylabel(\"Number of Reviews\")\n", " plt.xticks(rotation=45)\n", " plt.tight_layout()\n", " save_figure(\"negative_keyword_counts\")\n", " created_figures.append(\"negative_keyword_counts.png\")\n", " plt.close()\n", "\n", "# 5) Product category return rate\n", "if {\"product_category_name\", \"likely_return\"}.issubset(returns_df.columns):\n", " category_return_rate = (\n", " returns_df.groupby(\"product_category_name\")[\"likely_return\"]\n", " .mean()\n", " .sort_values(ascending=False)\n", " .head(15)\n", " .to_frame(\"return_rate\")\n", " )\n", " save_table(category_return_rate, \"category_return_rate\")\n", " created_tables.append(\"category_return_rate.csv\")\n", "\n", " plt.figure(figsize=(11, 5))\n", " plt.bar(category_return_rate.index.astype(str), category_return_rate[\"return_rate\"])\n", " plt.title(\"Top Product Categories by Estimated Return Rate\")\n", " plt.xlabel(\"Product Category\")\n", " plt.ylabel(\"Return Rate\")\n", " plt.xticks(rotation=75)\n", " plt.tight_layout()\n", " save_figure(\"category_return_rate\")\n", " created_figures.append(\"category_return_rate.png\")\n", " plt.close()\n", "\n", "# 6) Monthly return rate\n", "if {\"order_purchase_timestamp\", \"likely_return\"}.issubset(returns_df.columns):\n", " monthly_df = returns_df.copy()\n", " monthly_df[\"order_purchase_timestamp\"] = pd.to_datetime(monthly_df[\"order_purchase_timestamp\"], errors=\"coerce\")\n", " monthly_df = monthly_df.dropna(subset=[\"order_purchase_timestamp\"])\n", "\n", " if len(monthly_df) > 0:\n", " monthly_return_rate = (\n", " monthly_df.set_index(\"order_purchase_timestamp\")\n", " .resample(\"M\")[\"likely_return\"]\n", " .mean()\n", " .dropna()\n", " .to_frame(\"return_rate\")\n", " )\n", " save_table(monthly_return_rate, \"monthly_return_rate\")\n", " created_tables.append(\"monthly_return_rate.csv\")\n", "\n", " plt.figure(figsize=(10, 4))\n", " plt.plot(monthly_return_rate.index, monthly_return_rate[\"return_rate\"])\n", " plt.title(\"Monthly Estimated Return Rate\")\n", " plt.xlabel(\"Month\")\n", " plt.ylabel(\"Return Rate\")\n", " plt.tight_layout()\n", " save_figure(\"monthly_return_rate\")\n", " created_figures.append(\"monthly_return_rate.png\")\n", " plt.close()\n", "\n", "# 7) Simple feature importance if sklearn is available\n", "try:\n", " from sklearn.ensemble import RandomForestClassifier\n", " from sklearn.model_selection import train_test_split\n", " from sklearn.metrics import accuracy_score, classification_report\n", "\n", " feature_columns = [c for c in [\"Age\", \"Rating\", \"Positive Feedback Count\"] if c in reviews_df.columns]\n", " if \"Recommended IND\" in reviews_df.columns and len(feature_columns) > 0:\n", " model_df = reviews_df[feature_columns + [\"Recommended IND\"]].dropna().copy()\n", " if model_df[\"Recommended IND\"].nunique() >= 2:\n", " X = model_df[feature_columns]\n", " y = model_df[\"Recommended IND\"].astype(int)\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2, random_state=42, stratify=y\n", " )\n", "\n", " clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", " clf.fit(X_train, y_train)\n", " predictions = clf.predict(X_test)\n", " accuracy = accuracy_score(y_test, predictions)\n", "\n", " feature_importance = (\n", " pd.Series(clf.feature_importances_, index=feature_columns)\n", " .sort_values(ascending=False)\n", " .to_frame(\"importance\")\n", " )\n", " save_table(feature_importance, \"feature_importance\")\n", " created_tables.append(\"feature_importance.csv\")\n", "\n", " plt.figure(figsize=(7, 4))\n", " plt.bar(feature_importance.index.astype(str), feature_importance[\"importance\"])\n", " plt.title(\"Feature Importance for Recommendation Prediction\")\n", " plt.xlabel(\"Feature\")\n", " plt.ylabel(\"Importance\")\n", " plt.tight_layout()\n", " save_figure(\"feature_importance\")\n", " created_figures.append(\"feature_importance.png\")\n", " plt.close()\n", "\n", " report = \"Model accuracy: {:.4f}\\n\\n{}\".format(\n", " accuracy,\n", " classification_report(y_test, predictions)\n", " )\n", " save_text(report, \"classification_report\")\n", "except Exception as e:\n", " print(\"ML section skipped:\", repr(e))\n", "\n", "print(\"Artifact creation section finished.\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "c4bbc916", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "c4bbc916", "outputId": "1b70021e-6125-457a-bcd8-be0c896dfa58" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saved table everywhere: dashboard_summary.csv\n", "Saved text everywhere: business_insights_report.txt\n", "STEP 2 COMPLETE.\n", "Figures: ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n", "Tables: ['average_rating_by_age.csv', 'category_return_rate.csv', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n", "Outputs: ['average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n" ] } ], "source": [ "# ==================================================\n", "# FINAL REPORT + MANIFEST\n", "# ==================================================\n", "\n", "summary_rows = [\n", " {\"metric\": \"reviews_rows\", \"value\": int(len(reviews_df))},\n", " {\"metric\": \"returns_rows\", \"value\": int(len(returns_df))},\n", " {\"metric\": \"figures_created\", \"value\": int(len(list(FIGURES.glob(\"*.png\"))))},\n", " {\"metric\": \"tables_created\", \"value\": int(len(list(TABLES.glob(\"*.csv\"))))},\n", "]\n", "\n", "summary_df = pd.DataFrame(summary_rows).set_index(\"metric\")\n", "save_table(summary_df, \"dashboard_summary\")\n", "\n", "insights = \"\"\"\n", "FINAL BUSINESS INSIGHTS\n", "=======================\n", "\n", "This analysis supports an e-commerce return prediction and review intelligence assistant.\n", "\n", "Main findings:\n", "- Customer ratings and recommendation behavior are useful signals for product satisfaction.\n", "- Review text reveals return-risk themes such as fit, sizing, fabric, quality, and discomfort.\n", "- Product categories with higher estimated return rates should be prioritized for improvement.\n", "- Monthly return-rate tracking can help the business monitor operational or seasonal changes.\n", "\n", "Recommended automations:\n", "1. Automatically scan new reviews for return-risk keywords.\n", "2. Automatically rank products and categories by estimated return risk.\n", "3. Automatically generate business recommendations for product pages, sizing guidance, and quality control.\n", "\"\"\"\n", "\n", "save_text(insights, \"business_insights_report\")\n", "\n", "manifest = {\n", " \"base_path\": str(BASE_PATH),\n", " \"figures\": sorted([p.name for p in FIGURES.glob(\"*.png\")]),\n", " \"tables\": sorted([p.name for p in TABLES.glob(\"*.csv\")]),\n", " \"outputs\": sorted([p.name for p in OUTPUTS.iterdir() if p.is_file()]),\n", "}\n", "\n", "for folder in [OUTPUTS, ARTIFACTS, TABLES]:\n", " try:\n", " with open(folder / \"artifacts_manifest.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(manifest, f, indent=2)\n", " except Exception as e:\n", " print(f\"Could not save manifest in {folder}: {e}\")\n", "\n", "print(\"STEP 2 COMPLETE.\")\n", "print(\"Figures:\", manifest[\"figures\"])\n", "print(\"Tables:\", manifest[\"tables\"])\n", "print(\"Outputs:\", manifest[\"outputs\"])" ] }, { "cell_type": "code", "source": [ "print(\"\\nFINAL ARTIFACT CHECK\")\n", "\n", "check_dirs = {\n", " \"FIGURES\": FIGURES,\n", " \"TABLES\": TABLES,\n", " \"OUTPUTS\": OUTPUTS,\n", " \"OUTPUT_FIGURES\": OUTPUT_FIGURES,\n", " \"OUTPUT_TABLES\": OUTPUT_TABLES,\n", " \"ARTIFACTS\": ARTIFACTS,\n", " \"ARTIFACT_FIGURES\": ARTIFACT_FIGURES,\n", " \"ARTIFACT_TABLES\": ARTIFACT_TABLES,\n", "}\n", "\n", "for label, folder in check_dirs.items():\n", " files = sorted([p.name for p in folder.iterdir() if p.is_file()])\n", " print(label, \"=\", files)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fexa62gDM2c7", "outputId": "92a75649-5c5b-4202-ba1c-839682d28eee" }, "id": "fexa62gDM2c7", "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "FINAL ARTIFACT CHECK\n", "FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n", "TABLES = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n", "OUTPUTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n", "OUTPUT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n", "OUTPUT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n", "ARTIFACTS = ['artifacts_manifest.json', 'average_rating_by_age.csv', 'average_rating_by_age.png', 'business_insights_report.txt', 'category_return_rate.csv', 'category_return_rate.png', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'feature_importance.png', 'monthly_return_rate.csv', 'monthly_return_rate.png', 'negative_keyword_counts.csv', 'negative_keyword_counts.png', 'rating_distribution.csv', 'rating_distribution.png', 'recommendation_by_class.csv', 'recommendation_by_class.png']\n", "ARTIFACT_FIGURES = ['average_rating_by_age.png', 'category_return_rate.png', 'feature_importance.png', 'monthly_return_rate.png', 'negative_keyword_counts.png', 'rating_distribution.png', 'recommendation_by_class.png']\n", "ARTIFACT_TABLES = ['average_rating_by_age.csv', 'business_insights_report.txt', 'category_return_rate.csv', 'classification_report.txt', 'dashboard_summary.csv', 'feature_importance.csv', 'monthly_return_rate.csv', 'negative_keyword_counts.csv', 'rating_distribution.csv', 'recommendation_by_class.csv']\n" ] } ] }, { "cell_type": "code", "source": [ "# ==================================================\n", "# FORCE DASHBOARD ARTIFACTS FOR SE21 HUGGING FACE APP\n", "# Put this as the VERY LAST CELL of pythonanalysis.ipynb\n", "# ==================================================\n", "\n", "import os\n", "import json\n", "from pathlib import Path\n", "\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import matplotlib\n", "matplotlib.use(\"Agg\")\n", "import matplotlib.pyplot as plt\n", "\n", "# Detect runtime\n", "if Path(\"/app\").exists():\n", " BASE_PATH = Path(\"/app\")\n", "elif Path(\"/content\").exists():\n", " BASE_PATH = Path(\"/content\")\n", "else:\n", " BASE_PATH = Path.cwd()\n", "\n", "# THESE ARE THE EXACT FOLDERS app.py READS\n", "PY_FIG_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"figures\"\n", "PY_TAB_DIR = BASE_PATH / \"artifacts\" / \"py\" / \"tables\"\n", "\n", "PY_FIG_DIR.mkdir(parents=True, exist_ok=True)\n", "PY_TAB_DIR.mkdir(parents=True, exist_ok=True)\n", "\n", "print(\"Saving dashboard artifacts to:\")\n", "print(\"Figures:\", PY_FIG_DIR)\n", "print(\"Tables:\", PY_TAB_DIR)\n", "\n", "# Find CSV files\n", "csv_paths = [\n", " p for p in BASE_PATH.rglob(\"*.csv\")\n", " if \"sample_data\" not in str(p)\n", " and \"artifacts\" not in str(p)\n", " and \"outputs\" not in str(p)\n", " and \"figures\" not in str(p)\n", " and \"tables\" not in str(p)\n", "]\n", "\n", "print(\"CSV files found:\")\n", "for p in csv_paths:\n", " print(\"-\", p)\n", "\n", "# Find reviews dataset\n", "reviews_candidates = [\n", " BASE_PATH / \"data_processed\" / \"reviews_cleaned.csv\",\n", " BASE_PATH / \"Womens Clothing E-Commerce Reviews.csv\",\n", "]\n", "\n", "reviews_path = next((p for p in reviews_candidates if p.exists()), None)\n", "\n", "if reviews_path is None:\n", " matches = [\n", " p for p in csv_paths\n", " if \"clothing\" in p.name.lower() or \"review\" in p.name.lower()\n", " ]\n", " reviews_path = matches[0] if matches else None\n", "\n", "# Find returns dataset\n", "returns_candidates = [\n", " BASE_PATH / \"data_processed\" / \"returns_input.csv\",\n", " BASE_PATH / \"data_processed\" / \"returns_cleaned.csv\",\n", " BASE_PATH / \"ecommerce_returns_cleaned.csv\",\n", " BASE_PATH / \"data_processed\" / \"synthetic_return_risk.csv\",\n", "]\n", "\n", "returns_path = next((p for p in returns_candidates if p.exists()), None)\n", "\n", "if returns_path is None:\n", " matches = [\n", " p for p in csv_paths\n", " if \"return\" in p.name.lower()\n", " ]\n", " returns_path = matches[0] if matches else None\n", "\n", "if reviews_path is None:\n", " raise FileNotFoundError(\"Could not find reviews CSV.\")\n", "\n", "if returns_path is None:\n", " raise FileNotFoundError(\"Could not find returns CSV.\")\n", "\n", "print(\"Using reviews:\", reviews_path)\n", "print(\"Using returns:\", returns_path)\n", "\n", "reviews_df = pd.read_csv(reviews_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n", "returns_df = pd.read_csv(returns_path).drop(columns=[\"Unnamed: 0\"], errors=\"ignore\")\n", "\n", "print(\"Reviews shape:\", reviews_df.shape)\n", "print(\"Returns shape:\", returns_df.shape)\n", "\n", "# --------------------------------------------------\n", "# 1. Rating distribution\n", "# --------------------------------------------------\n", "if \"Rating\" in reviews_df.columns:\n", " rating_distribution = (\n", " reviews_df[\"Rating\"]\n", " .dropna()\n", " .value_counts()\n", " .sort_index()\n", " .reset_index()\n", " )\n", " rating_distribution.columns = [\"rating\", \"count\"]\n", "\n", " rating_distribution.to_csv(PY_TAB_DIR / \"rating_distribution.csv\", index=False)\n", "\n", " plt.figure(figsize=(7, 4))\n", " plt.bar(rating_distribution[\"rating\"].astype(str), rating_distribution[\"count\"])\n", " plt.title(\"Distribution of Customer Ratings\")\n", " plt.xlabel(\"Rating\")\n", " plt.ylabel(\"Number of Reviews\")\n", " plt.tight_layout()\n", " plt.savefig(PY_FIG_DIR / \"rating_distribution.png\", dpi=150, bbox_inches=\"tight\")\n", " plt.close()\n", "\n", "# --------------------------------------------------\n", "# 2. Sentiment counts for app's sentiment chart\n", "# The app specifically looks for sentiment_counts_sampled.csv\n", "# --------------------------------------------------\n", "if \"Rating\" in reviews_df.columns:\n", " temp = reviews_df.copy()\n", "\n", " def rating_to_sentiment(r):\n", " try:\n", " r = float(r)\n", " if r <= 2:\n", " return \"negative\"\n", " elif r == 3:\n", " return \"neutral\"\n", " else:\n", " return \"positive\"\n", " except:\n", " return \"neutral\"\n", "\n", " temp[\"sentiment\"] = temp[\"Rating\"].apply(rating_to_sentiment)\n", "\n", " group_col = \"Class Name\" if \"Class Name\" in temp.columns else None\n", "\n", " if group_col:\n", " sentiment_counts = (\n", " temp.groupby([group_col, \"sentiment\"])\n", " .size()\n", " .unstack(fill_value=0)\n", " .reset_index()\n", " .head(15)\n", " )\n", " sentiment_counts = sentiment_counts.rename(columns={group_col: \"title\"})\n", " else:\n", " sentiment_counts = (\n", " temp[\"sentiment\"]\n", " .value_counts()\n", " .to_frame()\n", " .T\n", " .reset_index(drop=True)\n", " )\n", " sentiment_counts.insert(0, \"title\", \"All Reviews\")\n", "\n", " for col in [\"negative\", \"neutral\", \"positive\"]:\n", " if col not in sentiment_counts.columns:\n", " sentiment_counts[col] = 0\n", "\n", " sentiment_counts[[\"title\", \"negative\", \"neutral\", \"positive\"]].to_csv(\n", " PY_TAB_DIR / \"sentiment_counts_sampled.csv\",\n", " index=False\n", " )\n", "\n", " # Also save a normal figure\n", " sentiment_total = temp[\"sentiment\"].value_counts().reindex(\n", " [\"negative\", \"neutral\", \"positive\"],\n", " fill_value=0\n", " )\n", "\n", " plt.figure(figsize=(7, 4))\n", " plt.bar(sentiment_total.index, sentiment_total.values)\n", " plt.title(\"Review Sentiment Distribution\")\n", " plt.xlabel(\"Sentiment\")\n", " plt.ylabel(\"Number of Reviews\")\n", " plt.tight_layout()\n", " plt.savefig(PY_FIG_DIR / \"sentiment_distribution.png\", dpi=150, bbox_inches=\"tight\")\n", " plt.close()\n", "\n", "# --------------------------------------------------\n", "# 3. Category return rate\n", "# --------------------------------------------------\n", "return_col = None\n", "for candidate in [\"likely_return\", \"synthetic_return_risk\", \"returned\", \"return_flag\"]:\n", " if candidate in returns_df.columns:\n", " return_col = candidate\n", " break\n", "\n", "category_col = None\n", "for candidate in [\"product_category_name\", \"category\", \"Class Name\", \"product_id\"]:\n", " if candidate in returns_df.columns:\n", " category_col = candidate\n", " break\n", "\n", "if return_col is not None:\n", " returns_df[return_col] = pd.to_numeric(returns_df[return_col], errors=\"coerce\")\n", "\n", "if return_col is not None and category_col is not None:\n", " category_return_rate = (\n", " returns_df.groupby(category_col)[return_col]\n", " .mean()\n", " .sort_values(ascending=False)\n", " .head(15)\n", " .reset_index()\n", " )\n", " category_return_rate.columns = [\"category\", \"return_rate\"]\n", "\n", " category_return_rate.to_csv(PY_TAB_DIR / \"category_return_rate.csv\", index=False)\n", "\n", " plt.figure(figsize=(11, 5))\n", " plt.bar(category_return_rate[\"category\"].astype(str), category_return_rate[\"return_rate\"])\n", " plt.title(\"Highest Return-Rate Categories\")\n", " plt.xlabel(\"Category\")\n", " plt.ylabel(\"Return Rate\")\n", " plt.xticks(rotation=75)\n", " plt.tight_layout()\n", " plt.savefig(PY_FIG_DIR / \"category_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n", " plt.close()\n", "\n", " # The template's AI fallback weirdly expects this filename for \"top\" questions.\n", " # We reuse it to show highest return-risk categories.\n", " top_titles_by_units_sold = category_return_rate.copy()\n", " top_titles_by_units_sold.columns = [\"title\", \"units_sold\"]\n", " top_titles_by_units_sold.to_csv(PY_TAB_DIR / \"top_titles_by_units_sold.csv\", index=False)\n", "\n", "# --------------------------------------------------\n", "# 4. Dashboard time-series file\n", "# The app's dashboard chart specifically looks for df_dashboard.csv\n", "# --------------------------------------------------\n", "if \"order_purchase_timestamp\" in returns_df.columns and return_col is not None:\n", " ts = returns_df.copy()\n", " ts[\"order_purchase_timestamp\"] = pd.to_datetime(\n", " ts[\"order_purchase_timestamp\"],\n", " errors=\"coerce\"\n", " )\n", " ts = ts.dropna(subset=[\"order_purchase_timestamp\"])\n", "\n", " if not ts.empty:\n", " dashboard_df = (\n", " ts.set_index(\"order_purchase_timestamp\")\n", " .resample(\"M\")\n", " .agg(\n", " return_rate=(return_col, \"mean\"),\n", " orders=(return_col, \"count\")\n", " )\n", " .reset_index()\n", " )\n", " dashboard_df = dashboard_df.rename(columns={\"order_purchase_timestamp\": \"month\"})\n", " else:\n", " dashboard_df = pd.DataFrame({\n", " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n", " \"return_rate\": [0, 0, 0],\n", " \"orders\": [0, 0, 0],\n", " })\n", "else:\n", " dashboard_df = pd.DataFrame({\n", " \"month\": pd.date_range(\"2024-01-01\", periods=3, freq=\"M\"),\n", " \"return_rate\": [0, 0, 0],\n", " \"orders\": [0, 0, 0],\n", " })\n", "\n", "dashboard_df.to_csv(PY_TAB_DIR / \"df_dashboard.csv\", index=False)\n", "\n", "plt.figure(figsize=(9, 4))\n", "plt.plot(pd.to_datetime(dashboard_df[\"month\"]), dashboard_df[\"return_rate\"], marker=\"o\")\n", "plt.title(\"Monthly Estimated Return Rate\")\n", "plt.xlabel(\"Month\")\n", "plt.ylabel(\"Return Rate\")\n", "plt.tight_layout()\n", "plt.savefig(PY_FIG_DIR / \"monthly_return_rate.png\", dpi=150, bbox_inches=\"tight\")\n", "plt.close()\n", "\n", "# --------------------------------------------------\n", "# 5. KPIs\n", "# --------------------------------------------------\n", "kpis = {\n", " \"reviews_rows\": int(len(reviews_df)),\n", " \"returns_rows\": int(len(returns_df)),\n", " \"n_titles\": int(reviews_df[\"Clothing ID\"].nunique()) if \"Clothing ID\" in reviews_df.columns else int(len(reviews_df)),\n", " \"n_months\": int(len(dashboard_df)),\n", " \"total_units_sold\": int(len(returns_df)),\n", " \"estimated_return_rate\": float(returns_df[return_col].mean()) if return_col is not None else None,\n", "}\n", "\n", "with open(PY_TAB_DIR / \"kpis.json\", \"w\", encoding=\"utf-8\") as f:\n", " json.dump(kpis, f, indent=2)\n", "\n", "# --------------------------------------------------\n", "# Final verification\n", "# --------------------------------------------------\n", "print(\"\\nFORCE ARTIFACT CELL RAN SUCCESSFULLY\")\n", "print(\"Figures now in app-readable folder:\")\n", "print(sorted([p.name for p in PY_FIG_DIR.glob(\"*\")]))\n", "\n", "print(\"Tables now in app-readable folder:\")\n", "print(sorted([p.name for p in PY_TAB_DIR.glob(\"*\")]))" ], "metadata": { "id": "G-jXRriWP1TW", "outputId": "69ac490c-cb7b-479f-b5b9-d85e02b0ba1d", "colab": { "base_uri": "https://localhost:8080/" } }, "id": "G-jXRriWP1TW", "execution_count": 7, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saving dashboard artifacts to:\n", "Figures: /content/artifacts/py/figures\n", "Tables: /content/artifacts/py/tables\n", "CSV files found:\n", "- /content/Womens Clothing E-Commerce Reviews.csv\n", "- /content/ecommerce_returns_cleaned.csv\n", "Using reviews: /content/Womens Clothing E-Commerce Reviews.csv\n", "Using returns: /content/ecommerce_returns_cleaned.csv\n", "Reviews shape: (23486, 10)\n", "Returns shape: (113314, 29)\n", "\n", "FORCE ARTIFACT CELL RAN SUCCESSFULLY\n", "Figures now in app-readable folder:\n", "['category_return_rate.png', 'monthly_return_rate.png', 'rating_distribution.png', 'sentiment_distribution.png']\n", "Tables now in app-readable folder:\n", "['category_return_rate.csv', 'df_dashboard.csv', 'kpis.json', 'rating_distribution.csv', 'sentiment_counts_sampled.csv', 'top_titles_by_units_sold.csv']\n" ] } ] }, { "cell_type": "markdown", "source": [ "2.3 ARIMA Pricfe Forecasting" ], "metadata": { "id": "WztEFgicJAfE" }, "id": "WztEFgicJAfE" }, { "cell_type": "code", "source": [ "# ==================================================\n", "# ARIMA PRICE FORECASTING\n", "# ==================================================\n", "\n", "!pip install statsmodels --quiet\n", "\n", "from statsmodels.tsa.arima.model import ARIMA\n", "from statsmodels.tsa.stattools import adfuller\n", "import matplotlib.pyplot as plt\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# Build a monthly average price time series from returns data\n", "if \"price\" in returns_df.columns and \"order_purchase_timestamp\" in returns_df.columns:\n", " ts_price = returns_df.copy()\n", " ts_price[\"order_purchase_timestamp\"] = pd.to_datetime(\n", " ts_price[\"order_purchase_timestamp\"], errors=\"coerce\"\n", " )\n", " ts_price = ts_price.dropna(subset=[\"order_purchase_timestamp\", \"price\"])\n", "\n", " monthly_price = (\n", " ts_price.set_index(\"order_purchase_timestamp\")\n", " .resample(\"M\")[\"price\"]\n", " .mean()\n", " .dropna()\n", " )\n", "\n", " print(\"Monthly price series shape:\", monthly_price.shape)\n", " print(monthly_price.tail())\n", "\n", " # --- Stationarity check ---\n", " adf_result = adfuller(monthly_price)\n", " print(f\"\\nADF Statistic: {adf_result[0]:.4f}\")\n", " print(f\"p-value: {adf_result[1]:.4f}\")\n", " if adf_result[1] < 0.05:\n", " print(\"Series is stationary — good for ARIMA.\")\n", " else:\n", " print(\"Series is NOT stationary — differencing will be applied (d=1).\")\n", "\n", " # --- Fit ARIMA ---\n", " # p=2 (autoregression), d=1 (differencing), q=1 (moving average)\n", " model = ARIMA(monthly_price, order=(2, 1, 1))\n", " model_fit = model.fit()\n", " print(\"\\nARIMA Model Summary:\")\n", " print(model_fit.summary())\n", "\n", " # --- Forecast next 6 months ---\n", " forecast_steps = 6\n", " forecast = model_fit.forecast(steps=forecast_steps)\n", " forecast_index = pd.date_range(\n", " start=monthly_price.index[-1] + pd.DateOffset(months=1),\n", " periods=forecast_steps,\n", " freq=\"M\"\n", " )\n", " forecast_series = pd.Series(forecast.values, index=forecast_index)\n", "\n", " # --- Plot ---\n", " plt.figure(figsize=(12, 5))\n", " plt.plot(monthly_price, label=\"Historical Price\", marker=\"o\")\n", " plt.plot(forecast_series, label=\"ARIMA Forecast (6 months)\",\n", " marker=\"o\", linestyle=\"--\", color=\"orange\")\n", " plt.axvline(x=monthly_price.index[-1], color=\"gray\",\n", " linestyle=\":\", label=\"Forecast Start\")\n", " plt.title(\"Monthly Average Price — ARIMA Forecast\")\n", " plt.xlabel(\"Month\")\n", " plt.ylabel(\"Average Price (€)\")\n", " plt.legend()\n", " plt.tight_layout()\n", " plt.savefig(PY_FIG_DIR / \"arima_price_forecast.png\", dpi=150, bbox_inches=\"tight\")\n", " plt.show()\n", " plt.close()\n", "\n", " # --- Save forecast table ---\n", " forecast_df = pd.DataFrame({\n", " \"month\": forecast_series.index,\n", " \"forecasted_price\": forecast_series.values.round(2)\n", " })\n", " forecast_df.to_csv(PY_TAB_DIR / \"arima_price_forecast.csv\", index=False)\n", "\n", " print(\"\\nForecast for next 6 months:\")\n", " display(forecast_df)\n", " print(\"Saved forecast to arima_price_forecast.csv\")\n", "\n", "else:\n", " print(\"Required columns 'price' or 'order_purchase_timestamp' not found in returns_df.\")\n", " print(\"Available columns:\", returns_df.columns.tolist())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "3gv4S_a-Gyt8", "outputId": "63b8278e-772d-41ed-9c05-acfd5b0e3a3c" }, "id": "3gv4S_a-Gyt8", "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Monthly price series shape: (24,)\n", "order_purchase_timestamp\n", "2018-05-31 125.654273\n", "2018-06-30 122.241141\n", "2018-07-31 126.141411\n", "2018-08-31 117.897993\n", "2018-09-30 145.000000\n", "Name: price, dtype: float64\n", "\n", "ADF Statistic: -6.4329\n", "p-value: 0.0000\n", "Series is stationary — good for ARIMA.\n", "\n", "ARIMA Model Summary:\n", " SARIMAX Results \n", "==============================================================================\n", "Dep. Variable: price No. Observations: 24\n", "Model: ARIMA(2, 1, 1) Log Likelihood -107.606\n", "Date: Mon, 27 Apr 2026 AIC 223.212\n", "Time: 18:02:47 BIC 227.754\n", "Sample: 0 HQIC 224.354\n", " - 24 \n", "Covariance Type: opg \n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "ar.L1 -0.2139 0.785 -0.273 0.785 -1.752 1.324\n", "ar.L2 0.4311 0.684 0.630 0.529 -0.910 1.772\n", "ma.L1 -0.8209 0.849 -0.966 0.334 -2.486 0.844\n", "sigma2 636.5748 198.429 3.208 0.001 247.661 1025.488\n", "===================================================================================\n", "Ljung-Box (L1) (Q): 0.10 Jarque-Bera (JB): 19.72\n", "Prob(Q): 0.75 Prob(JB): 0.00\n", "Heteroskedasticity (H): 0.06 Skew: 1.13\n", "Prob(H) (two-sided): 0.00 Kurtosis: 6.94\n", "===================================================================================\n", "\n", "Warnings:\n", "[1] Covariance matrix calculated using the outer product of gradients (complex-step).\n", "\n", "Forecast for next 6 months:\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ " month forecasted_price\n", "0 2018-10-31 118.22\n", "1 2018-11-30 135.63\n", "2 2018-12-31 120.36\n", "3 2019-01-31 131.13\n", "4 2019-02-28 122.25\n", "5 2019-03-31 128.79" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
monthforecasted_price
02018-10-31118.22
12018-11-30135.63
22018-12-31120.36
32019-01-31131.13
42019-02-28122.25
52019-03-31128.79
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "forecast_df", "summary": "{\n \"name\": \"forecast_df\",\n \"rows\": 6,\n \"fields\": [\n {\n \"column\": \"month\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2018-10-31 00:00:00\",\n \"max\": \"2019-03-31 00:00:00\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"2018-10-31 00:00:00\",\n \"2018-11-30 00:00:00\",\n \"2019-03-31 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"forecasted_price\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 6.829558306850204,\n \"min\": 118.22,\n \"max\": 135.63,\n \"num_unique_values\": 6,\n \"samples\": [\n 118.22,\n 135.63,\n 128.79\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {} }, { "output_type": "stream", "name": "stdout", "text": [ "Saved forecast to arima_price_forecast.csv\n" ] } ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 5 }