{ "cells": [ { "cell_type": "markdown", "id": "85356ae0-b730-462e-aab0-4491cbf1b1d4", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 68, "id": "0881c073-a413-4082-b7d6-9096f2ad6b1e", "metadata": {}, "outputs": [], "source": [ "import os\n", "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from scipy import stats\n", "\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler, PolynomialFeatures\n", "from sklearn.linear_model import LinearRegression, Ridge, LassoCV\n", "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.model_selection import KFold\n", "from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score\n", "\n", "import statsmodels.api as sm\n", "import pickle" ] }, { "cell_type": "markdown", "id": "e02b3464-1bc8-4521-8fad-35a65a82aa6a", "metadata": {}, "source": [ "# Configuration" ] }, { "cell_type": "code", "execution_count": 69, "id": "21128113-5b27-4e46-ab40-9a24627580f2", "metadata": {}, "outputs": [], "source": [ "REPO_ROOT = os.path.abspath(os.path.join(os.getcwd(), \"..\"))\n", "DATA_DIR = os.path.join(REPO_ROOT, \"Datasets_all\")\n", "\n", "OUT_DIR = Path(\"models\")\n", "OUT_DIR.mkdir(exist_ok=True)\n", "\n", "RANDOM_STATE = 42\n", "N_SPLITS = 5\n", "\n", "TRAIN_CSV = os.path.join(DATA_DIR, \"train.csv\")\n", "TEST_CSV = os.path.join(DATA_DIR, \"test.csv\")\n", "\n", "TARGET_COL = \"AimoScore\"\n", "DROP_FEATURES = [\"EstimatedScore\"] # EstimatedScore is excluded from input features as mentioned in slide 16 of lecture 2" ] }, { "cell_type": "markdown", "id": "12ad047b-82f5-49a5-8768-ac8144f4e619", "metadata": {}, "source": [ "# Load data" ] }, { "cell_type": "code", "execution_count": 70, "id": "856adeb4-1bf2-4828-bff1-8ee344e650b8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train shape: (1599, 42)\n", "Test shape : (400, 42)\n" ] } ], "source": [ "train_df = pd.read_csv(TRAIN_CSV)\n", "test_df = pd.read_csv(TEST_CSV)\n", "\n", "print(\"Train shape:\", train_df.shape)\n", "print(\"Test shape :\", test_df.shape)" ] }, { "cell_type": "code", "execution_count": 71, "id": "7d4a9dfb-ab3e-4195-9cd1-564e0f691c74", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index(['AimoScore', 'No_1_Angle_Deviation', 'No_2_Angle_Deviation',\n", " 'No_3_Angle_Deviation', 'No_4_Angle_Deviation', 'No_5_Angle_Deviation',\n", " 'No_6_Angle_Deviation', 'No_7_Angle_Deviation', 'No_8_Angle_Deviation',\n", " 'No_9_Angle_Deviation', 'No_10_Angle_Deviation',\n", " 'No_11_Angle_Deviation', 'No_12_Angle_Deviation',\n", " 'No_13_Angle_Deviation', 'No_1_NASM_Deviation', 'No_2_NASM_Deviation',\n", " 'No_3_NASM_Deviation', 'No_4_NASM_Deviation', 'No_5_NASM_Deviation',\n", " 'No_6_NASM_Deviation', 'No_7_NASM_Deviation', 'No_8_NASM_Deviation',\n", " 'No_9_NASM_Deviation', 'No_10_NASM_Deviation', 'No_11_NASM_Deviation',\n", " 'No_12_NASM_Deviation', 'No_13_NASM_Deviation', 'No_14_NASM_Deviation',\n", " 'No_15_NASM_Deviation', 'No_16_NASM_Deviation', 'No_17_NASM_Deviation',\n", " 'No_18_NASM_Deviation', 'No_19_NASM_Deviation', 'No_20_NASM_Deviation',\n", " 'No_21_NASM_Deviation', 'No_22_NASM_Deviation', 'No_23_NASM_Deviation',\n", " 'No_24_NASM_Deviation', 'No_25_NASM_Deviation', 'No_1_Time_Deviation',\n", " 'No_2_Time_Deviation', 'EstimatedScore'],\n", " dtype='object')\n" ] }, { "data": { "text/html": [ "
| \n", " | AimoScore | \n", "No_1_Angle_Deviation | \n", "No_2_Angle_Deviation | \n", "No_3_Angle_Deviation | \n", "No_4_Angle_Deviation | \n", "No_5_Angle_Deviation | \n", "No_6_Angle_Deviation | \n", "No_7_Angle_Deviation | \n", "No_8_Angle_Deviation | \n", "No_9_Angle_Deviation | \n", "... | \n", "No_19_NASM_Deviation | \n", "No_20_NASM_Deviation | \n", "No_21_NASM_Deviation | \n", "No_22_NASM_Deviation | \n", "No_23_NASM_Deviation | \n", "No_24_NASM_Deviation | \n", "No_25_NASM_Deviation | \n", "No_1_Time_Deviation | \n", "No_2_Time_Deviation | \n", "EstimatedScore | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "0.965848 | \n", "0.816834 | \n", "0.250120 | \n", "0.346724 | \n", "0.188905 | \n", "0.163080 | \n", "0.140124 | \n", "0.194165 | \n", "0.521760 | \n", "0.156385 | \n", "... | \n", "0.670971 | \n", "0.656624 | \n", "0.642276 | \n", "0.552846 | \n", "0.648972 | \n", "0.578192 | \n", "0.308943 | \n", "0.148733 | \n", "0.151124 | \n", "0.009087 | \n", "
| 1 | \n", "0.407282 | \n", "0.279770 | \n", "0.139168 | \n", "0.346724 | \n", "0.303682 | \n", "0.928264 | \n", "0.659971 | \n", "0.918221 | \n", "0.521760 | \n", "0.561932 | \n", "... | \n", "0.670971 | \n", "0.656624 | \n", "0.788140 | \n", "0.908656 | \n", "0.648972 | \n", "0.578192 | \n", "0.891918 | \n", "0.684840 | \n", "0.711621 | \n", "0.837877 | \n", "
| 2 | \n", "0.810337 | \n", "0.279770 | \n", "0.092300 | \n", "0.392157 | \n", "0.799617 | \n", "0.722621 | \n", "0.753228 | \n", "0.730273 | \n", "0.521760 | \n", "0.156385 | \n", "... | \n", "0.670971 | \n", "0.656624 | \n", "0.642276 | \n", "0.896222 | \n", "0.648972 | \n", "0.578192 | \n", "0.308943 | \n", "0.148733 | \n", "0.186514 | \n", "0.424199 | \n", "
| 3 | \n", "0.603826 | \n", "0.906743 | \n", "0.494978 | \n", "0.571019 | \n", "0.735533 | \n", "0.706839 | \n", "0.140124 | \n", "0.670971 | \n", "0.521760 | \n", "0.156385 | \n", "... | \n", "0.670971 | \n", "0.764228 | \n", "0.666188 | \n", "0.552846 | \n", "0.648972 | \n", "0.578192 | \n", "0.552367 | \n", "0.811573 | \n", "0.820182 | \n", "0.550933 | \n", "
| 4 | \n", "0.141338 | \n", "0.672884 | \n", "0.583931 | \n", "0.346724 | \n", "0.861310 | \n", "0.555715 | \n", "0.859876 | \n", "0.469632 | \n", "0.767097 | \n", "0.358680 | \n", "... | \n", "0.899570 | \n", "0.656624 | \n", "0.642276 | \n", "0.552846 | \n", "0.882353 | \n", "0.816356 | \n", "0.308943 | \n", "0.982783 | \n", "0.982783 | \n", "0.741750 | \n", "
5 rows × 42 columns
\n", "