# start by importing the necessary packages #standard import numpy as np import pandas as pd #plt packages import seaborn as sns import altair as alt import matplotlib.pyplot as plt #streamlit import streamlit as st #sklearn from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score st.set_page_config(page_title="StressedOUT – Cached/DB", page_icon=":skull:", layout="wide") st.title("StressedOOUT – Looking into a dataset of stressed students (Cached)") st.caption("Reads .csv files. Uses Streamlit caching and a form submit gate.") BASE_DIR = "StressLevelDataset.csv" # @st.cache_data def load_data(path): data = pd.read_csv(path) return data data = load_data(BASE_DIR).drop(columns=['future_career_concerns', 'anxiety_level', 'depression', 'bullying','peer_pressure']) with st.sidebar: st.header("Filters") with st.form("filters"): analysis = st.radio( "Select your dataset", ('PCA reduced', 'No dimensionality reduction'), captions=('PCA reduced', 'No dimensionality reduction') ) k = st.slider("Select number of clusters (k)", 2, 10, 4, step=1) iterations = st.slider("Select number of iterations to show", 1, 10, 5, step=1) seed = st.number_input("Random seed", min_value=0, max_value=100, value=42, step=1) st.write("For no dimensionality reduction, the first two features will be used for visualization.") feature_x = st.selectbox("Select X-axis feature", data.columns, index=0) feature_y = st.selectbox("Select Y-axis feature", data.columns, index=1) submitted = st.form_submit_button("Apply") if not submitted: st.info("Adjust filters and click **Apply**.") st.stop() def kmeans_iteration_demo(X, k, max_iters=iterations): # Initialize centers randomly np.random.seed(seed) centers = X[np.random.choice(len(X), k, replace=False)] fig, axes = plt.subplots(1, max_iters + 1, figsize=(20, 4)) for iteration in range(max_iters + 1): if iteration == 0: # Show initial random centers axes[iteration].scatter(X[:, 0], X[:, 1], c='lightgray', alpha=0.6, s=30) axes[iteration].scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='X', edgecolors='black', linewidths=2) axes[iteration].set_title(f'Iteration {iteration}\n(Random Initialization)') else: # Assign points to nearest center distances = np.sqrt(((X - centers[:, np.newaxis])**2).sum(axis=2)) labels = np.argmin(distances, axis=0) # Plot current clustering colors = ['blue', 'green', 'red', 'purple', 'orange'] for j in range(k): mask = labels == j axes[iteration].scatter(X[mask, 0], X[mask, 1], c=colors[j], alpha=0.6, s=30, label=f'Cluster {j+1}') axes[iteration].scatter(centers[:, 0], centers[:, 1], c='black', s=200, marker='X', edgecolors='white', linewidths=2) axes[iteration].set_title(f'Iteration {iteration}') # Update centers new_centers = np.array([X[labels == j].mean(axis=0) for j in range(k)]) # Show center movement with arrows if iteration > 1: for j in range(k): axes[iteration].annotate('', xy=new_centers[j], xytext=centers[j], arrowprops=dict(arrowstyle='->', lw=2, color='red', alpha=0.7)) centers = new_centers axes[iteration].set_xlabel('PC1') axes[iteration].set_ylabel('PC2') axes[iteration].grid(True, alpha=0.3) plt.tight_layout() st.pyplot(fig) if analysis == 'PCA reduced': data_scaled = StandardScaler().fit_transform(data) data_reduced_df = pd.DataFrame(data_scaled, columns=data.columns) st.write('You selected PCA reduced') pca = PCA() pca_data = pca.fit_transform(data_reduced_df) pca_data_pd = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(pca_data.shape[1])]) st.write('The PCA reduced data is shown below') st.dataframe(pca_data_pd.head(10)) explained_variance = pca.explained_variance_ratio_ cumulative_variance = np.cumsum(explained_variance) st.write("Explained Variance by Component:") for i in range(min(10, len(explained_variance))): st.write(f"PC{i+1}: {explained_variance[i]:.3f} ({explained_variance[i]*100:.1f}%)") st.write(f"\nFirst 3 components explain {cumulative_variance[2]*100:.1f}% of total variance") st.write(f"First 5 components explain {cumulative_variance[4]*100:.1f}% of total variance") #visualizations fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5)) #scree plot ax1.plot(range(1,len(explained_variance)+1),explained_variance,marker='o',linestyle='--') ax1.set_title('Scree Plot') ax1.set_xlabel('Principal Component') ax1.set_ylabel('Variance Explained') ax1.axvline(x=3,color='r',linestyle='--',label='3 components') ax1.axvline(x=5,color='g',linestyle='--',label='5 components') ax1.legend() ax1.grid() #cumulative variance plot ax2.plot(range(1,len(cumulative_variance)+1),cumulative_variance,marker='o',linestyle='--',color='orange') ax2.set_title('Cumulative Variance Explained') ax2.set_xlabel('Number of Principal Components') ax2.set_ylabel('Cumulative Variance Explained') ax2.axhline(y=0.9,color='r',linestyle='--',label='90% variance') ax2.axhline(y=0.95,color='g',linestyle='--',label='95% variance') ax2.legend() ax2.grid() st.pyplot(fig) components_df = pd.DataFrame( pca.components_[:5].T, # First 5 components columns=[f'PC{i+1}' for i in range(5)], index=data_reduced_df.columns ) st.write("PCA Component Loadings (first 5 components):") st.dataframe(components_df) # Visualize component loadings for interpretation fig, axes = plt.subplots(3, 2, figsize=(16, 12)) # PC1 loadings pc1_loadings = components_df['PC1'].sort_values(key=abs, ascending=False) axes[0,0].barh(range(len(pc1_loadings)), pc1_loadings.values) axes[0,0].set_yticks(range(len(pc1_loadings))) axes[0,0].set_yticklabels(pc1_loadings.index, fontsize=9) axes[0,0].set_title(f'PC1 Loadings (Explains {explained_variance[0]*100:.1f}% of variance)') axes[0,0].axvline(x=0, color='black', linestyle='-', alpha=0.3) # PC2 loadings pc2_loadings = components_df['PC2'].sort_values(key=abs, ascending=False) axes[0,1].barh(range(len(pc2_loadings)), pc2_loadings.values, color='orange') axes[0,1].set_yticks(range(len(pc2_loadings))) axes[0,1].set_yticklabels(pc2_loadings.index, fontsize=9) axes[0,1].set_title(f'PC2 Loadings (Explains {explained_variance[1]*100:.1f}% of variance)') axes[0,1].axvline(x=0, color='black', linestyle='-', alpha=0.3) # PC3 loadings pc3_loadings = components_df['PC3'].sort_values(key=abs, ascending=False) axes[1,0].barh(range(len(pc3_loadings)), pc3_loadings.values, color='green') axes[1,0].set_yticks(range(len(pc3_loadings))) axes[1,0].set_yticklabels(pc3_loadings.index, fontsize=9) axes[1,0].set_title(f'PC3 Loadings (Explains {explained_variance[2]*100:.1f}% of variance)') axes[1,0].axvline(x=0, color='black', linestyle='-', alpha=0.3) # PC1 vs PC2 scatter plot of cities axes[1,1].scatter(pca_data[:, 0], pca_data[:, 1], alpha=0.6) axes[1,1].set_xlabel('PC1') axes[1,1].set_ylabel('PC2') axes[1,1].set_title('Students in PC1-PC2 Space') axes[1,1].grid(True, alpha=0.3) # PC1 vs PC3 scatter plot of cities axes[2,0].scatter(pca_data[:, 0], pca_data[:, 2], alpha=0.6) axes[2,0].set_xlabel('PC1') axes[2,0].set_ylabel('PC3') axes[2,0].set_title('Students in PC1-PC3 Space') axes[2,0].grid(True, alpha=0.3) # PC2 vs PC3 scatter plot of cities axes[2,1].scatter(pca_data[:, 1], pca_data[:, 2], alpha=0.6) axes[2,1].set_xlabel('PC2') axes[2,1].set_ylabel('PC3') axes[2,1].set_title('Students in PC2-PC3 Space') axes[2,1].grid(True, alpha=0.3) plt.tight_layout() st.pyplot(fig) # KMeans clustering on PCA reduced data kmeans = KMeans(n_clusters=k, random_state=42) cluster_labels = kmeans.fit_predict(pca_data[:,:5]) # Using first 5 PCs silhouette_avg = silhouette_score(pca_data[:,:5], cluster_labels) st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}") pca_data_pd['Cluster'] = cluster_labels pca = PCA(n_components=2, random_state=42) pca_2d = pca.fit_transform(pca_data_pd.drop(columns=['Cluster'])) pca_2d_df = pd.DataFrame(pca_2d, columns=['PC1', 'PC2']) pca_2d_df['Cluster'] = cluster_labels st.write("2D PCA plot with KMeans clusters:") kmeans_iteration_demo(pca_2d_df[['PC1', 'PC2']].values, k) # ...existing code... # ...existing code... else: st.write('You selected No dimensionality reduction') st.write('The original data is shown below') st.dataframe(data.head(10)) # Standardize the data data_scaled = StandardScaler().fit_transform(data) data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns) st.dataframe(data_scaled_df.head(10)) # KMeans clustering on original scaled data kmeans = KMeans(n_clusters=k, random_state=seed) cluster_labels = kmeans.fit_predict(data_scaled_df) silhouette_avg = silhouette_score(data_scaled_df, cluster_labels) st.write(f"Silhouette Score for k={k}: {silhouette_avg:.3f}") # Add cluster labels for plotting data_scaled_df['Cluster'] = cluster_labels # 2D scatter plot using two original features for visualization fig, ax = plt.subplots(figsize=(8, 6)) scatter = ax.scatter( data_scaled_df[feature_x], data_scaled_df[feature_y], c=cluster_labels, cmap='tab10', alpha=0.7, s=50 ) ax.set_xlabel(feature_x) ax.set_ylabel(feature_y) ax.set_title('KMeans Clusters (Original Scaled Features)') plt.colorbar(scatter, ax=ax, label='Cluster') st.pyplot(fig)