Spaces:

EnYa32
/

UnsupervisedCustumerPrediction

Sleeping

App Files Files Community

EnYa32 commited on Dec 27, 2025

Commit

40c63dd

verified ·

1 Parent(s): 7bcf463

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +158 -37

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,161 @@
-import altair as alt
-import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import pandas as pd
+import numpy as np
 import streamlit as st
+import joblib
+from pathlib import Path
+# -------------------------
+# Page config
+# -------------------------
+st.set_page_config(
+    page_title='Clustering Predictor (KMeans / GMM)',
+    page_icon='🧩',
+    layout='centered'
+)
+st.title('🧩 Clustering Predictor (KMeans / GMM)')
+st.write(
+    'Upload a CSV file and get cluster predictions using your saved preprocessing (Scaler + PCA) and model.'
+)
+# -------------------------
+# Paths (HF friendly)
+# -------------------------
+BASE_DIR = Path(__file__).resolve().parent
+FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
+SCALER_PATH = BASE_DIR / 'scaler.pkl'
+PCA_PATH = BASE_DIR / 'pca.pkl'
+KMEANS_PATH = BASE_DIR / 'kmeans_model_k9.pkl'
+GMM_PATH = BASE_DIR / 'gmm_model_k9.pkl'
+# -------------------------
+# Load assets
+# -------------------------
+@st.cache_resource
+def load_assets():
+    missing = []
+    for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH]:
+        if not p.exists():
+            missing.append(p.name)
+    if missing:
+        raise FileNotFoundError(
+            f'Missing required files in repo root: {missing}. '
+            'Please upload them next to app.py.'
+        )
+    feature_names = joblib.load(FEATURES_PATH)
+    scaler = joblib.load(SCALER_PATH)
+    pca = joblib.load(PCA_PATH)
+    models = {}
+    if KMEANS_PATH.exists():
+        models['KMeans (k=9)'] = joblib.load(KMEANS_PATH)
+    if GMM_PATH.exists():
+        models['GMM (k=9)'] = joblib.load(GMM_PATH)
+    if not models:
+        raise FileNotFoundError(
+            "No model files found. Upload 'kmeans_model_k9.pkl' and/or 'gmm_model_k9.pkl' next to app.py."
+        )
+    return feature_names, scaler, pca, models
+try:
+    feature_names, scaler, pca, models = load_assets()
+except Exception as e:
+    st.error(str(e))
+    st.stop()
+# -------------------------
+# Model selector
+# -------------------------
+model_name = st.selectbox('Select model', list(models.keys()))
+model = models[model_name]
+st.caption('Expected input columns:')
+with st.expander('Show feature columns'):
+    st.write(feature_names)
+# -------------------------
+# Upload CSV
+# -------------------------
+uploaded = st.file_uploader('Upload CSV', type=['csv'])
+def preprocess_df(df_in: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series | None]:
+    # Accept either 'id' or 'Id' columns (optional)
+    id_col = None
+    if 'id' in df_in.columns:
+        id_col = 'id'
+    elif 'Id' in df_in.columns:
+        id_col = 'Id'
+    ids = df_in[id_col].copy() if id_col else None
+    X = df_in.drop(columns=[id_col], errors='ignore').copy()
+    # Validate columns
+    missing_cols = [c for c in feature_names if c not in X.columns]
+    extra_cols = [c for c in X.columns if c not in feature_names]
+    if missing_cols:
+        raise ValueError(f'Missing required columns: {missing_cols}')
+    # Keep only expected columns + correct order
+    X = X[feature_names]
+    # Convert to numeric if possible (safety)
+    for c in X.columns:
+        X[c] = pd.to_numeric(X[c], errors='coerce')
+    if X.isna().any().any():
+        # You can choose a strategy; here we fail fast so user fixes the input.
+        bad_cols = X.columns[X.isna().any()].tolist()
+        raise ValueError(
+            f'Found NaNs after converting to numeric. Check these columns: {bad_cols}. '
+            'Make sure your CSV has valid numeric values.'
+        )
+    return X, ids
+def predict_clusters(X: pd.DataFrame) -> np.ndarray:
+    X_scaled = scaler.transform(X)
+    X_pca = pca.transform(X_scaled)
+    # KMeans and GMM both have predict()
+    preds = model.predict(X_pca)
+    return preds
+if uploaded is not None:
+    try:
+        df_up = pd.read_csv(uploaded)
+        X_up, ids = preprocess_df(df_up)
+        preds = predict_clusters(X_up)
+        out = pd.DataFrame({'Predicted': preds})
+        if ids is not None:
+            out.insert(0, 'Id', ids)
+        st.success('✅ Predictions created successfully.')
+        st.dataframe(out.head(30), use_container_width=True)
+        st.download_button(
+            'Download predictions as CSV',
+            data=out.to_csv(index=False).encode('utf-8'),
+            file_name='predictions.csv',
+            mime='text/csv'
+        )
+        # Quick info
+        st.subheader('Cluster distribution')
+        dist = pd.Series(preds).value_counts().sort_index()
+        st.write(dist)
+    except Exception as e:
+        st.error(str(e))
+        st.stop()
+else:
+    st.info('Upload a CSV file to generate cluster predictions.')