|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import streamlit as st |
|
|
import joblib |
|
|
from pathlib import Path |
|
|
|
|
|
st.set_page_config(page_title='Clustering Predictor (GMM)', page_icon='🧩', layout='centered') |
|
|
|
|
|
st.title('🧩 Clustering Predictor (GMM)') |
|
|
st.write('Single-row cluster prediction using saved preprocessing: StandardScaler → PCA → GaussianMixture.') |
|
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent |
|
|
|
|
|
FEATURES_PATH = BASE_DIR / 'feature_names.pkl' |
|
|
SCALER_PATH = BASE_DIR / 'scaler.pkl' |
|
|
PCA_PATH = BASE_DIR / 'pca.pkl' |
|
|
GMM_PATH = BASE_DIR / 'gmm_model.pkl' |
|
|
|
|
|
@st.cache_resource |
|
|
def load_assets(): |
|
|
missing = [p.name for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH, GMM_PATH] if not p.exists()] |
|
|
if missing: |
|
|
raise FileNotFoundError(f'Missing files in repo root: {missing}. Put them next to app.py.') |
|
|
|
|
|
feature_names = joblib.load(FEATURES_PATH) |
|
|
scaler = joblib.load(SCALER_PATH) |
|
|
pca = joblib.load(PCA_PATH) |
|
|
model = joblib.load(GMM_PATH) |
|
|
|
|
|
|
|
|
if hasattr(pca, 'n_features_in_') and len(feature_names) != int(pca.n_features_in_): |
|
|
raise ValueError( |
|
|
f'Feature mismatch: feature_names has {len(feature_names)} features, ' |
|
|
f'but PCA expects {int(pca.n_features_in_)}. ' |
|
|
'Re-export feature_names.pkl and pca.pkl from the same training run.' |
|
|
) |
|
|
|
|
|
return feature_names, scaler, pca, model |
|
|
|
|
|
try: |
|
|
feature_names, scaler, pca, model = load_assets() |
|
|
except Exception as e: |
|
|
st.error(str(e)) |
|
|
st.stop() |
|
|
|
|
|
def predict_cluster(values_dict: dict) -> int: |
|
|
df_one = pd.DataFrame([values_dict], columns=feature_names) |
|
|
|
|
|
|
|
|
for c in df_one.columns: |
|
|
df_one[c] = pd.to_numeric(df_one[c], errors='coerce') |
|
|
|
|
|
if df_one.isna().any().any(): |
|
|
bad = df_one.columns[df_one.isna().any()].tolist() |
|
|
raise ValueError(f'NaN values found in columns: {bad}. Please provide valid numeric values.') |
|
|
|
|
|
X_scaled = scaler.transform(df_one) |
|
|
X_pca = pca.transform(X_scaled) |
|
|
pred = model.predict(X_pca)[0] |
|
|
return int(pred) |
|
|
|
|
|
st.subheader('🧮 Single Prediction') |
|
|
st.caption('Tip: Use a real row from your dataset for realistic values (all zeros may be unrealistic).') |
|
|
|
|
|
with st.form('single_pred_form'): |
|
|
cols = st.columns(2) |
|
|
values = {} |
|
|
|
|
|
for i, feat in enumerate(feature_names): |
|
|
if i % 2 == 0: |
|
|
values[feat] = cols[0].number_input(feat, value=0.0) |
|
|
else: |
|
|
values[feat] = cols[1].number_input(feat, value=0.0) |
|
|
|
|
|
submitted = st.form_submit_button('Predict cluster') |
|
|
|
|
|
if submitted: |
|
|
try: |
|
|
pred = predict_cluster(values) |
|
|
st.success(f'✅ Predicted cluster: **{pred}**') |
|
|
except Exception as e: |
|
|
st.error(str(e)) |
|
|
|
|
|
with st.expander('Show expected feature columns'): |
|
|
st.write(feature_names) |
|
|
|
|
|
with st.expander('Debug shapes (advanced)'): |
|
|
st.write('Number of input features:', len(feature_names)) |
|
|
st.write('PCA expects n_features_in_:', getattr(pca, 'n_features_in_', 'NA')) |
|
|
st.write('PCA output components:', getattr(pca, 'n_components_', 'NA')) |