EnYa32's picture
Update src/streamlit_app.py
fef5b3a verified
import pandas as pd
import numpy as np
import streamlit as st
import joblib
from pathlib import Path
st.set_page_config(page_title='Clustering Predictor (GMM)', page_icon='🧩', layout='centered')
st.title('🧩 Clustering Predictor (GMM)')
st.write('Single-row cluster prediction using saved preprocessing: StandardScaler → PCA → GaussianMixture.')
BASE_DIR = Path(__file__).resolve().parent
FEATURES_PATH = BASE_DIR / 'feature_names.pkl'
SCALER_PATH = BASE_DIR / 'scaler.pkl'
PCA_PATH = BASE_DIR / 'pca.pkl'
GMM_PATH = BASE_DIR / 'gmm_model.pkl'
@st.cache_resource
def load_assets():
missing = [p.name for p in [FEATURES_PATH, SCALER_PATH, PCA_PATH, GMM_PATH] if not p.exists()]
if missing:
raise FileNotFoundError(f'Missing files in repo root: {missing}. Put them next to app.py.')
feature_names = joblib.load(FEATURES_PATH)
scaler = joblib.load(SCALER_PATH)
pca = joblib.load(PCA_PATH)
model = joblib.load(GMM_PATH)
# Hard safety checks
if hasattr(pca, 'n_features_in_') and len(feature_names) != int(pca.n_features_in_):
raise ValueError(
f'Feature mismatch: feature_names has {len(feature_names)} features, '
f'but PCA expects {int(pca.n_features_in_)}. '
'Re-export feature_names.pkl and pca.pkl from the same training run.'
)
return feature_names, scaler, pca, model
try:
feature_names, scaler, pca, model = load_assets()
except Exception as e:
st.error(str(e))
st.stop()
def predict_cluster(values_dict: dict) -> int:
df_one = pd.DataFrame([values_dict], columns=feature_names)
# Convert to numeric safely
for c in df_one.columns:
df_one[c] = pd.to_numeric(df_one[c], errors='coerce')
if df_one.isna().any().any():
bad = df_one.columns[df_one.isna().any()].tolist()
raise ValueError(f'NaN values found in columns: {bad}. Please provide valid numeric values.')
X_scaled = scaler.transform(df_one) # (1, 29)
X_pca = pca.transform(X_scaled) # (1, 27)
pred = model.predict(X_pca)[0]
return int(pred)
st.subheader('🧮 Single Prediction')
st.caption('Tip: Use a real row from your dataset for realistic values (all zeros may be unrealistic).')
with st.form('single_pred_form'):
cols = st.columns(2)
values = {}
for i, feat in enumerate(feature_names):
if i % 2 == 0:
values[feat] = cols[0].number_input(feat, value=0.0)
else:
values[feat] = cols[1].number_input(feat, value=0.0)
submitted = st.form_submit_button('Predict cluster')
if submitted:
try:
pred = predict_cluster(values)
st.success(f'✅ Predicted cluster: **{pred}**')
except Exception as e:
st.error(str(e))
with st.expander('Show expected feature columns'):
st.write(feature_names)
with st.expander('Debug shapes (advanced)'):
st.write('Number of input features:', len(feature_names))
st.write('PCA expects n_features_in_:', getattr(pca, 'n_features_in_', 'NA'))
st.write('PCA output components:', getattr(pca, 'n_components_', 'NA'))