Spaces:
Running
Running
File size: 11,157 Bytes
b72652e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | import streamlit as st
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from src.models.explain import generate_shap_explanation
from src.features.build_features import calculate_risk_score
st.set_page_config(page_title="FairValue Transfer Cap Estimator", layout="wide")
# ── Currency Config ───────────────────────────────────────────────────────────
EUR_TO_GBP = 0.85 # Approximate conversion — review quarterly
st.title("FairValue Transfer Cap Estimator")
st.markdown(
"A rigorous, data-driven 'Transfer Ceiling Calculator' grounded in ML "
"and Hedonic Pricing Theory."
)
# ── Data Loading ─────────────────────────────────────────────────────────────
# Fixed: was @st.cache_resource — mutations persisted across user sessions.
# @st.cache_data correctly serialises DataFrames per session.
@st.cache_data
def load_player_data():
"""Loads processed player features CSV. Returns None if file not found."""
df_path = "data/processed/app_features.csv"
if not os.path.exists(df_path):
return None
df = pd.read_csv(df_path)
mv_rename_map = {
col: 'market_value_in_eur'
for col in df.columns
if 'market' in col.lower() and 'value' in col.lower()
}
if mv_rename_map:
df.rename(columns=mv_rename_map, inplace=True)
df = df.loc[:, ~df.columns.duplicated()].copy()
return df
# ── Model Loading ─────────────────────────────────────────────────────────────
# @st.cache_resource is correct here — the model object is shared, not copied.
@st.cache_resource
def load_model():
"""Loads XGBoost model from disk. Returns (model, size_bytes, path)."""
xgb_model = xgb.XGBRegressor()
for path in ["fairvalue_xgboost.json", "FairValue_xgboost.json"]:
if os.path.exists(path):
size = os.path.getsize(path)
xgb_model.load_model(path)
return xgb_model, size, path
return None, 0, None
df = load_player_data()
model, model_size, model_path = load_model()
if df is None:
st.error("⚠️ Data file not found: `data/processed/app_features.csv`. Re-run the pipeline.")
st.stop()
if model is None:
st.error("⚠️ MODEL_FILE_NOT_FOUND — ensure `fairvalue_xgboost.json` is in the project root.")
st.stop()
if model_size < 1000:
st.error(
f"❌ MODEL CORRUPTED: `{model_path}` is only {model_size:,} bytes "
"(expected ~400 KB). Please re-upload the correct model file."
)
st.stop() # Fixed: was missing — execution continued and produced silent zero predictions
# ── Sidebar ───────────────────────────────────────────────────────────────────
st.sidebar.header("Player Transfer Profile")
input_mode = st.sidebar.radio("Input Mode", ["Select Existing Player", "Create Custom Player"])
name_col = next(
(c for c in ['name', 'name_x', 'Player_Name', 'Name'] if c in df.columns), None
)
selected_name = ""
if input_mode == "Select Existing Player":
if name_col is None:
st.error("No player name column found in data. Please re-run the pipeline.")
st.stop()
player_list = sorted(df[name_col].astype(str).unique().tolist())
selected_name = st.sidebar.selectbox("Target Database Player", player_list)
player_data = df[df[name_col].astype(str) == selected_name].iloc[0:1].copy()
contract_years = st.sidebar.slider(
"Contract Years Remaining", 0.5, 6.0,
float(player_data['Contract_Years_Left'].iloc[0]), 0.5
)
age = st.sidebar.slider("Age", 16, 40, int(player_data['Age'].iloc[0]))
inj_col = next(
(c for c in ['Injury_Days_Total_24m', 'Injury_Days'] if c in player_data.columns), None
)
injuries = st.sidebar.number_input(
"Injury missed days (24m)", 0, 500,
int(player_data[inj_col].iloc[0]) if inj_col else 10
)
else:
player_data = df.median(numeric_only=True).to_frame().T
contract_years = st.sidebar.slider("Contract Years Remaining", 0.5, 6.0, 2.0, 0.5)
age = st.sidebar.slider("Age", 16, 40, 24)
injuries = st.sidebar.number_input("Injury missed days (24m)", 0, 500, 10)
if 'market_value_in_eur' in player_data.columns:
m_val = st.sidebar.number_input("Current Market Value Estimation (£m)", 1.0, 200.0, 20.0)
player_data['market_value_in_eur'] = (m_val * 1_000_000) / EUR_TO_GBP
asking_price = st.sidebar.number_input("Selling Club Asking Price (£m)", 1.0, 300.0, 45.0)
# ── Hype Factor Integration (consumed from Page 3 Live Player Intel) ──────────
# Fixed: was never read — hard_cap calculation ignored session_state entirely.
hype_all = st.session_state.get('player_hype_metrics', {})
hype_entry = hype_all.get(selected_name.lower(), {}) if selected_name else {}
hype_premium_pct = hype_entry.get('hype_premium_percent', 0.0)
if hype_premium_pct != 0.0:
sign = "+" if hype_premium_pct > 0 else ""
st.sidebar.info(
f"💡 **Hype Factor Active:** {sign}{hype_premium_pct:.1f}% \n"
"*Run Page 3 → Live Player Intel to refresh.*"
)
else:
st.sidebar.caption("No Hype Factor loaded. Run Page 3 to enrich this estimate.")
# ── Build Inference Vector ─────────────────────────────────────────────────────
expected_cols = model.feature_names_in_
player_data = player_data.copy()
player_data['Contract_Years_Left'] = contract_years
player_data['Age'] = age
if 'Injury_Days_Total_24m' in player_data.columns:
player_data['Injury_Days_Total_24m'] = injuries
# Recompute derived risk flags so they reflect the slider values, not stale DB data.
# Without this, Risk_Contract / Risk_Age / Risk_Injury don't update when sliders move.
player_data = calculate_risk_score(
player_data,
contract_col='Contract_Years_Left',
age_col='Age',
injury_col='Injury_Days_Total_24m'
)
X_infer = player_data.reindex(columns=expected_cols, fill_value=0)
if st.button("Calculate Prediction", type="primary"):
raw_preds = model.predict(X_infer)
log_pv = raw_preds[0]
baseline_pv = max(float(np.expm1(log_pv)), 0.0)
baseline_pv_m = baseline_pv / 1_000_000
conservative_bound = baseline_pv * 0.85
# Internal rule-based risk discount (transparent to users)
risk_pct = (
(0.20 if contract_years < 1.5 else 0.0) +
(0.15 if age > 30 else 0.0) +
(0.10 if injuries > 60 else 0.0)
)
# External hype multiplier from Page 3 NLP sentiment
hype_multiplier = 1.0 + (hype_premium_pct / 100.0)
hard_cap = conservative_bound * (1.0 - risk_pct) * hype_multiplier
hard_cap_m = hard_cap / 1_000_000
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("Price Range Recommendation")
fig = go.Figure(go.Indicator(
mode="gauge+number+delta",
value=asking_price,
delta={'reference': hard_cap_m, 'increasing': {'color': "red"}},
title={'text': "Asking Price vs Hard Cap (£m)"},
gauge={
'axis': {'range': [0, max(asking_price * 1.2, 100)]},
'threshold': {'line': {'color': "white", 'width': 4}, 'value': hard_cap_m}
}
))
st.plotly_chart(fig, use_container_width=True)
with col2:
st.subheader("Metrics Breakdown")
st.metric("Predicted Market Value (Baseline)", f"£{baseline_pv_m:.1f}m")
st.metric("Risk-Adjusted Hard Cap", f"£{hard_cap_m:.1f}m")
if hype_premium_pct != 0.0:
sign = "+" if hype_premium_pct > 0 else ""
st.metric("Hype / Form Adjustment", f"{sign}{hype_premium_pct:.1f}%")
if asking_price > hard_cap_m:
overpay = asking_price - hard_cap_m
st.error(f"⚠️ Winner's Curse Risk: £{overpay:.1f}m Overpay")
else:
st.success("✅ Asking price is within Fair Value bounds.")
# ── SHAP Explainability Panel ─────────────────────────────────────────────
# Fixed: shap was imported but never used. Now wired to generate_shap_explanation.
st.markdown("---")
st.subheader("🔬 XAI Explainability — What Drives This Valuation?")
with st.spinner("Computing SHAP feature contributions..."):
try:
_, explanation_df = generate_shap_explanation(model, X_infer)
top_shap = explanation_df.head(10).copy()
top_shap['Label'] = top_shap['Feature'].str.replace('_', ' ').str.title()
fig_shap = go.Figure(go.Bar(
x=top_shap['Contribution_to_LogPrice'],
y=top_shap['Label'],
orientation='h',
marker_color=[
'#e74c3c' if v < 0 else '#2ecc71'
for v in top_shap['Contribution_to_LogPrice']
],
text=[f"{v:+.3f}" for v in top_shap['Contribution_to_LogPrice']],
textposition='outside',
))
fig_shap.update_layout(
title="Top 10 Feature Contributions to Transfer Fee (Log-Price Scale)",
xaxis_title="SHAP Value (Additive impact on log transfer fee)",
yaxis={'categoryorder': 'total ascending'},
height=420,
margin=dict(l=10, r=10, t=50, b=10),
)
st.plotly_chart(fig_shap, use_container_width=True)
st.caption(
"🟢 Green = boosts transfer value | 🔴 Red = depresses transfer value | "
"Sorted by absolute magnitude."
)
except Exception as shap_err:
st.warning(f"SHAP panel unavailable: {shap_err}")
with st.expander("🛠️ Technical Deep Dive (Internal Metrics)"):
st.write(f"**Model:** `{model_path}` ({model_size:,} bytes)")
st.write(f"**Raw Log-Scale Prediction:** `{log_pv:.4f}`")
mv_val = (
X_infer['market_value_in_eur'].iloc[0]
if 'market_value_in_eur' in X_infer.columns else "MISSING"
)
if isinstance(mv_val, (int, float)):
st.write(f"**Market Value Input (EUR):** `{mv_val:,.0f}`")
else:
st.write(f"**Market Value Input:** `{mv_val}`")
st.write("**Full Feature Vector:**", X_infer)
st.markdown("---")
st.subheader("Model Performance Assessment")
st.markdown(f"**Training Set:** `{len(df):,}` transfers | **Engine:** XGBoost + RandomizedSearchCV")
st.markdown("**Validation MAE:** `~£23,980,000` | **Status:** Production Ready")
|