import gradio as gr import pandas as pd from data_generation import generate_synthetic_training_data, evaluate_synthetic_data from data_preprocessing import preprocess_real_data, preprocess_synthetic_data from credit_models import real_data_credit_model, synthetic_data_credit_model from visualization import ( plot_feature_distributions, plot_comparative_credit_score_distribution, plot_comparison_table, plot_comparative_confusion_matrices, plot_comparative_credit_score_distribution_by_actual_class, plot_evaluation_table, get_metrics_df, ) COLOR_MAP = { 'Good': '#28B463', 'Standard': '#F1C40F', 'Poor': '#E74C3C', } LABEL_ORDER = ['Good', 'Standard', 'Poor'] TARGET = 'Credit_Score' # Load and preprocess real data once at startup real_train = pd.read_csv('data/processed/v4/real_train_data.csv') real_test = pd.read_csv('data/processed/v4/real_test_data.csv') X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data( real_train, real_test, TARGET ) # Train real-data model once at startup real_scores, real_classification = real_data_credit_model( X_real_train, y_real_train, X_real_test ) def run_analysis(): """Generate new synthetic data, train the synthetic model, and return all comparison plots.""" synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET) categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]] numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]] fig_feature_dist = plot_feature_distributions( X_real_train, X_synth_train ) summary_rows = [] for cls in LABEL_ORDER: real_cls = X_real_train[y_real_train == cls] synth_cls = X_synth_train[y_synth_train == cls] ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data( real_cls, synth_cls, categorical_cols, numeric_cols ) summary_rows.append({ "ks_passed": ks_pass_rate == 1.0, "mean_ks_stat": round(mean_ks, 4), "chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None, "mean_corr_diff": round(mean_corr_diff, 4), }) summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER) summary = plot_evaluation_table(summary_df) synth_scores, synth_classification = synthetic_data_credit_model( X_synth_train, y_synth_train, X_real_test ) fig_score_dist = plot_comparative_credit_score_distribution( real_scores, synth_scores ) fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class( y_real_test, real_scores, synth_scores, color_map=COLOR_MAP, label_order=LABEL_ORDER, ) fig_metrics = plot_comparison_table( y_real_test, real_classification, synth_classification ) fig_cm = plot_comparative_confusion_matrices( y_real_test, real_classification, synth_classification, labels=LABEL_ORDER, ) metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification) metrics_df = metrics_df.round(4) return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # Credit Score Model Dashboard Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data. Click the button to regenerate synthetic data and retrain the synthetic model. """ ) run_btn = gr.Button( "Generate New Synthetic Data & Analyze", variant="primary", size="lg" ) gr.Markdown( """ ## Feature Distribution Comparison Below are the distributions of the features in the real vs synthetic training datasets. """ ) with gr.Row(): plot_feature_dist = gr.Plot(label='') gr.Markdown( """ ## Generated Data Quality Summary Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics. """ ) with gr.Row(): plot_summary = gr.Plot(label='') gr.Markdown( """ ## Credit Models Metrics Below are the metrics for the real-data and synthetic-data models. """ ) with gr.Row(): plot_metrics = gr.Plot(label='') gr.Markdown( """ ## Credit Score Distribution Comparison Below are the distributions of the predicted credit scores for the real-data and synthetic-data models. """ ) with gr.Row(): plot_score_dist = gr.Plot(label='') gr.Markdown( """ ## Credit Score Distribution by Actual Class Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models. """ ) with gr.Row(): plot_score_by_class = gr.Plot(label='') gr.Markdown( """ ## Confusion Matrix Comparison Below are the confusion matrices for the real-data and synthetic-data models. """ ) with gr.Row(): plot_cm = gr.Plot(label='') run_btn.click( fn=run_analysis, inputs=[], outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm], ) demo.launch()