"""Generate realistic synthetic PFAS-SBEAD reactor experimental data (100+ records)."""

from __future__ import annotations

import numpy as np
import pandas as pd

RNG = np.random.default_rng(42)


def generate_sbead_dataset(n: int = 120) -> pd.DataFrame:
    """
    Generate n experimental records for PFAS degradation in SBEAD reactor.
    Parameters follow ranges from the Final PFAS-SBEAD AI Pipeline document.
    """
    olr = RNG.uniform(1.0, 6.0, n)
    hrt = RNG.uniform(10.0, 30.0, n)
    ph = RNG.uniform(6.5, 8.0, n)
    temperature = RNG.uniform(30.0, 42.0, n)
    cod = RNG.uniform(2000.0, 8000.0, n)
    scod = cod * RNG.uniform(0.55, 0.85, n)
    vfa = RNG.uniform(100.0, 1500.0, n)
    alkalinity = RNG.uniform(1500.0, 5000.0, n)

    voltage = RNG.uniform(0.2, 1.2, n)
    current = voltage * RNG.uniform(0.5, 3.0, n)
    electrode_area = RNG.uniform(0.5, 2.0, n)
    current_density = current / electrode_area
    conductivity = RNG.uniform(1.0, 8.0, n)
    electrode_spacing = RNG.uniform(1.0, 5.0, n)

    initial_pfas = RNG.uniform(50.0, 500.0, n)

    base_degradation = (
        0.12
        + 0.08 * (voltage / 1.2)
        + 0.06 * (current_density / current_density.max())
        + 0.05 * ((ph - 6.5) / 1.5)
        + 0.04 * (hrt / 30.0)
        + 0.03 * (olr / 6.0)
        + 0.02 * ((temperature - 30.0) / 12.0)
    )
    noise = RNG.normal(0, 0.04, n)
    pfas_degradation_pct = np.clip((base_degradation + noise) * 100, 5.0, 65.0)

    final_pfas = initial_pfas * (1 - pfas_degradation_pct / 100)
    pfas_adsorbed_sludge = initial_pfas * RNG.uniform(0.02, 0.12, n)
    pfas_adsorbed_electrode = initial_pfas * RNG.uniform(0.01, 0.08, n)

    fluoride_release = (
        pfas_degradation_pct * RNG.uniform(0.3, 0.7, n)
        + voltage * RNG.uniform(2.0, 8.0, n)
    )
    defluorination_pct = np.clip(fluoride_release / (initial_pfas * 0.15) * 100, 2.0, 55.0)

    short_chain_risk_base = (
        0.3
        - 0.15 * (pfas_degradation_pct / 65.0)
        + 0.1 * (vfa / 1500.0)
        - 0.05 * (voltage / 1.2)
    )
    short_chain_formation = np.clip(
        short_chain_risk_base + RNG.normal(0, 0.05, n), 0.05, 0.6
    )

    ph_drop = np.clip(
        RNG.uniform(0.1, 0.8, n) + 0.2 * (olr / 6.0) - 0.1 * (alkalinity / 5000.0),
        0.0, 1.5,
    )
    vfa_accumulation = np.clip(
        vfa * (1.0 - pfas_degradation_pct / 100 * 0.3) + RNG.normal(0, 50, n),
        50.0, 2000.0,
    )
    orp_drift = RNG.uniform(-50.0, 50.0, n)
    current_instability = np.clip(
        RNG.uniform(0.0, 0.3, n) + 0.1 * (olr / 6.0),
        0.0, 0.5,
    )

    energy_input = voltage * current * 24 / 1000
    ai_score = (
        0.40 * (pfas_degradation_pct / 65.0)
        + 0.30 * (defluorination_pct / 55.0)
        - 0.15 * short_chain_formation
        - 0.10 * (energy_input / energy_input.max())
        - 0.05 * current_instability
    )
    ai_score = np.clip(ai_score, 0, 1)

    instability_flag = (
        (ph_drop > 0.8) | (vfa_accumulation > 1200) | (current_instability > 0.35)
    ).astype(int)

    experiment_id = np.arange(1, n + 1)

    df = pd.DataFrame({
        "experiment_id": experiment_id,
        "OLR_kg_m3_d": np.round(olr, 3),
        "HRT_days": np.round(hrt, 1),
        "pH": np.round(ph, 2),
        "temperature_C": np.round(temperature, 1),
        "COD_mg_L": np.round(cod, 0),
        "SCOD_mg_L": np.round(scod, 0),
        "VFA_mg_L": np.round(vfa, 0),
        "alkalinity_mg_CaCO3_L": np.round(alkalinity, 0),
        "voltage_V": np.round(voltage, 3),
        "current_A": np.round(current, 3),
        "current_density_A_m2": np.round(current_density, 2),
        "conductivity_mS_cm": np.round(conductivity, 2),
        "electrode_area_m2": np.round(electrode_area, 3),
        "electrode_spacing_cm": np.round(electrode_spacing, 2),
        "initial_PFAS_ug_L": np.round(initial_pfas, 1),
        "final_PFAS_ug_L": np.round(final_pfas, 1),
        "PFAS_degradation_pct": np.round(pfas_degradation_pct, 2),
        "PFAS_adsorbed_sludge_ug_L": np.round(pfas_adsorbed_sludge, 2),
        "PFAS_adsorbed_electrode_ug_L": np.round(pfas_adsorbed_electrode, 2),
        "fluoride_release_mg_L": np.round(fluoride_release, 2),
        "defluorination_pct": np.round(defluorination_pct, 2),
        "short_chain_formation_ratio": np.round(short_chain_formation, 4),
        "pH_drop": np.round(ph_drop, 3),
        "VFA_accumulation_mg_L": np.round(vfa_accumulation, 0),
        "ORP_drift_mV": np.round(orp_drift, 1),
        "current_instability_index": np.round(current_instability, 4),
        "energy_input_kWh_d": np.round(energy_input, 4),
        "AI_score": np.round(ai_score, 4),
        "instability_flag": instability_flag,
    })
    return df


def generate_mass_balance_data(df: pd.DataFrame) -> pd.DataFrame:
    """Compute PFAS mass balance for each experiment."""
    remaining = df["final_PFAS_ug_L"]
    adsorbed_sludge = df["PFAS_adsorbed_sludge_ug_L"]
    adsorbed_electrode = df["PFAS_adsorbed_electrode_ug_L"]
    short_chain_products = df["initial_PFAS_ug_L"] * df["short_chain_formation_ratio"]
    mineralized = (
        df["initial_PFAS_ug_L"]
        - remaining
        - adsorbed_sludge
        - adsorbed_electrode
        - short_chain_products
    )
    mineralized = np.clip(mineralized, 0, None)

    return pd.DataFrame({
        "experiment_id": df["experiment_id"],
        "initial_PFAS_ug_L": df["initial_PFAS_ug_L"],
        "remaining_in_water_ug_L": np.round(remaining, 2),
        "adsorbed_sludge_ug_L": np.round(adsorbed_sludge, 2),
        "adsorbed_electrode_ug_L": np.round(adsorbed_electrode, 2),
        "short_chain_products_ug_L": np.round(short_chain_products, 2),
        "mineralized_PFAS_ug_L": np.round(mineralized, 2),
        "mass_balance_closure_pct": np.round(
            (remaining + adsorbed_sludge + adsorbed_electrode + short_chain_products + mineralized)
            / df["initial_PFAS_ug_L"] * 100,
            2,
        ),
    })


if __name__ == "__main__":
    df = generate_sbead_dataset(120)
    df.to_csv("data/sbead_experiments.csv", index=False)
    mb = generate_mass_balance_data(df)
    mb.to_csv("data/mass_balance.csv", index=False)
    print(f"Generated {len(df)} experiments and mass balance data.")