"""Generate realistic synthetic PFAS-SBEAD reactor experimental data (100+ records).""" from __future__ import annotations import numpy as np import pandas as pd RNG = np.random.default_rng(42) def generate_sbead_dataset(n: int = 120) -> pd.DataFrame: """ Generate n experimental records for PFAS degradation in SBEAD reactor. Parameters follow ranges from the Final PFAS-SBEAD AI Pipeline document. """ olr = RNG.uniform(1.0, 6.0, n) hrt = RNG.uniform(10.0, 30.0, n) ph = RNG.uniform(6.5, 8.0, n) temperature = RNG.uniform(30.0, 42.0, n) cod = RNG.uniform(2000.0, 8000.0, n) scod = cod * RNG.uniform(0.55, 0.85, n) vfa = RNG.uniform(100.0, 1500.0, n) alkalinity = RNG.uniform(1500.0, 5000.0, n) voltage = RNG.uniform(0.2, 1.2, n) current = voltage * RNG.uniform(0.5, 3.0, n) electrode_area = RNG.uniform(0.5, 2.0, n) current_density = current / electrode_area conductivity = RNG.uniform(1.0, 8.0, n) electrode_spacing = RNG.uniform(1.0, 5.0, n) initial_pfas = RNG.uniform(50.0, 500.0, n) base_degradation = ( 0.12 + 0.08 * (voltage / 1.2) + 0.06 * (current_density / current_density.max()) + 0.05 * ((ph - 6.5) / 1.5) + 0.04 * (hrt / 30.0) + 0.03 * (olr / 6.0) + 0.02 * ((temperature - 30.0) / 12.0) ) noise = RNG.normal(0, 0.04, n) pfas_degradation_pct = np.clip((base_degradation + noise) * 100, 5.0, 65.0) final_pfas = initial_pfas * (1 - pfas_degradation_pct / 100) pfas_adsorbed_sludge = initial_pfas * RNG.uniform(0.02, 0.12, n) pfas_adsorbed_electrode = initial_pfas * RNG.uniform(0.01, 0.08, n) fluoride_release = ( pfas_degradation_pct * RNG.uniform(0.3, 0.7, n) + voltage * RNG.uniform(2.0, 8.0, n) ) defluorination_pct = np.clip(fluoride_release / (initial_pfas * 0.15) * 100, 2.0, 55.0) short_chain_risk_base = ( 0.3 - 0.15 * (pfas_degradation_pct / 65.0) + 0.1 * (vfa / 1500.0) - 0.05 * (voltage / 1.2) ) short_chain_formation = np.clip( short_chain_risk_base + RNG.normal(0, 0.05, n), 0.05, 0.6 ) ph_drop = np.clip( RNG.uniform(0.1, 0.8, n) + 0.2 * (olr / 6.0) - 0.1 * (alkalinity / 5000.0), 0.0, 1.5, ) vfa_accumulation = np.clip( vfa * (1.0 - pfas_degradation_pct / 100 * 0.3) + RNG.normal(0, 50, n), 50.0, 2000.0, ) orp_drift = RNG.uniform(-50.0, 50.0, n) current_instability = np.clip( RNG.uniform(0.0, 0.3, n) + 0.1 * (olr / 6.0), 0.0, 0.5, ) energy_input = voltage * current * 24 / 1000 ai_score = ( 0.40 * (pfas_degradation_pct / 65.0) + 0.30 * (defluorination_pct / 55.0) - 0.15 * short_chain_formation - 0.10 * (energy_input / energy_input.max()) - 0.05 * current_instability ) ai_score = np.clip(ai_score, 0, 1) instability_flag = ( (ph_drop > 0.8) | (vfa_accumulation > 1200) | (current_instability > 0.35) ).astype(int) experiment_id = np.arange(1, n + 1) df = pd.DataFrame({ "experiment_id": experiment_id, "OLR_kg_m3_d": np.round(olr, 3), "HRT_days": np.round(hrt, 1), "pH": np.round(ph, 2), "temperature_C": np.round(temperature, 1), "COD_mg_L": np.round(cod, 0), "SCOD_mg_L": np.round(scod, 0), "VFA_mg_L": np.round(vfa, 0), "alkalinity_mg_CaCO3_L": np.round(alkalinity, 0), "voltage_V": np.round(voltage, 3), "current_A": np.round(current, 3), "current_density_A_m2": np.round(current_density, 2), "conductivity_mS_cm": np.round(conductivity, 2), "electrode_area_m2": np.round(electrode_area, 3), "electrode_spacing_cm": np.round(electrode_spacing, 2), "initial_PFAS_ug_L": np.round(initial_pfas, 1), "final_PFAS_ug_L": np.round(final_pfas, 1), "PFAS_degradation_pct": np.round(pfas_degradation_pct, 2), "PFAS_adsorbed_sludge_ug_L": np.round(pfas_adsorbed_sludge, 2), "PFAS_adsorbed_electrode_ug_L": np.round(pfas_adsorbed_electrode, 2), "fluoride_release_mg_L": np.round(fluoride_release, 2), "defluorination_pct": np.round(defluorination_pct, 2), "short_chain_formation_ratio": np.round(short_chain_formation, 4), "pH_drop": np.round(ph_drop, 3), "VFA_accumulation_mg_L": np.round(vfa_accumulation, 0), "ORP_drift_mV": np.round(orp_drift, 1), "current_instability_index": np.round(current_instability, 4), "energy_input_kWh_d": np.round(energy_input, 4), "AI_score": np.round(ai_score, 4), "instability_flag": instability_flag, }) return df def generate_mass_balance_data(df: pd.DataFrame) -> pd.DataFrame: """Compute PFAS mass balance for each experiment.""" remaining = df["final_PFAS_ug_L"] adsorbed_sludge = df["PFAS_adsorbed_sludge_ug_L"] adsorbed_electrode = df["PFAS_adsorbed_electrode_ug_L"] short_chain_products = df["initial_PFAS_ug_L"] * df["short_chain_formation_ratio"] mineralized = ( df["initial_PFAS_ug_L"] - remaining - adsorbed_sludge - adsorbed_electrode - short_chain_products ) mineralized = np.clip(mineralized, 0, None) return pd.DataFrame({ "experiment_id": df["experiment_id"], "initial_PFAS_ug_L": df["initial_PFAS_ug_L"], "remaining_in_water_ug_L": np.round(remaining, 2), "adsorbed_sludge_ug_L": np.round(adsorbed_sludge, 2), "adsorbed_electrode_ug_L": np.round(adsorbed_electrode, 2), "short_chain_products_ug_L": np.round(short_chain_products, 2), "mineralized_PFAS_ug_L": np.round(mineralized, 2), "mass_balance_closure_pct": np.round( (remaining + adsorbed_sludge + adsorbed_electrode + short_chain_products + mineralized) / df["initial_PFAS_ug_L"] * 100, 2, ), }) if __name__ == "__main__": df = generate_sbead_dataset(120) df.to_csv("data/sbead_experiments.csv", index=False) mb = generate_mass_balance_data(df) mb.to_csv("data/mass_balance.csv", index=False) print(f"Generated {len(df)} experiments and mass balance data.")