pfas-sbead-optimization / utils /data_generator.py
shrut27's picture
Upload folder using huggingface_hub
bcb2d6c verified
"""Generate realistic synthetic PFAS-SBEAD reactor experimental data (100+ records)."""
from __future__ import annotations
import numpy as np
import pandas as pd
RNG = np.random.default_rng(42)
def generate_sbead_dataset(n: int = 120) -> pd.DataFrame:
"""
Generate n experimental records for PFAS degradation in SBEAD reactor.
Parameters follow ranges from the Final PFAS-SBEAD AI Pipeline document.
"""
olr = RNG.uniform(1.0, 6.0, n)
hrt = RNG.uniform(10.0, 30.0, n)
ph = RNG.uniform(6.5, 8.0, n)
temperature = RNG.uniform(30.0, 42.0, n)
cod = RNG.uniform(2000.0, 8000.0, n)
scod = cod * RNG.uniform(0.55, 0.85, n)
vfa = RNG.uniform(100.0, 1500.0, n)
alkalinity = RNG.uniform(1500.0, 5000.0, n)
voltage = RNG.uniform(0.2, 1.2, n)
current = voltage * RNG.uniform(0.5, 3.0, n)
electrode_area = RNG.uniform(0.5, 2.0, n)
current_density = current / electrode_area
conductivity = RNG.uniform(1.0, 8.0, n)
electrode_spacing = RNG.uniform(1.0, 5.0, n)
initial_pfas = RNG.uniform(50.0, 500.0, n)
base_degradation = (
0.12
+ 0.08 * (voltage / 1.2)
+ 0.06 * (current_density / current_density.max())
+ 0.05 * ((ph - 6.5) / 1.5)
+ 0.04 * (hrt / 30.0)
+ 0.03 * (olr / 6.0)
+ 0.02 * ((temperature - 30.0) / 12.0)
)
noise = RNG.normal(0, 0.04, n)
pfas_degradation_pct = np.clip((base_degradation + noise) * 100, 5.0, 65.0)
final_pfas = initial_pfas * (1 - pfas_degradation_pct / 100)
pfas_adsorbed_sludge = initial_pfas * RNG.uniform(0.02, 0.12, n)
pfas_adsorbed_electrode = initial_pfas * RNG.uniform(0.01, 0.08, n)
fluoride_release = (
pfas_degradation_pct * RNG.uniform(0.3, 0.7, n)
+ voltage * RNG.uniform(2.0, 8.0, n)
)
defluorination_pct = np.clip(fluoride_release / (initial_pfas * 0.15) * 100, 2.0, 55.0)
short_chain_risk_base = (
0.3
- 0.15 * (pfas_degradation_pct / 65.0)
+ 0.1 * (vfa / 1500.0)
- 0.05 * (voltage / 1.2)
)
short_chain_formation = np.clip(
short_chain_risk_base + RNG.normal(0, 0.05, n), 0.05, 0.6
)
ph_drop = np.clip(
RNG.uniform(0.1, 0.8, n) + 0.2 * (olr / 6.0) - 0.1 * (alkalinity / 5000.0),
0.0, 1.5,
)
vfa_accumulation = np.clip(
vfa * (1.0 - pfas_degradation_pct / 100 * 0.3) + RNG.normal(0, 50, n),
50.0, 2000.0,
)
orp_drift = RNG.uniform(-50.0, 50.0, n)
current_instability = np.clip(
RNG.uniform(0.0, 0.3, n) + 0.1 * (olr / 6.0),
0.0, 0.5,
)
energy_input = voltage * current * 24 / 1000
ai_score = (
0.40 * (pfas_degradation_pct / 65.0)
+ 0.30 * (defluorination_pct / 55.0)
- 0.15 * short_chain_formation
- 0.10 * (energy_input / energy_input.max())
- 0.05 * current_instability
)
ai_score = np.clip(ai_score, 0, 1)
instability_flag = (
(ph_drop > 0.8) | (vfa_accumulation > 1200) | (current_instability > 0.35)
).astype(int)
experiment_id = np.arange(1, n + 1)
df = pd.DataFrame({
"experiment_id": experiment_id,
"OLR_kg_m3_d": np.round(olr, 3),
"HRT_days": np.round(hrt, 1),
"pH": np.round(ph, 2),
"temperature_C": np.round(temperature, 1),
"COD_mg_L": np.round(cod, 0),
"SCOD_mg_L": np.round(scod, 0),
"VFA_mg_L": np.round(vfa, 0),
"alkalinity_mg_CaCO3_L": np.round(alkalinity, 0),
"voltage_V": np.round(voltage, 3),
"current_A": np.round(current, 3),
"current_density_A_m2": np.round(current_density, 2),
"conductivity_mS_cm": np.round(conductivity, 2),
"electrode_area_m2": np.round(electrode_area, 3),
"electrode_spacing_cm": np.round(electrode_spacing, 2),
"initial_PFAS_ug_L": np.round(initial_pfas, 1),
"final_PFAS_ug_L": np.round(final_pfas, 1),
"PFAS_degradation_pct": np.round(pfas_degradation_pct, 2),
"PFAS_adsorbed_sludge_ug_L": np.round(pfas_adsorbed_sludge, 2),
"PFAS_adsorbed_electrode_ug_L": np.round(pfas_adsorbed_electrode, 2),
"fluoride_release_mg_L": np.round(fluoride_release, 2),
"defluorination_pct": np.round(defluorination_pct, 2),
"short_chain_formation_ratio": np.round(short_chain_formation, 4),
"pH_drop": np.round(ph_drop, 3),
"VFA_accumulation_mg_L": np.round(vfa_accumulation, 0),
"ORP_drift_mV": np.round(orp_drift, 1),
"current_instability_index": np.round(current_instability, 4),
"energy_input_kWh_d": np.round(energy_input, 4),
"AI_score": np.round(ai_score, 4),
"instability_flag": instability_flag,
})
return df
def generate_mass_balance_data(df: pd.DataFrame) -> pd.DataFrame:
"""Compute PFAS mass balance for each experiment."""
remaining = df["final_PFAS_ug_L"]
adsorbed_sludge = df["PFAS_adsorbed_sludge_ug_L"]
adsorbed_electrode = df["PFAS_adsorbed_electrode_ug_L"]
short_chain_products = df["initial_PFAS_ug_L"] * df["short_chain_formation_ratio"]
mineralized = (
df["initial_PFAS_ug_L"]
- remaining
- adsorbed_sludge
- adsorbed_electrode
- short_chain_products
)
mineralized = np.clip(mineralized, 0, None)
return pd.DataFrame({
"experiment_id": df["experiment_id"],
"initial_PFAS_ug_L": df["initial_PFAS_ug_L"],
"remaining_in_water_ug_L": np.round(remaining, 2),
"adsorbed_sludge_ug_L": np.round(adsorbed_sludge, 2),
"adsorbed_electrode_ug_L": np.round(adsorbed_electrode, 2),
"short_chain_products_ug_L": np.round(short_chain_products, 2),
"mineralized_PFAS_ug_L": np.round(mineralized, 2),
"mass_balance_closure_pct": np.round(
(remaining + adsorbed_sludge + adsorbed_electrode + short_chain_products + mineralized)
/ df["initial_PFAS_ug_L"] * 100,
2,
),
})
if __name__ == "__main__":
df = generate_sbead_dataset(120)
df.to_csv("data/sbead_experiments.csv", index=False)
mb = generate_mass_balance_data(df)
mb.to_csv("data/mass_balance.csv", index=False)
print(f"Generated {len(df)} experiments and mass balance data.")