Spaces:

WellmatixGenAI
/

pfas-sbead-optimization

Running

App Files Files Community

pfas-sbead-optimization / utils /data_generator.py

shrut27

Upload folder using huggingface_hub

bcb2d6c verified about 16 hours ago

raw

history blame contribute delete

6.27 kB

	"""Generate realistic synthetic PFAS-SBEAD reactor experimental data (100+ records)."""

	from __future__ import annotations

	import numpy as np
	import pandas as pd

	RNG = np.random.default_rng(42)


	def generate_sbead_dataset(n: int = 120) -> pd.DataFrame:
	"""
	Generate n experimental records for PFAS degradation in SBEAD reactor.
	Parameters follow ranges from the Final PFAS-SBEAD AI Pipeline document.
	"""
	olr = RNG.uniform(1.0, 6.0, n)
	hrt = RNG.uniform(10.0, 30.0, n)
	ph = RNG.uniform(6.5, 8.0, n)
	temperature = RNG.uniform(30.0, 42.0, n)
	cod = RNG.uniform(2000.0, 8000.0, n)
	scod = cod * RNG.uniform(0.55, 0.85, n)
	vfa = RNG.uniform(100.0, 1500.0, n)
	alkalinity = RNG.uniform(1500.0, 5000.0, n)

	voltage = RNG.uniform(0.2, 1.2, n)
	current = voltage * RNG.uniform(0.5, 3.0, n)
	electrode_area = RNG.uniform(0.5, 2.0, n)
	current_density = current / electrode_area
	conductivity = RNG.uniform(1.0, 8.0, n)
	electrode_spacing = RNG.uniform(1.0, 5.0, n)

	initial_pfas = RNG.uniform(50.0, 500.0, n)

	base_degradation = (
	0.12
	+ 0.08 * (voltage / 1.2)
	+ 0.06 * (current_density / current_density.max())
	+ 0.05 * ((ph - 6.5) / 1.5)
	+ 0.04 * (hrt / 30.0)
	+ 0.03 * (olr / 6.0)
	+ 0.02 * ((temperature - 30.0) / 12.0)
	)
	noise = RNG.normal(0, 0.04, n)
	pfas_degradation_pct = np.clip((base_degradation + noise) * 100, 5.0, 65.0)

	final_pfas = initial_pfas * (1 - pfas_degradation_pct / 100)
	pfas_adsorbed_sludge = initial_pfas * RNG.uniform(0.02, 0.12, n)
	pfas_adsorbed_electrode = initial_pfas * RNG.uniform(0.01, 0.08, n)

	fluoride_release = (
	pfas_degradation_pct * RNG.uniform(0.3, 0.7, n)
	+ voltage * RNG.uniform(2.0, 8.0, n)
	)
	defluorination_pct = np.clip(fluoride_release / (initial_pfas * 0.15) * 100, 2.0, 55.0)

	short_chain_risk_base = (
	0.3
	- 0.15 * (pfas_degradation_pct / 65.0)
	+ 0.1 * (vfa / 1500.0)
	- 0.05 * (voltage / 1.2)
	)
	short_chain_formation = np.clip(
	short_chain_risk_base + RNG.normal(0, 0.05, n), 0.05, 0.6
	)

	ph_drop = np.clip(
	RNG.uniform(0.1, 0.8, n) + 0.2 * (olr / 6.0) - 0.1 * (alkalinity / 5000.0),
	0.0, 1.5,
	)
	vfa_accumulation = np.clip(
	vfa * (1.0 - pfas_degradation_pct / 100 * 0.3) + RNG.normal(0, 50, n),
	50.0, 2000.0,
	)
	orp_drift = RNG.uniform(-50.0, 50.0, n)
	current_instability = np.clip(
	RNG.uniform(0.0, 0.3, n) + 0.1 * (olr / 6.0),
	0.0, 0.5,
	)

	energy_input = voltage * current * 24 / 1000
	ai_score = (
	0.40 * (pfas_degradation_pct / 65.0)
	+ 0.30 * (defluorination_pct / 55.0)
	- 0.15 * short_chain_formation
	- 0.10 * (energy_input / energy_input.max())
	- 0.05 * current_instability
	)
	ai_score = np.clip(ai_score, 0, 1)

	instability_flag = (
	(ph_drop > 0.8) \| (vfa_accumulation > 1200) \| (current_instability > 0.35)
	).astype(int)

	experiment_id = np.arange(1, n + 1)

	df = pd.DataFrame({
	"experiment_id": experiment_id,
	"OLR_kg_m3_d": np.round(olr, 3),
	"HRT_days": np.round(hrt, 1),
	"pH": np.round(ph, 2),
	"temperature_C": np.round(temperature, 1),
	"COD_mg_L": np.round(cod, 0),
	"SCOD_mg_L": np.round(scod, 0),
	"VFA_mg_L": np.round(vfa, 0),
	"alkalinity_mg_CaCO3_L": np.round(alkalinity, 0),
	"voltage_V": np.round(voltage, 3),
	"current_A": np.round(current, 3),
	"current_density_A_m2": np.round(current_density, 2),
	"conductivity_mS_cm": np.round(conductivity, 2),
	"electrode_area_m2": np.round(electrode_area, 3),
	"electrode_spacing_cm": np.round(electrode_spacing, 2),
	"initial_PFAS_ug_L": np.round(initial_pfas, 1),
	"final_PFAS_ug_L": np.round(final_pfas, 1),
	"PFAS_degradation_pct": np.round(pfas_degradation_pct, 2),
	"PFAS_adsorbed_sludge_ug_L": np.round(pfas_adsorbed_sludge, 2),
	"PFAS_adsorbed_electrode_ug_L": np.round(pfas_adsorbed_electrode, 2),
	"fluoride_release_mg_L": np.round(fluoride_release, 2),
	"defluorination_pct": np.round(defluorination_pct, 2),
	"short_chain_formation_ratio": np.round(short_chain_formation, 4),
	"pH_drop": np.round(ph_drop, 3),
	"VFA_accumulation_mg_L": np.round(vfa_accumulation, 0),
	"ORP_drift_mV": np.round(orp_drift, 1),
	"current_instability_index": np.round(current_instability, 4),
	"energy_input_kWh_d": np.round(energy_input, 4),
	"AI_score": np.round(ai_score, 4),
	"instability_flag": instability_flag,
	})
	return df


	def generate_mass_balance_data(df: pd.DataFrame) -> pd.DataFrame:
	"""Compute PFAS mass balance for each experiment."""
	remaining = df["final_PFAS_ug_L"]
	adsorbed_sludge = df["PFAS_adsorbed_sludge_ug_L"]
	adsorbed_electrode = df["PFAS_adsorbed_electrode_ug_L"]
	short_chain_products = df["initial_PFAS_ug_L"] * df["short_chain_formation_ratio"]
	mineralized = (
	df["initial_PFAS_ug_L"]
	- remaining
	- adsorbed_sludge
	- adsorbed_electrode
	- short_chain_products
	)
	mineralized = np.clip(mineralized, 0, None)

	return pd.DataFrame({
	"experiment_id": df["experiment_id"],
	"initial_PFAS_ug_L": df["initial_PFAS_ug_L"],
	"remaining_in_water_ug_L": np.round(remaining, 2),
	"adsorbed_sludge_ug_L": np.round(adsorbed_sludge, 2),
	"adsorbed_electrode_ug_L": np.round(adsorbed_electrode, 2),
	"short_chain_products_ug_L": np.round(short_chain_products, 2),
	"mineralized_PFAS_ug_L": np.round(mineralized, 2),
	"mass_balance_closure_pct": np.round(
	(remaining + adsorbed_sludge + adsorbed_electrode + short_chain_products + mineralized)
	/ df["initial_PFAS_ug_L"] * 100,
	2,
	),
	})


	if __name__ == "__main__":
	df = generate_sbead_dataset(120)
	df.to_csv("data/sbead_experiments.csv", index=False)
	mb = generate_mass_balance_data(df)
	mb.to_csv("data/mass_balance.csv", index=False)
	print(f"Generated {len(df)} experiments and mass balance data.")