File size: 2,283 Bytes
40e7e76 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | import pandas as pd
from math import sqrt
def summarize_metrics(skip, csv_path: str, save_path: str | None = None) -> pd.DataFrame:
"""
Compute mean and standard deviation for all columns except the first
(assumed non-numeric identifier like 'Peptide Sequence').
Returns a DataFrame with rows = column names and columns = ['mean','std','count'].
Uses sample std (ddof=1). Non-numeric cells are coerced to NaN.
"""
df = pd.read_csv(csv_path)
vals = df.iloc[:, skip:].apply(pd.to_numeric, errors='coerce') # columns 2..end
stats = vals.agg(['mean', 'std', 'count']).T # shape: (num_metrics, 3)
if save_path:
stats.to_csv(save_path, index=True)
return stats
def summarize_list(xs, ddof = 1):
# Clean & coerce to float
vals = []
for x in xs:
if x is None or x == "":
continue
try:
vals.append(float(x))
except (TypeError, ValueError):
continue
n = len(vals)
if n == 0:
raise ValueError("No numeric values found.")
if n <= ddof:
raise ValueError(f"Need at least {ddof + 1} numeric values; got {n}.")
# Welford’s algorithm (one pass, stable)
mean = 0.0
M2 = 0.0
count = 0
for v in vals:
count += 1
delta = v - mean
mean += delta / count
M2 += delta * (v - mean)
var = M2 / (count - ddof)
std = sqrt(var)
result = {"mean": mean, "std": std, "count": count}
return result
def csv_column_to_list(path: str, column: str, *, dropna: bool = True):
df = pd.read_csv(path)
if column not in df.columns:
raise KeyError(f"Column '{column}' not found. Available: {list(df.columns)}")
s = df[column]
if dropna:
s = s.dropna()
return s.tolist()
def main():
csv_path = "/scratch/pranamlab/sophtang/home/tr2d2/peptides/plots/glast_resample20_no-mcts/"
path = "/scratch/pranamlab/sophtang/home/TR2-D2/tr2d2-pep/results/tfr_resample10_buffer20_numiter10_children50_20260326_183626"
prot_name = "tfr"
stats = summarize_metrics(skip=1, csv_path=f"{path}/{prot_name}_generation_results.csv",
save_path=f"{path}/results_summary.csv")
print(stats)
if __name__ == '__main__':
main() |