File size: 12,687 Bytes
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea61d54
5eace46
 
 
 
ea61d54
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea61d54
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea61d54
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea61d54
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893f11c
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893f11c
5eace46
 
 
 
 
 
 
 
 
 
 
 
893f11c
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893f11c
5eace46
 
 
893f11c
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893f11c
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea61d54
5eace46
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
# Shared UI: formatting, tables, analysis bullets, exports: used on several sidebar pages
import html as _html
from typing import Dict, List, Tuple, Optional

from utils.analyze import compute_properties

def predicted_confidence(row: Dict) -> Optional[float]:
    # Convert AMP probability into confidence of the predicted class.
    if not row:
        return None
    pred = row.get("Prediction")
    p_amp = row.get("Confidence")
    if p_amp is None:
        return None
    try:
        p_amp = float(p_amp)
    except (TypeError, ValueError):
        return None
    if pred == "AMP":
        return p_amp
    # Non-AMP: use complement so “confidence” matches the displayed class.
    return 1.0 - p_amp


def format_conf_percent(conf_prob: float, digits: int = 1) -> str:
    # Probability in [0,1] -> percent string for UI / exports.
    return f"{round(conf_prob * 100, digits)}%"


def heuristic_reason_for_profile(charge: float, hydro_fraction: float) -> str:
    if charge > 2:
        return "High positive charge supports membrane disruption"
    if 0.3 <= hydro_fraction <= 0.6:
        return "Balanced hydrophobicity"
    return "Favorable predicted profile"


def choose_top_candidate(predictions: List[Dict]) -> Optional[Dict]:
    # Select best candidate row and attach a short profile-based reason.
    if not predictions:
        return None

    # Prefer AMP rows first, then fall back to highest-confidence overall row.
    amp_rows = [r for r in predictions if r.get("Prediction") == "AMP"]
    rows = amp_rows if amp_rows else predictions

    best_row = None
    best_conf = -1.0
    for r in rows:
        c = predicted_confidence(r)
        if c is None:
            continue
        if c > best_conf:
            best_conf = c
            best_row = r

    if best_row is None:
        return None

    seq = best_row.get("Sequence", "")
    if not seq:
        return None

    props = compute_properties(seq)
    charge = props.get("Net Charge (approx.)", 0)
    hydro = props.get("Hydrophobic Fraction", 0)

    return {
        "Sequence": seq,
        "Prediction": best_row.get("Prediction"),
        "predicted_confidence": best_conf,
        "Reason": heuristic_reason_for_profile(charge, hydro),
        "Charge": charge,
        "Hydrophobic Fraction": hydro,
    }


def mutation_heatmap_html(original: str, final: str) -> str:
    # Highlight per-position residue changes between original and final sequences.
    orig = original or ""
    fin = final or ""
    max_len = max(len(orig), len(fin))

    # Use monospace layout so per-position residue changes align visually.
    out: List[str] = [
        "<div style='font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, \"Liberation Mono\", monospace; white-space: pre-wrap;'>"
    ]
    for i in range(max_len):
        o = orig[i] if i < len(orig) else ""
        f = fin[i] if i < len(fin) else ""
        residue = f if f else o
        changed = (o != f)
        residue_escaped = _html.escape(residue)
        if changed and residue:
            out.append(f"<span style='color:#d62728; font-weight:700;'>{residue_escaped}</span>")
        else:
            out.append(residue_escaped if residue else "&nbsp;")
    out.append("</div>")
    return "".join(out)


def mutation_diff_table(original: str, final: str) -> List[Dict]:
    # Side-by-side per-position rows for the optimizer diff expander.
    orig = original or ""
    fin = final or ""
    max_len = max(len(orig), len(fin))
    rows: List[Dict] = []
    for i in range(max_len):
        o = orig[i] if i < len(orig) else ""
        f = fin[i] if i < len(fin) else ""
        rows.append(
            {
                "Position": i + 1,
                "Original": o,
                "Final": f,
                "Changed": "Yes" if o != f else "No",
            }
        )
    return rows


def _ideal_distance_to_interval(value: float, low: float, high: float) -> float:
    # Zero if inside [low, high]; else distance to nearest bound (hydrophobic “ideal band”).
    if low <= value <= high:
        return 0.0
    if value < low:
        return low - value
    return value - high


def optimization_summary(orig_seq: str, orig_conf: float, final_seq: str, final_conf: float) -> Dict:
    # Compute confidence and property deltas for the Optimize summary panel.
    orig_seq = orig_seq or ""
    final_seq = final_seq or ""

    # Property deltas drive the compact "what changed" summary panel.
    props_orig = compute_properties(orig_seq) if orig_seq else {}
    props_final = compute_properties(final_seq) if final_seq else {}

    charge_orig = props_orig.get("Net Charge (approx.)", props_orig.get("Net charge", 0))
    charge_final = props_final.get("Net Charge (approx.)", props_final.get("Net charge", 0))

    hydro_orig = props_orig.get("Hydrophobic Fraction", 0)
    hydro_final = props_final.get("Hydrophobic Fraction", 0)

    delta_conf_pct = (float(final_conf) - float(orig_conf)) * 100.0

    if charge_final > charge_orig:
        charge_change = "Increased"
    elif charge_final < charge_orig:
        charge_change = "Decreased"
    else:
        charge_change = "Same"

    ideal_low, ideal_high = 0.4, 0.5
    dist_orig = _ideal_distance_to_interval(float(hydro_orig), ideal_low, ideal_high)
    dist_final = _ideal_distance_to_interval(float(hydro_final), ideal_low, ideal_high)

    if dist_final < dist_orig:
        hydro_change = "Improved balance"
    elif dist_final > dist_orig:
        hydro_change = "Less optimal"
    else:
        hydro_change = "Same"

    return {
        "delta_conf_pct": delta_conf_pct,
        "charge_orig": charge_orig,
        "charge_final": charge_final,
        "charge_change": charge_change,
        "hydro_orig": hydro_orig,
        "hydro_final": hydro_final,
        "hydro_change": hydro_change,
    }


def sequence_length_warning(seq: str) -> Optional[str]:
    # Soft guardrails for typical AMP length; model itself has no hard cutoff.
    if not seq:
        return None
    n = len(seq)
    if n < 8:
        return "Too short for typical AMP"
    if n > 50:
        return "Unusually long sequence"
    return None


def sequence_health_label(conf_prob: float, charge: float, hydro_fraction: float) -> Tuple[str, str]:
    # Return a short quality label plus color for Analyze page status display.
    # Very high model confidence is treated as strong even outside ideal property ranges.
    if conf_prob >= 0.9:
        return "Strong AMP candidate", "#2ca02c"
    if conf_prob > 0.75 and charge >= 2 and 0.3 <= hydro_fraction <= 0.6:
        return "Strong AMP candidate", "#2ca02c"
    if conf_prob > 0.5:
        return "Moderate potential", "#ff9800"
    return "Unlikely AMP", "#d62728"


# Plain-language bullets for Analyze, rules of thumb, not a second model.
def build_analysis_insights(
    label: str,
    conf: float,
    comp: Dict[str, float],
    length: int,
    hydro: float,
    charge: float,
) -> List[str]:
    # Short, mechanism-oriented bullets for the Analyze page (heuristics, not lab truth).
    lines: List[str] = []
    p_amp = float(conf)
    conf_pct = round(p_amp * 100, 1)
    pred_conf = conf_pct if label == "AMP" else round((1 - p_amp) * 100, 1)

    if label == "AMP":
        if pred_conf >= 80:
            lines.append(
                f"Model: **AMP** with high confidence ({pred_conf}% on this prediction). Profile below explains typical mechanisms."
            )
        elif pred_conf >= 60:
            lines.append(
                f"Model: **AMP** with moderate confidence ({pred_conf}%); cross-check chemistry bullets before treating it as a strong hit."
            )
        else:
            lines.append(
                f"Model: **AMP** but low confidence ({pred_conf}%); the mechanistic notes below matter more than the label alone."
            )
    else:
        if pred_conf >= 80:
            lines.append(
                f"Model: **Non-AMP** with high confidence ({pred_conf}% on this prediction). Below are common reasons a sequence may not behave like a classic AMP."
            )
        elif pred_conf >= 60:
            lines.append(
                f"Model: **Non-AMP** with moderate confidence ({pred_conf}%); reasons below are typical but not exhaustive."
            )
        else:
            lines.append(
                f"Model: **Non-AMP** with low confidence ({pred_conf}%); treat the label as tentative and read the property-based notes."
            )

    polar_frac = sum(float(comp.get(aa, 0.0)) for aa in "STNQYC")
    basic_frac = sum(float(comp.get(aa, 0.0)) for aa in "KRH")

    explain_weak = (label == "Non-AMP") or (label == "AMP" and pred_conf < 65)

    if explain_weak:
        if charge <= 0:
            lines.append(
                "Weak or absent **positive net charge**: many AMPs rely on cationic residues to bind **anionic bacterial surfaces** (e.g. LPS, teichoic acids); near-neutral or negative peptides often lack that first electrostatic hook."
            )
        if hydro < 0.28:
            lines.append(
                "Low **hydrophobic** content: membrane insertion, pore formation, or lipid disruption is harder without a hydrophobic face or core to partition into the bilayer."
            )
        if hydro > 0.65:
            lines.append(
                "Very high **hydrophobic** content: risk of aggregation or poor **aqueous solubility** before the peptide can reach bacteria, delivery and effective concentration suffer."
            )
        if polar_frac < 0.12:
            lines.append(
                "Few **polar / H-bonding** residues (S, T, N, Q, Y, C): weaker interfacial interactions with lipids and water at the membrane. Many AMP mechanisms benefit from polar positioning at the interface."
            )
        if basic_frac < 0.06 and charge < 2:
            lines.append(
                "Sparse **basic** residues (K, R, H): a hallmark of many AMPs is concentrated positive charge for initial **bacterial association**; this sequence is thin on that axis."
            )
        if length < 8:
            lines.append(
                "Very **short** length: may be too small to form a stable membrane-active structure or to span a bilayer meaningfully."
            )
        elif length > 50:
            lines.append(
                "Unusually **long** chain: folding, proteolysis, and synthesis cost can diverge from small cationic AMP archetypes."
            )

        if label == "Non-AMP" and charge >= 2 and 0.28 <= hydro <= 0.58:
            lines.append(
                "**Note:** Charge and hydrophobic balance still look somewhat AMP-like; the model says Non-AMP. Treat this as a **disagreement** worth validating experimentally, not proof either way."
            )

    if label == "AMP" and pred_conf >= 65:
        if charge >= 2 and 0.28 <= hydro <= 0.58:
            lines.append(
                "**Positive charge** plus **moderate hydrophobic fraction** aligns with membrane-targeting motifs common in AMP literature."
            )
        if polar_frac >= 0.12:
            lines.append(
                "Adequate **polar** residues can help **interfacial** placement and H-bonding at the membrane."
            )

    if (comp.get("K", 0) + comp.get("R", 0) + comp.get("H", 0)) >= 0.18:
        lines.append(
            "Higher **basic** residue fraction supports **electrostatic** attraction to anionic bacterial components."
        )
    if (comp.get("C", 0) + comp.get("W", 0)) >= 0.08:
        lines.append(
            "**Cysteine / tryptophan** can contribute to membrane insertion, stacking, or oxidative chemistry depending on context."
        )

    # De-duplicate while preserving order.
    out: List[str] = []
    seen = set()
    for line in lines:
        if line not in seen:
            seen.add(line)
            out.append(line)
    return out[:14]


def build_analysis_summary_text(
    sequence: str,
    prediction: str,
    confidence_display: str,
    props: Dict,
    analysis_lines: List[str],
) -> str:
    # Flat text blob for Analyze page TXT download.
    length = props.get("Length", len(sequence))
    charge = props.get("Net Charge (approx.)", props.get("Net charge", 0))
    hydro = props.get("Hydrophobic Fraction", props.get("Hydrophobic", 0))
    analysis_block = "\n".join(f"- {line}" for line in (analysis_lines or []))
    return (
        f"Sequence: {sequence}\n"
        f"Prediction: {prediction}\n"
        f"Confidence: {confidence_display}\n"
        f"Length: {length}\n"
        f"Net Charge (approx.): {charge}\n"
        f"Hydrophobic Fraction: {hydro}\n\n"
        f"Summary:\n{analysis_block}\n"
    )