ananya updates

Browse files

Files changed (4) hide show

dpacman/classifier/model/clustering_data.py +383 -0
dpacman/classifier/model/compute_embeddings.py +28 -14
dpacman/classifier/model/model.py +55 -16
dpacman/classifier/model/train.py +355 -234

dpacman/classifier/model/clustering_data.py ADDED Viewed

	@@ -0,0 +1,383 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import random
+import sys
+import subprocess
+from collections import defaultdict
+# ─────────────────────────────────────────────────────────────────────────
+# Original helpers (kept; some lightly edited/commented where needed)
+# ─────────────────────────────────────────────────────────────────────────
+def read_ids_file(p):
+    p = Path(p)
+    if not p.exists():
+        raise FileNotFoundError(f"IDs file not found: {p}")
+    return [line.strip() for line in p.open() if line.strip()]
+def split_embeddings(emb_path, ids_path, out_dir, prefix):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if not Path(emb_path).exists():
+        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
+    if not Path(ids_path).exists():
+        raise FileNotFoundError(f"IDs file not found: {ids_path}")
+    if emb_path.endswith(".npz"):
+        data = np.load(emb_path, allow_pickle=True)
+        if "embeddings" in data:
+            emb = data["embeddings"]
+        else:
+            raise ValueError(f"{emb_path} missing 'embeddings' key")
+    else:
+        emb = np.load(emb_path)
+    ids = read_ids_file(ids_path)
+    if len(ids) != emb.shape[0]:
+        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
+    mapping = {}
+    for i, ident in enumerate(ids):
+        if i >= emb.shape[0]:
+            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
+            continue
+        arr = emb[i]
+        out_file = out_dir / f"{prefix}_{ident}.npy"
+        np.save(out_file, arr)
+        mapping[ident] = str(out_file)
+    return mapping
+def extract_symbol_from_tf_id(full_id: str) -> str:
+    """
+    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
+    return the gene symbol uppercase (e.g., 'ZBTB5').
+    """
+    if "|" in full_id:
+        try:
+            # format sp|Accession|SYMBOL_HUMAN
+            genepart = full_id.split("|")[2]
+        except IndexError:
+            genepart = full_id
+    else:
+        genepart = full_id
+    symbol = genepart.split("_")[0]
+    return symbol.upper()
+def build_tf_symbol_map(tf_map):
+    """
+    Build mapping gene_symbol -> list of embedding paths.
+    """
+    symbol_map = {}
+    for full_id, path in tf_map.items():
+        symbol = extract_symbol_from_tf_id(full_id)
+        symbol_map.setdefault(symbol, []).append(path)
+    return symbol_map
+def tf_key_from_path(path: str) -> str:
+    """
+    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
+    """
+    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
+    # remove leading prefix if present (tf_)
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return extract_symbol_from_tf_id(rest)
+def dna_key_from_path(path: str) -> str:
+    """
+    Given .../dna_peak42.npy -> 'peak42'
+    """
+    stem = Path(path).stem
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return rest
+# ─────────────────────────────────────────────────────────────────────────
+# New helpers for MMseqs clustering & cluster-level splitting
+# ─────────────────────────────────────────────────────────────────────────
+def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
+    """
+    Write unique DNA sequences to FASTA using dna_id as header.
+    Requires df with columns: dna_id, dna_sequence
+    """
+    uniq = df[["dna_id", "dna_sequence"]].drop_duplicates()
+    with open(out_fasta, "w") as f:
+        for _, row in uniq.iterrows():
+            did = row["dna_id"]
+            seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
+            f.write(f">{did}\n{seq}\n")
+def run_mmseqs_easy_cluster(
+    mmseqs_bin: str,
+    fasta: Path,
+    out_prefix: Path,
+    tmp_dir: Path,
+    min_seq_id: float,
+    coverage: float,
+    cov_mode: int,
+) -> Path:
+    """
+    Runs mmseqs easy-cluster on nucleotide sequences.
+    Returns the path to a clusters TSV file (creating it if the default one isn't present).
+    """
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    out_prefix.parent.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        mmseqs_bin, "easy-cluster",
+        str(fasta), str(out_prefix), str(tmp_dir),
+        "--min-seq-id", str(min_seq_id),
+        "-c", str(coverage),
+        "--cov-mode", str(cov_mode),
+        # You can add performance flags here if needed, e.g.:
+        # "--threads", "8"
+    ]
+    print("[i] Running:", " ".join(cmd), flush=True)
+    subprocess.run(cmd, check=True)
+    # MMseqs easy-cluster typically writes <out_prefix>_cluster.tsv
+    default_tsv = Path(str(out_prefix) + "_cluster.tsv")
+    if default_tsv.exists():
+        print(f"[i] Found cluster TSV: {default_tsv}")
+        return default_tsv
+    # Fallback: try createtsv if default is missing
+    # This requires the internal DBs. easy-cluster creates DBs alongside out_prefix.
+    # We'll try to locate them and emit a TSV.
+    in_db = Path(str(out_prefix) + "_query")
+    cl_db = Path(str(out_prefix) + "_cluster")
+    out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
+    if in_db.exists() and cl_db.exists():
+        cmd2 = [mmseqs_bin, "createtsv", str(in_db), str(in_db), str(cl_db), str(out_tsv)]
+        print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
+        subprocess.run(cmd2, check=True)
+        if out_tsv.exists():
+            return out_tsv
+    raise FileNotFoundError("Could not locate clusters TSV from mmseqs. "
+                            "Expected {default_tsv} or createtsv fallback.")
+def parse_mmseqs_clusters(tsv_path: Path) -> dict:
+    """
+    Parse MMseqs cluster TSV (rep \t member). Returns dna_id -> cluster_rep_id
+    """
+    mapping = {}
+    with open(tsv_path) as f:
+        for line in f:
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) < 2:
+                continue
+            rep, member = parts[0], parts[1]
+            mapping[member] = rep
+            # Some TSVs include rep->rep; if not, ensure rep is mapped to itself:
+            if rep not in mapping:
+                mapping[rep] = rep
+    return mapping
+def assign_clusters_to_splits(cluster_rep_to_members: dict,
+                              val_frac: float,
+                              test_frac: float,
+                              seed: int = 42):
+    """
+    cluster_rep_to_members: dict[rep] = [members...]
+    Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
+    Ensures all members of a cluster go to the same split.
+    """
+    rng = random.Random(seed)
+    reps = list(cluster_rep_to_members.keys())
+    rng.shuffle(reps)
+    # Greedy-ish fill by total member counts to match desired fractions.
+    total = sum(len(cluster_rep_to_members[r]) for r in reps)
+    target_val = int(round(total * val_frac))
+    target_test = int(round(total * test_frac))
+    cur_val = cur_test = 0
+    val_ids, test_ids, train_ids = set(), set(), set()
+    for rep in reps:
+        members = cluster_rep_to_members[rep]
+        c = len(members)
+        # Fill val first, then test, then train
+        if cur_val + c <= target_val:
+            val_ids.update(members); cur_val += c
+        elif cur_test + c <= target_test:
+            test_ids.update(members); cur_test += c
+        else:
+            train_ids.update(members)
+    return {"train": train_ids, "val": val_ids, "test": test_ids}
+# ─────────────────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
+    )
+    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
+    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
+    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)")
+    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
+    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)")
+    parser.add_argument("--out_dir", required=True, help="Output directory")
+    parser.add_argument("--seed", type=int, default=42)
+    # NEW: MMseqs options & split fractions
+    parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
+    parser.add_argument("--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id")
+    parser.add_argument("--cov", type=float, default=0.8, help="MMseqs -c coverage fraction")
+    parser.add_argument("--cov_mode", type=int, default=1, help="MMseqs --cov-mode (1 = coverage of target)")
+    parser.add_argument("--val_frac", type=float, default=0.10)
+    parser.add_argument("--test_frac", type=float, default=0.10)
+    parser.add_argument("--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)")
+    args = parser.parse_args()
+    random.seed(args.seed)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Load final.csv
+    df = pd.read_csv(args.final_csv, dtype=str)
+    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
+        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
+    # Assign dna_id (unique per dna_sequence)
+    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
+    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
+    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
+    enriched_csv = out_dir / "final_with_dna_id.csv"
+    df.to_csv(enriched_csv, index=False)
+    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
+    # Split embeddings into per-item files (unchanged)
+    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
+    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
+    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
+    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
+    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
+    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
+    # Build gene-symbol normalized map
+    tf_symbol_map = build_tf_symbol_map(tf_map)
+    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
+    # Diagnostic overlaps
+    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
+    available_tf_symbols = set(tf_symbol_map.keys())
+    intersect_tf = norm_tf_in_final & available_tf_symbols
+    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
+    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
+    print(f"[i] Intersection count: {len(intersect_tf)}")
+    if len(intersect_tf) == 0:
+        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
+        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
+        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
+        sys.exit(1)
+    dna_ids_final = set(df["dna_id"].unique())
+    available_dna_ids = set(dna_map.keys())
+    intersect_dna = dna_ids_final & available_dna_ids
+    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
+    if len(intersect_dna) == 0:
+        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
+        sys.exit(1)
+    # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
+    fasta_path = out_dir / "dna_unique.fasta"
+    write_dna_fasta(df, fasta_path)
+    print(f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}")
+    tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
+    cluster_prefix = out_dir / "mmseqs_dna_clusters"
+    clusters_tsv = run_mmseqs_easy_cluster(
+        mmseqs_bin=args.mmseqs_bin,
+        fasta=fasta_path,
+        out_prefix=cluster_prefix,
+        tmp_dir=tmp_dir,
+        min_seq_id=args.min_seq_id,
+        coverage=args.cov,
+        cov_mode=args.cov_mode,
+    )
+    # Parse clusters
+    member_to_rep = parse_mmseqs_clusters(clusters_tsv)   # dna_id -> rep_id
+    # Build rep -> members list
+    rep_to_members = defaultdict(list)
+    for member, rep in member_to_rep.items():
+        rep_to_members[rep].append(member)
+    print(f"[i] Parsed {len(rep_to_members)} clusters from {clusters_tsv}")
+    clusters_table = []
+    for rep, members in rep_to_members.items():
+        for m in members:
+            clusters_table.append((m, rep))
+    clusters_df = pd.DataFrame(clusters_table, columns=["dna_id", "cluster_id"])
+    clusters_df.to_csv(out_dir / "clusters.tsv", sep="\t", index=False)
+    print(f"[i] Wrote clusters mapping → {out_dir / 'clusters.tsv'}")
+    # Attach cluster_id back to final df
+    df = df.merge(clusters_df, on="dna_id", how="left")
+    df.to_csv(out_dir / "final_with_dna_id_and_cluster.csv", index=False)
+    print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
+    # Assign entire clusters to splits
+    splits = assign_clusters_to_splits(rep_to_members,
+                                       val_frac=args.val_frac,
+                                       test_frac=args.test_frac,
+                                       seed=args.seed)
+    for k in ["train", "val", "test"]:
+        print(f"[i] {k}: {len(splits[k])} dna_ids")
+    # ── Build positive pairs only, per split (NO negatives) ───────────────
+    positives_by_split = {"train": [], "val": [], "test": []}
+    # Build a quick dna_id -> embedding path map
+    dnaid_to_path = {did: path for did, path in dna_map.items()}
+    pos_count = 0
+    for _, row in df.iterrows():
+        tf_raw = row["TF_id"]
+        tf_symbol = tf_raw.split("_seq")[0].upper()
+        dnaid = row["dna_id"]
+        if (tf_symbol not in tf_symbol_map) or (dnaid not in dnaid_to_path):
+            continue
+        tf_embedding_path = tf_symbol_map[tf_symbol][0]  # first embedding per symbol
+        # decide split by dna_id cluster assignment
+        if dnaid in splits["train"]:
+            positives_by_split["train"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        elif dnaid in splits["val"]:
+            positives_by_split["val"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        elif dnaid in splits["test"]:
+            positives_by_split["test"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        pos_count += 1
+    print(f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})")
+    for k in ["train", "val", "test"]:
+        print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
+    # # OLD: negatives (kept commented)
+    # negatives = []
+    # print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive not used)")
+    # Emit split-specific pair lists
+    for split in ["train", "val", "test"]:
+        out_tsv = out_dir / f"pair_list_{split}.tsv"
+        with open(out_tsv, "w") as f:
+            for binder_path, glm_path, label in positives_by_split[split]:  # + negatives if you add later
+                f.write(f"{binder_path}\t{glm_path}\t{label}\n")
+        print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
+    print("✅ Done. Cluster-aware splits ready.")
+if __name__ == "__main__":
+    main()

dpacman/classifier/model/compute_embeddings.py CHANGED Viewed

@@ -25,6 +25,9 @@ from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipelin
 import esm
 from Bio import SeqIO
 import time
 # ---- model wrappers ----
@@ -67,7 +70,7 @@ class CaduceusEmbedder:
         # return np.stack(outputs, axis=0)  # (N, L, D)
         outputs = []
-        for seq in seqs:
             toks = self.tokenizer(
                 seq,
                 return_tensors="pt",
@@ -471,34 +474,45 @@ def embed_and_save(seqs, ids, embedder, out_path):
 if __name__=="__main__":
     p = argparse.ArgumentParser()
-    p.add_argument("--peak-fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
     p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
     p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
     p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
     p.add_argument("--chrom-model",   default="caduceus")
     p.add_argument("--tf-model",      default="esm-dbp")
-    p.add_argument("--out-dir",       default="data_files/processed/tfclust/hg38_tf/embeddings")
     p.add_argument("--device",        default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
     device = args.device
     if not args.skip_dna:
-        peak_fasta = Path(args.peak_fasta)
-        if peak_fasta.exists():
-            # Load peak sequences from FASTA
-            from Bio import SeqIO
-            peak_seqs = []
-            peak_ids = []
-            for rec in SeqIO.parse(peak_fasta, "fasta"):
-                peak_ids.append(rec.id)
-                peak_seqs.append(str(rec.seq))
-            print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
             dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
             out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
             embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
         elif args.genome_json_dir:
             # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
             genome_dir = Path(args.genome_json_dir)

 import esm
 from Bio import SeqIO
 import time
+import pandas as pd
+from tqdm.auto import tqdm
+import logging, math
 # ---- model wrappers ----
         # return np.stack(outputs, axis=0)  # (N, L, D)
         outputs = []
+        for seq in tqdm(seqs, total=len(seqs), desc="DNA: Caduceus", dynamic_ncols=True):
             toks = self.tokenizer(
                 seq,
                 return_tensors="pt",
 if __name__=="__main__":
     p = argparse.ArgumentParser()
+    #p.add_argument("--peak_fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
     p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
     p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
     p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
     p.add_argument("--chrom-model",   default="caduceus")
     p.add_argument("--tf-model",      default="esm-dbp")
+    p.add_argument("--out-dir",       default="dpacman/model/embeddings")
     p.add_argument("--device",        default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
     device = args.device
+    print(device)
     if not args.skip_dna:
+        if args.genome_json_dir == None:
+            dna_df = pd.read_parquet('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.parquet', engine='pyarrow')
+            #df.to_csv('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.csv', index=False)
+            peak_seqs = dna_df["dna_sequence"]
+            peak_ids = dna_df["ID"]
+            print(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data", flush=True)
             dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
             out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
             embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
+        # peak_fasta = Path(args.peak_fasta)
+        # if peak_fasta.exists():
+        #     # Load peak sequences from FASTA
+        #     from Bio import SeqIO
+        #     peak_seqs = []
+        #     peak_ids = []
+        #     for rec in SeqIO.parse(peak_fasta, "fasta"):
+        #         peak_ids.append(rec.id)
+        #         peak_seqs.append(str(rec.seq))
+        #     print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
+        #     dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+        #     out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
+        #     embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
         elif args.genome_json_dir:
             # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
             genome_dir = Path(args.genome_json_dir)

dpacman/classifier/model/model.py CHANGED Viewed

@@ -39,7 +39,7 @@ class CrossModalBlock(nn.Module):
     def forward(self, binder: torch.Tensor, glm: torch.Tensor):
         """
         binder: (batch, Lb, dim)
-        glm: (batch, Lg, dim) -- has passed through its local CNN beforehand
         returns: updated binder representation (batch, Lb, dim)
         """
         # binder self-attn + ffn
@@ -63,16 +63,50 @@ class CrossModalBlock(nn.Module):
         c = self.ln_c2(c + c_ff)
         return c  # (batch, Lb, dim)
 class BindPredictor(nn.Module):
     def __init__(self,
-                 input_dim: int = 256,
                  hidden_dim: int = 256,
                  heads: int = 8,
                  num_layers: int = 4,
                  use_local_cnn_on_glm: bool = True):
         super().__init__()
-        self.proj_binder = nn.Linear(input_dim, hidden_dim)
-        self.proj_glm = nn.Linear(input_dim, hidden_dim)
         self.use_local_cnn = use_local_cnn_on_glm
         self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
@@ -81,23 +115,28 @@ class BindPredictor(nn.Module):
         ])
         self.ln_out = nn.LayerNorm(hidden_dim)
-        self.head = nn.Sequential(
-            nn.Linear(hidden_dim, 1),
-            nn.Sigmoid()
-        )
     def forward(self, binder_emb, glm_emb):
         """
-        binder_emb, glm_emb: (batch, L, input_dim)
         """
-        b = self.proj_binder(binder_emb)   # (B, Lb, hidden_dim)
-        g = self.proj_glm(glm_emb)         # (B, Lg, hidden_dim)
         if self.use_local_cnn:
-            g = self.local_cnn(g)          # local context injected
         for layer in self.layers:
-            b = layer(b, g)                # update binder with cross-modal info
-        pooled = b.mean(dim=1)             # (B, hidden_dim)
-        out = self.ln_out(pooled)
-        return self.head(out).squeeze(-1)   # (B,)

     def forward(self, binder: torch.Tensor, glm: torch.Tensor):
         """
         binder: (batch, Lb, dim)
+        glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
         returns: updated binder representation (batch, Lb, dim)
         """
         # binder self-attn + ffn
         c = self.ln_c2(c + c_ff)
         return c  # (batch, Lb, dim)
+class DimCompressor(nn.Module):
+    """
+    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
+    If in_dim == out_dim, behaves as identity.
+    """
+    def __init__(self, in_dim: int, out_dim: int = 256):
+        super().__init__()
+        if in_dim == out_dim:
+            self.net = nn.Identity()
+        else:
+            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
+            self.net = nn.Sequential(
+                nn.LayerNorm(in_dim),
+                nn.Linear(in_dim, hidden),
+                nn.GELU(),
+                nn.Linear(hidden, out_dim),
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, L, in_dim)
+        return self.net(x)
 class BindPredictor(nn.Module):
     def __init__(self,
+                 # input_dim: int = 256,                     # OLD: single input dim
+                 binder_input_dim: int = 1280,               # NEW: TF (binder) original dim (e.g., 1280)
+                 glm_input_dim: int = 256,                   # NEW: DNA/GLM original dim (e.g., 256)
+                 compressed_dim: int = 256,                  # NEW: learnable compressed dim
                  hidden_dim: int = 256,
                  heads: int = 8,
                  num_layers: int = 4,
                  use_local_cnn_on_glm: bool = True):
         super().__init__()
+        # OLD:
+        # self.proj_binder = nn.Linear(input_dim, hidden_dim)
+        # self.proj_glm = nn.Linear(input_dim, hidden_dim)
+        # NEW: learnable compressor for binder → 256, then project to hidden
+        self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
+        self.proj_binder     = nn.Linear(compressed_dim, hidden_dim)
+        # GLM side stays 256 → hidden
+        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
         self.use_local_cnn = use_local_cnn_on_glm
         self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
         ])
         self.ln_out = nn.LayerNorm(hidden_dim)
+        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
+        self.head = nn.Linear(hidden_dim, 1)                                  # NEW: return logits (safe for AMP)
     def forward(self, binder_emb, glm_emb):
         """
+        binder_emb: (B, Lb, binder_input_dim)
+        glm_emb:    (B, Lg, glm_input_dim)
+        Returns per-nucleotide logits for the GLM sequence: (B, Lg)
         """
+        # Binder: learnable compression → 256 → hidden
+        b = self.binder_compress(binder_emb)      # (B, Lb, 256)
+        b = self.proj_binder(b)                   # (B, Lb, hidden_dim)
+        # GLM: project → hidden, add local CNN context
+        g = self.proj_glm(glm_emb)                # (B, Lg, hidden_dim)
         if self.use_local_cnn:
+            g = self.local_cnn(g)
+        # Cross-modal blocks: update binder states using GLM
         for layer in self.layers:
+            b = layer(b, g)                       # (B, Lb, hidden_dim)
+        # Predict per-nucleotide logits on the GLM tokens:
+        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
+        return self.head(g).squeeze(-1)           # NEW: logits (apply sigmoid only in loss/metrics)

dpacman/classifier/model/train.py CHANGED Viewed

@@ -1,262 +1,383 @@
-#!/usr/bin/env python3
-import argparse
 import numpy as np
 import torch
 from torch import nn
-from torch.utils.data import Dataset, DataLoader
-from model import BindPredictor
-from pathlib import Path
-from collections import Counter
-from sklearn.metrics import roc_auc_score, average_precision_score
-from sklearn.decomposition import TruncatedSVD
-import random
-import sys
-# ---- dataset ---------------------------------------------------------
-class PairDataset(Dataset):
-    def __init__(self, binder_paths, glm_paths, labels, tf_compressed_cache):
-        """
-        tf_compressed_cache: dict mapping binder_path -> compressed (256-d) tensor/array
-        """
-        assert len(binder_paths) == len(glm_paths) == len(labels)
-        self.binder_paths = binder_paths
-        self.glm_paths = glm_paths
-        self.labels = labels
-        self.tf_cache = tf_compressed_cache  # already reduced to 256
-    def __len__(self):
-        return len(self.labels)
-    def __getitem__(self, idx):
-        # binder = TF embedding (possibly reduced)
-        b = self.tf_cache[self.binder_paths[idx]]  # numpy array shape (L, 256) or (256,)
-        g = np.load(self.glm_paths[idx])           # glm (DNA) embedding
-        if b.ndim == 1:
-            b = b[None, :]
-        if g.ndim == 1:
-            g = g[None, :]
-        b_tensor = torch.from_numpy(b).float()
-        g_tensor = torch.from_numpy(g).float()
-        y = torch.tensor(self.labels[idx]).float()
-        return b_tensor, g_tensor, y
-def collate_fn(batch):
-    binders, glms, labels = zip(*batch)
-    binder_lens = [b.shape[0] for b in binders]
-    glm_lens = [g.shape[0] for g in glms]
-    max_b = max(binder_lens)
-    max_g = max(glm_lens)
-    def pad_seq(seq, target_len):
-        L, D = seq.shape
-        if L < target_len:
-            pad = torch.zeros((target_len - L, D), dtype=seq.dtype, device=seq.device)
-            return torch.cat([seq, pad], dim=0)
-        return seq
-    b_padded = torch.stack([pad_seq(b, max_b) for b in binders])  # (B, Lb, D)
-    g_padded = torch.stack([pad_seq(g, max_g) for g in glms])     # (B, Lg, D)
-    y = torch.stack(labels)
-    return b_padded, g_padded, y
-# ---- utilities -------------------------------------------------------
-def parse_pair_list(pair_list_path):
-    binder_paths, glm_paths, labels = [], [], []
-    with open(pair_list_path) as f:
-        for lineno, line in enumerate(f, start=1):
-            if not line.strip():
-                continue
             parts = line.strip().split()
-            if len(parts) != 3:
-                print(f"[WARN] skipping malformed line {lineno}: {line.strip()}", file=sys.stderr)
-                continue
-            b, g, l = parts
-            try:
-                lab = int(l)
-            except ValueError:
-                print(f"[WARN] invalid label on line {lineno}: {l}", file=sys.stderr)
-                continue
-            binder_paths.append(b)
-            glm_paths.append(g)
-            labels.append(lab)
-    return binder_paths, glm_paths, labels
-def build_tf_compressed_cache(binder_paths, target_dim=256):
     """
-    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
     """
-    unique_paths = sorted(set(binder_paths))
-    print(f"[i] Found {len(unique_paths)} unique TF embedding files to compress.", flush=True)
-    # Load all embeddings to determine dimensionality
-    samples = []
-    for p in unique_paths:
-        arr = np.load(p)
-        samples.append(arr)
-    # Determine if reduction needed: assume all have same embedding width
-    first = samples[0]
-    orig_dim = first.shape[1] if first.ndim == 2 else 1
-    reduction_needed = (orig_dim != target_dim)
-    tf_cache = {}
-    if reduction_needed:
-        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
-        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
-        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
-        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
-        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
-        pooled = []
-        for arr in samples:
-            if arr.ndim == 2:
-                pooled.append(arr.mean(axis=0))  # (orig_dim,)
-            else:
-                pooled.append(arr)  # degenerate
-        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
-        print(f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}", flush=True)
-        svd = TruncatedSVD(n_components=target_dim, random_state=42)
-        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
-        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
-        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
-        proj_mat = svd.components_.T  # (orig_dim, target_dim)
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]  # shape (L, orig_dim)
-            if arr.ndim == 1:
-                arr2 = arr @ proj_mat  # (target_dim,)
-            else:
-                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
-                arr2 = arr @ proj_mat
-            tf_cache[p] = arr2  # reduced per-token representation
-        print("[i] Completed compression of TF embeddings.", flush=True)
-    else:
-        # already correct dim: just cache originals
-        print(f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.", flush=True)
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]
-            tf_cache[p] = arr
-    return tf_cache
-def evaluate(model, dl, device):
     model.eval()
-    all_labels = []
-    all_preds = []
     with torch.no_grad():
-        for b, g, y in dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            all_labels.append(y.cpu())
-            all_preds.append(pred.cpu())
-    if not all_labels:
-        return 0.0, 0.0
-    y_true = torch.cat(all_labels).numpy()
-    y_score = torch.cat(all_preds).numpy()
-    try:
-        auc = roc_auc_score(y_true, y_score)
-    except Exception:
-        auc = 0.0
-    try:
-        ap = average_precision_score(y_true, y_score)
-    except Exception:
-        ap = 0.0
-    return auc, ap
-# ---- main ------------------------------------------------------------
 def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--pair_list", type=str, required=True,
-                        help="TSV: binder_path glm_path label")
-    parser.add_argument("--out_dir", type=str, required=True)
-    parser.add_argument("--epochs", type=int, default=10)
-    parser.add_argument("--batch_size", type=int, default=32)
-    parser.add_argument("--lr", type=float, default=1e-4)
-    parser.add_argument("--device", type=str, default="cuda")
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-    # reproducibility
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    print("DEBUG: starting training script with in-line TF compression", flush=True)
-    print(f"[i] pair_list: {args.pair_list}", flush=True)
-    print(f"[i] output dir: {args.out_dir}", flush=True)
     device = torch.device(args.device if torch.cuda.is_available() else "cpu")
-    binder_paths, glm_paths, labels = parse_pair_list(args.pair_list)
-    if len(labels) == 0:
-        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
-        sys.exit(1)
-    label_counts = Counter(labels)
-    print(f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}", flush=True)
-    # build compressed TF cache (reduces to 256 if needed)
-    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
-    # simple split: 80/10/10
-    n = len(labels)
-    idxs = np.arange(n)
-    np.random.shuffle(idxs)
-    train_i = idxs[: int(0.8 * n)]
-    val_i = idxs[int(0.8 * n): int(0.9 * n)]
-    test_i = idxs[int(0.9 * n):]
-    def subset(idxs):
-        return [binder_paths[i] for i in idxs], [glm_paths[i] for i in idxs], [labels[i] for i in idxs]
-    train_ds = PairDataset(*subset(train_i), tf_compressed_cache=tf_compressed_cache)
-    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
-    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
-    print(f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}", flush=True)
-    if len(train_ds) == 0 or len(val_ds) == 0:
-        print("[ERROR] Train or validation split is empty; cannot proceed.", file=sys.stderr)
-        sys.exit(1)
-    train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)
-    val_dl = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
-    test_dl = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
-    model = BindPredictor(input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True)
-    model = model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
-    loss_fn = nn.BCELoss()
-    best_val = -float("inf")
-    os_out = Path(args.out_dir)
-    os_out.mkdir(exist_ok=True, parents=True)
-    for epoch in range(1, args.epochs + 1):
-        print(f"[Epoch {epoch}] starting...", flush=True)
         model.train()
-        running_loss = 0.0
-        for b, g, y in train_dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            loss = loss_fn(pred, y)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item() * b.size(0)
-        train_loss = running_loss / len(train_ds)
-        val_auc, val_ap = evaluate(model, val_dl, device)
-        print(f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}", flush=True)
-        if val_auc > best_val:
-            best_val = val_auc
-            torch.save(model.state_dict(), os_out / "best_model.pt")
-            print(f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}", flush=True)
-    torch.save(model.state_dict(), os_out / "last_model.pt")
-    test_auc, test_ap = evaluate(model, test_dl, device)
-    print(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
-    print(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
-if __name__ == "__main__":
     main()

+import argparse, random, sys
+from pathlib import Path
 import numpy as np
+import pandas as pd
 import torch
 from torch import nn
+from torch.utils.data import Dataset, DataLoader, Sampler
+# from sklearn.random_projection import GaussianRandomProjection  # OLD (kept): projection was removed earlier
+import matplotlib.pyplot as plt
+import torch.amp as amp
+from torch.nn import functional as F
+from model import BindPredictor
+# ─────────────── utilities ────────────────────────────────────────────────
+def parse_pair_list(path):
+    binders, glms = [], []
+    with open(path) as f:
+        for ln, line in enumerate(f,1):
             parts = line.strip().split()
+            if len(parts) < 2: continue
+            b,g = parts[0], parts[1]
+            binders.append(b); glms.append(g)
+    return binders, glms
+class ListBatchSampler(Sampler):
+    def __init__(self, batches): self.batches = batches
+    def __iter__(self): return iter(self.batches)
+    def __len__(self):   return len(self.batches)
+def make_buckets(idxs, glm_paths, batch_size, n_buckets=10, seed=42):
+    rng = random.Random(seed)
+    lengths = [(i, np.load(glm_paths[i]).shape[0]) for i in idxs]
+    lengths.sort(key=lambda x: x[1])
+    size    = max(1, int(np.ceil(len(lengths)/n_buckets)))
+    buckets = [lengths[i:i+size] for i in range(0,len(lengths),size)]
+    batches = []
+    for bucket in buckets:
+        ids = [i for i,_ in bucket]
+        rng.shuffle(ids)
+        for i in range(0,len(ids),batch_size):
+            batches.append(ids[i:i+batch_size])
+    rng.shuffle(batches)
+    return batches
+def dna_key_from_path(path: str) -> str:
+    """.../dna_peak42.npy -> 'peak42'"""
+    stem = Path(path).stem
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return rest
+def build_tf_cache(tf_paths, target_dim=256):
     """
+    Load raw TF embeddings without projecting; compression is learnable in the model.
     """
+    unique = sorted(set(tf_paths))
+    print(f"[i] (Learnable) Preparing {len(unique)} TF files; target {target_dim}d inside the model", flush=True)
+    pools, raw = [], []
+    for p in unique:
+        arr = np.load(p)          # (L, D) or (D,)
+        raw.append(arr)
+        pools.append(arr.mean(axis=0) if arr.ndim==2 else arr)
+    M = np.stack(pools,0)
+    orig_dim = M.shape[1]
+    print(f"[i] Pooled shape → {M.shape}  (orig_dim={orig_dim})", flush=True)
+    cache = {}
+    for i,p in enumerate(unique):
+        arr = raw[i]
+        # OLD: projection here (removed)
+        cache[p] = arr
+    print("[i] TF cache ready (raw); compression will be learned.", flush=True)
+    return cache
+# ─────────────── Dataset & Collation ─────────────────────────────────────
+class PairDataset(Dataset):
+    def __init__(self, tf_paths, dna_paths, final_df, tf_cache):
+        self.tf_paths, self.dna_paths = tf_paths, dna_paths
+        self.tf_cache                = tf_cache
+        self.targets = {}
+        for _, row in final_df.iterrows():
+            dna_id = row["dna_id"]
+            vec    = np.array(list(map(float, row["score_sig_r2"].split(","))), dtype=np.float32)
+            self.targets[dna_id] = vec
+    def __len__(self): return len(self.tf_paths)
+    def __getitem__(self, i):
+        b = self.tf_cache[self.tf_paths[i]]      # (L_b, D_b) or (D_b,)
+        if b.ndim==1: b = b[None,:]
+        g = np.load(self.dna_paths[i])           # (L_g, 256) or (256,)
+        if g.ndim==1: g = g[None,:]
+        stem = Path(self.dna_paths[i]).stem
+        dna_id = stem.replace("dna_","")
+        t = self.targets.get(dna_id, np.zeros(g.shape[0],dtype=np.float32))
+        return torch.from_numpy(b).float(), \
+               torch.from_numpy(g).float(), \
+               torch.from_numpy(t).float()
+def collate_fn(batch):
+    Bs = [b.shape[0] for b,_,_ in batch]
+    Gs = [g.shape[0] for _,g,_ in batch]
+    maxB, maxG = max(Bs), max(Gs)
+    def pad_seq(x, L):
+        if x.shape[0] < L:
+            pad = torch.zeros((L-x.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+            return torch.cat([x, pad], dim=0)
+        return x
+    def pad_t(y, L):
+        if y.shape[0] < L:
+            pad = torch.zeros((L-y.shape[0],), dtype=y.dtype, device=y.device)
+            return torch.cat([y, pad], dim=0)
+        return y
+    b_stack = torch.stack([pad_seq(b, maxB) for b,_,_ in batch])
+    g_stack = torch.stack([pad_seq(g, maxG) for _,g,_ in batch])
+    t_stack = torch.stack([pad_t(t, maxG) for *_,t in batch])
+    return b_stack, g_stack, t_stack
+# ─────────────── losses, metrics ─────────────────────────────────────────
+def combined_loss_components(logits, targets, peak_thresh=0.5, eps=1e-8):
+    probs = torch.sigmoid(logits)
+    labels = (targets >= peak_thresh).float()
+    non_peak_mask = (labels == 0).float()
+    peak_mask     = (labels == 1).float()
+    bce_all = F.binary_cross_entropy_with_logits(logits, labels, reduction='none')
+    bce_non = (bce_all * non_peak_mask)
+    bce_non = bce_non.sum() / (non_peak_mask.sum() + eps)
+    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction='sum') \
+                / (peak_mask.sum() + eps)
+    t_dist = (targets + eps)
+    p_dist = (probs + eps)
+    t_dist = t_dist / t_dist.sum(dim=1, keepdim=True)
+    p_dist = p_dist / p_dist.sum(dim=1, keepdim=True)
+    kl = (t_dist * (t_dist.clamp(min=eps).log() - p_dist.clamp(min=eps).log())).sum(dim=1).mean()
+    return bce_non, kl, mse_peaks, probs
+def accuracy_percentage(logits, targets, peak_thresh=0.5):
+    probs = torch.sigmoid(logits)
+    preds_bin = (probs >= 0.5).float()
+    labels    = (targets >= peak_thresh).float()
+    correct   = (preds_bin == labels).float().sum()
+    total     = torch.numel(labels)
+    return (correct / max(1, total)).item() * 100.0
+def evaluate(model, dl, device, alpha, beta, gamma, peak_thresh, eps=1e-8):
     model.eval()
+    tot_loss, tot_acc = 0.0, 0.0
+    n_batches = 0
     with torch.no_grad():
+        for b,g,t in dl:
+            b,g,t = b.to(device), g.to(device), t.to(device)
+            logits = model(b,g)
+            bce_non, kl, mse_peaks, _ = combined_loss_components(logits, t, peak_thresh=peak_thresh, eps=eps)
+            loss = alpha*bce_non + beta*kl + gamma*mse_peaks
+            acc  = accuracy_percentage(logits, t, peak_thresh=peak_thresh)
+            tot_loss += loss.item(); tot_acc += acc; n_batches += 1
+    if n_batches == 0: return float("nan"), float("nan")
+    return tot_loss / n_batches, tot_acc / n_batches
+# ─────────────── cluster-aware splitting ──────────────────────────────────
+def assign_clusters_to_splits(cluster_to_indices, val_frac=0.10, test_frac=0.10, seed=42):
+    """
+    cluster_to_indices: dict[cluster_id] -> list of example indices (from pair_list) in that cluster
+    We greedily pack whole clusters into val/test until hitting targets (#examples), rest to train.
+    """
+    rng = random.Random(seed)
+    clusters = list(cluster_to_indices.items())
+    rng.shuffle(clusters)
+    total = sum(len(ixs) for _, ixs in clusters)
+    target_val  = int(round(total * val_frac))
+    target_test = int(round(total * test_frac))
+    cur_val = cur_test = 0
+    tr_ix, va_ix, te_ix = [], [], []
+    for cid, ixs in clusters:
+        c = len(ixs)
+        if cur_val + c <= target_val:
+            va_ix.extend(ixs); cur_val += c
+        elif cur_test + c <= target_test:
+            te_ix.extend(ixs); cur_test += c
+        else:
+            tr_ix.extend(ixs)
+    return tr_ix, va_ix, te_ix
+# ─────────────── train & main ────────────────────────────────────────────
 def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--pair_list",    required=True)
+    p.add_argument("--final_csv",     required=True)
+    p.add_argument("--out_dir",       required=True)
+    p.add_argument("--epochs",       type=int,   default=10)
+    p.add_argument("--batch_size",   type=int,   default=16)
+    p.add_argument("--accum_steps",  type=int,   default=4)
+    p.add_argument("--lr",           type=float, default=1e-4)
+    p.add_argument("--device",       default="cuda")
+    p.add_argument("--seed",         type=int,   default=42)
+    p.add_argument("--alpha",        type=float, default=0.5)
+    p.add_argument("--beta",         type=float, default=0.6)
+    p.add_argument("--gamma",        type=float, default=0.6)
+    p.add_argument("--peak_thresh",  type=float, default=0.5)
+    # NEW: fractions for cluster-aware split (used only if cluster_id present)
+    p.add_argument("--val_frac",     type=float, default=0.10)
+    p.add_argument("--test_frac",    type=float, default=0.10)
+    args = p.parse_args()
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
     device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    # 1) load pair list & final.csv (now may include cluster_id)
+    tf_paths, dna_paths = parse_pair_list(args.pair_list)
+    final_df = pd.read_csv(args.final_csv, dtype=str)
+    print(f"[i] Loaded {len(tf_paths)} pairs", flush=True)
+    tf_cache = build_tf_cache(tf_paths, target_dim=256)
+    # detect binder/DNA dims
+    sample_tf = tf_cache[tf_paths[0]]
+    binder_input_dim = sample_tf.shape[1] if sample_tf.ndim == 2 else sample_tf.shape[0]
+    glm_input_dim = 256
+    # 2) cluster-aware split if possible
+    use_cluster_split = ("cluster_id" in final_df.columns)
+    if use_cluster_split:
+        print("[i] Cluster column detected in final_csv; performing cluster-aware split.", flush=True)
+        # build dna_id -> cluster_id map
+        cid_map = (final_df[["dna_id","cluster_id"]].dropna().drop_duplicates()
+                   .set_index("dna_id")["cluster_id"].to_dict())
+        # map each example (by index) to its dna_id and cluster
+        example_dna_ids = [dna_key_from_path(p) for p in dna_paths]
+        example_clusters = []
+        missing = 0
+        for did in example_dna_ids:
+            if did in cid_map:
+                example_clusters.append(cid_map[did])
+            else:
+                # fallback: treat singleton cluster
+                example_clusters.append(f"singleton::{did}")
+                missing += 1
+        if missing:
+            print(f"[WARN] {missing} dna_ids from pair_list not found in cluster map; treating as singleton clusters.", flush=True)
+        # build cluster -> indices
+        cluster_to_indices = {}
+        for i, cid in enumerate(example_clusters):
+            cluster_to_indices.setdefault(cid, []).append(i)
+        tr_idx, va_idx, te_idx = assign_clusters_to_splits(
+            cluster_to_indices,
+            val_frac=args.val_frac, test_frac=args.test_frac, seed=args.seed
+        )
+        print(f"[i] Cluster split sizes (examples): train={len(tr_idx)} val={len(va_idx)} test={len(te_idx)}", flush=True)
+        # helper to subset paths
+        def subset_by_indices(ixs):
+            return [tf_paths[i] for i in ixs], [dna_paths[i] for i in ixs]
+        tr_t, tr_d = subset_by_indices(tr_idx)
+        va_t, va_d = subset_by_indices(va_idx)
+        te_t, te_d = subset_by_indices(te_idx)
+    else:
+        print("[i] No cluster_id in final_csv; using random 80/10/10 split (OLD behavior).", flush=True)
+        # OLD random split (kept, now under else)
+        N = len(tf_paths)
+        idxs = list(range(N)); random.shuffle(idxs)
+        n_tr = int(0.8*N); n_va = int(0.1*N)
+        tr, va, te = idxs[:n_tr], idxs[n_tr:n_tr+n_va], idxs[n_tr+n_va:]
+        def subset(idxs_):
+            return [tf_paths[i] for i in idxs_], [dna_paths[i] for i in idxs_]
+        tr_t, tr_d = subset(tr)
+        va_t, va_d = subset(va)
+        te_t, te_d = subset(te)
+    # 3) bucketed samplers (unchanged, but now use the cluster-aware subsets when available)
+    tr_bs = make_buckets(list(range(len(tr_t))), tr_d, args.batch_size, n_buckets=10, seed=args.seed)
+    va_bs = make_buckets(list(range(len(va_t))), va_d, args.batch_size, n_buckets=5,  seed=args.seed+1)
+    te_bs = make_buckets(list(range(len(te_t))), te_d, args.batch_size, n_buckets=5,  seed=args.seed+2)
+    tr_dl = DataLoader(PairDataset(tr_t, tr_d, final_df, tf_cache),
+                       batch_sampler=ListBatchSampler(tr_bs),
+                       collate_fn=collate_fn)
+    va_dl = DataLoader(PairDataset(va_t, va_d, final_df, tf_cache),
+                       batch_sampler=ListBatchSampler(va_bs),
+                       collate_fn=collate_fn)
+    te_dl = DataLoader(PairDataset(te_t, te_d, final_df, tf_cache),
+                       batch_sampler=ListBatchSampler(te_bs),
+                       collate_fn=collate_fn)
+    # 4) model, optimizer, scaler
+    model     = BindPredictor(binder_input_dim=binder_input_dim,
+                              glm_input_dim=glm_input_dim,
+                              compressed_dim=256,
+                              hidden_dim=256,
+                              heads=8, num_layers=4,
+                              use_local_cnn_on_glm=True).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
+    scaler    = amp.GradScaler('cuda')
+    history, best_val = {"train": [], "val": []}, float("inf")
+    od = Path(args.out_dir); od.mkdir(exist_ok=True, parents=True)
+    for ep in range(1, args.epochs+1):
+        print(f"┌─[Epoch {ep}]────────────────────────", flush=True)
         model.train()
+        optimizer.zero_grad()
+        acc_loss_sum, acc_acc_sum, n_train_batches = 0.0, 0.0, 0
+        for i, (b, g, t) in enumerate(tr_dl):
+            b, g, t = b.to(device), g.to(device), t.to(device)
+            with amp.autocast('cuda'):
+                logits  = model(b, g)
+                bce_non, kl, mse_peaks, probs = combined_loss_components(
+                    logits, t, peak_thresh=args.peak_thresh
+                )
+                loss = args.alpha*bce_non + args.beta*kl + args.gamma*mse_peaks
+                loss = loss / args.accum_steps
+            scaler.scale(loss).backward()
+            if (i + 1) % args.accum_steps == 0:
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+            with torch.no_grad():
+                acc_loss_sum += (loss.item() * args.accum_steps)
+                acc_acc_sum  += accuracy_percentage(logits, t, peak_thresh=args.peak_thresh)
+                n_train_batches += 1
+            del b, g, t, logits, probs, loss, bce_non, kl, mse_peaks
+            torch.cuda.empty_cache()
+        # finalize if leftovers
+        if n_train_batches % args.accum_steps != 0:
+            scaler.step(optimizer); scaler.update(); optimizer.zero_grad()
+        train_loss = acc_loss_sum / max(1, n_train_batches)
+        train_acc  = acc_acc_sum  / max(1, n_train_batches)
+        val_loss, val_acc = evaluate(model, va_dl, device,
+                                     alpha=args.alpha, beta=args.beta, gamma=args.gamma,
+                                     peak_thresh=args.peak_thresh)
+        print(f"[Epoch {ep}] train_loss={train_loss:.4f}  train_acc={train_acc:.2f}%  "
+              f"val_loss={val_loss:.4f}  val_acc={val_acc:.2f}%", flush=True)
+        history["train"].append(train_loss)
+        history["val"].append(val_loss)
+        if val_loss < best_val:
+            best_val = val_loss
+            torch.save(model.state_dict(), od/"best_model.pt")
+            print(f" Saved new best_model.pt (val_loss={val_loss:.4f}, val_acc={val_acc:.2f}%)", flush=True)
+    torch.save(model.state_dict(), od/"last_model.pt")
+    fig, ax = plt.subplots()
+    ax.plot(history["train"], label="train")
+    ax.plot(history["val"],   label="val")
+    ax.set_xlabel("epoch"); ax.set_ylabel("combined loss"); ax.legend()
+    fig.savefig(od/"loss_curve.png")
+    print(f"✅ Done → outputs in {od}", flush=True)
+if __name__=="__main__":
     main()