svincoff commited on Aug 27, 2025

Commit

121a325

1 Parent(s): 4c4b1fc

baseline compare

Browse files

Files changed (22) hide show

.gitignore +4 -1
configs/model/baseline.yaml +10 -0
configs/model/pooling/truncatedsvd.yaml +0 -7
dpacman/classifier/{model_w_rca.py → baseline.py} +106 -211
dpacman/classifier/loss.py +73 -40
dpacman/classifier/model.py +42 -5
dpacman/classifier/model_tmp/__init__.py +0 -0
dpacman/classifier/model_tmp/clustering_data.py +0 -475
dpacman/classifier/model_tmp/compress_embeddings.py +0 -62
dpacman/classifier/model_tmp/compute_embeddings.py +0 -612
dpacman/classifier/model_tmp/extract_tf_symbols.py +0 -30
dpacman/classifier/model_tmp/make_pair_list.py +0 -282
dpacman/classifier/model_tmp/make_peak_fasta.py +0 -15
dpacman/classifier/model_tmp/model.py +0 -111
dpacman/classifier/model_tmp/prep_splits.py +0 -157
dpacman/classifier/model_tmp/train.py +0 -217
dpacman/classifier/old_train.py +0 -486
dpacman/classifier/torch_model.py +0 -157
dpacman/classifier/train.py +0 -220
dpacman/scripts/delay_run.sh +1 -1
dpacman/scripts/run_train.sh +3 -3
dpacman/scripts/run_train_baseline.sh +39 -0

.gitignore CHANGED Viewed

@@ -35,4 +35,7 @@ dpacman/combine.log
 dpacman/loss_sim.py
 dpacman/loss_temp.py
 dpacman/peak_examples/
-dpacman/__pycache__/

 dpacman/loss_sim.py
 dpacman/loss_temp.py
 dpacman/peak_examples/
+dpacman/__pycache__/
+log.log
+log2.log
+dpacman/delay.log

configs/model/baseline.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+_target_: dpacman.classifier.baseline.BaselineBindPredictor
+lr: 1e-4
+alpha: 20
+gamma: 20
+weight_decay: 0.01
+glm_input_dim: 256
+compressed_dim: 256
+hidden_dim: 128

configs/model/pooling/truncatedsvd.yaml DELETED Viewed

@@ -1,7 +0,0 @@
-n_components: 2
-algorithm: randomized
-n_iter: 5
-n_oversamples: 10
-poewr_iteration_normalizer: auto
-random_state: 42
-tol: 0

dpacman/classifier/{model_w_rca.py → baseline.py} RENAMED Viewed

@@ -1,177 +1,17 @@
 """
-Lightning Module for the binding model.
 """
-import torch
-from torch import nn
 from lightning import LightningModule
-from dpacman.utils.models import set_seed
-from .loss import calculate_loss
-set_seed()
-class LocalCNN(nn.Module):
-    def __init__(self, dim: int = 256, kernel_size: int = 3):
-        super().__init__()
-        padding = kernel_size // 2
-        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
-        self.act = nn.GELU()
-        self.ln = nn.LayerNorm(dim)
-    def forward(self, x: torch.Tensor):
-        # x: (batch, L, dim)
-        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
-        out = self.act(out)
-        out = out.transpose(1, 2)  # → (batch, L, dim)
-        return self.ln(out + x)  # residual
-# class CrossModalBlock(nn.Module):
-#     def __init__(self, dim: int = 256, heads: int = 8):
-#         super().__init__()
-#         # self-attention for both sides
-#         self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
-#         self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
-#         self.ln_b1 = nn.LayerNorm(dim)
-#         self.ln_g1 = nn.LayerNorm(dim)
-#         self.ffn_b = nn.Sequential(
-#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-#         )
-#         self.ffn_g = nn.Sequential(
-#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-#         )
-#         self.ln_b2 = nn.LayerNorm(dim)
-#         self.ln_g2 = nn.LayerNorm(dim)
-#         # cross attention (binder queries, glm keys/values)
-#         # so the NDA path is updated by the transcriptoin factors
-#         self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
-#         self.ln_c1 = nn.LayerNorm(dim)
-#         self.ffn_c = nn.Sequential(
-#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-#         )
-#         self.ln_c2 = nn.LayerNorm(dim)
-#     def forward(self, binder: torch.Tensor, glm: torch.Tensor):
-#         """
-#         binder: (batch, Lb, dim)
-#         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
-#         returns: updated binder representation (batch, Lb, dim)
-#         """
-#         # binder: self-attn + ffn
-#         b = binder
-#         b_sa, _ = self.sa_binder(b, b, b)
-#         b = self.ln_b1(b + b_sa)
-#         b_ff = self.ffn_b(b)
-#         b = self.ln_b2(b + b_ff)
-#         # glm: self-attn + ffn
-#         g = glm
-#         g_sa, _ = self.sa_glm(g, g, g)
-#         g = self.ln_g1(g + g_sa)
-#         g_ff = self.ffn_g(g)
-#         g = self.ln_g2(g + g_ff)
-#         # cross-attention: glm queries binder and glm embeddings are updated
-#         g_to_b_ca, _ = self.cross_attn(g, b, b)
-#         g = self.ln_c1(g + g_to_b_ca)
-#         g_ff = self.ffn_c(g)
-#         g = self.ln_c2(g + g_ff)
-#         return g  # (batch, Lb, dim)
-class CrossModalBlock(nn.Module):
-    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.0):
-        super().__init__()
-        # 1) self-attn on each stream
-        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
-        self.sa_glm    = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
-        self.ln_b1 = nn.LayerNorm(dim)
-        self.ln_g1 = nn.LayerNorm(dim)
-        self.ffn_b = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
-        self.ffn_g = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
-        self.ln_b2 = nn.LayerNorm(dim)
-        self.ln_g2 = nn.LayerNorm(dim)
-        # 2) reciprocal cross-attn: g<-b and b<-g
-        # DNA/GLM updated by attending to Binder
-        self.cross_g2b = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
-        self.ln_g_ca1  = nn.LayerNorm(dim)
-        self.ffn_g_ca  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
-        self.ln_g_ca2  = nn.LayerNorm(dim)
-        # Binder updated by attending to DNA/GLM
-        self.cross_b2g = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
-        self.ln_b_ca1  = nn.LayerNorm(dim)
-        self.ffn_b_ca  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
-        self.ln_b_ca2  = nn.LayerNorm(dim)
-    def forward(
-        self,
-        binder: torch.Tensor,      # (B, Lb, D)
-        glm: torch.Tensor,         # (B, Lg, D)
-        binder_mask: torch.Tensor | None = None,  # (B, Lb) True = keep
-        glm_mask: torch.Tensor | None = None,     # (B, Lg) True = keep
-    ):
-        # 1) self-attn+FFN on each stream
-        b, g = binder, glm
-        b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=None)
-        b = self.ln_b1(b + b_sa)
-        b = self.ln_b2(b + self.ffn_b(b))
-        g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=None)
-        g = self.ln_g1(g + g_sa)
-        g = self.ln_g2(g + self.ffn_g(g))
-        # 2a) DNA/GLM updated by attending to Binder (Q=g, K=b, V=b)
-        g_ca, _ = self.cross_g2b(
-            g, b, b,
-            # torch MultiheadAttention expects key_padding_mask=True for PADs;
-            # invert if your mask is True=keep:
-            # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
-        )
-        g = self.ln_g_ca1(g + g_ca)
-        g = self.ln_g_ca2(g + self.ffn_g_ca(g))
-        # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
-        b_ca, _ = self.cross_b2g(
-            b, g, g,
-            # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
-        )
-        b = self.ln_b_ca1(b + b_ca)
-        b = self.ln_b_ca2(b + self.ffn_b_ca(b))
-        return b, g
-class DimCompressor(nn.Module):
     """
-    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
-    If in_dim == out_dim, behaves as identity.
     """
-    def __init__(self, in_dim: int, out_dim: int = 256):
-        super().__init__()
-        if in_dim == out_dim:
-            self.net = nn.Identity()
-        else:
-            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
-            self.net = nn.Sequential(
-                nn.LayerNorm(in_dim),
-                nn.Linear(in_dim, hidden),
-                nn.GELU(),
-                nn.Linear(hidden, out_dim),
-            )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: (B, L, in_dim)
-        return self.net(x)
-class BindPredictor(LightningModule):
     def __init__(
         self,
         # input_dim: int = 256,                     # OLD: single input dim
@@ -179,77 +19,64 @@ class BindPredictor(LightningModule):
         glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
         compressed_dim: int = 256,  # NEW: learnable compressed dim
         hidden_dim: int = 256,
-        heads: int = 8,
-        num_layers: int = 4,
         lr: float = 1e-4,
         alpha: float = 20,
         gamma: float = 20,
-        use_local_cnn_on_glm: bool = True,
         weight_decay: float = 0.01,
     ):
         # Init
-        super(BindPredictor, self).__init__()
         self.save_hyperparameters()
         # Learnable compressor for binder -> 256, then project to hidden
         self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
-        self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
-        # GLM side stays 256 -> hidden
-        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
-        self.use_local_cnn = use_local_cnn_on_glm
-        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
-        self.layers = nn.ModuleList(
-            [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
         )
-        self.ln_out = nn.LayerNorm(hidden_dim)
-        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
-        self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
-    def forward(self, binder_emb, glm_emb):
         """
         binder_emb: (B, Lb, binder_input_dim)
         glm_emb:    (B, Lg, glm_input_dim)
         Returns per-nucleotide logits for the GLM sequence: (B, Lg)
         """
-        # Binder: learnable compression → 256 → hidden
-        b = self.binder_compress(binder_emb)  # (B, Lb, 256)
-        b = self.proj_binder(b)  # (B, Lb, hidden_dim)
-        # GLM: project → hidden, add local CNN context
-        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
-        if self.use_local_cnn:
-            g = self.local_cnn(g)
-        # Cross-modal blocks: update binder states using GLM
-        for layer in self.layers:
-            b, g = layer(b, g)  # (B, Lb, hidden_dim)
-        # Predict per-nucleotide logits on the GLM tokens:
-        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
-        return self.head(g).squeeze(
             -1
-        )  # NEW: logits (apply sigmoid only in loss/metrics)
     # ----- Lightning hooks -----
     def training_step(self, batch, batch_idx):
         """
         Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
         Colator returns a dictionary with:
             "binder_emb"    # [B, Lb_max, Db]
-            "binder_mask"   # [B, Lb_max]
             "glm_emb"       # [B, Lg_max, Dg]
-            "glm_mask"      # [B, Lg_max]
             "labels"        # [B, Lg_max]
             "ID"
             "tr_sequence"
             "dna_sequence"
         }
         """
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
@@ -261,10 +88,30 @@ class BindPredictor(LightningModule):
             prog_bar=True,
             batch_size=logits.size(0),
         )
         return loss
     def validation_step(self, batch, batch_idx):
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
@@ -276,17 +123,65 @@ class BindPredictor(LightningModule):
             prog_bar=True,
             batch_size=logits.size(0),
         )
         return loss
     def test_step(self, batch, batch_idx):
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
         )
         return loss
     def on_train_epoch_end(self):
         if False:
@@ -320,4 +215,4 @@ class BindPredictor(LightningModule):
         return {
             "optimizer": opt,
             "lr_scheduler": {"scheduler": sch, "interval": "epoch"},
-        }

 """
+Code for baseline model to compare the classifier to
 """
 from lightning import LightningModule
+import torch
+import torch.nn as nn
+from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_vs_ones_from_logits
+from .model import DimCompressor
+class BaselineBindPredictor(LightningModule):
     """
+    Baseline predictor: simple MLP that just concatenates the embeddings and outputs per-token predictions.
     """
     def __init__(
         self,
         # input_dim: int = 256,                     # OLD: single input dim
         glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
         compressed_dim: int = 256,  # NEW: learnable compressed dim
         hidden_dim: int = 256,
         lr: float = 1e-4,
         alpha: float = 20,
         gamma: float = 20,
+        dropout: float = 0,
         weight_decay: float = 0.01,
     ):
         # Init
+        super(BaselineBindPredictor, self).__init__()
         self.save_hyperparameters()
         # Learnable compressor for binder -> 256, then project to hidden
         self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(compressed_dim, hidden_dim),
+            torch.nn.ReLU(),
+            torch.nn.Linear(hidden_dim, 1),
+            torch.nn.ReLU(),
         )
+    def forward(self, binder_emb, glm_emb, binder_mask, glm_mask):
         """
         binder_emb: (B, Lb, binder_input_dim)
         glm_emb:    (B, Lg, glm_input_dim)
         Returns per-nucleotide logits for the GLM sequence: (B, Lg)
         """
+        # Binder: learnable compression → glm_input_dim
+        b = self.binder_compress(binder_emb)  # (B, Lb, glm_input_dim)
+        # Concatenate target and binder. Concatenate on the length dimension
+        lg = glm_emb.shape[1]
+        concat_embeddings = torch.concat((glm_emb,b), dim=1) # (B, Lb + Lg, glm_input_dim)
+        # Run concatenated embeddings through MLP
+        logits = self.mlp(concat_embeddings)  # (B, Lb + Lg, 1)
+        # Get only the DNA logits.
+        logits = logits[:,0:lg,:].squeeze(
             -1
+        )
+        return logits
     # ----- Lightning hooks -----
     def training_step(self, batch, batch_idx):
         """
         Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
         Colator returns a dictionary with:
             "binder_emb"    # [B, Lb_max, Db]
+            "binder_kpm"    # [B, Lb_max]
             "glm_emb"       # [B, Lg_max, Dg]
+            "glm_kpm"       # [B, Lg_max]
             "labels"        # [B, Lg_max]
             "ID"
             "tr_sequence"
             "dna_sequence"
         }
         """
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
             prog_bar=True,
             batch_size=logits.size(0),
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("train/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("train/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        # (optional) also log class counts so you can sanity-check balance
+        self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
+        self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
         return loss
     def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
             prog_bar=True,
             batch_size=logits.size(0),
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("val/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("val/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
     def test_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("test/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("test/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
+    def on_before_optimizer_step(self, optimizer):
+        # Compute global L2 norm of all parameter gradients (ignores None grads)
+        grads = []
+        for p in self.parameters():
+            if p.grad is not None:
+                # .detach() avoids autograd tracking; .float() avoids fp16 overflow in norms
+                grads.append(p.grad.detach().float().norm(2))
+        if grads:
+            total_norm = torch.norm(torch.stack(grads), p=2)
+            self.log("train/grad_norm", total_norm, on_step=True, prog_bar=False, logger=True)
+    def on_after_backward(self):
+        grads = [p.grad.detach().float().norm(2)
+                for p in self.parameters() if p.grad is not None]
+        if grads:
+            total_norm = torch.norm(torch.stack(grads), p=2)
+            self.log("train/grad_norm_back", total_norm, on_step=True, prog_bar=False)
     def on_train_epoch_end(self):
         if False:
         return {
             "optimizer": opt,
             "lr_scheduler": {"scheduler": sch, "interval": "epoch"},
+        }

dpacman/classifier/loss.py CHANGED Viewed

@@ -4,6 +4,9 @@ Define loss functions needed for training the model — padding safe (-1 sentine
 import torch
 import torch.nn.functional as F
 def _expand_like(mask: torch.Tensor, like: torch.Tensor):
     # Make mask broadcastable to logits/targets (handles (B,L) vs (B,L,1))
@@ -62,59 +65,89 @@ def calculate_loss(
     return alpha * bce_nonpeak + gamma * mse_peak
-import torch
 @torch.no_grad()
-def auprc_zeros_vs_ones_from_logits(
     logits: torch.Tensor,            # (B, L)
     labels: torch.Tensor,            # (B, L)
-    glm_kpm: torch.Tensor | None,    # (B, L) True=PAD; pass None if not available
     pos_thresh: float = 0.99,
-) -> tuple[torch.Tensor, int, int]:
     """
-    Returns (ap, n_pos, n_neg). AP is Average Precision (area under PR).
-    Uses only positions with labels == 0.0 or > pos_thresh. Ignores PADs via glm_kpm.
-    Computation stays on the same device as logits.
     """
-    probs = torch.sigmoid(logits)
-    # Valid positions: not padded
-    if glm_kpm is not None:
-        valid = ~glm_kpm
-    else:
-        valid = torch.ones_like(labels, dtype=torch.bool, device=labels.device)
-    # Keep only exact zeros and near-ones
-    pos = labels > pos_thresh
-    neg = labels == 0.0
-    keep = valid & (pos | neg)
     if keep.sum() == 0:
-        return torch.tensor(float('nan'), device=logits.device), 0, 0
-    y = pos[keep].to(probs.dtype)            # 1 for >0.99, 0 for 0.0
-    s = probs[keep].to(probs.dtype)
-    n = y.numel()
     n_pos = int(y.sum().item())
-    n_neg = n - n_pos
-    if n_pos == 0:                            # no positives → AP = 0 by convention
-        return torch.tensor(0.0, device=logits.device), 0, n_neg
-    # Sort by score descending
-    order = torch.argsort(s, descending=True)
-    y_sorted = y[order]
-    # CumTP and precision/recall
-    tp = torch.cumsum(y_sorted, dim=0)
-    ranks = torch.arange(1, n + 1, device=logits.device, dtype=probs.dtype)
-    precision = tp / ranks
-    recall = tp / n_pos
-    # AP = sum( precision * Δrecall )
-    recall_prev = torch.cat([torch.zeros(1, device=logits.device, dtype=probs.dtype), recall[:-1]])
-    ap = (precision * (recall - recall_prev)).sum()
-    return ap, n_pos, n_neg
 def accuracy_percentage(
     logits,

 import torch
 import torch.nn.functional as F
+from torchmetrics.functional.classification import (
+    auroc, average_precision, roc, precision_recall_curve
+)
 def _expand_like(mask: torch.Tensor, like: torch.Tensor):
     # Make mask broadcastable to logits/targets (handles (B,L) vs (B,L,1))
     return alpha * bce_nonpeak + gamma * mse_peak
 @torch.no_grad()
+def auroc_zeros_vs_ones_from_logits(
     logits: torch.Tensor,            # (B, L)
     labels: torch.Tensor,            # (B, L)
+    glm_kpm: torch.Tensor | None = None,  # (B, L) True=PAD
     pos_thresh: float = 0.99,
+):
     """
+    Returns:
+      auc:            scalar tensor (AUROC)
+      n_pos, n_neg:   ints
+      tpr, fpr:       tensors of shape (T,)
+      thresholds:     tensor of shape (T,)
+      tp, fp:         integer counts per threshold (shape (T,))
     """
+    device = logits.device
+    valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
+    keep = valid & ((labels > pos_thresh) | (labels == 0.0))
+    if keep.sum() == 0:
+        return (torch.tensor(float('nan'), device=device), 0, 0,
+                torch.empty(0, device=device), torch.empty(0, device=device),
+                torch.empty(0, device=device), torch.empty(0, device=device), torch.empty(0, device=device))
+    y = (labels[keep] > pos_thresh).to(torch.int)
+    s = logits[keep]
+    n_pos = int(y.sum().item())
+    n_neg = y.numel() - n_pos
+    if n_pos == 0 or n_neg == 0:
+        return (torch.tensor(float('nan'), device=device), n_pos, n_neg,
+                torch.empty(0, device=device), torch.empty(0, device=device),
+                torch.empty(0, device=device), torch.empty(0, device=device), torch.empty(0, device=device))
+    # Full ROC curve
+    fpr, tpr, thresholds = roc(s, y, task="binary")
+    # AUROC (TM handles logits)
+    auc = auroc(s, y, task="binary")
+    # Convert rates to counts (round to nearest to avoid float off-by-one)
+    tp = (tpr * n_pos).round().to(torch.long)
+    fp = (fpr * n_neg).round().to(torch.long)
+    return auc.to(device), n_pos, n_neg, tpr.to(device), fpr.to(device), thresholds.to(device), tp.to(device), fp.to(device)
+@torch.no_grad()
+def auprc_zeros_vs_ones_from_logits(
+    logits: torch.Tensor,            # (B, L)
+    labels: torch.Tensor,            # (B, L)
+    glm_kpm: torch.Tensor | None = None,  # (B, L) True=PAD
+    pos_thresh: float = 0.99,
+):
+    """
+    Returns:
+      ap:             scalar tensor (Average Precision / AUPRC)
+      n_pos, n_neg:   ints
+      precision:      (T,)
+      recall:         (T,)
+      thresholds:     (T,)
+    """
+    device = logits.device
+    valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
+    keep = valid & ((labels > pos_thresh) | (labels == 0.0))
     if keep.sum() == 0:
+        return (torch.tensor(float('nan'), device=device), 0, 0,
+                torch.empty(0, device=device), torch.empty(0, device=device), torch.empty(0, device=device))
+    y = (labels[keep] > pos_thresh).to(torch.int)
+    s = logits[keep]
     n_pos = int(y.sum().item())
+    n_neg = y.numel() - n_pos
+    if n_pos == 0:
+        # By convention, AP=0 when there are no positives
+        return (torch.tensor(0.0, device=device), 0, n_neg,
+                torch.empty(0, device=device), torch.empty(0, device=device), torch.empty(0, device=device))
+    # Full PR curve
+    precision, recall, thresholds = precision_recall_curve(s, y, task="binary")
+    # Average Precision / AUPRC
+    ap = average_precision(s, y, task="binary")
+    return ap.to(device), n_pos, n_neg, precision.to(device), recall.to(device), thresholds.to(device)
 def accuracy_percentage(
     logits,

dpacman/classifier/model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 from torch import nn
 from lightning import LightningModule
 from dpacman.utils.models import set_seed
-from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits
 set_seed()
@@ -211,9 +211,9 @@ class BindPredictor(LightningModule):
         Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
         Colator returns a dictionary with:
             "binder_emb"    # [B, Lb_max, Db]
-            "binder_mask"   # [B, Lb_max]
             "glm_emb"       # [B, Lg_max, Dg]
-            "glm_mask"      # [B, Lg_max]
             "labels"        # [B, Lg_max]
             "ID"
             "tr_sequence"
@@ -233,14 +233,20 @@ class BindPredictor(LightningModule):
             batch_size=logits.size(0),
         )
-        # ---- AUPRC on labels in {0, >0.99} only ----
-        ap, n_pos, n_neg = auprc_zeros_vs_ones_from_logits(
             logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
         )
         # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
         self.log("train/auprc_0v1",
                 ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
                 on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         # (optional) also log class counts so you can sanity-check balance
         self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
         self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
@@ -260,6 +266,22 @@ class BindPredictor(LightningModule):
             prog_bar=True,
             batch_size=logits.size(0),
         )
         return loss
     def test_step(self, batch, batch_idx):
@@ -270,6 +292,21 @@ class BindPredictor(LightningModule):
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
         )
         return loss
     def on_before_optimizer_step(self, optimizer):

 from torch import nn
 from lightning import LightningModule
 from dpacman.utils.models import set_seed
+from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_vs_ones_from_logits
 set_seed()
         Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
         Colator returns a dictionary with:
             "binder_emb"    # [B, Lb_max, Db]
+            "binder_kpm"    # [B, Lb_max]
             "glm_emb"       # [B, Lg_max, Dg]
+            "glm_kpm"       # [B, Lg_max]
             "labels"        # [B, Lg_max]
             "ID"
             "tr_sequence"
             batch_size=logits.size(0),
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
             logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
         )
         # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
         self.log("train/auprc_0v1",
                 ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
                 on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("train/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         # (optional) also log class counts so you can sanity-check balance
         self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
         self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
             prog_bar=True,
             batch_size=logits.size(0),
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("val/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("val/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
     def test_step(self, batch, batch_idx):
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
         )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("test/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("test/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
     def on_before_optimizer_step(self, optimizer):

dpacman/classifier/model_tmp/__init__.py DELETED Viewed

File without changes

dpacman/classifier/model_tmp/clustering_data.py DELETED Viewed

@@ -1,475 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import pandas as pd
-from pathlib import Path
-import random
-import sys
-import subprocess
-from collections import defaultdict
-# ─────────────────────────────────────────────────────────────────────────
-# Original helpers (kept; some lightly edited/commented where needed)
-# ─────────────────────────────────────────────────────────────────────────
-def read_ids_file(p):
-    p = Path(p)
-    if not p.exists():
-        raise FileNotFoundError(f"IDs file not found: {p}")
-    return [line.strip() for line in p.open() if line.strip()]
-def split_embeddings(emb_path, ids_path, out_dir, prefix):
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    if not Path(emb_path).exists():
-        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
-    if not Path(ids_path).exists():
-        raise FileNotFoundError(f"IDs file not found: {ids_path}")
-    if emb_path.endswith(".npz"):
-        data = np.load(emb_path, allow_pickle=True)
-        if "embeddings" in data:
-            emb = data["embeddings"]
-        else:
-            raise ValueError(f"{emb_path} missing 'embeddings' key")
-    else:
-        emb = np.load(emb_path)
-    ids = read_ids_file(ids_path)
-    if len(ids) != emb.shape[0]:
-        print(
-            f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}",
-            file=sys.stderr,
-        )
-    mapping = {}
-    for i, ident in enumerate(ids):
-        if i >= emb.shape[0]:
-            print(
-                f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr
-            )
-            continue
-        arr = emb[i]
-        out_file = out_dir / f"{prefix}_{ident}.npy"
-        np.save(out_file, arr)
-        mapping[ident] = str(out_file)
-    return mapping
-def extract_symbol_from_tf_id(full_id: str) -> str:
-    """
-    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
-    return the gene symbol uppercase (e.g., 'ZBTB5').
-    """
-    if "|" in full_id:
-        try:
-            # format sp|Accession|SYMBOL_HUMAN
-            genepart = full_id.split("|")[2]
-        except IndexError:
-            genepart = full_id
-    else:
-        genepart = full_id
-    symbol = genepart.split("_")[0]
-    return symbol.upper()
-def build_tf_symbol_map(tf_map):
-    """
-    Build mapping gene_symbol -> list of embedding paths.
-    """
-    symbol_map = {}
-    for full_id, path in tf_map.items():
-        symbol = extract_symbol_from_tf_id(full_id)
-        symbol_map.setdefault(symbol, []).append(path)
-    return symbol_map
-def tf_key_from_path(path: str) -> str:
-    """
-    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
-    """
-    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
-    # remove leading prefix if present (tf_)
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return extract_symbol_from_tf_id(rest)
-def dna_key_from_path(path: str) -> str:
-    """
-    Given .../dna_peak42.npy -> 'peak42'
-    """
-    stem = Path(path).stem
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return rest
-# ─────────────────────────────────────────────────────────────────────────
-# New helpers for MMseqs clustering & cluster-level splitting
-# ─────────────────────────────────────────────────────────────────────────
-def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
-    """
-    Write unique DNA sequences to FASTA using dna_id as header.
-    Requires df with columns: dna_id, dna_sequence
-    """
-    uniq = df[["dna_id", "dna_sequence"]].drop_duplicates()
-    with open(out_fasta, "w") as f:
-        for _, row in uniq.iterrows():
-            did = row["dna_id"]
-            seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
-            f.write(f">{did}\n{seq}\n")
-def run_mmseqs_easy_cluster(
-    mmseqs_bin: str,
-    fasta: Path,
-    out_prefix: Path,
-    tmp_dir: Path,
-    min_seq_id: float,
-    coverage: float,
-    cov_mode: int,
-) -> Path:
-    """
-    Runs mmseqs easy-cluster on nucleotide sequences.
-    Returns the path to a clusters TSV file (creating it if the default one isn't present).
-    """
-    tmp_dir.mkdir(parents=True, exist_ok=True)
-    out_prefix.parent.mkdir(parents=True, exist_ok=True)
-    cmd = [
-        mmseqs_bin,
-        "easy-cluster",
-        str(fasta),
-        str(out_prefix),
-        str(tmp_dir),
-        "--min-seq-id",
-        str(min_seq_id),
-        "-c",
-        str(coverage),
-        "--cov-mode",
-        str(cov_mode),
-        # You can add performance flags here if needed, e.g.:
-        # "--threads", "8"
-    ]
-    print("[i] Running:", " ".join(cmd), flush=True)
-    subprocess.run(cmd, check=True)
-    # MMseqs easy-cluster typically writes <out_prefix>_cluster.tsv
-    default_tsv = Path(str(out_prefix) + "_cluster.tsv")
-    if default_tsv.exists():
-        print(f"[i] Found cluster TSV: {default_tsv}")
-        return default_tsv
-    # Fallback: try createtsv if default is missing
-    # This requires the internal DBs. easy-cluster creates DBs alongside out_prefix.
-    # We'll try to locate them and emit a TSV.
-    in_db = Path(str(out_prefix) + "_query")
-    cl_db = Path(str(out_prefix) + "_cluster")
-    out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
-    if in_db.exists() and cl_db.exists():
-        cmd2 = [
-            mmseqs_bin,
-            "createtsv",
-            str(in_db),
-            str(in_db),
-            str(cl_db),
-            str(out_tsv),
-        ]
-        print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
-        subprocess.run(cmd2, check=True)
-        if out_tsv.exists():
-            return out_tsv
-    raise FileNotFoundError(
-        "Could not locate clusters TSV from mmseqs. "
-        "Expected {default_tsv} or createtsv fallback."
-    )
-def parse_mmseqs_clusters(tsv_path: Path) -> dict:
-    """
-    Parse MMseqs cluster TSV (rep \t member). Returns dna_id -> cluster_rep_id
-    """
-    mapping = {}
-    with open(tsv_path) as f:
-        for line in f:
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) < 2:
-                continue
-            rep, member = parts[0], parts[1]
-            mapping[member] = rep
-            # Some TSVs include rep->rep; if not, ensure rep is mapped to itself:
-            if rep not in mapping:
-                mapping[rep] = rep
-    return mapping
-def assign_clusters_to_splits(
-    cluster_rep_to_members: dict, val_frac: float, test_frac: float, seed: int = 42
-):
-    """
-    cluster_rep_to_members: dict[rep] = [members...]
-    Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
-    Ensures all members of a cluster go to the same split.
-    """
-    rng = random.Random(seed)
-    reps = list(cluster_rep_to_members.keys())
-    rng.shuffle(reps)
-    # Greedy-ish fill by total member counts to match desired fractions.
-    total = sum(len(cluster_rep_to_members[r]) for r in reps)
-    target_val = int(round(total * val_frac))
-    target_test = int(round(total * test_frac))
-    cur_val = cur_test = 0
-    val_ids, test_ids, train_ids = set(), set(), set()
-    for rep in reps:
-        members = cluster_rep_to_members[rep]
-        c = len(members)
-        # Fill val first, then test, then train
-        if cur_val + c <= target_val:
-            val_ids.update(members)
-            cur_val += c
-        elif cur_test + c <= target_test:
-            test_ids.update(members)
-            cur_test += c
-        else:
-            train_ids.update(members)
-    return {"train": train_ids, "val": val_ids, "test": test_ids}
-# ─────────────────────────────────────────────────────────────────────────
-# Main
-# ─────────────────────────────────────────────────────────────────────────
-def main():
-    parser = argparse.ArgumentParser(
-        description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
-    )
-    parser.add_argument(
-        "--final_csv", required=True, help="final.csv with TF_id and dna_sequence"
-    )
-    parser.add_argument(
-        "--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)"
-    )
-    parser.add_argument(
-        "--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)"
-    )
-    parser.add_argument(
-        "--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)"
-    )
-    parser.add_argument(
-        "--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)"
-    )
-    parser.add_argument("--out_dir", required=True, help="Output directory")
-    parser.add_argument("--seed", type=int, default=42)
-    # NEW: MMseqs options & split fractions
-    parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
-    parser.add_argument(
-        "--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id"
-    )
-    parser.add_argument(
-        "--cov", type=float, default=0.8, help="MMseqs -c coverage fraction"
-    )
-    parser.add_argument(
-        "--cov_mode",
-        type=int,
-        default=1,
-        help="MMseqs --cov-mode (1 = coverage of target)",
-    )
-    parser.add_argument("--val_frac", type=float, default=0.10)
-    parser.add_argument("--test_frac", type=float, default=0.10)
-    parser.add_argument(
-        "--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)"
-    )
-    args = parser.parse_args()
-    random.seed(args.seed)
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Load final.csv
-    df = pd.read_csv(args.final_csv, dtype=str)
-    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
-        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
-    # Assign dna_id (unique per dna_sequence)
-    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
-    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
-    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
-    enriched_csv = out_dir / "final_with_dna_id.csv"
-    df.to_csv(enriched_csv, index=False)
-    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
-    # Split embeddings into per-item files (unchanged)
-    print(
-        f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}"
-    )
-    dna_map = split_embeddings(
-        args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna"
-    )
-    print(
-        f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})"
-    )
-    print(
-        f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}"
-    )
-    tf_map = split_embeddings(
-        args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf"
-    )
-    print(
-        f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})"
-    )
-    # Build gene-symbol normalized map
-    tf_symbol_map = build_tf_symbol_map(tf_map)
-    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
-    # Diagnostic overlaps
-    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
-    available_tf_symbols = set(tf_symbol_map.keys())
-    intersect_tf = norm_tf_in_final & available_tf_symbols
-    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
-    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
-    print(f"[i] Intersection count: {len(intersect_tf)}")
-    if len(intersect_tf) == 0:
-        print(
-            "[ERROR] No overlap between normalized TF_id and TF embedding symbols.",
-            file=sys.stderr,
-        )
-        print(
-            "Sample normalized TFs from final.csv:",
-            sorted(list(norm_tf_in_final))[:30],
-            file=sys.stderr,
-        )
-        print(
-            "Sample available TF symbols:",
-            sorted(list(available_tf_symbols))[:30],
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    dna_ids_final = set(df["dna_id"].unique())
-    available_dna_ids = set(dna_map.keys())
-    intersect_dna = dna_ids_final & available_dna_ids
-    print(
-        f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}"
-    )
-    if len(intersect_dna) == 0:
-        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
-        sys.exit(1)
-    # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
-    fasta_path = out_dir / "dna_unique.fasta"
-    write_dna_fasta(df, fasta_path)
-    print(
-        f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}"
-    )
-    tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
-    cluster_prefix = out_dir / "mmseqs_dna_clusters"
-    clusters_tsv = run_mmseqs_easy_cluster(
-        mmseqs_bin=args.mmseqs_bin,
-        fasta=fasta_path,
-        out_prefix=cluster_prefix,
-        tmp_dir=tmp_dir,
-        min_seq_id=args.min_seq_id,
-        coverage=args.cov,
-        cov_mode=args.cov_mode,
-    )
-    # Parse clusters
-    member_to_rep = parse_mmseqs_clusters(clusters_tsv)  # dna_id -> rep_id
-    # Build rep -> members list
-    rep_to_members = defaultdict(list)
-    for member, rep in member_to_rep.items():
-        rep_to_members[rep].append(member)
-    print(f"[i] Parsed {len(rep_to_members)} clusters from {clusters_tsv}")
-    clusters_table = []
-    for rep, members in rep_to_members.items():
-        for m in members:
-            clusters_table.append((m, rep))
-    clusters_df = pd.DataFrame(clusters_table, columns=["dna_id", "cluster_id"])
-    clusters_df.to_csv(out_dir / "clusters.tsv", sep="\t", index=False)
-    print(f"[i] Wrote clusters mapping → {out_dir / 'clusters.tsv'}")
-    # Attach cluster_id back to final df
-    df = df.merge(clusters_df, on="dna_id", how="left")
-    df.to_csv(out_dir / "final_with_dna_id_and_cluster.csv", index=False)
-    print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
-    # Assign entire clusters to splits
-    splits = assign_clusters_to_splits(
-        rep_to_members, val_frac=args.val_frac, test_frac=args.test_frac, seed=args.seed
-    )
-    for k in ["train", "val", "test"]:
-        print(f"[i] {k}: {len(splits[k])} dna_ids")
-    # ── Build positive pairs only, per split (NO negatives) ───────────────
-    positives_by_split = {"train": [], "val": [], "test": []}
-    # Build a quick dna_id -> embedding path map
-    dnaid_to_path = {did: path for did, path in dna_map.items()}
-    pos_count = 0
-    for _, row in df.iterrows():
-        tf_raw = row["TF_id"]
-        tf_symbol = tf_raw.split("_seq")[0].upper()
-        dnaid = row["dna_id"]
-        if (tf_symbol not in tf_symbol_map) or (dnaid not in dnaid_to_path):
-            continue
-        tf_embedding_path = tf_symbol_map[tf_symbol][0]  # first embedding per symbol
-        # decide split by dna_id cluster assignment
-        if dnaid in splits["train"]:
-            positives_by_split["train"].append(
-                (tf_embedding_path, dnaid_to_path[dnaid], 1)
-            )
-        elif dnaid in splits["val"]:
-            positives_by_split["val"].append(
-                (tf_embedding_path, dnaid_to_path[dnaid], 1)
-            )
-        elif dnaid in splits["test"]:
-            positives_by_split["test"].append(
-                (tf_embedding_path, dnaid_to_path[dnaid], 1)
-            )
-        pos_count += 1
-    print(
-        f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})"
-    )
-    for k in ["train", "val", "test"]:
-        print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
-    # # OLD: negatives (kept commented)
-    # negatives = []
-    # print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive not used)")
-    # Emit split-specific pair lists
-    for split in ["train", "val", "test"]:
-        out_tsv = out_dir / f"pair_list_{split}.tsv"
-        with open(out_tsv, "w") as f:
-            for binder_path, glm_path, label in positives_by_split[
-                split
-            ]:  # + negatives if you add later
-                f.write(f"{binder_path}\t{glm_path}\t{label}\n")
-        print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
-    print("✅ Done. Cluster-aware splits ready.")
-if __name__ == "__main__":
-    main()

dpacman/classifier/model_tmp/compress_embeddings.py DELETED Viewed

@@ -1,62 +0,0 @@
-# compress_embeddings.py
-# USAGE: python compress_embeddings.py --input_glob "/path/to/esm_embeddings/*.npy" --output_dir "/path/to/compressed_embeddings" --esm_dim 1280 --out_dim 256
-# --------------
-import os
-import glob
-import numpy as np
-import torch
-from torch import nn
-class EmbeddingCompressor(nn.Module):
-    def __init__(self, input_dim: int = 1280, output_dim: int = 256):
-        super().__init__()
-        self.fc = nn.Linear(input_dim, output_dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: (batch, L, input_dim)  or (L, input_dim)
-        returns: (batch, output_dim) or (output_dim,)
-        """
-        if x.dim() == 2:
-            # single example: mean over tokens
-            x = x.mean(dim=0, keepdim=True)  # → (1, input_dim)
-        else:
-            # batch: mean over tokens
-            x = x.mean(dim=1)  # → (batch, input_dim)
-        return self.fc(x)  # → (batch, output_dim)
-def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
-    arr = np.load(in_path)  # shape (L, D) or (batch, L, D)
-    tensor = torch.from_numpy(arr).float()
-    with torch.no_grad():
-        compressed = model(tensor)  # → (batch, 256)
-    out = compressed.cpu().numpy()
-    np.save(out_path, out)
-    print(f"Saved {out_path}")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
-    parser.add_argument(
-        "--input_glob",
-        type=str,
-        required=True,
-        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)",
-    )
-    parser.add_argument("--output_dir", type=str, required=True)
-    parser.add_argument("--esm_dim", type=int, default=1280)
-    parser.add_argument("--out_dim", type=int, default=256)
-    args = parser.parse_args()
-    os.makedirs(args.output_dir, exist_ok=True)
-    compressor = EmbeddingCompressor(args.esm_dim, args.out_dim)
-    compressor.eval()
-    for fn in glob.glob(args.input_glob):
-        base = os.path.basename(fn).replace(".npy", "_256.npy")
-        out_path = os.path.join(args.output_dir, base)
-        compress_file(fn, out_path, compressor)

dpacman/classifier/model_tmp/compute_embeddings.py DELETED Viewed

@@ -1,612 +0,0 @@
-"""
-Plug-and-play embedding extraction for:
-  • Chromosome sequences (from raw UCSC JSON)
-  • TF sequences (transcription_factors.fasta)
-Usage example (DNA + protein in one go):
-  module load miniconda/24.7.1
-  conda activate dpacman
-  python dpacman/data/compute_embeddings.py \
-    --genome-json-dir ../data_files/raw/genomes/hg38 \
-    --tf-fasta         ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
-    --chrom-model      caduceus \
-    --tf-model         esm-dbp \
-    --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
-    --device           cuda
-"""
-import os
-import re
-import argparse
-import json
-import numpy as np
-from pathlib import Path
-import torch
-from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
-import esm
-from Bio import SeqIO
-import time
-# ---- model wrappers ----
-class CaduceusEmbedder:
-    def __init__(self, device, chunk_size=131_072, overlap=0):
-        """
-        device: 'cpu' or 'cuda'
-        chunk_size: max bases (and thus tokens) to send in one forward pass
-        overlap: how many bases each window overlaps the previous; 0 = no overlap
-        """
-        model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_name, trust_remote_code=True
-        )
-        self.model = (
-            AutoModel.from_pretrained(model_name, trust_remote_code=True)
-            .to(device)
-            .eval()
-        )
-        self.device = device
-        self.chunk_size = chunk_size
-        self.step = chunk_size - overlap
-    def embed(self, seqs):
-        """
-        seqs: List[str] of DNA sequences (each <= chunk_size for this test)
-        returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
-        """
-        # outputs = []
-        # for seq in seqs:
-        #     # --- new: raw per‐token embeddings in one shot ---
-        #     toks = self.tokenizer(
-        #         seq,
-        #         return_tensors="pt",
-        #         padding=False,
-        #         truncation=True,
-        #         max_length=self.chunk_size
-        #     ).to(self.device)
-        #     with torch.no_grad():
-        #         out = self.model(**toks).last_hidden_state  # (1, L, D)
-        #     outputs.append(out.cpu().numpy()[0])             # (L, D)
-        # return np.stack(outputs, axis=0)  # (N, L, D)
-        outputs = []
-        for seq in seqs:
-            toks = self.tokenizer(
-                seq,
-                return_tensors="pt",
-                padding=False,
-                truncation=True,
-                max_length=self.chunk_size,
-            ).to(self.device)
-            with torch.no_grad():
-                out = self.model(**toks).last_hidden_state  # (1, L, D)
-            outputs.append(out.cpu().numpy()[0])  # (L, D)
-        return outputs  # list of variable-length (L_i, D) arrays
-    def benchmark(self, lengths=None):
-        """
-        Time embedding on single-sequence of various lengths.
-        By default tests [5K,10K,50K,100K,chunk_size].
-        """
-        tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
-        print(f"→ Benchmarking Caduceus on device={self.device}")
-        for sz in tests:
-            seq = "A" * sz
-            # Warm-up
-            _ = self.embed([seq])
-            if self.device != "cpu":
-                torch.cuda.synchronize()
-            t0 = time.perf_counter()
-            _ = self.embed([seq])
-            if self.device != "cpu":
-                torch.cuda.synchronize()
-            t1 = time.perf_counter()
-            print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
-class SegmentNTEmbedder:
-    def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            "InstaDeepAI/segment_nt", trust_remote_code=True
-        )
-        self.model = (
-            AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
-            .to(device)
-            .eval()
-        )
-        self.device = device
-    def _adjust_length(self, input_ids):
-        bs, L = input_ids.shape
-        excl = L - 1
-        remainder = (excl) % 4
-        if remainder != 0:
-            pad_needed = 4 - remainder
-            pad_tensor = torch.full(
-                (bs, pad_needed),
-                self.tokenizer.pad_token_id,
-                dtype=input_ids.dtype,
-                device=input_ids.device,
-            )
-            input_ids = torch.cat([input_ids, pad_tensor], dim=1)
-        return input_ids
-    def embed(self, seqs, batch_size=16):
-        """
-        seqs: List[str]
-        Returns: np.ndarray of shape (N, D)
-        """
-        all_embeddings = []
-        for i in range(0, len(seqs), batch_size):
-            batch_seqs = seqs[i : i + batch_size]
-            encoded = self.tokenizer.batch_encode_plus(
-                batch_seqs,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-            )
-            input_ids = encoded["input_ids"].to(self.device)  # (B, L)
-            attention_mask = input_ids != self.tokenizer.pad_token_id
-            input_ids = self._adjust_length(input_ids)
-            attention_mask = input_ids != self.tokenizer.pad_token_id
-            with torch.no_grad():
-                outs = self.model(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
-                    return_dict=True,
-                )
-            if hasattr(outs, "hidden_states") and outs.hidden_states is not None:
-                last_hidden = outs.hidden_states[-1]  # (B, L, D)
-            else:
-                last_hidden = outs.last_hidden_state  # fallback
-            # Exclude CLS token if present (assume first token) and pool
-            pooled = last_hidden[:, 1:, :].mean(dim=1)  # (B, D)
-            all_embeddings.append(pooled.cpu().numpy())
-            # release fragmentation
-            torch.cuda.empty_cache()
-        return np.vstack(all_embeddings)  # (N, D)
-class DNABertEmbedder:
-    def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            "zhihan1996/DNA_bert_6", trust_remote_code=True
-        )
-        self.model = AutoModel.from_pretrained(
-            "zhihan1996/DNA_bert_6", trust_remote_code=True
-        ).to(device)
-        self.device = device
-    def embed(self, seqs):
-        embs = []
-        for s in seqs:
-            tokens = self.tokenizer(s, return_tensors="pt", padding=True)[
-                "input_ids"
-            ].to(self.device)
-            with torch.no_grad():
-                out = self.model(tokens).last_hidden_state.mean(1)
-            embs.append(out.cpu().numpy())
-        return np.vstack(embs)
-class NucleotideTransformerEmbedder:
-    def __init__(self, device):
-        # HF “feature-extraction” returns a list of (L, D) arrays for each input
-        # device: “cpu” or “cuda”
-        self.pipe = pipeline(
-            "feature-extraction",
-            model="InstaDeepAI/nucleotide-transformer-500m-1000g",
-            device=(
-                -1 if device == "cpu" else 0
-            ),  # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
-        )
-    def embed(self, seqs):
-        """
-        seqs: List[str] of raw DNA sequences
-        returns: (N, D) array, one D-dim vector per sequence
-        """
-        all_embeddings = self.pipe(seqs, truncation=True, padding=True)
-        # all_embeddings is a List of shape (L, D) arrays
-        pooled = [np.mean(x, axis=0) for x in all_embeddings]
-        return np.vstack(pooled)
-# class ESMEmbedder:
-#     def __init__(self, device):
-#         self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-#         self.batch_converter = self.alphabet.get_batch_converter()
-#         self.model.to(device).eval()
-#         self.device = device
-#     def embed(self, seqs):
-#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-#         _, _, toks = self.batch_converter(batch)
-#         toks = toks.to(self.device)
-#         with torch.no_grad():
-#             results = self.model(toks, repr_layers=[33], return_contacts=False)
-#         reps = results["representations"][33]
-#         return reps[:, 1:-1].mean(1).cpu().numpy()
-class ESMEmbedder:
-    def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
-        # Try to load the specified ESM-2 model; fallback to esm1b if missing
-        self.device = device
-        try:
-            self.model, self.alphabet = getattr(esm.pretrained, model_name)()
-            self.is_esm2 = model_name.lower().startswith("esm2")
-        except AttributeError:
-            # fallback to ESM-1b
-            self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-            self.is_esm2 = False
-        self.batch_converter = self.alphabet.get_batch_converter()
-        self.model.to(device).eval()
-        # determine max length: esm2 models vary; use default 1024 for esm1b
-        self.max_len = (
-            4096 if self.is_esm2 else 1024
-        )  # adjust if your esm2 variant has explicit limit
-        # for chunking: reserve 2 tokens if model uses BOS/EOS
-        self.chunk_size = self.max_len - 2
-        self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
-    def _chunk_sequence(self, seq):
-        """
-        Return list of possibly overlapping chunks of seq, each <= chunk_size.
-        """
-        if len(seq) <= self.chunk_size:
-            return [seq]
-        step = self.chunk_size - self.overlap
-        chunks = []
-        for i in range(0, len(seq), step):
-            chunk = seq[i : i + self.chunk_size]
-            if not chunk:
-                break
-            chunks.append(chunk)
-        return chunks
-    def embed(self, seqs):
-        """
-        seqs: List[str] of protein sequences.
-        Returns: np.ndarray of shape (N, D) pooled per-sequence embeddings.
-        """
-        all_embeddings = []
-        for i, seq in enumerate(seqs):
-            chunks = self._chunk_sequence(seq)
-            chunk_vecs = []
-            # process chunks in batch if small number, else sequentially
-            for chunk in chunks:
-                batch = [(str(i), chunk)]
-                _, _, toks = self.batch_converter(batch)
-                toks = toks.to(self.device)
-                with torch.no_grad():
-                    results = self.model(toks, repr_layers=[33], return_contacts=False)
-                reps = results["representations"][33]  # (1, L, D)
-                # remove BOS/EOS if present: take 1:-1 if length permits
-                if reps.size(1) > 2:
-                    rep = reps[:, 1:-1].mean(1)  # (1, D)
-                else:
-                    rep = reps.mean(1)  # fallback
-                chunk_vecs.append(rep.squeeze(0))  # (D,)
-            if len(chunk_vecs) == 1:
-                seq_vec = chunk_vecs[0]
-            else:
-                # average chunk vectors
-                stacked = torch.stack(chunk_vecs, dim=0)  # (num_chunks, D)
-                seq_vec = stacked.mean(0)
-            all_embeddings.append(seq_vec.cpu().numpy())
-        return np.vstack(all_embeddings)  # (N, D)
-# class ESMDBPEmbedder:
-#     def __init__(self, device):
-#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-#         model_path = (
-#             Path(__file__).resolve().parent.parent
-#             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
-#         )
-#         checkpoint = torch.load(model_path, map_location="cpu")
-#         clean_sd = {}
-#         for k, v in checkpoint.items():
-#             clean_sd[k.replace("module.", "")] = v
-#         result = base_model.load_state_dict(clean_sd, strict=False)
-#         if result.missing_keys:
-#             print(f"[ESMDBP] missing keys: {result.missing_keys}")
-#         if result.unexpected_keys:
-#             print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
-#         self.model = base_model.to(device).eval()
-#         self.alphabet = alphabet
-#         self.batch_converter = alphabet.get_batch_converter()
-#         self.device = device
-#     def embed(self, seqs):
-#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-#         _, _, toks = self.batch_converter(batch)
-#         toks = toks.to(self.device)
-#         with torch.no_grad():
-#             out = self.model(toks, repr_layers=[33], return_contacts=False)
-#         reps = out["representations"][33]
-#         # skip start/end tokens
-#         return reps[:, 1:-1].mean(1).cpu().numpy()
-class ESMDBPEmbedder:
-    def __init__(self, device):
-        base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-        model_path = (
-            Path(__file__).resolve().parent.parent
-            / "pretrained"
-            / "ESM-DBP"
-            / "ESM-DBP.model"
-        )
-        checkpoint = torch.load(model_path, map_location="cpu")
-        clean_sd = {}
-        for k, v in checkpoint.items():
-            clean_sd[k.replace("module.", "")] = v
-        result = base_model.load_state_dict(clean_sd, strict=False)
-        if result.missing_keys:
-            print(f"[ESMDBP] missing keys: {result.missing_keys}")
-        if result.unexpected_keys:
-            print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
-        self.model = base_model.to(device).eval()
-        self.alphabet = alphabet
-        self.batch_converter = alphabet.get_batch_converter()
-        self.device = device
-        self.max_len = 1024  # same limit as esm1b
-        self.chunk_size = self.max_len - 2
-        self.overlap = self.chunk_size // 4
-    def _chunk_sequence(self, seq):
-        if len(seq) <= self.chunk_size:
-            return [seq]
-        step = self.chunk_size - self.overlap
-        chunks = []
-        for i in range(0, len(seq), step):
-            chunk = seq[i : i + self.chunk_size]
-            if not chunk:
-                break
-            chunks.append(chunk)
-        return chunks
-    def embed(self, seqs):
-        all_embeddings = []
-        for i, seq in enumerate(seqs):
-            chunks = self._chunk_sequence(seq)
-            chunk_vecs = []
-            for chunk in chunks:
-                batch = [(str(i), chunk)]
-                _, _, toks = self.batch_converter(batch)
-                toks = toks.to(self.device)
-                with torch.no_grad():
-                    out = self.model(toks, repr_layers=[33], return_contacts=False)
-                reps = out["representations"][33]
-                if reps.size(1) > 2:
-                    rep = reps[:, 1:-1].mean(1)
-                else:
-                    rep = reps.mean(1)
-                chunk_vecs.append(rep.squeeze(0))
-            if len(chunk_vecs) == 1:
-                seq_vec = chunk_vecs[0]
-            else:
-                stacked = torch.stack(chunk_vecs, dim=0)
-                seq_vec = stacked.mean(0)
-            all_embeddings.append(seq_vec.cpu().numpy())
-        return np.vstack(all_embeddings)
-class GPNEmbedder:
-    def __init__(self, device):
-        model_name = "songlab/gpn-msa-sapiens"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
-        self.model.to(device)
-        self.model.eval()
-        self.device = device
-    def embed(self, seqs):
-        inputs = self.tokenizer(
-            seqs, return_tensors="pt", padding=True, truncation=True
-        ).to(self.device)
-        with torch.no_grad():
-            last_hidden = self.model(**inputs).last_hidden_state
-        return last_hidden.mean(dim=1).cpu().numpy()
-class ProGenEmbedder:
-    def __init__(self, device):
-        model_name = "jinyuan22/ProGen2-base"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
-        self.device = device
-    def embed(self, seqs):
-        inputs = self.tokenizer(
-            seqs, return_tensors="pt", padding=True, truncation=True
-        ).to(self.device)
-        with torch.no_grad():
-            last_hidden = self.model(**inputs).last_hidden_state
-        return last_hidden.mean(dim=1).cpu().numpy()
-# ---- main pipeline ----
-def get_embedder(name, device, for_dna=True):
-    name = name.lower()
-    if for_dna:
-        if name == "caduceus":
-            return CaduceusEmbedder(device)
-        if name == "dnabert":
-            return DNABertEmbedder(device)
-        if name == "nucleotide":
-            return NucleotideTransformerEmbedder(device)
-        if name == "gpn":
-            return GPNEmbedder(device)
-        if name == "segmentnt":
-            return SegmentNTEmbedder(device)
-    else:
-        if name in ("esm",):
-            return ESMEmbedder(device)
-        if name in ("esm-dbp", "esm_dbp"):
-            return ESMDBPEmbedder(device)
-        if name == "progen":
-            return ProGenEmbedder(device)
-    raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
-def pad_token_embeddings(list_of_arrays, pad_value=0.0):
-    """
-    list_of_arrays: list of (L_i, D) numpy arrays
-    Returns:
-      padded: (N, L_max, D) array
-      mask:   (N, L_max) boolean array where True = real token, False = padding
-    """
-    N = len(list_of_arrays)
-    D = list_of_arrays[0].shape[1]
-    L_max = max(arr.shape[0] for arr in list_of_arrays)
-    padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
-    mask = np.zeros((N, L_max), dtype=bool)
-    for i, arr in enumerate(list_of_arrays):
-        L = arr.shape[0]
-        padded[i, :L] = arr
-        mask[i, :L] = True
-    return padded, mask
-def embed_and_save(seqs, ids, embedder, out_path):
-    embs = embedder.embed(seqs)
-    # Decide whether we got variable-length per-token outputs (list of (L, D))
-    is_variable_token = (
-        isinstance(embs, (list, tuple))
-        and len(embs) > 0
-        and hasattr(embs[0], "shape")
-        and embs[0].ndim == 2
-    )
-    if is_variable_token:
-        # pad to (N, L_max, D) + mask
-        padded, mask = pad_token_embeddings(embs)
-        # Save both embeddings and mask together in an .npz for convenience
-        np.savez_compressed(
-            out_path.with_suffix(".caduceus.npz"),
-            embeddings=padded,
-            mask=mask,
-            ids=np.array(ids, dtype=object),
-        )
-    else:
-        # fixed shape output, e.g., pooled (N, D)
-        array = np.vstack(embs) if isinstance(embs, list) else embs
-        np.save(out_path, array)
-        with open(out_path.with_suffix(".ids"), "w") as f:
-            f.write("\n".join(ids))
-if __name__ == "__main__":
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "--peak-fasta",
-        default="binding_peaks_unique.fa",
-        help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs",
-    )
-    p.add_argument(
-        "--genome-json-dir",
-        default=None,
-        help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes",
-    )
-    p.add_argument(
-        "--skip-dna",
-        action="store_true",
-        help="if set, skip the chromosome embedding step",
-    )  # if glm embeddings successful but not plm embeddings
-    p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
-    p.add_argument("--chrom-model", default="caduceus")
-    p.add_argument("--tf-model", default="esm-dbp")
-    p.add_argument(
-        "--out-dir", default="data_files/processed/tfclust/hg38_tf/embeddings"
-    )
-    p.add_argument("--device", default="cpu")
-    args = p.parse_args()
-    os.makedirs(args.out_dir, exist_ok=True)
-    device = args.device
-    if not args.skip_dna:
-        peak_fasta = Path(args.peak_fasta)
-        if peak_fasta.exists():
-            # Load peak sequences from FASTA
-            from Bio import SeqIO
-            peak_seqs = []
-            peak_ids = []
-            for rec in SeqIO.parse(peak_fasta, "fasta"):
-                peak_ids.append(rec.id)
-                peak_seqs.append(str(rec.seq))
-            print(
-                f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}",
-                flush=True,
-            )
-            dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-            out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
-            embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
-        elif args.genome_json_dir:
-            # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
-            genome_dir = Path(args.genome_json_dir)
-            chrom_seqs, chrom_ids = [], []
-            primary_pattern = re.compile(
-                r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$"
-            )
-            for j in sorted(genome_dir.iterdir()):
-                if not primary_pattern.match(j.name):
-                    continue
-                data = json.loads(j.read_text())
-                seq = data.get("dna") or data.get("sequence")
-                chrom = data.get("chrom") or j.stem.split("_")[-1]
-                chrom_seqs.append(seq)
-                chrom_ids.append(chrom)
-            cutoff = CaduceusEmbedder(device).chunk_size
-            long_chroms = [
-                (chrom, len(seq))
-                for chrom, seq in zip(chrom_ids, chrom_seqs)
-                if len(seq) > cutoff
-            ]
-            if long_chroms:
-                print(
-                    "⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff)
-                )
-                for chrom, L in long_chroms:
-                    print(f"  {chrom}: {L} bases")
-            else:
-                print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
-            chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-            out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
-            embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
-        else:
-            raise ValueError(
-                "No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs."
-            )
-    # Load TF sequences
-    tf_seqs, tf_ids = [], []
-    for record in SeqIO.parse(args.tf_fasta, "fasta"):
-        tf_ids.append(record.id)
-        tf_seqs.append(str(record.seq))
-    # embed and save
-    tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
-    out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
-    embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
-    print("Done.")

dpacman/classifier/model_tmp/extract_tf_symbols.py DELETED Viewed

@@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-import pandas as pd
-from pathlib import Path
-FINAL_CSV = Path("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv")
-OUT_SYMBOLS = Path("tf_symbols.txt")
-def normalize_tf(tf_id: str) -> str:
-    return tf_id.split("_seq")[0].upper()
-def main():
-    df = pd.read_csv(FINAL_CSV, dtype=str)
-    if "TF_id" not in df.columns:
-        raise RuntimeError("final.csv missing TF_id column")
-    tf_raw = df["TF_id"].dropna().unique().tolist()
-    normalized = sorted({normalize_tf(t) for t in tf_raw})
-    print(f"Unique raw TF_id count: {len(tf_raw)}")
-    print(f"Unique normalized TF symbols: {len(normalized)}")
-    with open(OUT_SYMBOLS, "w") as f:
-        for s in normalized:
-            f.write(s + "\n")
-    print(f"Wrote normalized TF symbols to {OUT_SYMBOLS}")
-    # Optional: show sample
-    print("Sample symbols:", normalized[:50])
-if __name__ == "__main__":
-    main()

dpacman/classifier/model_tmp/make_pair_list.py DELETED Viewed

@@ -1,282 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import pandas as pd
-from pathlib import Path
-import random
-import sys
-def read_ids_file(p):
-    p = Path(p)
-    if not p.exists():
-        raise FileNotFoundError(f"IDs file not found: {p}")
-    return [line.strip() for line in p.open() if line.strip()]
-def split_embeddings(emb_path, ids_path, out_dir, prefix):
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    if not Path(emb_path).exists():
-        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
-    if not Path(ids_path).exists():
-        raise FileNotFoundError(f"IDs file not found: {ids_path}")
-    if emb_path.endswith(".npz"):
-        data = np.load(emb_path, allow_pickle=True)
-        if "embeddings" in data:
-            emb = data["embeddings"]
-        else:
-            raise ValueError(f"{emb_path} missing 'embeddings' key")
-    else:
-        emb = np.load(emb_path)
-    ids = read_ids_file(ids_path)
-    if len(ids) != emb.shape[0]:
-        print(
-            f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}",
-            file=sys.stderr,
-        )
-    mapping = {}
-    for i, ident in enumerate(ids):
-        if i >= emb.shape[0]:
-            print(
-                f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr
-            )
-            continue
-        arr = emb[i]
-        out_file = out_dir / f"{prefix}_{ident}.npy"
-        np.save(out_file, arr)
-        mapping[ident] = str(out_file)
-    return mapping
-def extract_symbol_from_tf_id(full_id: str) -> str:
-    """
-    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
-    return the gene symbol uppercase (e.g., 'ZBTB5').
-    """
-    if "|" in full_id:
-        try:
-            # format sp|Accession|SYMBOL_HUMAN
-            genepart = full_id.split("|")[2]
-        except IndexError:
-            genepart = full_id
-    else:
-        genepart = full_id
-    symbol = genepart.split("_")[0]
-    return symbol.upper()
-def build_tf_symbol_map(tf_map):
-    """
-    Build mapping gene_symbol -> list of embedding paths.
-    """
-    symbol_map = {}
-    for full_id, path in tf_map.items():
-        symbol = extract_symbol_from_tf_id(full_id)
-        symbol_map.setdefault(symbol, []).append(path)
-    return symbol_map
-def tf_key_from_path(path: str) -> str:
-    """
-    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
-    """
-    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
-    # remove leading prefix if present (tf_)
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return extract_symbol_from_tf_id(rest)
-def dna_key_from_path(path: str) -> str:
-    """
-    Given .../dna_peak42.npy -> 'peak42'
-    """
-    stem = Path(path).stem
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return rest
-def main():
-    parser = argparse.ArgumentParser(
-        description="Build TF-DNA pair list from final.csv with gene-symbol normalization for TFs."
-    )
-    parser.add_argument(
-        "--final_csv", required=True, help="final.csv with TF_id and dna_sequence"
-    )
-    parser.add_argument(
-        "--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)"
-    )
-    parser.add_argument(
-        "--dna_ids", required=True, help="IDs file for DNA embeddings (e.g., peak*.ids)"
-    )
-    parser.add_argument(
-        "--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)"
-    )
-    parser.add_argument(
-        "--tf_ids",
-        required=True,
-        help="IDs file for TF embeddings (e.g., sp|...|... ids)",
-    )
-    parser.add_argument("--out_dir", required=True, help="Output directory")
-    parser.add_argument(
-        "--neg_per_positive",
-        type=int,
-        default=2,
-        help="Negatives per positive (half same-TF, half same-DNA)",
-    )
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-    random.seed(args.seed)
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Load final.csv
-    df = pd.read_csv(args.final_csv, dtype=str)
-    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
-        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
-    # Assign dna_id (unique per dna_sequence)
-    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
-    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
-    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
-    enriched_csv = out_dir / "final_with_dna_id.csv"
-    df.to_csv(enriched_csv, index=False)
-    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
-    # Split embeddings into per-item files
-    print(
-        f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}"
-    )
-    dna_map = split_embeddings(
-        args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna"
-    )
-    print(
-        f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})"
-    )
-    print(
-        f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}"
-    )
-    tf_map = split_embeddings(
-        args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf"
-    )
-    print(
-        f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})"
-    )
-    # Build gene-symbol normalized map
-    tf_symbol_map = build_tf_symbol_map(tf_map)
-    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
-    # Diagnostic overlaps
-    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
-    available_tf_symbols = set(tf_symbol_map.keys())
-    intersect_tf = norm_tf_in_final & available_tf_symbols
-    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
-    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
-    print(f"[i] Intersection count: {len(intersect_tf)}")
-    if len(intersect_tf) == 0:
-        print(
-            "[ERROR] No overlap between normalized TF_id and TF embedding symbols.",
-            file=sys.stderr,
-        )
-        print(
-            "Sample normalized TFs from final.csv:",
-            sorted(list(norm_tf_in_final))[:30],
-            file=sys.stderr,
-        )
-        print(
-            "Sample available TF symbols:",
-            sorted(list(available_tf_symbols))[:30],
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    dna_ids_final = set(df["dna_id"].unique())
-    available_dna_ids = set(dna_map.keys())
-    intersect_dna = dna_ids_final & available_dna_ids
-    print(
-        f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}"
-    )
-    if len(intersect_dna) == 0:
-        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
-        sys.exit(1)
-    # Build positive pairs
-    positives = []
-    for _, row in df.iterrows():
-        tf_raw = row["TF_id"]
-        tf_symbol = tf_raw.split("_seq")[0].upper()
-        dnaid = row["dna_id"]
-        if tf_symbol not in tf_symbol_map:
-            continue
-        if dnaid not in dna_map:
-            continue
-        # pick the first embedding for that symbol
-        tf_embedding_path = tf_symbol_map[tf_symbol][0]
-        positives.append((tf_embedding_path, dna_map[dnaid], 1))
-    print(f"[i] Constructed {len(positives)} positive pairs after TF symbol resolution")
-    if len(positives) == 0:
-        print(
-            "[ERROR] No positive pairs could be constructed; aborting.", file=sys.stderr
-        )
-        sys.exit(1)
-    # Build negative samples
-    all_tf_symbols = sorted(tf_symbol_map.keys())
-    all_dnaids = sorted(dna_map.keys())
-    positive_set = set()
-    for tf_path, dna_path, _ in positives:
-        tf_key = tf_key_from_path(tf_path)
-        dna_key = dna_key_from_path(dna_path)
-        positive_set.add((tf_key, dna_key))
-    negatives = []
-    half = args.neg_per_positive // 2
-    for tf_path, dna_path, _ in positives:
-        tf_key = tf_key_from_path(tf_path)
-        dna_key = dna_key_from_path(dna_path)
-        # same TF, different DNA
-        for _ in range(half):
-            candidate_dna = random.choice(all_dnaids)
-            if candidate_dna == dna_key or (tf_key, candidate_dna) in positive_set:
-                continue
-            negatives.append((tf_path, dna_map[candidate_dna], 0))
-        # same DNA, different TF
-        for _ in range(half):
-            candidate_tf_symbol = random.choice(all_tf_symbols)
-            if (
-                candidate_tf_symbol == tf_key
-                or (candidate_tf_symbol, dna_key) in positive_set
-            ):
-                continue
-            # pick its first embedding
-            candidate_tf_path = tf_symbol_map[candidate_tf_symbol][0]
-            negatives.append((candidate_tf_path, dna_map[dnaid], 0))
-    print(
-        f"[i] Sampled {len(negatives)} negatives (neg_per_positive={args.neg_per_positive})"
-    )
-    # Write pair list
-    pair_list_path = out_dir / "pair_list.tsv"
-    with open(pair_list_path, "w") as f:
-        for binder_path, glm_path, label in positives + negatives:
-            # binder=TF, glm=DNA
-            f.write(f"{binder_path}\t{glm_path}\t{label}\n")
-    print(f"[i] Wrote {len(positives)+len(negatives)} examples to {pair_list_path}")
-if __name__ == "__main__":
-    main()

dpacman/classifier/model_tmp/make_peak_fasta.py DELETED Viewed

@@ -1,15 +0,0 @@
-import pandas as pd
-from pathlib import Path
-df = pd.read_csv(
-    "/home/a03-akrishna/DPACMAN/data_files/processed/final.csv", dtype=str
-)  # adjust path if needed
-# get unique sequences
-uniq = df[["dna_sequence"]].drop_duplicates().reset_index(drop=True)
-# make headers: e.g., peak0, peak1, ...
-out_fa = Path("binding_peaks_unique.fa")
-with open(out_fa, "w") as f:
-    for i, seq in enumerate(uniq["dna_sequence"]):
-        header = f">peak{i}"
-        f.write(f"{header}\n{seq}\n")
-print(f"Wrote {len(uniq)} unique binding sequences to {out_fa}")

dpacman/classifier/model_tmp/model.py DELETED Viewed

@@ -1,111 +0,0 @@
-import torch
-from torch import nn
-class LocalCNN(nn.Module):
-    def __init__(self, dim: int = 256, kernel_size: int = 3):
-        super().__init__()
-        padding = kernel_size // 2
-        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
-        self.act = nn.GELU()
-        self.ln = nn.LayerNorm(dim)
-    def forward(self, x: torch.Tensor):
-        # x: (batch, L, dim)
-        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
-        out = self.act(out)
-        out = out.transpose(1, 2)  # → (batch, L, dim)
-        return self.ln(out + x)  # residual
-class CrossModalBlock(nn.Module):
-    def __init__(self, dim: int = 256, heads: int = 8):
-        super().__init__()
-        # self-attention for both sides
-        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.ln_b1 = nn.LayerNorm(dim)
-        self.ln_g1 = nn.LayerNorm(dim)
-        self.ffn_b = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ffn_g = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ln_b2 = nn.LayerNorm(dim)
-        self.ln_g2 = nn.LayerNorm(dim)
-        # cross attention (binder queries, glm keys/values)
-        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.ln_c1 = nn.LayerNorm(dim)
-        self.ffn_c = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ln_c2 = nn.LayerNorm(dim)
-    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
-        """
-        binder: (batch, Lb, dim)
-        glm: (batch, Lg, dim) -- has passed through its local CNN beforehand
-        returns: updated binder representation (batch, Lb, dim)
-        """
-        # binder self-attn + ffn
-        b = binder
-        b_sa, _ = self.sa_binder(b, b, b)
-        b = self.ln_b1(b + b_sa)
-        b_ff = self.ffn_b(b)
-        b = self.ln_b2(b + b_ff)
-        # glm self-attn + ffn
-        g = glm
-        g_sa, _ = self.sa_glm(g, g, g)
-        g = self.ln_g1(g + g_sa)
-        g_ff = self.ffn_g(g)
-        g = self.ln_g2(g + g_ff)
-        # cross-attention: binder queries glm
-        c_sa, _ = self.cross_attn(b, g, g)
-        c = self.ln_c1(b + c_sa)
-        c_ff = self.ffn_c(c)
-        c = self.ln_c2(c + c_ff)
-        return c  # (batch, Lb, dim)
-class BindPredictor(nn.Module):
-    def __init__(
-        self,
-        input_dim: int = 256,
-        hidden_dim: int = 256,
-        heads: int = 8,
-        num_layers: int = 4,
-        use_local_cnn_on_glm: bool = True,
-    ):
-        super().__init__()
-        self.proj_binder = nn.Linear(input_dim, hidden_dim)
-        self.proj_glm = nn.Linear(input_dim, hidden_dim)
-        self.use_local_cnn = use_local_cnn_on_glm
-        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
-        self.layers = nn.ModuleList(
-            [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
-        )
-        self.ln_out = nn.LayerNorm(hidden_dim)
-        self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())
-    def forward(self, binder_emb, glm_emb):
-        """
-        binder_emb, glm_emb: (batch, L, input_dim)
-        """
-        b = self.proj_binder(binder_emb)  # (B, Lb, hidden_dim)
-        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
-        if self.use_local_cnn:
-            g = self.local_cnn(g)  # local context injected
-        for layer in self.layers:
-            b = layer(b, g)  # update binder with cross-modal info
-        pooled = b.mean(dim=1)  # (B, hidden_dim)
-        out = self.ln_out(pooled)
-        return self.head(out).squeeze(-1)  # (B,)

dpacman/classifier/model_tmp/prep_splits.py DELETED Viewed

@@ -1,157 +0,0 @@
-import numpy as np
-import pandas as pd
-import sys
-import json
-from sklearn.decomposition import TruncatedSVD
-from sklearn.model_selection import train_test_split
-from collections import Counter
-def parse_pair_list(pair_list_path):
-    binder_paths, glm_paths, labels = [], [], []
-    with open(pair_list_path) as f:
-        for lineno, line in enumerate(f, start=1):
-            if not line.strip():
-                continue
-            parts = line.strip().split()
-            if len(parts) != 3:
-                print(
-                    f"[WARN] skipping malformed line {lineno}: {line.strip()}",
-                    file=sys.stderr,
-                )
-                continue
-            b, g, l = parts
-            try:
-                lab = int(l)
-            except ValueError:
-                print(f"[WARN] invalid label on line {lineno}: {l}", file=sys.stderr)
-                continue
-            binder_paths.append(b)
-            glm_paths.append(g)
-            labels.append(lab)
-    return binder_paths, glm_paths, labels
-def build_tf_compressed_cache(binder_paths, target_dim=256):
-    """
-    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
-    """
-    unique_paths = sorted(set(binder_paths))
-    print(
-        f"[i] Found {len(unique_paths)} unique TF embedding files to compress.",
-        flush=True,
-    )
-    # Load all embeddings to determine dimensionality
-    samples = []
-    for p in unique_paths:
-        arr = np.load(p)
-        samples.append(arr)
-    # Determine if reduction needed: assume all have same embedding width
-    first = samples[0]
-    orig_dim = first.shape[1] if first.ndim == 2 else 1
-    reduction_needed = orig_dim != target_dim
-    tf_cache = {}
-    if reduction_needed:
-        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
-        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
-        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
-        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
-        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
-        pooled = []
-        for arr in samples:
-            if arr.ndim == 2:
-                pooled.append(arr.mean(axis=0))  # (orig_dim,)
-            else:
-                pooled.append(arr)  # degenerate
-        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
-        print(
-            f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}",
-            flush=True,
-        )
-        svd = TruncatedSVD(n_components=target_dim, random_state=42)
-        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
-        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
-        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
-        proj_mat = svd.components_.T  # (orig_dim, target_dim)
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]  # shape (L, orig_dim)
-            if arr.ndim == 1:
-                arr2 = arr @ proj_mat  # (target_dim,)
-            else:
-                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
-                arr2 = arr @ proj_mat
-            tf_cache[p] = arr2  # reduced per-token representation
-        print("[i] Completed compression of TF embeddings.", flush=True)
-    else:
-        # already correct dim: just cache originals
-        print(
-            f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.",
-            flush=True,
-        )
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]
-            tf_cache[p] = arr
-    return tf_cache
-def main():
-    # df = pd.read_csv("../data_files/processed/fimo/ananya_aug4_2025_final.csv")
-    binder_paths, glm_paths, labels = parse_pair_list(
-        "../data_files/processed/fimo/ananya_aug4_2025_pair_list.tsv"
-    )
-    if len(labels) == 0:
-        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
-        sys.exit(1)
-    label_counts = Counter(labels)
-    print(
-        f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}",
-        flush=True,
-    )
-    # build compressed TF cache (reduces to 256 if needed)
-    # tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
-    # Combine all data into one structure for easy splitting
-    data = list(zip(binder_paths, glm_paths, labels))
-    # First split: train vs temp (val+test)
-    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
-    # Second split: val vs test (50% of 20% → 10% each)
-    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
-    # Unpack for dataset construction
-    def unpack(data):
-        binders, glms, labels = zip(*data)
-        return list(binders), list(glms), list(labels)
-    def save_split(binder_paths, glm_paths, labels, out_path):
-        df = pd.DataFrame(
-            {
-                "binder_path": binder_paths,
-                "glm_path": glm_paths,
-                "label": labels,
-            }
-        )
-        df.to_csv(out_path, index=False)
-    # Unpack data for saving
-    train_binders, train_glms, train_labels = unpack(train_data)
-    val_binders, val_glms, val_labels = unpack(val_data)
-    test_binders, test_glms, test_labels = unpack(test_data)
-    # Save each split
-    save_split(
-        train_binders, train_glms, train_labels, "../data_files/splits/train.csv"
-    )
-    save_split(val_binders, val_glms, val_labels, "../data_files/splits/val.csv")
-    save_split(test_binders, test_glms, test_labels, "../data_files/splits/test.csv")
-if __name__ == "__main__":
-    main()

dpacman/classifier/model_tmp/train.py DELETED Viewed

@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import torch
-from torch import nn
-from model import BindPredictor
-from pathlib import Path
-from collections import Counter
-from sklearn.metrics import roc_auc_score, average_precision_score
-from sklearn.decomposition import TruncatedSVD
-import sys
-from dpacman.utils.models import set_seed
-def build_tf_compressed_cache(binder_paths, target_dim=256):
-    """
-    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
-    """
-    unique_paths = sorted(set(binder_paths))
-    print(
-        f"[i] Found {len(unique_paths)} unique TF embedding files to compress.",
-        flush=True,
-    )
-    # Load all embeddings to determine dimensionality
-    samples = []
-    for p in unique_paths:
-        arr = np.load(p)
-        samples.append(arr)
-    # Determine if reduction needed: assume all have same embedding width
-    first = samples[0]
-    orig_dim = first.shape[1] if first.ndim == 2 else 1
-    reduction_needed = orig_dim != target_dim
-    tf_cache = {}
-    if reduction_needed:
-        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
-        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
-        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
-        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
-        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
-        pooled = []
-        for arr in samples:
-            if arr.ndim == 2:
-                pooled.append(arr.mean(axis=0))  # (orig_dim,)
-            else:
-                pooled.append(arr)  # degenerate
-        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
-        print(
-            f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}",
-            flush=True,
-        )
-        svd = TruncatedSVD(n_components=target_dim, random_state=42)
-        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
-        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
-        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
-        proj_mat = svd.components_.T  # (orig_dim, target_dim)
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]  # shape (L, orig_dim)
-            if arr.ndim == 1:
-                arr2 = arr @ proj_mat  # (target_dim,)
-            else:
-                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
-                arr2 = arr @ proj_mat
-            tf_cache[p] = arr2  # reduced per-token representation
-        print("[i] Completed compression of TF embeddings.", flush=True)
-    else:
-        # already correct dim: just cache originals
-        print(
-            f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.",
-            flush=True,
-        )
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]
-            tf_cache[p] = arr
-    return tf_cache
-def evaluate(model, dl, device):
-    model.eval()
-    all_labels = []
-    all_preds = []
-    with torch.no_grad():
-        for b, g, y in dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            all_labels.append(y.cpu())
-            all_preds.append(pred.cpu())
-    if not all_labels:
-        return 0.0, 0.0
-    y_true = torch.cat(all_labels).numpy()
-    y_score = torch.cat(all_preds).numpy()
-    try:
-        auc = roc_auc_score(y_true, y_score)
-    except Exception:
-        auc = 0.0
-    try:
-        ap = average_precision_score(y_true, y_score)
-    except Exception:
-        ap = 0.0
-    return auc, ap
-def unpack(data):
-    binders, glms, labels = zip(*data)
-    return list(binders), list(glms), list(labels)
-# ---- main ------------------------------------------------------------
-def main(cfg):
-    # Set seed for reproducibility
-    set_seed(cfg.seed)
-    parser.add_argument("--out_dir", type=str, required=True)
-    parser.add_argument("--epochs", type=int, default=10)
-    parser.add_argument("--batch_size", type=int, default=32)
-    parser.add_argument("--lr", type=float, default=1e-4)
-    parser.add_argument("--device", type=str, default="cuda")
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-    #
-    print("DEBUG: starting training script with in-line TF compression", flush=True)
-    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
-    binder_paths, glm_paths, labels = parse_pair_list(cfg.pair_list)
-    if len(labels) == 0:
-        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
-        sys.exit(1)
-    label_counts = Counter(labels)
-    print(
-        f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}",
-        flush=True,
-    )
-    # build compressed TF cache (reduces to 256 if needed)
-    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
-    # load training data aloiaushasfoiuhasfoiuafasdfoihuaaasdfoiuhasfaaoiufhasfoasasfoiuh
-    train_ds = PairDataset(None, tf_compressed_cache=tf_compressed_cache)
-    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
-    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
-    print(
-        f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}",
-        flush=True,
-    )
-    if len(train_ds) == 0 or len(val_ds) == 0:
-        print(
-            "[ERROR] Train or validation split is empty; cannot proceed.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    train_dl = DataLoader(
-        train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn
-    )
-    val_dl = DataLoader(
-        val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn
-    )
-    test_dl = DataLoader(
-        test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn
-    )
-    model = BindPredictor(
-        input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True
-    )
-    model = model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
-    loss_fn = nn.BCELoss()
-    best_val = -float("inf")
-    os_out = Path(args.out_dir)
-    os_out.mkdir(exist_ok=True, parents=True)
-    for epoch in range(1, args.epochs + 1):
-        print(f"[Epoch {epoch}] starting...", flush=True)
-        model.train()
-        running_loss = 0.0
-        for b, g, y in train_dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            loss = loss_fn(pred, y)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item() * b.size(0)
-        train_loss = running_loss / len(train_ds)
-        val_auc, val_ap = evaluate(model, val_dl, device)
-        print(
-            f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}",
-            flush=True,
-        )
-        if val_auc > best_val:
-            best_val = val_auc
-            torch.save(model.state_dict(), os_out / "best_model.pt")
-            print(
-                f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}",
-                flush=True,
-            )
-    torch.save(model.state_dict(), os_out / "last_model.pt")
-    test_auc, test_ap = evaluate(model, test_dl, device)
-    print(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
-    print(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
-if __name__ == "__main__":
-    main()

dpacman/classifier/old_train.py DELETED Viewed

@@ -1,486 +0,0 @@
-import argparse, random, sys
-from pathlib import Path
-import numpy as np
-import pandas as pd
-import torch
-from torch import nn
-from torch.utils.data import Dataset, DataLoader, Sampler
-# from sklearn.random_projection import GaussianRandomProjection  # OLD (kept): projection was removed earlier
-import matplotlib.pyplot as plt
-import torch.amp as amp
-from torch.nn import functional as F
-from model import BindPredictor
-# ─────────────── utilities ────────────────────────────────────────────────
-def parse_pair_list(path):
-    binders, glms = [], []
-    with open(path) as f:
-        for ln, line in enumerate(f, 1):
-            parts = line.strip().split()
-            if len(parts) < 2:
-                continue
-            b, g = parts[0], parts[1]
-            binders.append(b)
-            glms.append(g)
-    return binders, glms
-class ListBatchSampler(Sampler):
-    def __init__(self, batches):
-        self.batches = batches
-    def __iter__(self):
-        return iter(self.batches)
-    def __len__(self):
-        return len(self.batches)
-def make_buckets(idxs, glm_paths, batch_size, n_buckets=10, seed=42):
-    rng = random.Random(seed)
-    lengths = [(i, np.load(glm_paths[i]).shape[0]) for i in idxs]
-    lengths.sort(key=lambda x: x[1])
-    size = max(1, int(np.ceil(len(lengths) / n_buckets)))
-    buckets = [lengths[i : i + size] for i in range(0, len(lengths), size)]
-    batches = []
-    for bucket in buckets:
-        ids = [i for i, _ in bucket]
-        rng.shuffle(ids)
-        for i in range(0, len(ids), batch_size):
-            batches.append(ids[i : i + batch_size])
-    rng.shuffle(batches)
-    return batches
-def dna_key_from_path(path: str) -> str:
-    """.../dna_peak42.npy -> 'peak42'"""
-    stem = Path(path).stem
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return rest
-def build_tf_cache(tf_paths, target_dim=256):
-    """
-    Load raw TF embeddings without projecting; compression is learnable in the model.
-    """
-    unique = sorted(set(tf_paths))
-    print(
-        f"[i] (Learnable) Preparing {len(unique)} TF files; target {target_dim}d inside the model",
-        flush=True,
-    )
-    pools, raw = [], []
-    for p in unique:
-        arr = np.load(p)  # (L, D) or (D,)
-        raw.append(arr)
-        pools.append(arr.mean(axis=0) if arr.ndim == 2 else arr)
-    M = np.stack(pools, 0)
-    orig_dim = M.shape[1]
-    print(f"[i] Pooled shape → {M.shape}  (orig_dim={orig_dim})", flush=True)
-    cache = {}
-    for i, p in enumerate(unique):
-        arr = raw[i]
-        # OLD: projection here (removed)
-        cache[p] = arr
-    print("[i] TF cache ready (raw); compression will be learned.", flush=True)
-    return cache
-# ─────────────── Dataset & Collation ─────────────────────────────────────
-class PairDataset(Dataset):
-    def __init__(self, tf_paths, dna_paths, final_df, tf_cache):
-        self.tf_paths, self.dna_paths = tf_paths, dna_paths
-        self.tf_cache = tf_cache
-        self.targets = {}
-        for _, row in final_df.iterrows():
-            dna_id = row["dna_id"]
-            vec = np.array(
-                list(map(float, row["score_sig_r2"].split(","))), dtype=np.float32
-            )
-            self.targets[dna_id] = vec
-    def __len__(self):
-        return len(self.tf_paths)
-    def __getitem__(self, i):
-        b = self.tf_cache[self.tf_paths[i]]  # (L_b, D_b) or (D_b,)
-        if b.ndim == 1:
-            b = b[None, :]
-        g = np.load(self.dna_paths[i])  # (L_g, 256) or (256,)
-        if g.ndim == 1:
-            g = g[None, :]
-        stem = Path(self.dna_paths[i]).stem
-        dna_id = stem.replace("dna_", "")
-        t = self.targets.get(dna_id, np.zeros(g.shape[0], dtype=np.float32))
-        return (
-            torch.from_numpy(b).float(),
-            torch.from_numpy(g).float(),
-            torch.from_numpy(t).float(),
-        )
-def collate_fn(batch):
-    Bs = [b.shape[0] for b, _, _ in batch]
-    Gs = [g.shape[0] for _, g, _ in batch]
-    maxB, maxG = max(Bs), max(Gs)
-    def pad_seq(x, L):
-        if x.shape[0] < L:
-            pad = torch.zeros(
-                (L - x.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
-            )
-            return torch.cat([x, pad], dim=0)
-        return x
-    def pad_t(y, L):
-        if y.shape[0] < L:
-            pad = torch.zeros((L - y.shape[0],), dtype=y.dtype, device=y.device)
-            return torch.cat([y, pad], dim=0)
-        return y
-    b_stack = torch.stack([pad_seq(b, maxB) for b, _, _ in batch])
-    g_stack = torch.stack([pad_seq(g, maxG) for _, g, _ in batch])
-    t_stack = torch.stack([pad_t(t, maxG) for *_, t in batch])
-    return b_stack, g_stack, t_stack
-# ──���──────────── losses, metrics ─────────────────────────────────────────
-def combined_loss_components(logits, targets, peak_thresh=0.5, eps=1e-8):
-    probs = torch.sigmoid(logits)
-    labels = (targets >= peak_thresh).float()
-    non_peak_mask = (labels == 0).float()
-    peak_mask = (labels == 1).float()
-    bce_all = F.binary_cross_entropy_with_logits(logits, labels, reduction="none")
-    bce_non = bce_all * non_peak_mask
-    bce_non = bce_non.sum() / (non_peak_mask.sum() + eps)
-    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction="sum") / (
-        peak_mask.sum() + eps
-    )
-    mse_global = F.mse_loss(probs, targets, reduction="mean")
-    t_dist = targets + eps
-    p_dist = probs + eps
-    t_dist = t_dist / t_dist.sum(dim=1, keepdim=True)
-    p_dist = p_dist / p_dist.sum(dim=1, keepdim=True)
-    kl = (
-        (t_dist * (t_dist.clamp(min=eps).log() - p_dist.clamp(min=eps).log()))
-        .sum(dim=1)
-        .mean()
-    )
-    return bce_non, kl, mse_global, probs
-def accuracy_percentage(logits, targets, peak_thresh=0.5):
-    probs = torch.sigmoid(logits)
-    preds_bin = (probs >= 0.5).float()
-    labels = (targets >= peak_thresh).float()
-    correct = (preds_bin == labels).float().sum()
-    total = torch.numel(labels)
-    return (correct / max(1, total)).item() * 100.0
-def evaluate(model, dl, device, alpha, beta, gamma, peak_thresh, eps=1e-8):
-    model.eval()
-    tot_loss, tot_acc = 0.0, 0.0
-    n_batches = 0
-    with torch.no_grad():
-        for b, g, t in dl:
-            b, g, t = b.to(device), g.to(device), t.to(device)
-            logits = model(b, g)
-            bce_non, kl, mse_global, _ = combined_loss_components(
-                logits, t, peak_thresh=peak_thresh, eps=eps
-            )
-            loss = alpha * bce_non + beta * kl + gamma * mse_global
-            acc = accuracy_percentage(logits, t, peak_thresh=peak_thresh)
-            tot_loss += loss.item()
-            tot_acc += acc
-            n_batches += 1
-    if n_batches == 0:
-        return float("nan"), float("nan")
-    return tot_loss / n_batches, tot_acc / n_batches
-# ─────────────── cluster-aware splitting ──────────────────────────────────
-def assign_clusters_to_splits(
-    cluster_to_indices, val_frac=0.10, test_frac=0.10, seed=42
-):
-    """
-    cluster_to_indices: dict[cluster_id] -> list of example indices (from pair_list) in that cluster
-    We greedily pack whole clusters into val/test until hitting targets (#examples), rest to train.
-    """
-    rng = random.Random(seed)
-    clusters = list(cluster_to_indices.items())
-    rng.shuffle(clusters)
-    total = sum(len(ixs) for _, ixs in clusters)
-    target_val = int(round(total * val_frac))
-    target_test = int(round(total * test_frac))
-    cur_val = cur_test = 0
-    tr_ix, va_ix, te_ix = [], [], []
-    for cid, ixs in clusters:
-        c = len(ixs)
-        if cur_val + c <= target_val:
-            va_ix.extend(ixs)
-            cur_val += c
-        elif cur_test + c <= target_test:
-            te_ix.extend(ixs)
-            cur_test += c
-        else:
-            tr_ix.extend(ixs)
-    return tr_ix, va_ix, te_ix
-# ─────────────── train & main ────────────────────────────────────────────
-def main():
-    p = argparse.ArgumentParser()
-    p.add_argument("--pair_list", required=True)
-    p.add_argument("--final_csv", required=True)
-    p.add_argument("--out_dir", required=True)
-    p.add_argument("--epochs", type=int, default=10)
-    p.add_argument("--batch_size", type=int, default=16)
-    p.add_argument("--accum_steps", type=int, default=4)
-    p.add_argument("--lr", type=float, default=1e-4)
-    p.add_argument("--device", default="cuda")
-    p.add_argument("--seed", type=int, default=42)
-    p.add_argument("--alpha", type=float, default=1)
-    p.add_argument("--beta", type=float, default=0)
-    p.add_argument("--gamma", type=float, default=1)
-    p.add_argument("--peak_thresh", type=float, default=0.5)
-    # NEW: fractions for cluster-aware split (used only if cluster_id present)
-    p.add_argument("--val_frac", type=float, default=0.10)
-    p.add_argument("--test_frac", type=float, default=0.10)
-    args = p.parse_args()
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
-    # 1) load pair list & final.csv (now may include cluster_id)
-    tf_paths, dna_paths = parse_pair_list(args.pair_list)
-    final_df = pd.read_csv(args.final_csv, dtype=str)
-    print(f"[i] Loaded {len(tf_paths)} pairs", flush=True)
-    tf_cache = build_tf_cache(tf_paths, target_dim=256)
-    # detect binder/DNA dims
-    sample_tf = tf_cache[tf_paths[0]]
-    binder_input_dim = sample_tf.shape[1] if sample_tf.ndim == 2 else sample_tf.shape[0]
-    glm_input_dim = 256
-    # 2) cluster-aware split if possible
-    use_cluster_split = "cluster_id" in final_df.columns
-    if use_cluster_split:
-        print(
-            "[i] Cluster column detected in final_csv; performing cluster-aware split.",
-            flush=True,
-        )
-        # build dna_id -> cluster_id map
-        cid_map = (
-            final_df[["dna_id", "cluster_id"]]
-            .dropna()
-            .drop_duplicates()
-            .set_index("dna_id")["cluster_id"]
-            .to_dict()
-        )
-        # map each example (by index) to its dna_id and cluster
-        example_dna_ids = [dna_key_from_path(p) for p in dna_paths]
-        example_clusters = []
-        missing = 0
-        for did in example_dna_ids:
-            if did in cid_map:
-                example_clusters.append(cid_map[did])
-            else:
-                # fallback: treat singleton cluster
-                example_clusters.append(f"singleton::{did}")
-                missing += 1
-        if missing:
-            print(
-                f"[WARN] {missing} dna_ids from pair_list not found in cluster map; treating as singleton clusters.",
-                flush=True,
-            )
-        # build cluster -> indices
-        cluster_to_indices = {}
-        for i, cid in enumerate(example_clusters):
-            cluster_to_indices.setdefault(cid, []).append(i)
-        tr_idx, va_idx, te_idx = assign_clusters_to_splits(
-            cluster_to_indices,
-            val_frac=args.val_frac,
-            test_frac=args.test_frac,
-            seed=args.seed,
-        )
-        print(
-            f"[i] Cluster split sizes (examples): train={len(tr_idx)} val={len(va_idx)} test={len(te_idx)}",
-            flush=True,
-        )
-        # helper to subset paths
-        def subset_by_indices(ixs):
-            return [tf_paths[i] for i in ixs], [dna_paths[i] for i in ixs]
-        tr_t, tr_d = subset_by_indices(tr_idx)
-        va_t, va_d = subset_by_indices(va_idx)
-        te_t, te_d = subset_by_indices(te_idx)
-    else:
-        print(
-            "[i] No cluster_id in final_csv; using random 80/10/10 split (OLD behavior).",
-            flush=True,
-        )
-        # OLD random split (kept, now under else)
-        N = len(tf_paths)
-        idxs = list(range(N))
-        random.shuffle(idxs)
-        n_tr = int(0.8 * N)
-        n_va = int(0.1 * N)
-        tr, va, te = idxs[:n_tr], idxs[n_tr : n_tr + n_va], idxs[n_tr + n_va :]
-        def subset(idxs_):
-            return [tf_paths[i] for i in idxs_], [dna_paths[i] for i in idxs_]
-        tr_t, tr_d = subset(tr)
-        va_t, va_d = subset(va)
-        te_t, te_d = subset(te)
-    # 3) bucketed samplers (unchanged, but now use the cluster-aware subsets when available)
-    tr_bs = make_buckets(
-        list(range(len(tr_t))), tr_d, args.batch_size, n_buckets=10, seed=args.seed
-    )
-    va_bs = make_buckets(
-        list(range(len(va_t))), va_d, args.batch_size, n_buckets=5, seed=args.seed + 1
-    )
-    te_bs = make_buckets(
-        list(range(len(te_t))), te_d, args.batch_size, n_buckets=5, seed=args.seed + 2
-    )
-    tr_dl = DataLoader(
-        PairDataset(tr_t, tr_d, final_df, tf_cache),
-        batch_sampler=ListBatchSampler(tr_bs),
-        collate_fn=collate_fn,
-    )
-    va_dl = DataLoader(
-        PairDataset(va_t, va_d, final_df, tf_cache),
-        batch_sampler=ListBatchSampler(va_bs),
-        collate_fn=collate_fn,
-    )
-    te_dl = DataLoader(
-        PairDataset(te_t, te_d, final_df, tf_cache),
-        batch_sampler=ListBatchSampler(te_bs),
-        collate_fn=collate_fn,
-    )
-    # 4) model, optimizer, scaler
-    model = BindPredictor(
-        binder_input_dim=binder_input_dim,
-        glm_input_dim=glm_input_dim,
-        compressed_dim=256,
-        hidden_dim=256,
-        heads=8,
-        num_layers=4,
-        use_local_cnn_on_glm=True,
-    ).to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
-    scaler = amp.GradScaler("cuda")
-    history, best_val = {"train": [], "val": []}, float("inf")
-    od = Path(args.out_dir)
-    od.mkdir(exist_ok=True, parents=True)
-    for ep in range(1, args.epochs + 1):
-        print(f"┌─[Epoch {ep}]────────────────────────", flush=True)
-        model.train()
-        optimizer.zero_grad()
-        acc_loss_sum, acc_acc_sum, n_train_batches = 0.0, 0.0, 0
-        for i, (b, g, t) in enumerate(tr_dl):
-            b, g, t = b.to(device), g.to(device), t.to(device)
-            with amp.autocast("cuda"):
-                logits = model(b, g)
-                bce_non, kl, mse_global, probs = combined_loss_components(
-                    logits, t, peak_thresh=args.peak_thresh
-                )
-                loss = args.alpha * bce_non + args.beta * kl + args.gamma * mse_global
-                loss = loss / args.accum_steps
-            scaler.scale(loss).backward()
-            if (i + 1) % args.accum_steps == 0:
-                scaler.step(optimizer)
-                scaler.update()
-                optimizer.zero_grad()
-            with torch.no_grad():
-                acc_loss_sum += loss.item() * args.accum_steps
-                acc_acc_sum += accuracy_percentage(
-                    logits, t, peak_thresh=args.peak_thresh
-                )
-                n_train_batches += 1
-            del b, g, t, logits, probs, loss, bce_non, kl, mse_global
-            torch.cuda.empty_cache()
-        # finalize if leftovers
-        if n_train_batches % args.accum_steps != 0:
-            scaler.step(optimizer)
-            scaler.update()
-            optimizer.zero_grad()
-        train_loss = acc_loss_sum / max(1, n_train_batches)
-        train_acc = acc_acc_sum / max(1, n_train_batches)
-        val_loss, val_acc = evaluate(
-            model,
-            va_dl,
-            device,
-            alpha=args.alpha,
-            beta=args.beta,
-            gamma=args.gamma,
-            peak_thresh=args.peak_thresh,
-        )
-        print(
-            f"[Epoch {ep}] train_loss={train_loss:.4f}  train_acc={train_acc:.2f}%  "
-            f"val_loss={val_loss:.4f}  val_acc={val_acc:.2f}%",
-            flush=True,
-        )
-        history["train"].append(train_loss)
-        history["val"].append(val_loss)
-        if val_loss < best_val:
-            best_val = val_loss
-            torch.save(model.state_dict(), od / "best_model.pt")
-            print(
-                f" Saved new best_model.pt (val_loss={val_loss:.4f}, val_acc={val_acc:.2f}%)",
-                flush=True,
-            )
-    torch.save(model.state_dict(), od / "last_model.pt")
-    fig, ax = plt.subplots()
-    ax.plot(history["train"], label="train")
-    ax.plot(history["val"], label="val")
-    ax.set_xlabel("epoch")
-    ax.set_ylabel("combined loss")
-    ax.legend()
-    fig.savefig(od / "loss_curve.png")
-    print(f"✅ Done → outputs in {od}", flush=True)
-if __name__ == "__main__":
-    main()

dpacman/classifier/torch_model.py DELETED Viewed

@@ -1,157 +0,0 @@
-import torch
-from torch import nn
-class LocalCNN(nn.Module):
-    def __init__(self, dim: int = 256, kernel_size: int = 3):
-        super().__init__()
-        padding = kernel_size // 2
-        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
-        self.act = nn.GELU()
-        self.ln = nn.LayerNorm(dim)
-    def forward(self, x: torch.Tensor):
-        # x: (batch, L, dim)
-        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
-        out = self.act(out)
-        out = out.transpose(1, 2)  # → (batch, L, dim)
-        return self.ln(out + x)  # residual
-class CrossModalBlock(nn.Module):
-    def __init__(self, dim: int = 256, heads: int = 8):
-        super().__init__()
-        # self-attention for both sides
-        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.ln_b1 = nn.LayerNorm(dim)
-        self.ln_g1 = nn.LayerNorm(dim)
-        self.ffn_b = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ffn_g = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ln_b2 = nn.LayerNorm(dim)
-        self.ln_g2 = nn.LayerNorm(dim)
-        # cross attention (binder queries, glm keys/values)
-        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.ln_c1 = nn.LayerNorm(dim)
-        self.ffn_c = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
-        )
-        self.ln_c2 = nn.LayerNorm(dim)
-    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
-        """
-        binder: (batch, Lb, dim)
-        glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
-        returns: updated binder representation (batch, Lb, dim)
-        """
-        # binder self-attn + ffn
-        b = binder
-        b_sa, _ = self.sa_binder(b, b, b)
-        b = self.ln_b1(b + b_sa)
-        b_ff = self.ffn_b(b)
-        b = self.ln_b2(b + b_ff)
-        # glm self-attn + ffn
-        g = glm
-        g_sa, _ = self.sa_glm(g, g, g)
-        g = self.ln_g1(g + g_sa)
-        g_ff = self.ffn_g(g)
-        g = self.ln_g2(g + g_ff)
-        # cross-attention: binder queries glm
-        c_sa, _ = self.cross_attn(b, g, g)
-        c = self.ln_c1(b + c_sa)
-        c_ff = self.ffn_c(c)
-        c = self.ln_c2(c + c_ff)
-        return c  # (batch, Lb, dim)
-class DimCompressor(nn.Module):
-    """
-    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
-    If in_dim == out_dim, behaves as identity.
-    """
-    def __init__(self, in_dim: int, out_dim: int = 256):
-        super().__init__()
-        if in_dim == out_dim:
-            self.net = nn.Identity()
-        else:
-            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
-            self.net = nn.Sequential(
-                nn.LayerNorm(in_dim),
-                nn.Linear(in_dim, hidden),
-                nn.GELU(),
-                nn.Linear(hidden, out_dim),
-            )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x: (B, L, in_dim)
-        return self.net(x)
-class BindPredictor(nn.Module):
-    def __init__(
-        self,
-        # input_dim: int = 256,                     # OLD: single input dim
-        binder_input_dim: int = 1280,  # NEW: TF (binder) original dim (e.g., 1280)
-        glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
-        compressed_dim: int = 256,  # NEW: learnable compressed dim
-        hidden_dim: int = 256,
-        heads: int = 8,
-        num_layers: int = 4,
-        use_local_cnn_on_glm: bool = True,
-    ):
-        super().__init__()
-        # OLD:
-        # self.proj_binder = nn.Linear(input_dim, hidden_dim)
-        # self.proj_glm = nn.Linear(input_dim, hidden_dim)
-        # NEW: learnable compressor for binder → 256, then project to hidden
-        self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
-        self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
-        # GLM side stays 256 → hidden
-        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
-        self.use_local_cnn = use_local_cnn_on_glm
-        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
-        self.layers = nn.ModuleList(
-            [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
-        )
-        self.ln_out = nn.LayerNorm(hidden_dim)
-        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
-        self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
-    def forward(self, binder_emb, glm_emb):
-        """
-        binder_emb: (B, Lb, binder_input_dim)
-        glm_emb:    (B, Lg, glm_input_dim)
-        Returns per-nucleotide logits for the GLM sequence: (B, Lg)
-        """
-        # Binder: learnable compression → 256 → hidden
-        b = self.binder_compress(binder_emb)  # (B, Lb, 256)
-        b = self.proj_binder(b)  # (B, Lb, hidden_dim)
-        # GLM: project → hidden, add local CNN context
-        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
-        if self.use_local_cnn:
-            g = self.local_cnn(g)
-        # Cross-modal blocks: update binder states using GLM
-        for layer in self.layers:
-            b = layer(b, g)  # (B, Lb, hidden_dim)
-        # Predict per-nucleotide logits on the GLM tokens:
-        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
-        return self.head(g).squeeze(
-            -1
-        )  # NEW: logits (apply sigmoid only in loss/metrics)

dpacman/classifier/train.py DELETED Viewed

@@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import torch
-from torch import nn
-from model import BindPredictor
-from pathlib import Path
-from collections import Counter
-from sklearn.metrics import roc_auc_score, average_precision_score
-from sklearn.decomposition import TruncatedSVD
-import sys
-from dpacman.utils.models import set_seed
-def build_tf_compressed_cache(binder_paths, target_dim=256):
-    """
-    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
-    """
-    unique_paths = sorted(set(binder_paths))
-    print(
-        f"[i] Found {len(unique_paths)} unique TF embedding files to compress.",
-        flush=True,
-    )
-    # Load all embeddings to determine dimensionality
-    samples = []
-    for p in unique_paths:
-        arr = np.load(p)
-        samples.append(arr)
-    # Determine if reduction needed: assume all have same embedding width
-    first = samples[0]
-    orig_dim = first.shape[1] if first.ndim == 2 else 1
-    reduction_needed = orig_dim != target_dim
-    tf_cache = {}
-    if reduction_needed:
-        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
-        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
-        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
-        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
-        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
-        pooled = []
-        for arr in samples:
-            if arr.ndim == 2:
-                pooled.append(arr.mean(axis=0))  # (orig_dim,)
-            else:
-                pooled.append(arr)  # degenerate
-        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
-        print(
-            f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}",
-            flush=True,
-        )
-        svd = TruncatedSVD(n_components=target_dim, random_state=42)
-        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
-        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
-        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
-        proj_mat = svd.components_.T  # (orig_dim, target_dim)
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]  # shape (L, orig_dim)
-            if arr.ndim == 1:
-                arr2 = arr @ proj_mat  # (target_dim,)
-            else:
-                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
-                arr2 = arr @ proj_mat
-            tf_cache[p] = arr2  # reduced per-token representation
-        print("[i] Completed compression of TF embeddings.", flush=True)
-    else:
-        # already correct dim: just cache originals
-        print(
-            f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.",
-            flush=True,
-        )
-        for i, p in enumerate(unique_paths):
-            arr = samples[i]
-            tf_cache[p] = arr
-    return tf_cache
-def evaluate(model, dl, device):
-    model.eval()
-    all_labels = []
-    all_preds = []
-    with torch.no_grad():
-        for b, g, y in dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            all_labels.append(y.cpu())
-            all_preds.append(pred.cpu())
-    if not all_labels:
-        return 0.0, 0.0
-    y_true = torch.cat(all_labels).numpy()
-    y_score = torch.cat(all_preds).numpy()
-    try:
-        auc = roc_auc_score(y_true, y_score)
-    except Exception:
-        auc = 0.0
-    try:
-        ap = average_precision_score(y_true, y_score)
-    except Exception:
-        ap = 0.0
-    return auc, ap
-def unpack(data):
-    binders, glms, labels = zip(*data)
-    return list(binders), list(glms), list(labels)
-# ---- main ------------------------------------------------------------
-def main(cfg):
-    """
-    Main method, used to train the model.
-    """
-    # Set seed for reproducibility
-    set_seed(cfg.seed)
-    parser.add_argument("--out_dir", type=str, required=True)
-    parser.add_argument("--epochs", type=int, default=10)
-    parser.add_argument("--batch_size", type=int, default=32)
-    parser.add_argument("--lr", type=float, default=1e-4)
-    parser.add_argument("--device", type=str, default="cuda")
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-    #
-    print("DEBUG: starting training script with in-line TF compression", flush=True)
-    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
-    binder_paths, glm_paths, labels = parse_pair_list(cfg.pair_list)
-    if len(labels) == 0:
-        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
-        sys.exit(1)
-    label_counts = Counter(labels)
-    print(
-        f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}",
-        flush=True,
-    )
-    # build compressed TF cache (reduces to 256 if needed)
-    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
-    # load training data aloiaushasfoiuhasfoiuafasdfoihuaaasdfoiuhasfaaoiufhasfoasasfoiuh
-    train_ds = PairDataset(None, tf_compressed_cache=tf_compressed_cache)
-    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
-    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
-    print(
-        f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}",
-        flush=True,
-    )
-    if len(train_ds) == 0 or len(val_ds) == 0:
-        print(
-            "[ERROR] Train or validation split is empty; cannot proceed.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    train_dl = DataLoader(
-        train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn
-    )
-    val_dl = DataLoader(
-        val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn
-    )
-    test_dl = DataLoader(
-        test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn
-    )
-    model = BindPredictor(
-        input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True
-    )
-    model = model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
-    loss_fn = nn.BCELoss()
-    best_val = -float("inf")
-    os_out = Path(args.out_dir)
-    os_out.mkdir(exist_ok=True, parents=True)
-    for epoch in range(1, args.epochs + 1):
-        print(f"[Epoch {epoch}] starting...", flush=True)
-        model.train()
-        running_loss = 0.0
-        for b, g, y in train_dl:
-            b = b.to(device)
-            g = g.to(device)
-            y = y.to(device)
-            pred = model(b, g)
-            loss = loss_fn(pred, y)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-            running_loss += loss.item() * b.size(0)
-        train_loss = running_loss / len(train_ds)
-        val_auc, val_ap = evaluate(model, val_dl, device)
-        print(
-            f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}",
-            flush=True,
-        )
-        if val_auc > best_val:
-            best_val = val_auc
-            torch.save(model.state_dict(), os_out / "best_model.pt")
-            print(
-                f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}",
-                flush=True,
-            )
-    torch.save(model.state_dict(), os_out / "last_model.pt")
-    test_auc, test_ap = evaluate(model, test_dl, device)
-    print(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
-    print(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
-if __name__ == "__main__":
-    main()

dpacman/scripts/delay_run.sh CHANGED Viewed

@@ -4,7 +4,7 @@ set -euo pipefail
 # Usage: ./stagger.sh <first_script.sh> <second_script.sh>
 # Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
-WAIT1=${WAIT1:-10800}  # 3 hours in seconds
 WAIT2=${WAIT2:-10800}
 SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"

 # Usage: ./stagger.sh <first_script.sh> <second_script.sh>
 # Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
+WAIT1=${WAIT1:-3600}  # 3 hours in seconds
 WAIT2=${WAIT2:-10800}
 SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"

dpacman/scripts/run_train.sh CHANGED Viewed

@@ -22,7 +22,7 @@ CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   +trainer.gradient_clip_algorithm="norm" \
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
-  trainer.max_epochs=10 \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
@@ -31,8 +31,8 @@ CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   data_module.batch_size=16 \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
-  model.hidden_dim=256 \
-  model.lr=5e-6 \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

   +trainer.gradient_clip_algorithm="norm" \
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
+  trainer.max_epochs=20 \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.batch_size=16 \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
+  model.hidden_dim=128 \
+  model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_train_baseline.sh ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/bin/bash
+# Manually specify values used in the config
+main_task="train"
+model_type="baseline"
+timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+run_dir="$HOME/DPACMAN/logs/${main_task}/${model_type}/runs/${timestamp}"
+mkdir -p "$run_dir"
+if [ -z "$WANDB_API_KEY" ]; then
+    read -s -p "Enter your WANDB API key: " wandb_key
+    echo
+    export WANDB_API_KEY="$wandb_key"
+fi
+CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
+  +trainer.strategy=ddp \
+  +trainer.use_distributed_sampler="false" \
+  +trainer.detect_anomaly="false" \
+  +trainer.gradient_clip_val=0.5 \
+  +trainer.gradient_clip_algorithm="norm" \
+  hydra.run.dir="${run_dir}" \
+  trainer.devices=2 \
+  trainer.max_epochs=10 \
+  data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
+  data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
+  data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
+  data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
+  data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
+  data_module.batch_size=16 \
+  model=baseline \
+  model.glm_input_dim=256 \
+  model.compressed_dim=256 \
+  model.hidden_dim=128 \
+  model.lr=1e-5 \
+  > "${run_dir}/run.log" 2>&1 &
+echo $! > "${run_dir}/pid.txt"