bug fixes

Browse files

Files changed (8) hide show

.gitignore +2 -1
configs/model/classifier.yaml +2 -1
dpacman.egg-info/SOURCES.txt +39 -1
dpacman.egg-info/top_level.txt +1 -1
dpacman/classifier/model.py +62 -27
dpacman/classifier/model_w_rca.py +323 -0
dpacman/data_modules/pair.py +74 -10
dpacman/scripts/run_train.sh +6 -2

.gitignore CHANGED Viewed

@@ -34,4 +34,5 @@ dpacman/combine_shards.py
 dpacman/combine.log
 dpacman/loss_sim.py
 dpacman/loss_temp.py
-dpacman/peak_examples/

 dpacman/combine.log
 dpacman/loss_sim.py
 dpacman/loss_temp.py
+dpacman/peak_examples/
+dpacman/__pycache__/

configs/model/classifier.yaml CHANGED Viewed

@@ -6,4 +6,5 @@ gamma: 20
 weight_decay: 0.01
 glm_input_dim: 1029
-compressed_dim: 1029

 weight_decay: 0.01
 glm_input_dim: 1029
+compressed_dim: 1029
+hidden_dim: 256

dpacman.egg-info/SOURCES.txt CHANGED Viewed

@@ -1,6 +1,44 @@
 README.md
 setup.py
 dpacman.egg-info/PKG-INFO
 dpacman.egg-info/SOURCES.txt
 dpacman.egg-info/dependency_links.txt
-dpacman.egg-info/top_level.txt

 README.md
 setup.py
+dpacman/__init__.py
+dpacman/combine_shards.py
+dpacman/loss_sim.py
+dpacman/loss_temp.py
+dpacman/temp.py
+dpacman/temp2.py
 dpacman.egg-info/PKG-INFO
 dpacman.egg-info/SOURCES.txt
 dpacman.egg-info/dependency_links.txt
+dpacman.egg-info/top_level.txt
+dpacman/classifier/__init__.py
+dpacman/classifier/loss.py
+dpacman/classifier/model.py
+dpacman/classifier/old_train.py
+dpacman/classifier/torch_model.py
+dpacman/classifier/train.py
+dpacman/classifier/model_tmp/__init__.py
+dpacman/classifier/model_tmp/clustering_data.py
+dpacman/classifier/model_tmp/compress_embeddings.py
+dpacman/classifier/model_tmp/compute_embeddings.py
+dpacman/classifier/model_tmp/extract_tf_symbols.py
+dpacman/classifier/model_tmp/make_pair_list.py
+dpacman/classifier/model_tmp/make_peak_fasta.py
+dpacman/classifier/model_tmp/model.py
+dpacman/classifier/model_tmp/prep_splits.py
+dpacman/classifier/model_tmp/train.py
+dpacman/data_modules/__init__.py
+dpacman/data_modules/pair.py
+dpacman/scripts/__init__.py
+dpacman/scripts/eval.py
+dpacman/scripts/preprocess.py
+dpacman/scripts/train.py
+dpacman/utils/__init__.py
+dpacman/utils/clustering.py
+dpacman/utils/instantiators.py
+dpacman/utils/logging_utils.py
+dpacman/utils/models.py
+dpacman/utils/plotting_utils.py
+dpacman/utils/pylogger.py
+dpacman/utils/rich_utils.py
+dpacman/utils/splitting.py
+dpacman/utils/utils.py

dpacman.egg-info/top_level.txt CHANGED Viewed

	@@ -1 +1 @@
1	-


1	+ dpacman

dpacman/classifier/model.py CHANGED Viewed

@@ -28,59 +28,93 @@ class LocalCNN(nn.Module):
 class CrossModalBlock(nn.Module):
-    def __init__(self, dim: int = 256, heads: int = 8):
         super().__init__()
         # self-attention for both sides
         self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
         self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
         self.ln_b1 = nn.LayerNorm(dim)
         self.ln_g1 = nn.LayerNorm(dim)
-        self.ffn_b = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
-        self.ffn_g = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
         self.ln_b2 = nn.LayerNorm(dim)
         self.ln_g2 = nn.LayerNorm(dim)
         # cross attention (binder queries, glm keys/values)
         # so the NDA path is updated by the transcriptoin factors
-        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.ln_c1 = nn.LayerNorm(dim)
-        self.ffn_c = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
-        self.ln_c2 = nn.LayerNorm(dim)
-    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
         """
         binder: (batch, Lb, dim)
         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
         returns: updated binder representation (batch, Lb, dim)
         """
         # binder: self-attn + ffn
         b = binder
-        b_sa, _ = self.sa_binder(b, b, b)
         b = self.ln_b1(b + b_sa)
-        b_ff = self.ffn_b(b)
         b = self.ln_b2(b + b_ff)
         # glm: self-attn + ffn
         g = glm
-        g_sa, _ = self.sa_glm(g, g, g)
         g = self.ln_g1(g + g_sa)
-        g_ff = self.ffn_g(g)
         g = self.ln_g2(g + g_ff)
-        # cross-attention: glm queries binder and glm embeddings are updated
-        g_to_b_ca, _ = self.cross_attn(g, b, b)
-        g = self.ln_c1(g + g_to_b_ca)
-        g_ff = self.ffn_c(g)
-        g = self.ln_c2(g + g_ff)
-        return g  # (batch, Lb, dim)
 class DimCompressor(nn.Module):
     """
@@ -144,7 +178,7 @@ class BindPredictor(LightningModule):
         # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
         self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
-    def forward(self, binder_emb, glm_emb):
         """
         binder_emb: (B, Lb, binder_input_dim)
         glm_emb:    (B, Lg, glm_input_dim)
@@ -161,14 +195,15 @@ class BindPredictor(LightningModule):
         # Cross-modal blocks: update binder states using GLM
         for layer in self.layers:
-            g = layer(b, g)  # (B, Lb, hidden_dim)
         # Predict per-nucleotide logits on the GLM tokens:
         # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
-        return self.head(g).squeeze(
             -1
-        )  # NEW: logits (apply sigmoid only in loss/metrics)
     # ----- Lightning hooks -----
     def training_step(self, batch, batch_idx):
         """
@@ -184,7 +219,7 @@ class BindPredictor(LightningModule):
             "dna_sequence"
         }
         """
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
@@ -199,7 +234,7 @@ class BindPredictor(LightningModule):
         return loss
     def validation_step(self, batch, batch_idx):
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
@@ -214,7 +249,7 @@ class BindPredictor(LightningModule):
         return loss
     def test_step(self, batch, batch_idx):
-        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )

 class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.0):
         super().__init__()
         # self-attention for both sides
         self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
         self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+        # first layer norms
         self.ln_b1 = nn.LayerNorm(dim)
         self.ln_g1 = nn.LayerNorm(dim)
+        # first feed forward networks
+        self.ffn_b1 = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
+        self.ffn_g1 = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
         self.ln_b2 = nn.LayerNorm(dim)
         self.ln_g2 = nn.LayerNorm(dim)
+        # 2) reciprocal cross-attn: g<-b and b<-g
+        # DNA/GLM updated by attending to Binder
+        self.cross_g2b_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_g3_RCA  = nn.LayerNorm(dim)
+        self.ffn_g2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_g4_RCA  = nn.LayerNorm(dim)
+        # Binder updated by attending to DNA/GLM
+        self.cross_b2g_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_b3_RCA = nn.LayerNorm(dim)
+        self.ffn_b2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_b4_RCA  = nn.LayerNorm(dim)
         # cross attention (binder queries, glm keys/values)
         # so the NDA path is updated by the transcriptoin factors
+        self.cross_g2b_2 = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_g5 = nn.LayerNorm(dim)
+        self.ffn_g3 = nn.Sequential(
             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
+        self.ln_g6 = nn.LayerNorm(dim)
+    def forward(self, binder: torch.Tensor, glm: torch.Tensor, binder_kpm_mask=None, glm_kpm_mask=None):
         """
         binder: (batch, Lb, dim)
         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
         returns: updated binder representation (batch, Lb, dim)
         """
+        # 1) Self-attentino and feed-forward networks for binder and DNA
         # binder: self-attn + ffn
         b = binder
+        b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=binder_kpm_mask)
         b = self.ln_b1(b + b_sa)
+        b_ff = self.ffn_b1(b)
         b = self.ln_b2(b + b_ff)
         # glm: self-attn + ffn
         g = glm
+        g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=glm_kpm_mask)
         g = self.ln_g1(g + g_sa)
+        g_ff = self.ffn_g1(g)
         g = self.ln_g2(g + g_ff)
+        # 2a) Reciprocal Cross-Attention:
+        # DNA updated by attending to Binder (Q=g, K=b, V=b)
+        # Binder updated by attending to DNA (Q=b, K=g, V=g)
+        g_ca, _ = self.cross_g2b_1_RCA(
+            g, b, b, key_padding_mask=binder_kpm_mask
+            # torch MultiheadAttention expects key_padding_mask=True for PADs;
+            # invert if your mask is True=keep:
+            # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
+        )
+        g = self.ln_g3_RCA(g + g_ca)
+        g = self.ln_g4_RCA(g + self.ffn_g2_RCA(g))
+        # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
+        b_ca, _ = self.cross_b2g_1_RCA(
+            b, g, g, key_padding_mask=glm_kpm_mask
+            # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
+        )
+        b = self.ln_b3_RCA(b + b_ca)
+        b = self.ln_b4_RCA(b + self.ffn_b2_RCA(b))
+        # cross-attention: glm queries binder and glm embeddings are updated
+        g_to_b_ca, _ = self.cross_g2b_2(g, b, b, key_padding_mask=binder_kpm_mask)
+        g = self.ln_g5(g + g_to_b_ca)
+        g_ff = self.ffn_g3(g)
+        g = self.ln_g6(g + g_ff)
+        return b, g  # (batch, Lb, dim)
 class DimCompressor(nn.Module):
     """
         # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
         self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
+    def forward(self, binder_emb, glm_emb, binder_mask, glm_mask):
         """
         binder_emb: (B, Lb, binder_input_dim)
         glm_emb:    (B, Lg, glm_input_dim)
         # Cross-modal blocks: update binder states using GLM
         for layer in self.layers:
+            b, g = layer(b, g, binder_mask, glm_mask)  # (B, Lb, hidden_dim)
         # Predict per-nucleotide logits on the GLM tokens:
         # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
+        logits = self.head(g).squeeze(
             -1
+        )
+        return logits
     # ----- Lightning hooks -----
     def training_step(self, batch, batch_idx):
         """
             "dna_sequence"
         }
         """
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         return loss
     def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         return loss
     def test_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
             logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )

dpacman/classifier/model_w_rca.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Lightning Module for the binding model.
+"""
+import torch
+from torch import nn
+from lightning import LightningModule
+from dpacman.utils.models import set_seed
+from .loss import calculate_loss
+set_seed()
+class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+        self.act = nn.GELU()
+        self.ln = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor):
+        # x: (batch, L, dim)
+        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
+        out = self.act(out)
+        out = out.transpose(1, 2)  # → (batch, L, dim)
+        return self.ln(out + x)  # residual
+# class CrossModalBlock(nn.Module):
+#     def __init__(self, dim: int = 256, heads: int = 8):
+#         super().__init__()
+#         # self-attention for both sides
+#         self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
+#         self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+#         self.ln_b1 = nn.LayerNorm(dim)
+#         self.ln_g1 = nn.LayerNorm(dim)
+#         self.ffn_b = nn.Sequential(
+#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+#         )
+#         self.ffn_g = nn.Sequential(
+#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+#         )
+#         self.ln_b2 = nn.LayerNorm(dim)
+#         self.ln_g2 = nn.LayerNorm(dim)
+#         # cross attention (binder queries, glm keys/values)
+#         # so the NDA path is updated by the transcriptoin factors
+#         self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
+#         self.ln_c1 = nn.LayerNorm(dim)
+#         self.ffn_c = nn.Sequential(
+#             nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+#         )
+#         self.ln_c2 = nn.LayerNorm(dim)
+#     def forward(self, binder: torch.Tensor, glm: torch.Tensor):
+#         """
+#         binder: (batch, Lb, dim)
+#         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
+#         returns: updated binder representation (batch, Lb, dim)
+#         """
+#         # binder: self-attn + ffn
+#         b = binder
+#         b_sa, _ = self.sa_binder(b, b, b)
+#         b = self.ln_b1(b + b_sa)
+#         b_ff = self.ffn_b(b)
+#         b = self.ln_b2(b + b_ff)
+#         # glm: self-attn + ffn
+#         g = glm
+#         g_sa, _ = self.sa_glm(g, g, g)
+#         g = self.ln_g1(g + g_sa)
+#         g_ff = self.ffn_g(g)
+#         g = self.ln_g2(g + g_ff)
+#         # cross-attention: glm queries binder and glm embeddings are updated
+#         g_to_b_ca, _ = self.cross_attn(g, b, b)
+#         g = self.ln_c1(g + g_to_b_ca)
+#         g_ff = self.ffn_c(g)
+#         g = self.ln_c2(g + g_ff)
+#         return g  # (batch, Lb, dim)
+class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.0):
+        super().__init__()
+        # 1) self-attn on each stream
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.sa_glm    = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_b1 = nn.LayerNorm(dim)
+        self.ln_g1 = nn.LayerNorm(dim)
+        self.ffn_b = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ffn_g = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_b2 = nn.LayerNorm(dim)
+        self.ln_g2 = nn.LayerNorm(dim)
+        # 2) reciprocal cross-attn: g<-b and b<-g
+        # DNA/GLM updated by attending to Binder
+        self.cross_g2b = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_g_ca1  = nn.LayerNorm(dim)
+        self.ffn_g_ca  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_g_ca2  = nn.LayerNorm(dim)
+        # Binder updated by attending to DNA/GLM
+        self.cross_b2g = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_b_ca1  = nn.LayerNorm(dim)
+        self.ffn_b_ca  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_b_ca2  = nn.LayerNorm(dim)
+    def forward(
+        self,
+        binder: torch.Tensor,      # (B, Lb, D)
+        glm: torch.Tensor,         # (B, Lg, D)
+        binder_mask: torch.Tensor | None = None,  # (B, Lb) True = keep
+        glm_mask: torch.Tensor | None = None,     # (B, Lg) True = keep
+    ):
+        # 1) self-attn+FFN on each stream
+        b, g = binder, glm
+        b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=None)
+        b = self.ln_b1(b + b_sa)
+        b = self.ln_b2(b + self.ffn_b(b))
+        g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=None)
+        g = self.ln_g1(g + g_sa)
+        g = self.ln_g2(g + self.ffn_g(g))
+        # 2a) DNA/GLM updated by attending to Binder (Q=g, K=b, V=b)
+        g_ca, _ = self.cross_g2b(
+            g, b, b,
+            # torch MultiheadAttention expects key_padding_mask=True for PADs;
+            # invert if your mask is True=keep:
+            # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
+        )
+        g = self.ln_g_ca1(g + g_ca)
+        g = self.ln_g_ca2(g + self.ffn_g_ca(g))
+        # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
+        b_ca, _ = self.cross_b2g(
+            b, g, g,
+            # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
+        )
+        b = self.ln_b_ca1(b + b_ca)
+        b = self.ln_b_ca2(b + self.ffn_b_ca(b))
+        return b, g
+class DimCompressor(nn.Module):
+    """
+    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
+    If in_dim == out_dim, behaves as identity.
+    """
+    def __init__(self, in_dim: int, out_dim: int = 256):
+        super().__init__()
+        if in_dim == out_dim:
+            self.net = nn.Identity()
+        else:
+            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
+            self.net = nn.Sequential(
+                nn.LayerNorm(in_dim),
+                nn.Linear(in_dim, hidden),
+                nn.GELU(),
+                nn.Linear(hidden, out_dim),
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, L, in_dim)
+        return self.net(x)
+class BindPredictor(LightningModule):
+    def __init__(
+        self,
+        # input_dim: int = 256,                     # OLD: single input dim
+        binder_input_dim: int = 1280,  # NEW: TF (binder) original dim (e.g., 1280)
+        glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
+        compressed_dim: int = 256,  # NEW: learnable compressed dim
+        hidden_dim: int = 256,
+        heads: int = 8,
+        num_layers: int = 4,
+        lr: float = 1e-4,
+        alpha: float = 20,
+        gamma: float = 20,
+        use_local_cnn_on_glm: bool = True,
+        weight_decay: float = 0.01,
+    ):
+        # Init
+        super(BindPredictor, self).__init__()
+        self.save_hyperparameters()
+        # Learnable compressor for binder -> 256, then project to hidden
+        self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
+        self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
+        # GLM side stays 256 -> hidden
+        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
+        self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
+        self.layers = nn.ModuleList(
+            [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
+        )
+        self.ln_out = nn.LayerNorm(hidden_dim)
+        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
+        self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
+    def forward(self, binder_emb, glm_emb):
+        """
+        binder_emb: (B, Lb, binder_input_dim)
+        glm_emb:    (B, Lg, glm_input_dim)
+        Returns per-nucleotide logits for the GLM sequence: (B, Lg)
+        """
+        # Binder: learnable compression → 256 → hidden
+        b = self.binder_compress(binder_emb)  # (B, Lb, 256)
+        b = self.proj_binder(b)  # (B, Lb, hidden_dim)
+        # GLM: project → hidden, add local CNN context
+        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
+        if self.use_local_cnn:
+            g = self.local_cnn(g)
+        # Cross-modal blocks: update binder states using GLM
+        for layer in self.layers:
+            b, g = layer(b, g)  # (B, Lb, hidden_dim)
+        # Predict per-nucleotide logits on the GLM tokens:
+        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
+        return self.head(g).squeeze(
+            -1
+        )  # NEW: logits (apply sigmoid only in loss/metrics)
+    # ----- Lightning hooks -----
+    def training_step(self, batch, batch_idx):
+        """
+        Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
+        Colator returns a dictionary with:
+            "binder_emb"    # [B, Lb_max, Db]
+            "binder_mask"   # [B, Lb_max]
+            "glm_emb"       # [B, Lg_max, Dg]
+            "glm_mask"      # [B, Lg_max]
+            "labels"        # [B, Lg_max]
+            "ID"
+            "tr_sequence"
+            "dna_sequence"
+        }
+        """
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "train/loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "val/loss",
+            loss,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        return loss
+    def test_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
+        )
+        return loss
+    def on_train_epoch_end(self):
+        if False:
+            if self.train_auc.compute() is not None:
+                self.log("train/auroc", self.train_auc.compute(), prog_bar=True)
+            self.train_auc.reset()
+    def on_validation_epoch_end(self):
+        if False:
+            if self.val_auc.compute() is not None:
+                self.log("val/auroc", self.val_auc.compute(), prog_bar=True)
+            self.val_auc.reset()
+    def on_test_epoch_end(self):
+        if False:
+            if self.test_auc.compute() is not None:
+                self.log("test/auroc", self.test_auc.compute(), prog_bar=True)
+            self.test_auc.reset()
+    def configure_optimizers(self):
+        # AdamW + cosine as a sensible default
+        opt = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.hparams.lr,
+            weight_decay=self.hparams.weight_decay,
+        )
+        # Scheduler optional—comment out if you prefer fixed LR
+        sch = torch.optim.lr_scheduler.CosineAnnealingLR(
+            opt, T_max=max(self.trainer.max_epochs, 1)
+        )
+        return {
+            "optimizer": opt,
+            "lr_scheduler": {"scheduler": sch, "interval": "epoch"},
+        }

dpacman/data_modules/pair.py CHANGED Viewed

@@ -2,7 +2,8 @@
 import argparse
 import numpy as np
 import torch
-from torch.utils.data import Dataset, DataLoader, Sampler
 from lightning import LightningDataModule
 from pathlib import Path
 from multiprocessing import cpu_count
@@ -14,11 +15,66 @@ from typing import List, Iterable, Sequence
 import sys
 import rootutils
 import logging
 from dpacman.utils import pylogger
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
 class PreBatchedSampler(Sampler[List[int]]):
     """
     Yields precomputed batches of indices, e.g. [[3,7,9], [0,1,2], ...].
@@ -211,9 +267,11 @@ class PairDataModule(LightningDataModule):
                     batch_size=self.batch_size,
                     drop_last=self.drop_last,
                 )
-                self.train_batch_sampler = PreBatchedSampler(
                     self.train_batches,
                     shuffle_batch_order=self.shuffle_batch_order,
                 )
             if not hasattr(self, "val_dataset"):
@@ -225,8 +283,8 @@ class PairDataModule(LightningDataModule):
                     batch_size=self.batch_size,
                     drop_last=False,
                 )
-                self.val_batch_sampler = PreBatchedSampler(
-                    self.val_batches, shuffle_batch_order=False
                 )
         # VALIDATE called standalone: ensure val is built
@@ -240,10 +298,10 @@ class PairDataModule(LightningDataModule):
                     batch_size=self.batch_size,
                     drop_last=False,
                 )
-                self.val_batch_sampler = PreBatchedSampler(
-                    self.val_batches, shuffle_batch_order=False
                 )
         # TEST phase
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
@@ -393,19 +451,23 @@ class ShelfCollator:
             1
         )  # [B, Lg_max]
         # 3) Collate labels for DNA and pad
         labels_list = [torch.tensor(s, dtype=torch.float32) for s in scores_list]
         labels = pad_sequence(
-            labels_list, batch_first=True, padding_value=0.0
         )  # [B, Lg_max]
-        # (Optional) ensure labels are zeroed beyond mask:
-        labels = labels * glm_mask.to(labels.dtype)
         return {
             "binder_emb": binder_emb,  # [B, Lb_max, Db]
             "binder_mask": binder_mask,  # [B, Lb_max]
             "glm_emb": glm_emb,  # [B, Lg_max, Dg]
             "glm_mask": glm_mask,  # [B, Lg_max]
             "labels": labels,  # [B, Lg_max]
             "ID": ids,
             "tr_sequence": tr_seqs,
@@ -451,9 +513,11 @@ def _peek_batches(dl, n_batches: int = 2, tag: str = "train"):
         logger.info(f"\n[{tag}] batch {i+1}")
         logger.info(f"  binder_emb: {tuple(be.shape)}  dtype={be.dtype}")
         logger.info(f"  binder_mask true count: {bm.sum().item()} / {bm.numel()}")
         logger.info(f"  glm_emb:    {tuple(ge.shape)}  dtype={ge.dtype}")
         logger.info(f"  glm_mask  true count: {gm.sum().item()} / {gm.numel()}")
         logger.info(
             f"  labels:     {tuple(y.shape)}  min={y.min().item():.4f} max={y.max().item():.4f}"
         )

 import argparse
 import numpy as np
 import torch
+from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler
+import torch.distributed as dist
 from lightning import LightningDataModule
 from pathlib import Path
 from multiprocessing import cpu_count
 import sys
 import rootutils
 import logging
+import math
 from dpacman.utils import pylogger
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
+class PreBatchedDistributedBatchSampler(BatchSampler):
+    """
+    Accepts a precomputed list of batches (list[list[int]]) and shards them across DDP ranks.
+    - shuffle_batch_order: shuffle order of batches each epoch (deterministic via set_epoch)
+    - drop_last: drop remainder so each rank gets same #steps
+    """
+    def __init__(self, batches, shuffle_batch_order=False, drop_last=False, seed: int = 0):
+        # super expects attributes batch_size, drop_last, sampler – but we don't need them.
+        # We only need to subclass BatchSampler to satisfy Lightning's check.
+        self.batches = [list(b) for b in batches]
+        self.shuffle = shuffle_batch_order
+        self.drop_last = drop_last
+        self.seed = int(seed)
+        self.epoch = 0
+        if dist.is_available() and dist.is_initialized():
+            self.world_size = dist.get_world_size()
+            self.rank = dist.get_rank()
+        else:
+            self.world_size = 1
+            self.rank = 0
+    def __iter__(self):
+        n_batches = len(self.batches)
+        order = list(range(n_batches))
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            order = torch.randperm(n_batches, generator=g).tolist()
+        # make divisible across ranks
+        if self.drop_last:
+            total = (len(order) // self.world_size) * self.world_size
+            order = order[:total]
+        else:
+            pad = (-len(order)) % self.world_size
+            if pad:
+                order = order + order[:pad]
+        # shard by rank
+        for i in order[self.rank::self.world_size]:
+            yield self.batches[i]
+    def __len__(self):
+        n = len(self.batches)
+        if self.drop_last:
+            return (n // self.world_size)
+        return math.ceil(n / self.world_size)
+    # Lightning will call this if present via its epoch hooks
+    def set_epoch(self, epoch: int):
+        self.epoch = int(epoch)
 class PreBatchedSampler(Sampler[List[int]]):
     """
     Yields precomputed batches of indices, e.g. [[3,7,9], [0,1,2], ...].
                     batch_size=self.batch_size,
                     drop_last=self.drop_last,
                 )
+                self.train_batch_sampler = PreBatchedDistributedBatchSampler(
                     self.train_batches,
                     shuffle_batch_order=self.shuffle_batch_order,
+                    drop_last=self.drop_last,
+                    seed=0,
                 )
             if not hasattr(self, "val_dataset"):
                     batch_size=self.batch_size,
                     drop_last=False,
                 )
+                self.val_batch_sampler = PreBatchedDistributedBatchSampler(
+                    self.val_batches, shuffle_batch_order=False, drop_last=False, seed=0
                 )
         # VALIDATE called standalone: ensure val is built
                     batch_size=self.batch_size,
                     drop_last=False,
                 )
+                self.val_batch_sampler = PreBatchedDistributedBatchSampler(
+                    self.test_batches, shuffle_batch_order=False, drop_last=False, seed=0
                 )
         # TEST phase
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
             1
         )  # [B, Lg_max]
+        # True = PAD  (what MHA expects)
+        binder_kpm = ~binder_mask
+        glm_kpm    = ~glm_mask
         # 3) Collate labels for DNA and pad
         labels_list = [torch.tensor(s, dtype=torch.float32) for s in scores_list]
         labels = pad_sequence(
+            labels_list, batch_first=True, padding_value=self.pad_value
         )  # [B, Lg_max]
         return {
             "binder_emb": binder_emb,  # [B, Lb_max, Db]
             "binder_mask": binder_mask,  # [B, Lb_max]
+            "binder_kpm": binder_kpm.bool(),     # True = PAD  ← pass to MHA
             "glm_emb": glm_emb,  # [B, Lg_max, Dg]
             "glm_mask": glm_mask,  # [B, Lg_max]
+            "glm_kpm": glm_kpm.bool(),     # True = PAD  ← pass to MHA
             "labels": labels,  # [B, Lg_max]
             "ID": ids,
             "tr_sequence": tr_seqs,
         logger.info(f"\n[{tag}] batch {i+1}")
         logger.info(f"  binder_emb: {tuple(be.shape)}  dtype={be.dtype}")
+        logger.info(f"  binder_emb: {tuple(bm.shape)}  dtype={bm.dtype}")
         logger.info(f"  binder_mask true count: {bm.sum().item()} / {bm.numel()}")
         logger.info(f"  glm_emb:    {tuple(ge.shape)}  dtype={ge.dtype}")
         logger.info(f"  glm_mask  true count: {gm.sum().item()} / {gm.numel()}")
+        logger.info(f"  glm_mask: {tuple(gm.shape)}  dtype={gm.dtype}")
         logger.info(
             f"  labels:     {tuple(y.shape)}  min={y.min().item():.4f} max={y.max().item():.4f}"
         )

dpacman/scripts/run_train.sh CHANGED Viewed

@@ -14,16 +14,20 @@ if [ -z "$WANDB_API_KEY" ]; then
     export WANDB_API_KEY="$wandb_key"
 fi
-nohup python -u -m scripts.train \
   hydra.run.dir="${run_dir}" \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   model.glm_input_dim=256 \
   model.lr=1e-5 \
-  model.compressed_dim=1029 \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

     export WANDB_API_KEY="$wandb_key"
 fi
+CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
+  +trainer.strategy=ddp \
+  +trainer.use_distributed_sampler="false"\
   hydra.run.dir="${run_dir}" \
+  trainer.devices=2 \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   model.glm_input_dim=256 \
+  model.compressed_dim=256 \
+  model.hidden_dim=256 \
   model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"