svincoff commited on Aug 28, 2025

Commit

9da03b7

1 Parent(s): 7b33404

added dropout and overfit prevention

Browse files

Files changed (20) hide show

.gitignore +3 -1
configs/data_module/pair.yaml +1 -0
configs/data_task/split/remap.yaml +3 -1
configs/model/classifier.yaml +1 -0
dpacman/benchmark/README.md +1 -0
dpacman/benchmark/__init__.py +0 -0
dpacman/classifier/model.py +43 -24
dpacman/classifier/old_model.py +374 -0
dpacman/data_modules/pair.py +12 -8
dpacman/data_tasks/split/complex_remap.py +736 -0
dpacman/data_tasks/split/remap.py +169 -429
dpacman/data_tasks/split/remap_handpick.py +266 -0
dpacman/find_wandb_run_name.py +67 -0
dpacman/make_splits.ipynb +0 -0
dpacman/manual_scan_chroms.ipynb +147 -0
dpacman/scripts/delay_run.sh +2 -2
dpacman/scripts/run_eval.sh +5 -3
dpacman/scripts/run_split.sh +3 -2
dpacman/scripts/run_train.sh +5 -3
dpacman/scripts/run_train_baseline.sh +1 -0

.gitignore CHANGED Viewed

@@ -40,4 +40,6 @@ log.log
 log2.log
 dpacman/delay.log
 dpacman/view_profiles.ipynb
-dpacman/find_wandb_run_dirs.py

 log2.log
 dpacman/delay.log
 dpacman/view_profiles.ipynb
+dpacman/find_wandb_run_dirs.py
+dpacman/delay_binary.log
+dpacman/delay_mix.log

configs/data_module/pair.yaml CHANGED Viewed

@@ -6,6 +6,7 @@ test_file: data_files/processed/splits/by_dna/babytest.csv
 target_col: dna_sequence
 score_col: scores
 tr_shelf_path: data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf
 dna_shelf_path: data_files/processed/embeddings/fimo_hits_only/baby_peaks_segmentnt_pernuc_with_onehot.shelf

 target_col: dna_sequence
 score_col: scores
+norm_value: 1333
 tr_shelf_path: data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf
 dna_shelf_path: data_files/processed/embeddings/fimo_hits_only/baby_peaks_segmentnt_pernuc_with_onehot.shelf

configs/data_task/split/remap.yaml CHANGED Viewed

@@ -12,7 +12,9 @@ split_out_dir: dpacman/data_files/processed/splits
 dna_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
-split_by: both # protein, dna, or both
 augment_rc: true
 test_ratio: 0.10

 dna_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
+split_by: dna # protein, dna, or both
+test_trs: ["trseq23","trseq26","trseq17"]
+test_dnas: null
 augment_rc: true
 test_ratio: 0.10

configs/model/classifier.yaml CHANGED Viewed

@@ -4,6 +4,7 @@ lr: 1e-4
 alpha: 20
 gamma: 20
 weight_decay: 0.01
 glm_input_dim: 1029
 compressed_dim: 1029

 alpha: 20
 gamma: 20
 weight_decay: 0.01
+dropout: 0.1
 glm_input_dim: 1029
 compressed_dim: 1029

dpacman/benchmark/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This folder is for benchmarking the trained classifier.

dpacman/benchmark/__init__.py ADDED Viewed

File without changes

dpacman/classifier/model.py CHANGED Viewed

@@ -11,82 +11,96 @@ from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_v
 set_seed()
 class LocalCNN(nn.Module):
-    def __init__(self, dim: int = 256, kernel_size: int = 3):
         super().__init__()
         padding = kernel_size // 2
         self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
         self.act = nn.GELU()
         self.ln = nn.LayerNorm(dim)
     def forward(self, x: torch.Tensor):
         # x: (batch, L, dim)
         out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
         out = self.act(out)
         out = out.transpose(1, 2)  # → (batch, L, dim)
         return self.ln(out + x)  # residual
 class CrossModalBlock(nn.Module):
-    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.0):
         super().__init__()
         # self-attention for both sides
-        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
-        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
         # first layer norms
         self.ln_b1 = nn.LayerNorm(dim)
         self.ln_g1 = nn.LayerNorm(dim)
         # first feed forward networks
         self.ffn_b1 = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
         self.ffn_g1 = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
         self.ln_b2 = nn.LayerNorm(dim)
         self.ln_g2 = nn.LayerNorm(dim)
         # 2) reciprocal cross-attn: g<-b and b<-g
         # DNA/GLM updated by attending to Binder
         self.cross_g2b_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
         self.ln_g3_RCA  = nn.LayerNorm(dim)
-        self.ffn_g2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
         self.ln_g4_RCA  = nn.LayerNorm(dim)
         # Binder updated by attending to DNA/GLM
         self.cross_b2g_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
         self.ln_b3_RCA = nn.LayerNorm(dim)
-        self.ffn_b2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
         self.ln_b4_RCA  = nn.LayerNorm(dim)
         # cross attention (binder queries, glm keys/values)
-        # so the NDA path is updated by the transcriptoin factors
         self.cross_g2b_2 = nn.MultiheadAttention(dim, heads, batch_first=True)
         self.ln_g5 = nn.LayerNorm(dim)
         self.ffn_g3 = nn.Sequential(
-            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
         )
         self.ln_g6 = nn.LayerNorm(dim)
     def forward(self, binder: torch.Tensor, glm: torch.Tensor, binder_kpm_mask=None, glm_kpm_mask=None):
         """
         binder: (batch, Lb, dim)
         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
-        returns: updated binder representation (batch, Lb, dim)
         """
-        # 1) Self-attentino and feed-forward networks for binder and DNA
         # binder: self-attn + ffn
         b = binder
         b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=binder_kpm_mask)
-        b = self.ln_b1(b + b_sa)
         b_ff = self.ffn_b1(b)
-        b = self.ln_b2(b + b_ff)
         # glm: self-attn + ffn
         g = glm
         g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=glm_kpm_mask)
-        g = self.ln_g1(g + g_sa)
         g_ff = self.ffn_g1(g)
-        g = self.ln_g2(g + g_ff)
         # 2a) Reciprocal Cross-Attention:
         # DNA updated by attending to Binder (Q=g, K=b, V=b)
@@ -97,22 +111,22 @@ class CrossModalBlock(nn.Module):
             # invert if your mask is True=keep:
             # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
         )
-        g = self.ln_g3_RCA(g + g_ca)
-        g = self.ln_g4_RCA(g + self.ffn_g2_RCA(g))
         # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
         b_ca, _ = self.cross_b2g_1_RCA(
             b, g, g, key_padding_mask=glm_kpm_mask
             # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
         )
-        b = self.ln_b3_RCA(b + b_ca)
-        b = self.ln_b4_RCA(b + self.ffn_b2_RCA(b))
         # cross-attention: glm queries binder and glm embeddings are updated
         g_to_b_ca, _ = self.cross_g2b_2(g, b, b, key_padding_mask=binder_kpm_mask)
-        g = self.ln_g5(g + g_to_b_ca)
         g_ff = self.ffn_g3(g)
-        g = self.ln_g6(g + g_ff)
         return b, g  # (batch, Lb, dim)
 class DimCompressor(nn.Module):
@@ -164,12 +178,15 @@ class BindPredictor(LightningModule):
         # Learnable compressor for binder -> 256, then project to hidden
         self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
         self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
         # GLM side stays 256 -> hidden
         self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
         self.use_local_cnn = use_local_cnn_on_glm
-        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
         self.layers = nn.ModuleList(
             [CrossModalBlock(hidden_dim, heads, self.hparams.dropout) for _ in range(num_layers)]
@@ -188,9 +205,11 @@ class BindPredictor(LightningModule):
         # Binder: learnable compression → 256 → hidden
         b = self.binder_compress(binder_emb)  # (B, Lb, 256)
         b = self.proj_binder(b)  # (B, Lb, hidden_dim)
         # GLM: project → hidden, add local CNN context
         g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
         if self.use_local_cnn:
             g = self.local_cnn(g)

 set_seed()
 class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3, dropout=0.1):
         super().__init__()
         padding = kernel_size // 2
         self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
         self.act = nn.GELU()
         self.ln = nn.LayerNorm(dim)
+        self.dropout = nn.Dropout(dropout)
     def forward(self, x: torch.Tensor):
         # x: (batch, L, dim)
         out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
         out = self.act(out)
+        out = self.dropout(out) # dropout before the layer norm
         out = out.transpose(1, 2)  # → (batch, L, dim)
         return self.ln(out + x)  # residual
 class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.1):
         super().__init__()
         # self-attention for both sides
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.do_sa_b = nn.Dropout(dropout)
+        self.do_sa_g = nn.Dropout(dropout)
         # first layer norms
         self.ln_b1 = nn.LayerNorm(dim)
         self.ln_g1 = nn.LayerNorm(dim)
         # first feed forward networks
         self.ffn_b1 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim * 4, dim)
         )
         self.ffn_g1 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim * 4, dim)
         )
+        self.do_ffn_b1 = nn.Dropout(dropout)
+        self.do_ffn_g1 = nn.Dropout(dropout)
         self.ln_b2 = nn.LayerNorm(dim)
         self.ln_g2 = nn.LayerNorm(dim)
         # 2) reciprocal cross-attn: g<-b and b<-g
         # DNA/GLM updated by attending to Binder
         self.cross_g2b_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.do_rca_g = nn.Dropout(dropout)
         self.ln_g3_RCA  = nn.LayerNorm(dim)
+        self.ffn_g2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim*4, dim))
+        self.do_ffn_g2  = nn.Dropout(dropout)
         self.ln_g4_RCA  = nn.LayerNorm(dim)
         # Binder updated by attending to DNA/GLM
         self.cross_b2g_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.do_rca_b = nn.Dropout(dropout)
         self.ln_b3_RCA = nn.LayerNorm(dim)
+        self.ffn_b2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim*4, dim))
+        self.do_ffn_b2  = nn.Dropout(dropout)
         self.ln_b4_RCA  = nn.LayerNorm(dim)
         # cross attention (binder queries, glm keys/values)
+        # so the NDA path is updated by the transcription factors
         self.cross_g2b_2 = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.do_g2b2 = nn.Dropout(dropout)
         self.ln_g5 = nn.LayerNorm(dim)
         self.ffn_g3 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Dropout(dropout), nn.Linear(dim * 4, dim)
         )
+        self.do_ffn_g3 = nn.Dropout(dropout)
         self.ln_g6 = nn.LayerNorm(dim)
     def forward(self, binder: torch.Tensor, glm: torch.Tensor, binder_kpm_mask=None, glm_kpm_mask=None):
         """
         binder: (batch, Lb, dim)
         glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
+        returns: updated binder representation (batch, Lb, dim) and gLM representation
         """
+        # 1) Self-attention and feed-forward networks for binder and DNA
         # binder: self-attn + ffn
         b = binder
         b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=binder_kpm_mask)
+        b = self.ln_b1(b + self.do_sa_b(b_sa))
         b_ff = self.ffn_b1(b)
+        b = self.ln_b2(b + self.do_ffn_b1(b_ff))
         # glm: self-attn + ffn
         g = glm
         g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=glm_kpm_mask)
+        g = self.ln_g1(g + self.do_sa_g(g_sa))
         g_ff = self.ffn_g1(g)
+        g = self.ln_g2(g + self.do_ffn_g1(g_ff))
         # 2a) Reciprocal Cross-Attention:
         # DNA updated by attending to Binder (Q=g, K=b, V=b)
             # invert if your mask is True=keep:
             # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
         )
+        g = self.ln_g3_RCA(g + self.do_rca_g(g_ca))
+        g = self.ln_g4_RCA(g + self.do_ffn_g2(self.ffn_g2_RCA(g)))
         # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
         b_ca, _ = self.cross_b2g_1_RCA(
             b, g, g, key_padding_mask=glm_kpm_mask
             # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
         )
+        b = self.ln_b3_RCA(b + self.do_rca_b(b_ca))
+        b = self.ln_b4_RCA(b + self.do_ffn_b2(self.ffn_b2_RCA(b)))
         # cross-attention: glm queries binder and glm embeddings are updated
         g_to_b_ca, _ = self.cross_g2b_2(g, b, b, key_padding_mask=binder_kpm_mask)
+        g = self.ln_g5(g + self.do_g2b2(g_to_b_ca))
         g_ff = self.ffn_g3(g)
+        g = self.ln_g6(g + self.do_ffn_g3(g_ff))
         return b, g  # (batch, Lb, dim)
 class DimCompressor(nn.Module):
         # Learnable compressor for binder -> 256, then project to hidden
         self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
         self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
+        self.dropout_b1 = nn.Dropout(dropout)
+        self.act = nn.GELU()
         # GLM side stays 256 -> hidden
         self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
+        self.dropout_g1 = nn.Dropout(dropout)
         self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim, dropout=self.hparams.dropout) if use_local_cnn_on_glm else nn.Identity()
         self.layers = nn.ModuleList(
             [CrossModalBlock(hidden_dim, heads, self.hparams.dropout) for _ in range(num_layers)]
         # Binder: learnable compression → 256 → hidden
         b = self.binder_compress(binder_emb)  # (B, Lb, 256)
         b = self.proj_binder(b)  # (B, Lb, hidden_dim)
+        b = self.dropout_b1(self.act(b))
         # GLM: project → hidden, add local CNN context
         g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
+        g = self.dropout_g1(self.act(g))
         if self.use_local_cnn:
             g = self.local_cnn(g)

dpacman/classifier/old_model.py ADDED Viewed

	@@ -0,0 +1,374 @@

+"""
+Lightning Module for the binding model.
+"""
+import torch
+from torch import nn
+from lightning import LightningModule
+from dpacman.utils.models import set_seed
+from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_vs_ones_from_logits
+set_seed()
+class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+        self.act = nn.GELU()
+        self.ln = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor):
+        # x: (batch, L, dim)
+        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
+        out = self.act(out)
+        out = out.transpose(1, 2)  # → (batch, L, dim)
+        return self.ln(out + x)  # residual
+class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8, dropout: float = 0.0):
+        super().__init__()
+        # self-attention for both sides
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+        # first layer norms
+        self.ln_b1 = nn.LayerNorm(dim)
+        self.ln_g1 = nn.LayerNorm(dim)
+        # first feed forward networks
+        self.ffn_b1 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ffn_g1 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ln_b2 = nn.LayerNorm(dim)
+        self.ln_g2 = nn.LayerNorm(dim)
+        # 2) reciprocal cross-attn: g<-b and b<-g
+        # DNA/GLM updated by attending to Binder
+        self.cross_g2b_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_g3_RCA  = nn.LayerNorm(dim)
+        self.ffn_g2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_g4_RCA  = nn.LayerNorm(dim)
+        # Binder updated by attending to DNA/GLM
+        self.cross_b2g_1_RCA = nn.MultiheadAttention(dim, heads, batch_first=True, dropout=dropout)
+        self.ln_b3_RCA = nn.LayerNorm(dim)
+        self.ffn_b2_RCA  = nn.Sequential(nn.Linear(dim, dim*4), nn.GELU(), nn.Linear(dim*4, dim))
+        self.ln_b4_RCA  = nn.LayerNorm(dim)
+        # cross attention (binder queries, glm keys/values)
+        # so the NDA path is updated by the transcriptoin factors
+        self.cross_g2b_2 = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_g5 = nn.LayerNorm(dim)
+        self.ffn_g3 = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ln_g6 = nn.LayerNorm(dim)
+    def forward(self, binder: torch.Tensor, glm: torch.Tensor, binder_kpm_mask=None, glm_kpm_mask=None):
+        """
+        binder: (batch, Lb, dim)
+        glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
+        returns: updated binder representation (batch, Lb, dim)
+        """
+        # 1) Self-attentino and feed-forward networks for binder and DNA
+        # binder: self-attn + ffn
+        b = binder
+        b_sa, _ = self.sa_binder(b, b, b, key_padding_mask=binder_kpm_mask)
+        b = self.ln_b1(b + b_sa)
+        b_ff = self.ffn_b1(b)
+        b = self.ln_b2(b + b_ff)
+        # glm: self-attn + ffn
+        g = glm
+        g_sa, _ = self.sa_glm(g, g, g, key_padding_mask=glm_kpm_mask)
+        g = self.ln_g1(g + g_sa)
+        g_ff = self.ffn_g1(g)
+        g = self.ln_g2(g + g_ff)
+        # 2a) Reciprocal Cross-Attention:
+        # DNA updated by attending to Binder (Q=g, K=b, V=b)
+        # Binder updated by attending to DNA (Q=b, K=g, V=g)
+        g_ca, _ = self.cross_g2b_1_RCA(
+            g, b, b, key_padding_mask=binder_kpm_mask
+            # torch MultiheadAttention expects key_padding_mask=True for PADs;
+            # invert if your mask is True=keep:
+            # key_padding_mask=(~binder_mask.bool()) if binder_mask is not None else None
+        )
+        g = self.ln_g3_RCA(g + g_ca)
+        g = self.ln_g4_RCA(g + self.ffn_g2_RCA(g))
+        # 2b) Binder updated by attending to DNA/GLM (Q=b, K=g, V=g)
+        b_ca, _ = self.cross_b2g_1_RCA(
+            b, g, g, key_padding_mask=glm_kpm_mask
+            # key_padding_mask=(~glm_mask.bool()) if glm_mask is not None else None
+        )
+        b = self.ln_b3_RCA(b + b_ca)
+        b = self.ln_b4_RCA(b + self.ffn_b2_RCA(b))
+        # cross-attention: glm queries binder and glm embeddings are updated
+        g_to_b_ca, _ = self.cross_g2b_2(g, b, b, key_padding_mask=binder_kpm_mask)
+        g = self.ln_g5(g + g_to_b_ca)
+        g_ff = self.ffn_g3(g)
+        g = self.ln_g6(g + g_ff)
+        return b, g  # (batch, Lb, dim)
+class DimCompressor(nn.Module):
+    """
+    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
+    If in_dim == out_dim, behaves as identity.
+    """
+    def __init__(self, in_dim: int, out_dim: int = 256):
+        super().__init__()
+        if in_dim == out_dim:
+            self.net = nn.Identity()
+        else:
+            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
+            self.net = nn.Sequential(
+                nn.LayerNorm(in_dim),
+                nn.Linear(in_dim, hidden),
+                nn.GELU(),
+                nn.Linear(hidden, out_dim),
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, L, in_dim)
+        return self.net(x)
+class BindPredictor(LightningModule):
+    def __init__(
+        self,
+        # input_dim: int = 256,                     # OLD: single input dim
+        binder_input_dim: int = 1280,  # NEW: TF (binder) original dim (e.g., 1280)
+        glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
+        compressed_dim: int = 256,  # NEW: learnable compressed dim
+        hidden_dim: int = 256,
+        heads: int = 8,
+        num_layers: int = 4,
+        lr: float = 1e-4,
+        alpha: float = 20,
+        gamma: float = 20,
+        dropout: float = 0,
+        use_local_cnn_on_glm: bool = True,
+        weight_decay: float = 0.01,
+        loss_type = "mixed"
+    ):
+        # Init
+        super(BindPredictor, self).__init__()
+        self.save_hyperparameters()
+        # Learnable compressor for binder -> 256, then project to hidden
+        self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
+        self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
+        # GLM side stays 256 -> hidden
+        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
+        self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
+        self.layers = nn.ModuleList(
+            [CrossModalBlock(hidden_dim, heads, self.hparams.dropout) for _ in range(num_layers)]
+        )
+        #self.ln_out = nn.LayerNorm(hidden_dim)
+        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
+        self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
+    def forward(self, binder_emb, glm_emb, binder_mask, glm_mask):
+        """
+        binder_emb: (B, Lb, binder_input_dim)
+        glm_emb:    (B, Lg, glm_input_dim)
+        Returns per-nucleotide logits for the GLM sequence: (B, Lg)
+        """
+        # Binder: learnable compression → 256 → hidden
+        b = self.binder_compress(binder_emb)  # (B, Lb, 256)
+        b = self.proj_binder(b)  # (B, Lb, hidden_dim)
+        # GLM: project → hidden, add local CNN context
+        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
+        if self.use_local_cnn:
+            g = self.local_cnn(g)
+        # Cross-modal blocks: update binder states using GLM
+        for layer in self.layers:
+            b, g = layer(b, g, binder_mask, glm_mask)  # (B, Lb, hidden_dim)
+        # Predict per-nucleotide logits on the GLM tokens:
+        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
+        logits = self.head(g).squeeze(
+            -1
+        )
+        return logits
+    # ----- Lightning hooks -----
+    def training_step(self, batch, batch_idx):
+        """
+        Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
+        Colator returns a dictionary with:
+            "binder_emb"    # [B, Lb_max, Db]
+            "binder_kpm"    # [B, Lb_max]
+            "glm_emb"       # [B, Lg_max, Dg]
+            "glm_kpm"       # [B, Lg_max]
+            "labels"        # [B, Lg_max]
+            "ID"
+            "tr_sequence"
+            "dna_sequence"
+        }
+        """
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
+        loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
+        )
+        self.log(
+            "train/loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("train/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("train/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        # (optional) also log class counts so you can sanity-check balance
+        self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
+        self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
+        loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
+        )
+        self.log(
+            "val/loss",
+            loss,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("val/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("val/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        return loss
+    def test_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
+        loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
+        )
+        self.log(
+            "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
+        )
+        # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+        ap, n_pos, n_neg, precision, recall, thresholds = auprc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        auc, n_pos, n_neg, tpr, fpr, thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+            logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+        )
+        # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+        self.log("test/auprc_0v1",
+                ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        self.log("test/auroc_0v1",
+                auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
+                on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+        return loss
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"],
+                            batch["binder_kpm"], batch["glm_kpm"]).squeeze(-1)  # (B,L)
+        valid = ~batch["glm_kpm"]   # (B,L)
+        return {
+            "ids": batch["ID"],                           # list[str]
+            "logits": logits.detach().cpu(),              # (B,Lmax) padded
+            "valid": valid.detach().cpu(),                # (B,Lmax) booleans
+            "labels": batch["labels"].detach().cpu(),     # (B,Lmax) padded
+        }
+    def on_before_optimizer_step(self, optimizer):
+        # Compute global L2 norm of all parameter gradients (ignores None grads)
+        grads = []
+        for p in self.parameters():
+            if p.grad is not None:
+                # .detach() avoids autograd tracking; .float() avoids fp16 overflow in norms
+                grads.append(p.grad.detach().float().norm(2))
+        if grads:
+            total_norm = torch.norm(torch.stack(grads), p=2)
+            self.log("train/grad_norm", total_norm, on_step=True, prog_bar=False, logger=True)
+    def on_after_backward(self):
+        grads = [p.grad.detach().float().norm(2)
+                for p in self.parameters() if p.grad is not None]
+        if grads:
+            total_norm = torch.norm(torch.stack(grads), p=2)
+            self.log("train/grad_norm_back", total_norm, on_step=True, prog_bar=False)
+    def on_train_epoch_end(self):
+        if False:
+            if self.train_auc.compute() is not None:
+                self.log("train/auroc", self.train_auc.compute(), prog_bar=True)
+            self.train_auc.reset()
+    def on_validation_epoch_end(self):
+        if False:
+            if self.val_auc.compute() is not None:
+                self.log("val/auroc", self.val_auc.compute(), prog_bar=True)
+            self.val_auc.reset()
+    def on_test_epoch_end(self):
+        if False:
+            if self.test_auc.compute() is not None:
+                self.log("test/auroc", self.test_auc.compute(), prog_bar=True)
+            self.test_auc.reset()
+    def configure_optimizers(self):
+        # AdamW + cosine as a sensible default
+        opt = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.hparams.lr,
+            weight_decay=self.hparams.weight_decay,
+        )
+        # Scheduler optional—comment out if you prefer fixed LR
+        sch = torch.optim.lr_scheduler.CosineAnnealingLR(
+            opt, T_max=max(self.trainer.max_epochs, 1)
+        )
+        return {
+            "optimizer": opt,
+            "lr_scheduler": {"scheduler": sch, "interval": "epoch"},
+        }

dpacman/data_modules/pair.py CHANGED Viewed

@@ -180,16 +180,16 @@ class PairDataset(Dataset):
         """
         if self.score_col not in dataset.columns:
             logger.info(f"Scores not provided. Adding placeholder scores where all positions are considered binding")
-            dataset[self.score_col] = dataset["dna_sequence"].str.len()
             dataset[self.score_col] = dataset[self.score_col].apply(lambda x: ",".join([str(self.norm_value)]*x))
             self.fake_scores=True
         # split string into list of strings
         dataset[self.score_col] = dataset[self.score_col].apply(lambda x: x.split(","))
         # turn list of strings into list of normalized, rounded floats
         dataset[self.score_col] = dataset[self.score_col].apply(
             lambda x: [round(int(y) / self.norm_value, self.round_to) for y in x]
         )
         # convert to records for ease of loading
         dataset = dataset.to_dict(orient="records")
         return dataset
@@ -222,11 +222,13 @@ class PairDataModule(LightningDataModule):
         shuffle_train_batch_order: bool = True,
         score_col: str = "scores",
         target_col: str = "dna_sequence",
-        binder_col: str = "tr_sequence"
     ):
         super().__init__()
         self.save_hyperparameters()
         self.debug_run = debug_run
         # Initialize the data files
         self.train_data_file = train_file
@@ -267,7 +269,7 @@ class PairDataModule(LightningDataModule):
             df = pd.read_csv(file_path)
             if lim is not None:
                 df = df[:lim].reset_index(drop=True)
-            return df[["ID", "dna_sequence", "tr_sequence", "scores"]]
         except:
             raise Exception(f"{file_path} is not a valid file")
@@ -278,7 +280,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "fit"):
             if not hasattr(self, "train_dataset"):
                 train_df = self.load_file(self.train_data_file, lim=lim)
-                self.train_dataset = PairDataset(train_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.train_batches = make_length_batches(
                     dataset_records=self.train_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -294,7 +296,7 @@ class PairDataModule(LightningDataModule):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
-                self.val_dataset = PairDataset(val_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -309,7 +311,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "validate"):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
-                self.val_dataset = PairDataset(val_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -324,7 +326,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
                 test_df = self.load_file(self.test_data_file, lim=lim)
-                self.test_dataset = PairDataset(test_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.test_batches = make_length_batches(
                     dataset_records=self.test_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -623,6 +625,8 @@ def main():
         debug_run=args.debug_run,
         shuffle_train_batch_order=args.shuffle_train_batch_order,
         pin_memory=False,
     )
     # ---- Train ----

         """
         if self.score_col not in dataset.columns:
             logger.info(f"Scores not provided. Adding placeholder scores where all positions are considered binding")
+            dataset[self.score_col] = dataset[self.target_col].str.len()
             dataset[self.score_col] = dataset[self.score_col].apply(lambda x: ",".join([str(self.norm_value)]*x))
             self.fake_scores=True
         # split string into list of strings
         dataset[self.score_col] = dataset[self.score_col].apply(lambda x: x.split(","))
+        dataset["copycol"] = dataset[self.score_col]
         # turn list of strings into list of normalized, rounded floats
         dataset[self.score_col] = dataset[self.score_col].apply(
             lambda x: [round(int(y) / self.norm_value, self.round_to) for y in x]
         )
         # convert to records for ease of loading
         dataset = dataset.to_dict(orient="records")
         return dataset
         shuffle_train_batch_order: bool = True,
         score_col: str = "scores",
         target_col: str = "dna_sequence",
+        binder_col: str = "tr_sequence",
+        norm_value: int = 1333
     ):
         super().__init__()
         self.save_hyperparameters()
         self.debug_run = debug_run
+        self.norm_value = norm_value
         # Initialize the data files
         self.train_data_file = train_file
             df = pd.read_csv(file_path)
             if lim is not None:
                 df = df[:lim].reset_index(drop=True)
+            return df
         except:
             raise Exception(f"{file_path} is not a valid file")
         if stage in (None, "fit"):
             if not hasattr(self, "train_dataset"):
                 train_df = self.load_file(self.train_data_file, lim=lim)
+                self.train_dataset = PairDataset(train_df, norm_value = self.norm_value, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.train_batches = make_length_batches(
                     dataset_records=self.train_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
+                self.val_dataset = PairDataset(val_df, norm_value = self.norm_value, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
         if stage in (None, "validate"):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
+                self.val_dataset = PairDataset(val_df, norm_value = self.norm_value, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
                 test_df = self.load_file(self.test_data_file, lim=lim)
+                self.test_dataset = PairDataset(test_df, norm_value = self.norm_value, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.test_batches = make_length_batches(
                     dataset_records=self.test_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
         debug_run=args.debug_run,
         shuffle_train_batch_order=args.shuffle_train_batch_order,
         pin_memory=False,
+        score_col="binary_scores",
+        norm_value=1
     )
     # ---- Train ----

dpacman/data_tasks/split/complex_remap.py ADDED Viewed

	@@ -0,0 +1,736 @@

+from collections import Counter, defaultdict
+from ortools.linear_solver import pywraplp
+import random
+from omegaconf import DictConfig
+import pandas as pd
+from pathlib import Path
+import os
+import numpy as np
+from sklearn.model_selection import train_test_split
+from dpacman.data_tasks.fimo.post_fimo import get_reverse_complement
+import json
+import rootutils
+from dpacman.utils import pylogger
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
+def split_with_predefined_test(
+    full_df = pd.DataFrame(),
+    split_names=("train", "val", "test"),
+    test_trs=None,
+    test_dnas=None,
+    ratios=(0.8, 0.1, 0.1),
+):
+    """
+    Method for splitting into train and val with a predefined test set.
+    The proteins in the test set, and the DNA clusters of the DNAs they're associated with, must be excluded from train and val.
+    The remaining rows for train and val are split to preserve 80/10/10 as best as possible.
+    """
+    full_df[""]
+    test = full_df.copy(deep=True)
+    if test_trs is not None:
+        test = test.loc[test["tr_seqid"].isin(test_trs)].reset_index(drop=True)
+    if test_dnas is not None:
+        test = test.loc[test["dna_seqid"].isin(test_dnas)].reset_index(drop=True)
+    tr_clusters_to_exclude = test["tr_cluster_rep"].unique().tolist()
+    dna_clusters_to_exclude = test["dna_cluster_rep"].unique().tolist()
+    remaining = full_df.loc[
+        (~full_df["tr_cluster_rep"].isin(tr_clusters_to_exclude)) &
+        (~full_df["dna_cluster_rep"].isin(dna_clusters_to_exclude))
+    ].reset_index(drop=True)
+    test_ids = test["ID"].unique().tolist()
+    remaining_ids = remaining["ID"].unique().tolist()
+    remaining_clusters = remaining["dna_cluster_rep"].unique().tolis()
+    lost_rows = full_df.loc[
+        (~full_df["ID"].isin(test_ids)) &
+        (~full_df["ID"].isin(remaining_ids))
+    ]
+    logger.info(f"Rows in test: {len(test)}")
+    logger.info(f"Rows to be split between train and val: {len(remaining)}")
+    total_rows = len(test) + len(remaining)
+    logger.info(f"Total rows: {total_rows}. Test percentage: {100*len(test)/total_rows:.2f}%")
+    logger.info(f"Lost rows: {len(lost_rows)}")
+    train_ratio_from_remaining = round((0.8*total_rows)/len(remaining), 2)
+    # use sklearn
+    test_size_1 = 1 - train_ratio_from_remaining
+    logger.info(
+        f"\tPerforming first split: non-test clusters -> train clusters ({round(1-test_size_1,3)}) and val ({test_size_1})"
+    )
+    X = remaining_clusters
+    y = [0] * len(remaining_clusters)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=test_size_1, random_state=0
+    )
+    train = remaining.loc[remaining["dna_cluster_rep"].isin(X_train)]
+    val = remaining.loc[remaining["dna_cluster_rep"].isin(X_val)]
+    leaky_test = lost_rows
+    splits = {
+        "train": train,
+        "val": val,
+        "test": test,
+        "leaky_test": leaky_test
+    }
+    return splits
+def split_bipartite_fast(
+    dna_clusters,
+    split_names=("train", "val", "test"),
+    ratios=(0.8, 0.1, 0.1),
+):
+    # use sklearn
+    test_size_1 = 0.2
+    test_size_2 = 0.5
+    logger.info(
+        f"\tPerforming first split: all clusters -> train clusters ({round(1-test_size_1,3)}) and other ({test_size_1})"
+    )
+    X = dna_clusters
+    y = [0] * len(dna_clusters)
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=test_size_1, random_state=0
+    )
+    logger.info(
+        f"\tPerforming second split: other -> val clusters ({round(1-test_size_2,3)}) and test clusters ({test_size_2})"
+    )
+    X_val, X_test, y_val, y_test = train_test_split(
+        X_test, y_test, test_size=test_size_2, random_state=0
+    )
+    dna_assign = {}
+    for x in X_train:
+        dna_assign[x] = "train"
+    for x in X_val:
+        dna_assign[x] = "val"
+    for x in X_test:
+        dna_assign[x] = "test"
+    kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
+    return dna_assign, kept_by_split
+def convert_scores(scores):
+    svec = [int(x) for x in scores.split(",")]
+    max_score = max(svec)
+    binary_svec = [0 if x<max_score else 1 for x in svec]
+    assert(svec.count(max_score)==binary_svec.count(1))
+    binary_svec = ",".join([str(x) for x in binary_svec])
+    return binary_svec
+def split_bipartite_with_ratios_and_leaky(
+    edges,
+    split_names=("train", "val", "test"),
+    ratios=(0.8, 0.1, 0.1),
+    require_nonempty=False,
+    ratio_tolerance=None,  # None = soft ratios only; 0.0 = exact band (use with care)
+    bigM=None,
+    shuffle_within_pair=False,
+    seed=0,
+    test_edges_must=None,  # NEW: list of (tf,dna) with duplicates OR dict {(tf,dna): count}
+):
+    """
+    edges: list of (tf_cluster_id, dna_cluster_id). Duplicates allowed (-> weights).
+    test_edges_must: None, list of pairs, or dict {(tf,dna): required_count}.
+        - If a pair appears with required_count > 0, at least that many examples MUST be kept in TEST.
+        - This implicitly pins both clusters of that pair to TEST (cluster exclusivity).
+    Returns:
+        tf_assign: {tf_cluster -> split}
+        dna_assign: {dna_cluster -> split}
+        kept_by_split: {split -> kept_count} (train/val/test only)
+        total_kept: int
+        split_to_indices: {split -> [input indices]} including 'leaky_test'
+        split_to_edges:   {split -> [(tf,dna), ...]} including 'leaky_test'
+    """
+    # Aggregate counts per pair
+    w = Counter(edges)
+    tfs = {t for (t, _) in w}
+    dnas = {d for (_, d) in w}
+    S = list(split_names)
+    rs = dict(zip(S, ratios))
+    N = sum(w.values())
+    if bigM is None:
+        bigM = 1000 * max(1, N)
+    # Index original edges so we can return a per-example split
+    pair_to_indices = defaultdict(list)
+    for idx, (c, d) in enumerate(edges):
+        pair_to_indices[(c, d)].append(idx)
+    if shuffle_within_pair:
+        rng = random.Random(seed)
+        for key in pair_to_indices:
+            rng.shuffle(pair_to_indices[key])
+    # Parse required test edges
+    req_test = Counter()
+    if test_edges_must:
+        if isinstance(test_edges_must, dict):
+            for k, v in test_edges_must.items():
+                if not isinstance(k, tuple) or len(k) != 2:
+                    raise ValueError(
+                        "test_edges_must dict keys must be (tf_cluster, dna_cluster)"
+                    )
+                if v < 0:
+                    raise ValueError("required_count must be non-negative")
+                if v:
+                    req_test[k] += int(v)
+        else:
+            # assume iterable of pairs
+            req_test = Counter(test_edges_must)
+        # Validate against available counts
+        for pair, req in req_test.items():
+            if pair not in w:
+                raise ValueError(f"Required test pair {pair} not present in edges.")
+            if req > w[pair]:
+                raise ValueError(
+                    f"Required count {req} for {pair} exceeds available {w[pair]}."
+                )
+    # Build solver
+    solver = pywraplp.Solver.CreateSolver("CBC")
+    if solver is None:
+        raise RuntimeError("Could not create CBC solver.")
+    # Binary cluster assignments
+    x = {(c, s): solver.BoolVar(f"x[{c},{s}]") for c in tfs for s in S}
+    y = {(d, s): solver.BoolVar(f"y[{d},{s}]") for d in dnas for s in S}
+    # Each cluster in exactly one split
+    for c in tfs:
+        solver.Add(sum(x[c, s] for s in S) == 1)
+    for d in dnas:
+        solver.Add(sum(y[d, s] for s in S) == 1)
+    # Integer kept counts per pair and split (allow partial within-pair)
+    k = {
+        ((c, d), s): solver.IntVar(0, w[(c, d)], f"k[{c},{d},{s}]")
+        for (c, d) in w
+        for s in S
+    }
+    # Only keep in split s if both endpoint clusters are assigned to s
+    for (c, d), wt in w.items():
+        for s in S:
+            solver.Add(k[((c, d), s)] <= wt * x[c, s])
+            solver.Add(k[((c, d), s)] <= wt * y[d, s])
+    # Enforce minimum kept counts in TEST for required pairs
+    for (c, d), req in req_test.items():
+        solver.Add(k[((c, d), "test")] >= req)
+    # Optional: ensure each split has at least one cluster (feasibility depends on counts)
+    if require_nonempty:
+        for s in S:
+            solver.Add(sum(x[c, s] for c in tfs) + sum(y[d, s] for d in dnas) >= 1)
+    # Kept counts per split and total
+    K = {s: solver.IntVar(0, N, f"K[{s}]") for s in S}
+    for s in S:
+        solver.Add(K[s] == sum(k[((c, d), s)] for (c, d) in w))
+    T = solver.IntVar(0, N, "T")
+    solver.Add(T == sum(K[s] for s in S))
+    # Ratio deviation: K_s - r_s * T = d+ - d-
+    dpos = {s: solver.NumVar(0, solver.infinity(), f"dpos[{s}]") for s in S}
+    dneg = {s: solver.NumVar(0, solver.infinity(), f"dneg[{s}]") for s in S}
+    for s in S:
+        solver.Add(K[s] - rs[s] * T == dpos[s] - dneg[s])
+    # Optional hard band around target ratios
+    if ratio_tolerance is not None:
+        eps = float(ratio_tolerance)
+        for s in S:
+            solver.Add(K[s] >= (rs[s] - eps) * T)
+            solver.Add(K[s] <= (rs[s] + eps) * T)
+    # Objective: maximize T then minimize total deviation
+    obj = solver.Objective()
+    obj.SetMaximization()
+    obj.SetCoefficient(T, float(bigM))
+    for s in S:
+        obj.SetCoefficient(dpos[s], -1.0)
+        obj.SetCoefficient(dneg[s], -1.0)
+    status = solver.Solve()
+    if status not in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
+        raise RuntimeError(
+            "No feasible solution (check ratio_tolerance vs. required test edges)."
+        )
+    # Read cluster assignments
+    tf_assign = {c: next(s for s in S if x[c, s].solution_value() > 0.5) for c in tfs}
+    dna_assign = {d: next(s for s in S if y[d, s].solution_value() > 0.5) for d in dnas}
+    # Kept counts per split
+    kept_by_split = {s: int(round(K[s].solution_value())) for s in S}
+    total_kept = int(round(T.solution_value()))
+    # ---- Build per-example split assignment (including 'leaky_test') ----
+    split_to_indices = {s: [] for s in S}
+    remaining_indices = {pair: list(pair_to_indices[pair]) for pair in pair_to_indices}
+    # Allocate the kept examples per split (train/val/test)
+    for (c, d), wt in w.items():
+        for s in S:
+            cnt = int(round(k[((c, d), s)].solution_value()))
+            if cnt > 0:
+                take = remaining_indices[(c, d)][:cnt]
+                split_to_indices[s].extend(take)
+                remaining_indices[(c, d)] = remaining_indices[(c, d)][cnt:]
+    # Everything left becomes leaky_test
+    leaky_indices = []
+    for pair, idxs in remaining_indices.items():
+        if idxs:
+            leaky_indices.extend(idxs)
+    split_to_indices["leaky_test"] = leaky_indices
+    split_to_edges = {
+        s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices
+    }
+    return (
+        tf_assign,
+        dna_assign,
+        kept_by_split,
+        total_kept,
+        split_to_indices,
+        split_to_edges,
+    )
+class DSU:
+    def __init__(self):
+        self.p = {}
+    def find(self, x):
+        if x not in self.p:
+            self.p[x] = x
+        while self.p[x] != x:
+            self.p[x] = self.p[self.p[x]]
+            x = self.p[x]
+        return x
+    def union(self, a, b):
+        ra, rb = self.find(a), self.find(b)
+        if ra != rb:
+            self.p[rb] = ra
+def split_bipartite_by_components(
+    edges,
+    split_names=("train", "val", "test"),
+    ratios=(0.8, 0.1, 0.1),
+    seed=0,
+    require_nonempty=False,
+    test_edges_must=None,  # None, list[(tf,dna)], or dict{(tf,dna): count}
+):
+    """
+    Guarantees exclusivity: each TF cluster and DNA cluster appears in at most one split.
+    Strategy: find connected components in the TF–DNA bipartite graph and assign components wholesale.
+    """
+    rng = random.Random(seed)
+    w = Counter(edges)  # multiplicities per pair
+    if not w:
+        raise ValueError("No edges.")
+    # 1) Build components with Union-Find (prefix to keep TF/DNA namespaces disjoint)
+    dsu = DSU()
+    for tf, dna in w:
+        dsu.union(("T", tf), ("D", dna))
+    comp_pairs = defaultdict(list)
+    comp_weight = defaultdict(int)
+    for (tf, dna), cnt in w.items():
+        root = dsu.find(("T", tf))  # component id = root of TF endpoint
+        comp_pairs[root].append((tf, dna))
+        comp_weight[root] += cnt
+    comps = list(comp_pairs.keys())
+    C = len(comps)
+    S = list(split_names)
+    rs = dict(zip(S, ratios))
+    N = sum(comp_weight[c] for c in comps)
+    target = {s: int(round(rs[s] * N)) for s in S}
+    # 2) Pin components that contain required TEST pairs
+    pinned = {}  # comp_root -> pinned_split ("test")
+    if test_edges_must:
+        req = (
+            Counter(test_edges_must)
+            if not isinstance(test_edges_must, dict)
+            else Counter(test_edges_must)
+        )
+        # Map each required pair to its component, ensure feasibility
+        for (tf, dna), r in req.items():
+            if (tf, dna) not in w:
+                raise ValueError(f"Required pair {(tf,dna)} not present.")
+            if r > w[(tf, dna)]:
+                raise ValueError(
+                    f"Required count {r} for {(tf,dna)} exceeds available {w[(tf,dna)]}."
+                )
+            comp = dsu.find(("T", tf))
+            if comp in pinned and pinned[comp] != "test":
+                raise ValueError(
+                    f"Component conflict: already pinned to {pinned[comp]}, but {(tf,dna)} demands test."
+                )
+            pinned[comp] = "test"
+        # NOTE: pinning a pair pins the WHOLE component to test (to keep exclusivity).
+        # If you only want some edges kept in test and discard the rest, handle below when materializing.
+    # 3) Assign components greedily by deficit
+    kept_by_split = {s: 0 for s in S}
+    comp_assign = {}  # comp_root -> split
+    # First assign pinned comps
+    for comp, split in pinned.items():
+        comp_assign[comp] = split
+        kept_by_split[split] += comp_weight[comp]
+    # Sort remaining components by descending weight
+    remaining = [c for c in comps if c not in comp_assign]
+    remaining.sort(key=lambda c: comp_weight[c], reverse=True)
+    # Ensure nonempty splits if requested (seed with largest remaining comps)
+    if require_nonempty:
+        seeds = remaining[: min(len(S), len(remaining))]
+        for comp, s in zip(seeds, S):
+            comp_assign[comp] = s
+            kept_by_split[s] += comp_weight[comp]
+        remaining = [c for c in remaining if c not in comp_assign]
+    for comp in remaining:
+        # choose split with largest deficit (target - current)
+        deficits = {s: target[s] - kept_by_split[s] for s in S}
+        best = max(deficits, key=lambda s: deficits[s])
+        comp_assign[comp] = best
+        kept_by_split[best] += comp_weight[comp]
+    total_kept = sum(kept_by_split.values())
+    # 4) Materialize per-example indices (and verify exclusivity)
+    pair_to_indices = defaultdict(list)
+    for idx, pair in enumerate(edges):
+        pair_to_indices[pair].append(idx)
+    split_to_indices = {s: [] for s in S}
+    for comp, s in comp_assign.items():
+        for pair in comp_pairs[comp]:
+            split_to_indices[s].extend(pair_to_indices[pair])
+    # Optional: if you pinned a comp due to a small 'must-test' count but
+    # want to *discard* the rest instead of keeping them in test, uncomment:
+    # for comp, s in comp_assign.items():
+    #     if s == "test" and test_edges_must:
+    #         # Keep only the required counts; dump extras to 'leaky_test'
+    #         ...
+    # (Left out for clarity; default is: keep the whole component in its split.)
+    # 5) Build edge lists and simple cluster assignments
+    split_to_edges = {
+        s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices
+    }
+    tf_assign, dna_assign = {}, {}
+    for comp, s in comp_assign.items():
+        for tf, dna in comp_pairs[comp]:
+            tf_assign[tf] = s
+            dna_assign[dna] = s
+    # 6) Safety check: no DNA/TF appears in multiple splits
+    tf_in_split = defaultdict(set)
+    dna_in_split = defaultdict(set)
+    for s, elist in split_to_edges.items():
+        for tf, dna in elist:
+            tf_in_split[tf].add(s)
+            dna_in_split[dna].add(s)
+    dup_tf = {tf: ss for tf, ss in tf_in_split.items() if len(ss) > 1}
+    dup_dna = {dn: ss for dn, ss in dna_in_split.items() if len(ss) > 1}
+    assert not dup_tf and not dup_dna, f"Exclusivity violated: {dup_tf} {dup_dna}"
+    return (
+        tf_assign,
+        dna_assign,
+        kept_by_split,
+        total_kept,
+        split_to_indices,
+        split_to_edges,
+    )
+def print_split_ratios(kept_by_split):
+    total = sum(kept_by_split.values())
+    train_pcnt = 100 * kept_by_split["train"] / total
+    val_pcnt = 100 * kept_by_split["val"] / total
+    test_pcnt = 100 * kept_by_split["test"] / total
+    logger.info(
+        f"Cluster distribution - Train: {train_pcnt:.2f}%, Val: {val_pcnt:.2f}%, Test: {test_pcnt:.2f}%"
+    )
+def make_edges(
+    processed_fimo_path: str, protein_cluster_path: str, dna_cluster_path: str
+):
+    """
+    Make edges for input to the splitting algorithm. Edges consist of: (tr_cluster_rep)_(dna_cluster_rep) where the cluster rep is the sequence ID
+    """
+    # Read cluser data
+    protein_clusters = pd.read_csv(protein_cluster_path, header=None, sep="\t")
+    protein_clusters.columns = ["tr_cluster_rep", "tr_seqid"]
+    dna_clusters = pd.read_csv(dna_cluster_path, header=None, sep="\t")
+    dna_clusters.columns = ["dna_cluster_rep", "dna_seqid"]
+    # Read datapoints
+    edges = pd.read_parquet(processed_fimo_path)
+    edges = pd.merge(edges, dna_clusters, on="dna_seqid", how="left")
+    edges = pd.merge(edges, protein_clusters, on="tr_seqid", how="left")
+    edges["edge"] = edges.apply(
+        lambda row: (row["tr_cluster_rep"], row["dna_cluster_rep"]), axis=1
+    )
+    logger.info(f"Total unique edges: {len(edges['edge'].unique().tolist())}")
+    dup_edges = edges.loc[edges.duplicated("edge")]["edge"].unique().tolist()
+    logger.info(f"Total edges with >1 datapoint: {len(dup_edges)}")
+    logger.info(
+        f"Total datapoints belonging to a duplicate edge: {len(edges.loc[edges['edge'].isin(dup_edges)])}"
+    )
+    return edges
+def check_validity(train, val, test, split_by="both"):
+    """
+    Rigorous check for no overlap
+    Columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
+    """
+    train_ids = set(train["ID"].unique().tolist())
+    val_ids = set(val["ID"].unique().tolist())
+    test_ids = set(test["ID"].unique().tolist())
+    assert len(train_ids.intersection(val_ids)) == 0
+    assert len(train_ids.intersection(test_ids)) == 0
+    assert len(val_ids.intersection(test_ids)) == 0
+    logger.info(f"Pass! No overlap in IDs")
+    if split_by != "dna":
+        train_tr_seqs = set(train["tr_sequence"].unique().tolist())
+        val_tr_seqs = set(val["tr_sequence"].unique().tolist())
+        test_tr_seqs = set(test["tr_sequence"].unique().tolist())
+        assert len(train_tr_seqs.intersection(val_tr_seqs)) == 0
+        assert len(train_tr_seqs.intersection(test_tr_seqs)) == 0
+        assert len(val_tr_seqs.intersection(test_tr_seqs)) == 0
+        logger.info(f"Pass! No overlap in TR sequences")
+        train_tr_reps = set(train["tr_cluster_rep"].unique().tolist())
+        val_tr_reps = set(val["tr_cluster_rep"].unique().tolist())
+        test_tr_reps = set(test["tr_cluster_rep"].unique().tolist())
+        assert len(train_tr_reps.intersection(val_tr_reps)) == 0
+        assert len(train_tr_reps.intersection(test_tr_reps)) == 0
+        assert len(val_tr_reps.intersection(test_tr_reps)) == 0
+        logger.info(f"Pass! No overlap in TR cluster reps")
+    if split_by != "protein":
+        train_dna_seqs = set(train["dna_sequence"].unique().tolist())
+        val_dna_seqs = set(val["dna_sequence"].unique().tolist())
+        test_dna_seqs = set(test["dna_sequence"].unique().tolist())
+        assert len(train_dna_seqs.intersection(val_dna_seqs)) == 0
+        assert len(train_dna_seqs.intersection(test_dna_seqs)) == 0
+        assert len(val_dna_seqs.intersection(test_dna_seqs)) == 0
+        logger.info(f"Pass! No overlap in DNA sequences")
+        train_dna_reps = set(train["dna_cluster_rep"].unique().tolist())
+        val_dna_reps = set(val["dna_cluster_rep"].unique().tolist())
+        test_dna_reps = set(test["dna_cluster_rep"].unique().tolist())
+        assert len(train_dna_reps.intersection(val_dna_reps)) == 0
+        assert len(train_dna_reps.intersection(test_dna_reps)) == 0
+        assert len(val_dna_reps.intersection(test_dna_reps)) == 0
+        logger.info(f"Pass! No overlap in DNA cluster reps")
+def augment_rc(df):
+    """
+    Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
+    Also flip the orientation of the scores
+    columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
+    """
+    df_rc = df.copy(deep=True)
+    df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
+        lambda x: get_reverse_complement(x)
+    )
+    df_rc["ID"] = df_rc["ID"] + "_rc"
+    df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))
+    final_df = pd.concat([df, df_rc]).reset_index(drop=True)
+    return final_df
+def main(cfg: DictConfig):
+    """
+    Take a set of DNA clusters + protein clusters, and create the best possible splits into train/val/test.
+    """
+    # construct edges from training data
+    edge_df = make_edges(
+        processed_fimo_path=Path(root) / cfg.data_task.input_data_path,
+        protein_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.protein,
+        dna_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.dna,
+    )
+    edges = edge_df["edge"].unique().tolist()
+    # figure out if we actually even have a conflict
+    total_proteins = len(edge_df["tr_seqid"].unique().tolist())
+    total_protein_clusters = len(edge_df["tr_cluster_rep"].unique().tolist())
+    no_protein_overlap = (total_proteins) == (total_protein_clusters)
+    logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
+    if cfg.data_task.split_by == "dna":
+        if cfg.data_task.p_exclude:
+            return
+        else:
+            logger.info(f"Easy split: all proteins are in their own clusters.")
+            dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
+            results = split_bipartite_fast(
+                dna_clusters,
+                split_names=("train", "val", "test"),
+                ratios=(
+                    cfg.data_task.train_ratio,
+                    cfg.data_task.val_ratio,
+                    cfg.data_task.test_ratio,
+                ),
+            )
+            dna_assign, kept_by_split = results
+            # assign datapoints to cluster by their DNA cluster rep
+            edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
+    else:
+        results = split_bipartite_by_components(
+            edges,
+            split_names=("train", "val", "test"),
+            ratios=(
+                cfg.data_task.train_ratio,
+                cfg.data_task.val_ratio,
+                cfg.data_task.test_ratio,
+            ),
+            require_nonempty=cfg.data_task.require_nonempty,
+            seed=cfg.data_task.seed,
+            test_edges_must=None,
+        )
+        (
+            tf_assign,
+            dna_assign,
+            kept_by_split,
+            total_kept,
+            split_to_indices,
+            split_to_edges,
+        ) = results
+        # Map each sample to its split
+        print(tf_assign)
+        print(dna_assign)
+        edge_df["tr_split"] = edge_df["tr_cluster_rep"].map(tf_assign)
+        edge_df["dna_split"] = edge_df["dna_cluster_rep"].map(dna_assign)
+        edge_df["same_split"] = (
+            edge_df["tr_split"] == edge_df["dna_split"]
+        )  # should always be true if easy cluster
+        edge_df["split"] = edge_df["tr_split"]
+        print(edge_df)
+        edge_df["split"] = np.where(
+            edge_df["same_split"],
+            edge_df["split"],  # keep existing split if same_split == True
+            "leak",  # otherwise leak
+        )
+        print(edge_df)
+    # Print ratios: hopefully close to desired (e.g. 80/10/10)
+    print_split_ratios(kept_by_split)
+    # Make train, val, test sets
+    # make sure no ID is duplicate
+    assert len(edge_df["ID"].unique()) == len(edge_df)
+    split_cols = [
+        "ID",
+        "dna_sequence",
+        "tr_sequence",
+        "tr_cluster_rep",
+        "dna_cluster_rep",
+        "scores",
+        "split",
+    ]
+    train = edge_df.loc[edge_df["split"] == "train"].reset_index(drop=True)[split_cols]
+    val = edge_df.loc[edge_df["split"] == "val"].reset_index(drop=True)[split_cols]
+    test = edge_df.loc[edge_df["split"] == "test"].reset_index(drop=True)[split_cols]
+    # ensure there is no overlap
+    check_validity(train, val, test, split_by=cfg.data_task.split_by)
+    total = sum([len(train), len(val), len(test)])
+    logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
+    logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
+    logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
+    logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")
+    og_unique_dna = pd.concat([train, val, test])
+    og_unique_dna = len(og_unique_dna["dna_sequence"].unique())
+    ## Now do RC data augmentation if asked
+    if cfg.data_task.augment_rc:
+        train = augment_rc(train)
+        val = augment_rc(val)
+        test = augment_rc(test)
+        logger.info(f"Added reverse complement sequences to train, val, and test.")
+        check_validity(train, val, test, split_by=cfg.data_task.split_by)
+        total = sum([len(train), len(val), len(test)])
+        logger.info(
+            f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)"
+        )
+        logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
+        logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
+        logger.info(
+            f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}"
+        )
+        # since we've added all these new DNA sequences, we do need a new apping of seq id to dna sequence
+        all_data = pd.concat([train, val, test])
+        all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
+        dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
+        assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
+        new_map_path = str(Path(root) / cfg.data_task.dna_map_path).replace(
+            ".json", "_with_rc.json"
+        )
+        with open(new_map_path, "w") as f:
+            json.dump(dna_dict, f, indent=2)
+        logger.info(
+            f"Saved DNA maps with reverse complements (len {len(dna_dict)}=2*original map of len {og_unique_dna}=={len(dna_dict)==2*og_unique_dna}) to {new_map_path}"
+        )
+    # create the output dir
+    split_out_dir = Path(root) / cfg.data_task.split_out_dir
+    os.makedirs(split_out_dir, exist_ok=True)
+    # add binary_scores to allow other training modes
+    train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x))
+    val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x))
+    test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x))
+    # slect final cols and save
+    split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
+    train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
+    val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
+    test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
+    logger.info(f"Saved all splits to {split_out_dir}")

dpacman/data_tasks/split/remap.py CHANGED Viewed

@@ -15,6 +15,74 @@ from dpacman.utils import pylogger
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
 def split_bipartite_fast(
     dna_clusters,
@@ -50,354 +118,22 @@ def split_bipartite_fast(
     kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
     return dna_assign, kept_by_split
-def convert_scores(scores):
     svec = [int(x) for x in scores.split(",")]
     max_score = max(svec)
-    binary_svec = [0 if x<max_score else 1 for x in svec]
-    assert(svec.count(max_score)==binary_svec.count(1))
     binary_svec = ",".join([str(x) for x in binary_svec])
     return binary_svec
-def split_bipartite_with_ratios_and_leaky(
-    edges,
-    split_names=("train", "val", "test"),
-    ratios=(0.8, 0.1, 0.1),
-    require_nonempty=False,
-    ratio_tolerance=None,  # None = soft ratios only; 0.0 = exact band (use with care)
-    bigM=None,
-    shuffle_within_pair=False,
-    seed=0,
-    test_edges_must=None,  # NEW: list of (tf,dna) with duplicates OR dict {(tf,dna): count}
-):
-    """
-    edges: list of (tf_cluster_id, dna_cluster_id). Duplicates allowed (-> weights).
-    test_edges_must: None, list of pairs, or dict {(tf,dna): required_count}.
-        - If a pair appears with required_count > 0, at least that many examples MUST be kept in TEST.
-        - This implicitly pins both clusters of that pair to TEST (cluster exclusivity).
-    Returns:
-        tf_assign: {tf_cluster -> split}
-        dna_assign: {dna_cluster -> split}
-        kept_by_split: {split -> kept_count} (train/val/test only)
-        total_kept: int
-        split_to_indices: {split -> [input indices]} including 'leaky_test'
-        split_to_edges:   {split -> [(tf,dna), ...]} including 'leaky_test'
-    """
-    # Aggregate counts per pair
-    w = Counter(edges)
-    tfs = {t for (t, _) in w}
-    dnas = {d for (_, d) in w}
-    S = list(split_names)
-    rs = dict(zip(S, ratios))
-    N = sum(w.values())
-    if bigM is None:
-        bigM = 1000 * max(1, N)
-    # Index original edges so we can return a per-example split
-    pair_to_indices = defaultdict(list)
-    for idx, (c, d) in enumerate(edges):
-        pair_to_indices[(c, d)].append(idx)
-    if shuffle_within_pair:
-        rng = random.Random(seed)
-        for key in pair_to_indices:
-            rng.shuffle(pair_to_indices[key])
-    # Parse required test edges
-    req_test = Counter()
-    if test_edges_must:
-        if isinstance(test_edges_must, dict):
-            for k, v in test_edges_must.items():
-                if not isinstance(k, tuple) or len(k) != 2:
-                    raise ValueError(
-                        "test_edges_must dict keys must be (tf_cluster, dna_cluster)"
-                    )
-                if v < 0:
-                    raise ValueError("required_count must be non-negative")
-                if v:
-                    req_test[k] += int(v)
-        else:
-            # assume iterable of pairs
-            req_test = Counter(test_edges_must)
-        # Validate against available counts
-        for pair, req in req_test.items():
-            if pair not in w:
-                raise ValueError(f"Required test pair {pair} not present in edges.")
-            if req > w[pair]:
-                raise ValueError(
-                    f"Required count {req} for {pair} exceeds available {w[pair]}."
-                )
-    # Build solver
-    solver = pywraplp.Solver.CreateSolver("CBC")
-    if solver is None:
-        raise RuntimeError("Could not create CBC solver.")
-    # Binary cluster assignments
-    x = {(c, s): solver.BoolVar(f"x[{c},{s}]") for c in tfs for s in S}
-    y = {(d, s): solver.BoolVar(f"y[{d},{s}]") for d in dnas for s in S}
-    # Each cluster in exactly one split
-    for c in tfs:
-        solver.Add(sum(x[c, s] for s in S) == 1)
-    for d in dnas:
-        solver.Add(sum(y[d, s] for s in S) == 1)
-    # Integer kept counts per pair and split (allow partial within-pair)
-    k = {
-        ((c, d), s): solver.IntVar(0, w[(c, d)], f"k[{c},{d},{s}]")
-        for (c, d) in w
-        for s in S
-    }
-    # Only keep in split s if both endpoint clusters are assigned to s
-    for (c, d), wt in w.items():
-        for s in S:
-            solver.Add(k[((c, d), s)] <= wt * x[c, s])
-            solver.Add(k[((c, d), s)] <= wt * y[d, s])
-    # Enforce minimum kept counts in TEST for required pairs
-    for (c, d), req in req_test.items():
-        solver.Add(k[((c, d), "test")] >= req)
-    # Optional: ensure each split has at least one cluster (feasibility depends on counts)
-    if require_nonempty:
-        for s in S:
-            solver.Add(sum(x[c, s] for c in tfs) + sum(y[d, s] for d in dnas) >= 1)
-    # Kept counts per split and total
-    K = {s: solver.IntVar(0, N, f"K[{s}]") for s in S}
-    for s in S:
-        solver.Add(K[s] == sum(k[((c, d), s)] for (c, d) in w))
-    T = solver.IntVar(0, N, "T")
-    solver.Add(T == sum(K[s] for s in S))
-    # Ratio deviation: K_s - r_s * T = d+ - d-
-    dpos = {s: solver.NumVar(0, solver.infinity(), f"dpos[{s}]") for s in S}
-    dneg = {s: solver.NumVar(0, solver.infinity(), f"dneg[{s}]") for s in S}
-    for s in S:
-        solver.Add(K[s] - rs[s] * T == dpos[s] - dneg[s])
-    # Optional hard band around target ratios
-    if ratio_tolerance is not None:
-        eps = float(ratio_tolerance)
-        for s in S:
-            solver.Add(K[s] >= (rs[s] - eps) * T)
-            solver.Add(K[s] <= (rs[s] + eps) * T)
-    # Objective: maximize T then minimize total deviation
-    obj = solver.Objective()
-    obj.SetMaximization()
-    obj.SetCoefficient(T, float(bigM))
-    for s in S:
-        obj.SetCoefficient(dpos[s], -1.0)
-        obj.SetCoefficient(dneg[s], -1.0)
-    status = solver.Solve()
-    if status not in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
-        raise RuntimeError(
-            "No feasible solution (check ratio_tolerance vs. required test edges)."
-        )
-    # Read cluster assignments
-    tf_assign = {c: next(s for s in S if x[c, s].solution_value() > 0.5) for c in tfs}
-    dna_assign = {d: next(s for s in S if y[d, s].solution_value() > 0.5) for d in dnas}
-    # Kept counts per split
-    kept_by_split = {s: int(round(K[s].solution_value())) for s in S}
-    total_kept = int(round(T.solution_value()))
-    # ---- Build per-example split assignment (including 'leaky_test') ----
-    split_to_indices = {s: [] for s in S}
-    remaining_indices = {pair: list(pair_to_indices[pair]) for pair in pair_to_indices}
-    # Allocate the kept examples per split (train/val/test)
-    for (c, d), wt in w.items():
-        for s in S:
-            cnt = int(round(k[((c, d), s)].solution_value()))
-            if cnt > 0:
-                take = remaining_indices[(c, d)][:cnt]
-                split_to_indices[s].extend(take)
-                remaining_indices[(c, d)] = remaining_indices[(c, d)][cnt:]
-    # Everything left becomes leaky_test
-    leaky_indices = []
-    for pair, idxs in remaining_indices.items():
-        if idxs:
-            leaky_indices.extend(idxs)
-    split_to_indices["leaky_test"] = leaky_indices
-    split_to_edges = {
-        s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices
-    }
-    return (
-        tf_assign,
-        dna_assign,
-        kept_by_split,
-        total_kept,
-        split_to_indices,
-        split_to_edges,
-    )
-class DSU:
-    def __init__(self):
-        self.p = {}
-    def find(self, x):
-        if x not in self.p:
-            self.p[x] = x
-        while self.p[x] != x:
-            self.p[x] = self.p[self.p[x]]
-            x = self.p[x]
-        return x
-    def union(self, a, b):
-        ra, rb = self.find(a), self.find(b)
-        if ra != rb:
-            self.p[rb] = ra
-def split_bipartite_by_components(
-    edges,
-    split_names=("train", "val", "test"),
-    ratios=(0.8, 0.1, 0.1),
-    seed=0,
-    require_nonempty=False,
-    test_edges_must=None,  # None, list[(tf,dna)], or dict{(tf,dna): count}
-):
-    """
-    Guarantees exclusivity: each TF cluster and DNA cluster appears in at most one split.
-    Strategy: find connected components in the TF–DNA bipartite graph and assign components wholesale.
-    """
-    rng = random.Random(seed)
-    w = Counter(edges)  # multiplicities per pair
-    if not w:
-        raise ValueError("No edges.")
-    # 1) Build components with Union-Find (prefix to keep TF/DNA namespaces disjoint)
-    dsu = DSU()
-    for tf, dna in w:
-        dsu.union(("T", tf), ("D", dna))
-    comp_pairs = defaultdict(list)
-    comp_weight = defaultdict(int)
-    for (tf, dna), cnt in w.items():
-        root = dsu.find(("T", tf))  # component id = root of TF endpoint
-        comp_pairs[root].append((tf, dna))
-        comp_weight[root] += cnt
-    comps = list(comp_pairs.keys())
-    C = len(comps)
-    S = list(split_names)
-    rs = dict(zip(S, ratios))
-    N = sum(comp_weight[c] for c in comps)
-    target = {s: int(round(rs[s] * N)) for s in S}
-    # 2) Pin components that contain required TEST pairs
-    pinned = {}  # comp_root -> pinned_split ("test")
-    if test_edges_must:
-        req = (
-            Counter(test_edges_must)
-            if not isinstance(test_edges_must, dict)
-            else Counter(test_edges_must)
-        )
-        # Map each required pair to its component, ensure feasibility
-        for (tf, dna), r in req.items():
-            if (tf, dna) not in w:
-                raise ValueError(f"Required pair {(tf,dna)} not present.")
-            if r > w[(tf, dna)]:
-                raise ValueError(
-                    f"Required count {r} for {(tf,dna)} exceeds available {w[(tf,dna)]}."
-                )
-            comp = dsu.find(("T", tf))
-            if comp in pinned and pinned[comp] != "test":
-                raise ValueError(
-                    f"Component conflict: already pinned to {pinned[comp]}, but {(tf,dna)} demands test."
-                )
-            pinned[comp] = "test"
-        # NOTE: pinning a pair pins the WHOLE component to test (to keep exclusivity).
-        # If you only want some edges kept in test and discard the rest, handle below when materializing.
-    # 3) Assign components greedily by deficit
-    kept_by_split = {s: 0 for s in S}
-    comp_assign = {}  # comp_root -> split
-    # First assign pinned comps
-    for comp, split in pinned.items():
-        comp_assign[comp] = split
-        kept_by_split[split] += comp_weight[comp]
-    # Sort remaining components by descending weight
-    remaining = [c for c in comps if c not in comp_assign]
-    remaining.sort(key=lambda c: comp_weight[c], reverse=True)
-    # Ensure nonempty splits if requested (seed with largest remaining comps)
-    if require_nonempty:
-        seeds = remaining[: min(len(S), len(remaining))]
-        for comp, s in zip(seeds, S):
-            comp_assign[comp] = s
-            kept_by_split[s] += comp_weight[comp]
-        remaining = [c for c in remaining if c not in comp_assign]
-    for comp in remaining:
-        # choose split with largest deficit (target - current)
-        deficits = {s: target[s] - kept_by_split[s] for s in S}
-        best = max(deficits, key=lambda s: deficits[s])
-        comp_assign[comp] = best
-        kept_by_split[best] += comp_weight[comp]
-    total_kept = sum(kept_by_split.values())
-    # 4) Materialize per-example indices (and verify exclusivity)
-    pair_to_indices = defaultdict(list)
-    for idx, pair in enumerate(edges):
-        pair_to_indices[pair].append(idx)
-    split_to_indices = {s: [] for s in S}
-    for comp, s in comp_assign.items():
-        for pair in comp_pairs[comp]:
-            split_to_indices[s].extend(pair_to_indices[pair])
-    # Optional: if you pinned a comp due to a small 'must-test' count but
-    # want to *discard* the rest instead of keeping them in test, uncomment:
-    # for comp, s in comp_assign.items():
-    #     if s == "test" and test_edges_must:
-    #         # Keep only the required counts; dump extras to 'leaky_test'
-    #         ...
-    # (Left out for clarity; default is: keep the whole component in its split.)
-    # 5) Build edge lists and simple cluster assignments
-    split_to_edges = {
-        s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices
-    }
-    tf_assign, dna_assign = {}, {}
-    for comp, s in comp_assign.items():
-        for tf, dna in comp_pairs[comp]:
-            tf_assign[tf] = s
-            dna_assign[dna] = s
-    # 6) Safety check: no DNA/TF appears in multiple splits
-    tf_in_split = defaultdict(set)
-    dna_in_split = defaultdict(set)
-    for s, elist in split_to_edges.items():
-        for tf, dna in elist:
-            tf_in_split[tf].add(s)
-            dna_in_split[dna].add(s)
-    dup_tf = {tf: ss for tf, ss in tf_in_split.items() if len(ss) > 1}
-    dup_dna = {dn: ss for dn, ss in dna_in_split.items() if len(ss) > 1}
-    assert not dup_tf and not dup_dna, f"Exclusivity violated: {dup_tf} {dup_dna}"
-    return (
-        tf_assign,
-        dna_assign,
-        kept_by_split,
-        total_kept,
-        split_to_indices,
-        split_to_edges,
-    )
 def print_split_ratios(kept_by_split):
     total = sum(kept_by_split.values())
     train_pcnt = 100 * kept_by_split["train"] / total
@@ -452,39 +188,57 @@ def check_validity(train, val, test, split_by="both"):
     assert len(val_ids.intersection(test_ids)) == 0
     logger.info(f"Pass! No overlap in IDs")
-    if split_by != "dna":
-        train_tr_seqs = set(train["tr_sequence"].unique().tolist())
-        val_tr_seqs = set(val["tr_sequence"].unique().tolist())
-        test_tr_seqs = set(test["tr_sequence"].unique().tolist())
         assert len(train_tr_seqs.intersection(val_tr_seqs)) == 0
         assert len(train_tr_seqs.intersection(test_tr_seqs)) == 0
         assert len(val_tr_seqs.intersection(test_tr_seqs)) == 0
         logger.info(f"Pass! No overlap in TR sequences")
-        train_tr_reps = set(train["tr_cluster_rep"].unique().tolist())
-        val_tr_reps = set(val["tr_cluster_rep"].unique().tolist())
-        test_tr_reps = set(test["tr_cluster_rep"].unique().tolist())
         assert len(train_tr_reps.intersection(val_tr_reps)) == 0
         assert len(train_tr_reps.intersection(test_tr_reps)) == 0
         assert len(val_tr_reps.intersection(test_tr_reps)) == 0
         logger.info(f"Pass! No overlap in TR cluster reps")
     if split_by != "protein":
-        train_dna_seqs = set(train["dna_sequence"].unique().tolist())
-        val_dna_seqs = set(val["dna_sequence"].unique().tolist())
-        test_dna_seqs = set(test["dna_sequence"].unique().tolist())
         assert len(train_dna_seqs.intersection(val_dna_seqs)) == 0
         assert len(train_dna_seqs.intersection(test_dna_seqs)) == 0
         assert len(val_dna_seqs.intersection(test_dna_seqs)) == 0
         logger.info(f"Pass! No overlap in DNA sequences")
-        train_dna_reps = set(train["dna_cluster_rep"].unique().tolist())
-        val_dna_reps = set(val["dna_cluster_rep"].unique().tolist())
-        test_dna_reps = set(test["dna_cluster_rep"].unique().tolist())
         assert len(train_dna_reps.intersection(val_dna_reps)) == 0
         assert len(train_dna_reps.intersection(test_dna_reps)) == 0
         assert len(val_dna_reps.intersection(test_dna_reps)) == 0
@@ -531,8 +285,23 @@ def main(cfg: DictConfig):
     logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
     if cfg.data_task.split_by == "dna":
-        if cfg.data_task.p_exclude:
-            return
         else:
             logger.info(f"Easy split: all proteins are in their own clusters.")
             dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
@@ -549,75 +318,42 @@ def main(cfg: DictConfig):
             # assign datapoints to cluster by their DNA cluster rep
             edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
-    else:
-        results = split_bipartite_by_components(
-            edges,
-            split_names=("train", "val", "test"),
-            ratios=(
-                cfg.data_task.train_ratio,
-                cfg.data_task.val_ratio,
-                cfg.data_task.test_ratio,
-            ),
-            require_nonempty=cfg.data_task.require_nonempty,
-            seed=cfg.data_task.seed,
-            test_edges_must=None,
-        )
-        (
-            tf_assign,
-            dna_assign,
-            kept_by_split,
-            total_kept,
-            split_to_indices,
-            split_to_edges,
-        ) = results
-        # Map each sample to its split
-        print(tf_assign)
-        print(dna_assign)
-        edge_df["tr_split"] = edge_df["tr_cluster_rep"].map(tf_assign)
-        edge_df["dna_split"] = edge_df["dna_cluster_rep"].map(dna_assign)
-        edge_df["same_split"] = (
-            edge_df["tr_split"] == edge_df["dna_split"]
-        )  # should always be true if easy cluster
-        edge_df["split"] = edge_df["tr_split"]
-        print(edge_df)
-        edge_df["split"] = np.where(
-            edge_df["same_split"],
-            edge_df["split"],  # keep existing split if same_split == True
-            "leak",  # otherwise leak
-        )
-        print(edge_df)
-    # Print ratios: hopefully close to desired (e.g. 80/10/10)
-    print_split_ratios(kept_by_split)
-    # Make train, val, test sets
-    # make sure no ID is duplicate
-    assert len(edge_df["ID"].unique()) == len(edge_df)
-    split_cols = [
-        "ID",
-        "dna_sequence",
-        "tr_sequence",
-        "tr_cluster_rep",
-        "dna_cluster_rep",
-        "scores",
-        "split",
-    ]
-    train = edge_df.loc[edge_df["split"] == "train"].reset_index(drop=True)[split_cols]
-    val = edge_df.loc[edge_df["split"] == "val"].reset_index(drop=True)[split_cols]
-    test = edge_df.loc[edge_df["split"] == "test"].reset_index(drop=True)[split_cols]
     # ensure there is no overlap
     check_validity(train, val, test, split_by=cfg.data_task.split_by)
-    total = sum([len(train), len(val), len(test)])
     logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
     logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
     logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
     logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")
-    og_unique_dna = pd.concat([train, val, test])
     og_unique_dna = len(og_unique_dna["dna_sequence"].unique())
     ## Now do RC data augmentation if asked
@@ -625,23 +361,25 @@ def main(cfg: DictConfig):
         train = augment_rc(train)
         val = augment_rc(val)
         test = augment_rc(test)
-        logger.info(f"Added reverse complement sequences to train, val, and test.")
         check_validity(train, val, test, split_by=cfg.data_task.split_by)
-        total = sum([len(train), len(val), len(test)])
         logger.info(
             f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)"
         )
         logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
         logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
         logger.info(
             f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}"
         )
-        # since we've added all these new DNA sequences, we do need a new apping of seq id to dna sequence
-        all_data = pd.concat([train, val, test])
         all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
         dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
         assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
@@ -660,13 +398,15 @@ def main(cfg: DictConfig):
     os.makedirs(split_out_dir, exist_ok=True)
     # add binary_scores to allow other training modes
-    train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x))
-    val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x))
-    test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x))
     # slect final cols and save
     split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
     train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
     val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
     test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
     logger.info(f"Saved all splits to {split_out_dir}")

 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
+def split_with_predefined_test(
+    full_df = pd.DataFrame(),
+    split_names=("train", "val", "test"),
+    test_trs=None,
+    test_dnas=None,
+    ratios=(0.8, 0.1, 0.1),
+):
+    """
+    Method for splitting into train and val with a predefined test set.
+    The proteins in the test set, and the DNA clusters of the DNAs they're associated with, must be excluded from train and val.
+    The remaining rows for train and val are split to preserve 80/10/10 as best as possible.
+    """
+    test = full_df.copy(deep=True)
+    if test_trs is not None:
+        test = test.loc[test["tr_seqid"].isin(test_trs)].reset_index(drop=True)
+    if test_dnas is not None:
+        test = test.loc[test["dna_seqid"].isin(test_dnas)].reset_index(drop=True)
+    tr_clusters_to_exclude = test["tr_cluster_rep"].unique().tolist()
+    dna_clusters_to_exclude = test["dna_cluster_rep"].unique().tolist()
+    remaining = full_df.loc[
+        (~full_df["tr_cluster_rep"].isin(tr_clusters_to_exclude)) &
+        (~full_df["dna_cluster_rep"].isin(dna_clusters_to_exclude))
+    ].reset_index(drop=True)
+    test_ids = test["ID"].unique().tolist()
+    remaining_ids = remaining["ID"].unique().tolist()
+    remaining_clusters = remaining["dna_cluster_rep"].unique().tolist()
+    lost_rows = full_df.loc[
+        (~full_df["ID"].isin(test_ids)) &
+        (~full_df["ID"].isin(remaining_ids))
+    ]
+    logger.info(f"Rows in test: {len(test)}")
+    logger.info(f"Rows to be split between train and val: {len(remaining)}")
+    total_rows = len(test) + len(remaining)
+    logger.info(f"Total rows: {total_rows}. Test percentage: {100*len(test)/total_rows:.2f}%")
+    logger.info(f"Lost rows: {len(lost_rows)}")
+    train_ratio_from_remaining = round((0.8*total_rows)/len(remaining), 2)
+    # use sklearn
+    test_size_1 = 1 - train_ratio_from_remaining
+    logger.info(
+        f"\tPerforming first split: non-test clusters -> train clusters ({round(1-test_size_1,3)}) and val ({test_size_1})"
+    )
+    X = remaining_clusters
+    y = [0] * len(remaining_clusters)
+    X_train, X_val, y_train, y_val = train_test_split(
+        X, y, test_size=test_size_1, random_state=0
+    )
+    train = remaining.loc[remaining["dna_cluster_rep"].isin(X_train)]
+    val = remaining.loc[remaining["dna_cluster_rep"].isin(X_val)]
+    leaky_test = lost_rows
+    kept_by_split = {
+        "train": len(X_train),
+        "val": len(X_val),
+        "test": len(test["dna_cluster_rep"].unique())
+    }
+    splits = {
+        "train": train,
+        "val": val,
+        "test": test,
+        "leaky_test": leaky_test
+    }
+    return splits, kept_by_split
 def split_bipartite_fast(
     dna_clusters,
     kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
     return dna_assign, kept_by_split
+# construct new labels
+def convert_scores(scores, mode=1):
+    """
+    Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
+    """
     svec = [int(x) for x in scores.split(",")]
     max_score = max(svec)
+    if mode ==1:
+        binary_svec = [0 if x<max_score else 1 for x in svec]
+        assert(svec.count(max_score)==binary_svec.count(1))
+    else:
+        binary_svec = [0 if x<max_score else max_score for x in svec]
+        assert(svec.count(max_score)==binary_svec.count(max_score))
     binary_svec = ",".join([str(x) for x in binary_svec])
     return binary_svec
 def print_split_ratios(kept_by_split):
     total = sum(kept_by_split.values())
     train_pcnt = 100 * kept_by_split["train"] / total
     assert len(val_ids.intersection(test_ids)) == 0
     logger.info(f"Pass! No overlap in IDs")
+    # Investigate TR intersection. No assertions unless we are explicitly splitting on this.
+    train_tr_seqs = set(train["tr_sequence"].unique().tolist())
+    val_tr_seqs = set(val["tr_sequence"].unique().tolist())
+    test_tr_seqs = set(test["tr_sequence"].unique().tolist())
+    train_tr_reps = set(train["tr_cluster_rep"].unique().tolist())
+    val_tr_reps = set(val["tr_cluster_rep"].unique().tolist())
+    test_tr_reps = set(test["tr_cluster_rep"].unique().tolist())
+    logger.info(f"Train-Val TR intersection: {len(train_tr_seqs.intersection(val_tr_seqs))}")
+    logger.info(f"Train-Test TR intersection: {len(train_tr_seqs.intersection(test_tr_seqs))}")
+    logger.info(f"Val-Test TR intersection: {len(val_tr_seqs.intersection(test_tr_seqs))}")
+    logger.info(f"Train-Val TR Cluster Rep intersection: {len(train_tr_reps.intersection(val_tr_reps))}")
+    logger.info(f"Train-Test TR Cluster Rep intersection: {len(train_tr_reps.intersection(test_tr_reps))}")
+    logger.info(f"Val-Test TR Cluster Rep intersection: {len(val_tr_reps.intersection(test_tr_reps))}")
+    # Investigate DNA intersection. No assertions unless we are explicitly splitting on this.
+    train_dna_seqs = set(train["dna_sequence"].unique().tolist())
+    val_dna_seqs = set(val["dna_sequence"].unique().tolist())
+    test_dna_seqs = set(test["dna_sequence"].unique().tolist())
+    train_dna_reps = set(train["dna_cluster_rep"].unique().tolist())
+    val_dna_reps = set(val["dna_cluster_rep"].unique().tolist())
+    test_dna_reps = set(test["dna_cluster_rep"].unique().tolist())
+    logger.info(f"Train-Val DNA intersection: {len(train_dna_seqs.intersection(val_dna_seqs))}")
+    logger.info(f"Train-Test DNA intersection: {len(train_dna_seqs.intersection(test_dna_seqs))}")
+    logger.info(f"Val-Test DNA intersection: {len(val_dna_seqs.intersection(test_dna_seqs))}")
+    logger.info(f"Train-Val DNA Cluster Rep intersection: {len(train_dna_reps.intersection(val_dna_reps))}")
+    logger.info(f"Train-Test DNA Cluster Rep intersection: {len(train_dna_reps.intersection(test_dna_reps))}")
+    logger.info(f"Val-Test DNA Cluster Rep intersection: {len(val_dna_reps.intersection(test_dna_reps))}")
+    if split_by != "dna":
         assert len(train_tr_seqs.intersection(val_tr_seqs)) == 0
         assert len(train_tr_seqs.intersection(test_tr_seqs)) == 0
         assert len(val_tr_seqs.intersection(test_tr_seqs)) == 0
         logger.info(f"Pass! No overlap in TR sequences")
         assert len(train_tr_reps.intersection(val_tr_reps)) == 0
         assert len(train_tr_reps.intersection(test_tr_reps)) == 0
         assert len(val_tr_reps.intersection(test_tr_reps)) == 0
         logger.info(f"Pass! No overlap in TR cluster reps")
     if split_by != "protein":
         assert len(train_dna_seqs.intersection(val_dna_seqs)) == 0
         assert len(train_dna_seqs.intersection(test_dna_seqs)) == 0
         assert len(val_dna_seqs.intersection(test_dna_seqs)) == 0
         logger.info(f"Pass! No overlap in DNA sequences")
         assert len(train_dna_reps.intersection(val_dna_reps)) == 0
         assert len(train_dna_reps.intersection(test_dna_reps)) == 0
         assert len(val_dna_reps.intersection(test_dna_reps)) == 0
     logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
     if cfg.data_task.split_by == "dna":
+        if cfg.data_task.test_trs or cfg.data_task.test_dnas:
+            logger.info(f"Splitting with predefined trs/dnas reserved for test set")
+            splits, kept_by_split = split_with_predefined_test(
+                full_df=edge_df,
+                split_names=("train", "val", "test"),
+                test_trs=cfg.data_task.test_trs if cfg.data_task.test_trs else None,
+                test_dnas=cfg.data_task.test_dnas if cfg.data_task.test_dnas else None,
+                ratios=(0.8, 0.1, 0.1),
+            )
+            train = splits["train"]
+            train["split"]=["train"]*len(train)
+            val = splits["val"]
+            val["split"]=["val"]*len(val)
+            test = splits["test"]
+            test["split"]=["test"]*len(test)
+            leaky_test = splits["leaky_test"]
+            leaky_test["split"]=["leaky_test"]*len(leaky_test)
         else:
             logger.info(f"Easy split: all proteins are in their own clusters.")
             dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
             # assign datapoints to cluster by their DNA cluster rep
             edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
+            train = edge_df.loc[edge_df["split"] == "train"].reset_index(drop=True)
+            val = edge_df.loc[edge_df["split"] == "val"].reset_index(drop=True)
+            test = edge_df.loc[edge_df["split"] == "test"].reset_index(drop=True)
+            leaky_test = pd.DataFrame(columns=edge_df.columns)
+        # Print ratios: hopefully close to desired (e.g. 80/10/10)
+        print_split_ratios(kept_by_split)
+        # Make train, val, test sets
+        # make sure no ID is duplicate
+        assert len(edge_df["ID"].unique()) == len(edge_df)
+        split_cols = [
+            "ID",
+            "dna_sequence",
+            "tr_sequence",
+            "tr_cluster_rep",
+            "dna_cluster_rep",
+            "scores",
+            "split",
+        ]
+        train = train[split_cols]
+        val = val[split_cols]
+        test = test[split_cols]
+        leaky_test = leaky_test[split_cols]
     # ensure there is no overlap
     check_validity(train, val, test, split_by=cfg.data_task.split_by)
+    total = sum([len(train), len(val), len(test), len(leaky_test)])
     logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
     logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
     logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
+    logger.info(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
     logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")
+    og_unique_dna = pd.concat([train, val, test, leaky_test])
     og_unique_dna = len(og_unique_dna["dna_sequence"].unique())
     ## Now do RC data augmentation if asked
         train = augment_rc(train)
         val = augment_rc(val)
         test = augment_rc(test)
+        leaky_test = augment_rc(leaky_test)
+        logger.info(f"Added reverse complement sequences to train, val, and test (and leaky test)")
         check_validity(train, val, test, split_by=cfg.data_task.split_by)
+        total = sum([len(train), len(val), len(test), len(leaky_test)])
         logger.info(
             f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)"
         )
         logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
         logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
+        logger.info(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
         logger.info(
             f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}"
         )
+        # since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
+        all_data = pd.concat([train, val, test, leaky_test])
         all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
         dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
         assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
     os.makedirs(split_out_dir, exist_ok=True)
     # add binary_scores to allow other training modes
+    train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x, mode=1))
+    val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x, mode=1))
+    test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x, mode=1))
+    leaky_test["fimo_binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))
     # slect final cols and save
     split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
     train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
     val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
     test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
+    leaky_test[split_final_cols].to_csv(split_out_dir / "leaky_test.csv", index=False)
     logger.info(f"Saved all splits to {split_out_dir}")

dpacman/data_tasks/split/remap_handpick.py ADDED Viewed

	@@ -0,0 +1,266 @@

+"""
+Not neat, but this is what I did to make exclusive splits. saving here for now.
+"""
+## Full pipeline
+import pandas as pd
+protein_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv", sep="\t", header=None)
+protein_clusters.columns=["tr_cluster_rep","tr_cluster_member"]
+protein_clusters.head()
+dna_clusters = pd.read_csv("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv", sep="\t", header=None)
+dna_clusters.columns=["dna_cluster_rep","dna_cluster_member"]
+dna_clusters.head()
+all_data = pd.read_parquet("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet")
+all_data
+protein_cluster_map = dict(zip(protein_clusters["tr_cluster_member"],protein_clusters["tr_cluster_rep"]))
+dna_cluster_map = dict(zip(dna_clusters["dna_cluster_member"],dna_clusters["dna_cluster_rep"]))
+print(len(protein_cluster_map))
+print(len(dna_cluster_map))
+all_data["tr_cluster_rep"] = all_data["tr_seqid"].map(protein_cluster_map)
+all_data["dna_cluster_rep"] = all_data["dna_seqid"].map(dna_cluster_map)
+print(len(all_data[all_data["tr_cluster_rep"].isna()]))
+print(len(all_data[all_data["dna_cluster_rep"].isna()]))
+all_data.head()
+### handpick test
+handpicked_test_trs = ["trseq23","trseq26","trseq17"]
+handpicked_test = all_data.loc[
+    all_data["tr_cluster_rep"].isin(handpicked_test_trs)
+].reset_index(drop=True)
+off_limits_dna_clusters = handpicked_test["dna_cluster_rep"].unique().tolist()
+remaining = all_data.loc[
+    (~all_data["tr_cluster_rep"].isin(handpicked_test_trs)) &
+    (~all_data["dna_cluster_rep"].isin(off_limits_dna_clusters))
+].reset_index(drop=True)
+test_ids = handpicked_test["ID"].unique().tolist()
+remaining_ids = remaining["ID"].unique().tolist()
+lost_rows = all_data.loc[
+    (~all_data["ID"].isin(test_ids)) &
+    (~all_data["ID"].isin(remaining_ids))
+]
+print(f"Rows in test: {len(handpicked_test)}")
+print(f"Rows to be split between train and val: {len(remaining)}")
+total_rows = len(handpicked_test) + len(remaining)
+print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_test)/total_rows:.2f}%")
+print(f"Lost rows: {len(lost_rows)}")
+### handpick val
+handpicked_val_trs = ["trseq9", "trseq5", "trseq28"]
+handpicked_val = remaining.loc[
+    remaining["tr_cluster_rep"].isin(handpicked_val_trs)
+].reset_index(drop=True)
+off_limits_dna_clusters = handpicked_val["dna_cluster_rep"].unique().tolist()
+train_remain = remaining.loc[
+    (~remaining["tr_cluster_rep"].isin(handpicked_val_trs)) &
+    (~remaining["dna_cluster_rep"].isin(off_limits_dna_clusters))
+].reset_index(drop=True)
+val_ids = handpicked_val["ID"].unique().tolist()
+train_remain_ids = train_remain["ID"].unique().tolist()
+lost_rows = all_data.loc[
+    (~all_data["ID"].isin(test_ids)) &
+    (~all_data["ID"].isin(val_ids)) &
+    (~all_data["ID"].isin(train_remain_ids))
+]
+print(f"Rows in val: {len(handpicked_val)}")
+print(f"Rows left for train: {len(train_remain)}")
+total_rows = len(handpicked_val) + len(train_remain)
+print(f"Total rows: {total_rows}. Test percentage: {100*len(handpicked_val)/total_rows:.2f}%")
+print(f"Lost rows: {len(lost_rows)}")
+train_exclusive = all_data.loc[
+    all_data["ID"].isin(train_remain_ids)
+    ].reset_index(drop=True)
+val_exclusive = all_data.loc[
+    all_data["ID"].isin(val_ids)
+    ].reset_index(drop=True)
+test_exclusive = all_data.loc[
+    all_data["ID"].isin(test_ids)
+    ].reset_index(drop=True)
+leaky_test = all_data.loc[
+    ~(all_data["ID"].isin(train_exclusive["ID"].tolist())) &
+    ~(all_data["ID"].isin(val_exclusive["ID"].tolist())) &
+    ~(all_data["ID"].isin(test_exclusive["ID"].tolist()))
+].reset_index(drop=True)
+print(f"Original total: {len(all_data)}")
+retained_total = len(train_exclusive)+len(val_exclusive)+len(test_exclusive)
+print(f"New, exclusive total: {retained_total}")
+print(f"Lost rows: {len(all_data)-retained_total}")
+print(f"Length train: {len(train_exclusive)}/{retained_total} ({100*len(train_exclusive)/retained_total:.2f}%)")
+print(f"Length val: {len(val_exclusive)}/{retained_total} ({100*len(val_exclusive)/retained_total:.2f}%)")
+print(f"Length test: {len(test_exclusive)}/{retained_total} ({100*len(test_exclusive)/retained_total:.2f}%)")
+def check_validity(train_exclusive, val_exclusive, test_exclusive):
+    train_exclusive_ids = set(train_exclusive["ID"].unique().tolist())
+    val_exclusive_ids = set(val_exclusive["ID"].unique().tolist())
+    test_exclusive_ids = set(test_exclusive["ID"].unique().tolist())
+    assert len(train_exclusive_ids.intersection(val_exclusive_ids)) == 0
+    assert len(train_exclusive_ids.intersection(test_exclusive_ids)) == 0
+    assert len(val_exclusive_ids.intersection(test_exclusive_ids)) == 0
+    print(f"Pass! No overlap in IDs")
+    # Investigate TR intersection. No assertions unless we are explicitly splitting on this.
+    train_exclusive_tr_seqs = set(train_exclusive["tr_sequence"].unique().tolist())
+    val_exclusive_tr_seqs = set(val_exclusive["tr_sequence"].unique().tolist())
+    test_exclusive_tr_seqs = set(test_exclusive["tr_sequence"].unique().tolist())
+    train_exclusive_tr_reps = set(train_exclusive["tr_cluster_rep"].unique().tolist())
+    val_exclusive_tr_reps = set(val_exclusive["tr_cluster_rep"].unique().tolist())
+    test_exclusive_tr_reps = set(test_exclusive["tr_cluster_rep"].unique().tolist())
+    print(f"Train-Val TR intersection: {len(train_exclusive_tr_seqs.intersection(val_exclusive_tr_seqs))}")
+    print(f"Train-Test TR intersection: {len(train_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
+    print(f"Val-Test TR intersection: {len(val_exclusive_tr_seqs.intersection(test_exclusive_tr_seqs))}")
+    print(f"Train-Val TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(val_exclusive_tr_reps))}")
+    print(f"Train-Test TR Cluster Rep intersection: {len(train_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
+    print(f"Val-Test TR Cluster Rep intersection: {len(val_exclusive_tr_reps.intersection(test_exclusive_tr_reps))}")
+    # Investigate DNA intersection. No assertions unless we are explicitly splitting on this.
+    train_exclusive_dna_seqs = set(train_exclusive["dna_sequence"].unique().tolist())
+    val_exclusive_dna_seqs = set(val_exclusive["dna_sequence"].unique().tolist())
+    test_exclusive_dna_seqs = set(test_exclusive["dna_sequence"].unique().tolist())
+    train_exclusive_dna_reps = set(train_exclusive["dna_cluster_rep"].unique().tolist())
+    val_exclusive_dna_reps = set(val_exclusive["dna_cluster_rep"].unique().tolist())
+    test_exclusive_dna_reps = set(test_exclusive["dna_cluster_rep"].unique().tolist())
+    print(f"Train-Val DNA intersection: {len(train_exclusive_dna_seqs.intersection(val_exclusive_dna_seqs))}")
+    print(f"Train-Test DNA intersection: {len(train_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
+    print(f"Val-Test DNA intersection: {len(val_exclusive_dna_seqs.intersection(test_exclusive_dna_seqs))}")
+    print(f"Train-Val DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(val_exclusive_dna_reps))}")
+    print(f"Train-Test DNA Cluster Rep intersection: {len(train_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
+    print(f"Val-Test DNA Cluster Rep intersection: {len(val_exclusive_dna_reps.intersection(test_exclusive_dna_reps))}")
+def get_reverse_complement(s):
+    """
+    Returns 5' to 3' sequence of the reverse complement
+    """
+    chars = list(s)
+    recon = []
+    rev_map = {
+        "a": "t",
+        "c": "g",
+        "t": "a",
+        "g": "c",
+        "A": "T",
+        "C": "G",
+        "T": "A",
+        "G": "C",
+        "n": "n",
+        "N": "N",
+    }
+    for c in chars:
+        recon += [rev_map[c]]
+    recon = "".join(recon)
+    return recon[::-1]
+# now make reverse complements
+def augment_rc(df):
+    """
+    Get the reverse complement and add it as a datapoint, effectively doubling the dataset.
+    Also flip the orientation of the scores
+    columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
+    """
+    df_rc = df.copy(deep=True)
+    df_rc["dna_sequence"] = df_rc["dna_sequence"].apply(
+        lambda x: get_reverse_complement(x)
+    )
+    df_rc["ID"] = df_rc["ID"] + "_rc"
+    df_rc["scores"] = df_rc["scores"].apply(lambda s: ",".join(s.split(",")[::-1]))
+    final_df = pd.concat([df, df_rc]).reset_index(drop=True)
+    return final_df
+def convert_scores(scores, mode=1):
+    """
+    Two modes: 1 means FIMO peaks get 1. 0 means FIMO peaks get their max score
+    """
+    svec = [int(x) for x in scores.split(",")]
+    max_score = max(svec)
+    if mode ==1:
+        binary_svec = [0 if x<max_score else 1 for x in svec]
+        assert(svec.count(max_score)==binary_svec.count(1))
+    else:
+        binary_svec = [0 if x<max_score else max_score for x in svec]
+        assert(svec.count(max_score)==binary_svec.count(max_score))
+    binary_svec = ",".join([str(x) for x in binary_svec])
+    return binary_svec
+check_validity(train_exclusive, val_exclusive, test_exclusive)
+train_exclusive = augment_rc(train_exclusive)
+val_exclusive = augment_rc(val_exclusive)
+test_exclusive = augment_rc(test_exclusive)
+leaky_test = augment_rc(leaky_test)
+print(f"Added reverse complement sequences to train_exclusive, val_exclusive, and test_exclusive (and leaky test_exclusive)")
+check_validity(train_exclusive, val_exclusive, test_exclusive)
+total = sum([len(train_exclusive), len(val_exclusive), len(test_exclusive), len(leaky_test)])
+print(
+    f"Length of train_exclusive dataset: {len(train_exclusive)} ({100*len(train_exclusive)/total:.2f}%)"
+)
+print(f"Length of val_exclusive dataset: {len(val_exclusive)} ({100*len(val_exclusive)/total:.2f}%)")
+print(f"Length of test_exclusive dataset: {len(test_exclusive)} ({100*len(test_exclusive)/total:.2f}%)")
+print(f"Length of leaky_test dataset: {len(leaky_test)} ({100*len(leaky_test)/total:.2f}%)")
+print(
+    f"Total sequences = {total}. Same as edges size*2? {total==len(all_data)*2}"
+)
+# since we've added all these new DNA sequences, we do need a new mapping of seq id to dna sequence
+all_data = pd.concat([train_exclusive, val_exclusive, test_exclusive, leaky_test])
+all_data["dna_seqid"] = all_data["ID"].str.split("_", n=1, expand=True)[1]
+dna_dict = dict(zip(all_data["dna_seqid"], all_data["dna_sequence"]))
+assert len(dna_dict) == len(all_data.drop_duplicates(["dna_sequence"]))
+# create the output dir
+import os
+from pathlib import Path
+split_out_dir = Path("/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test")
+os.makedirs(split_out_dir, exist_ok=True)
+# add binary_scores to allow other training modes
+train_exclusive["binary_sores"] = train_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
+val_exclusive["binary_sores"] = val_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
+test_exclusive["binary_sores"] = test_exclusive["scores"].apply(lambda x: convert_scores(x, mode=1))
+leaky_test["binary_sores"] = leaky_test["scores"].apply(lambda x: convert_scores(x, mode=1))
+train_exclusive["split"] = ["train"]*len(train_exclusive)
+val_exclusive["split"] = ["val"]*len(val_exclusive)
+test_exclusive["split"] = ["test"]*len(test_exclusive)
+leaky_test["split"] = ["leakytest"]*len(leaky_test)
+# slect final cols and save
+split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "binary_sores", "split"]
+train_exclusive[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
+val_exclusive[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
+test_exclusive[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
+leaky_test[split_final_cols].to_csv(split_out_dir / "leakytest.csv", index=False)
+print(f"Saved all splits to {split_out_dir}")
+# make baby versions too
+train_exclusive[split_final_cols].sample(400, random_state=42).to_csv(split_out_dir / "babytrain.csv", index=False)
+val_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyval.csv", index=False)
+test_exclusive[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babytest.csv", index=False)
+leaky_test[split_final_cols].sample(50, random_state=42).to_csv(split_out_dir / "babyleakytest.csv", index=False)

dpacman/find_wandb_run_name.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+import os
+import argparse
+from typing import Optional
+SENTINEL = "View project at"
+SKIP_DIRS = {".git", ".hg", ".svn", "__pycache__", ".mamba", ".conda"}
+def extract_run_name(log_path: str) -> Optional[str]:
+    """
+    Return the run name if we find a line that:
+      - starts with 'wandb:' (after leading whitespace is stripped)
+      - whose second-to-last word is 'run' and the last word is the run name
+    and this occurs before the first line containing SENTINEL.
+    Otherwise return None.
+    """
+    try:
+        with open(log_path, "r", errors="ignore") as f:
+            for raw in f:
+                if SENTINEL in raw:
+                    return None
+                line = raw.strip()
+                if not line.startswith("wandb: "):
+                    continue
+                toks = line.split()
+                if len(toks) >= 2 and toks[-2] == "run":
+                    return toks[-1]  # the run name (last token)
+    except OSError:
+        return None
+    return None
+def list_runs(root: str, followlinks: bool = False, do_rename=False):
+    for dirpath, dirs, files in os.walk(root, followlinks=followlinks):
+        # prune junk dirs
+        dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
+        if "run.log" in files:
+            log_path = os.path.join(dirpath, "run.log")
+            name = extract_run_name(log_path)
+            new_dir_path = dirpath
+            if name:
+                if name not in dirpath:
+                    new_dir_path = f"{dirpath}-{name}"
+            print(f"{name if name else '<unknown>'}\t{new_dir_path}")
+            if do_rename and new_dir_path != dirpath:
+                parent = os.path.dirname(dirpath)
+                # resolve absolute path for safety
+                abs_old = os.path.abspath(dirpath)
+                abs_new = os.path.abspath(new_dir_path)
+                if os.path.exists(abs_new):
+                    print(f"⚠️ Target {abs_new} already exists, skipping rename.")
+                else:
+                    print(f"Renaming {abs_old} → {abs_new}")
+                    os.rename(abs_old, abs_new)
+def main():
+    ap = argparse.ArgumentParser(
+        description="List W&B run names by parsing lines that start with 'wandb:' and end with 'run <name>'."
+    )
+    ap.add_argument("root", help="Root directory to search")
+    ap.add_argument("--followlinks", action="store_true", help="Follow symlinks while walking")
+    args = ap.parse_args()
+    list_runs(args.root, followlinks=args.followlinks)
+if __name__ == "__main__":
+    main()

dpacman/make_splits.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

dpacman/manual_scan_chroms.ipynb ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f6c01484",
+   "metadata": {},
+   "source": [
+    "Temporary notebook for manually scanning chromosomes for sequences of interest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "0608f91e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seq_of_interest = \"GCAGATCTGCACATC\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3c245151",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "genome_dir = \"/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "682098b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "dict_keys(['chr12', 'chr5', 'chr17', 'chr2', 'chr21', 'chr1', 'chrM', 'chr22', 'chr20', 'chr16', 'chr9', 'chr8', 'chr19', 'chr7', 'chr11', 'chr3', 'chr4', 'chr14', 'chr15', 'chr18', 'chrY', 'chr6', 'chrX', 'chr13', 'chr10'])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "chrom_cache = {}\n",
+    "for chrom_file in os.listdir(genome_dir):\n",
+    "    chrom = chrom_file.split(\"hg38_\")[1].split(\".json\")[0]\n",
+    "    with open(f\"{genome_dir}/{chrom_file}\", \"r\") as f:\n",
+    "        chrom_cache[chrom] = json.load(f)\n",
+    "\n",
+    "print(chrom_cache.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd6cca79",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing sequence A: TAGCAGGATGTGT\n",
+      "Testing sequence B: GCAGATCTGCACATC\n",
+      "Testing sequence C: CGACACCTGACGCG\n",
+      "Testing sequence D: CGCTATCCAGAGCG\n",
+      "Testing sequence E: CGCGATGCTTCTCG\n",
+      "Testing sequence F: CGGCTGGATTACCG\n",
+      "Testing sequence G: CGAGAACATAGTCG\n",
+      "Testing sequence H: CGGGGAAACGCCCG\n",
+      "Testing sequence I: CGCCCAAAGCCGCG\n",
+      "Testing sequence J: CGGAGGTAATGACG\n",
+      "Testing sequence K: CGCACCGACTCACG\n",
+      "Testing sequence L: CGGCCCTTTGCGCG\n",
+      "Testing sequence M: CGCCGTTAGTGTCG\n"
+     ]
+    }
+   ],
+   "source": [
+    "baker_sequences = {\n",
+    "    \"A\": \"TAGCAGGATGTGT\",\n",
+    "    \"B\": \"GCAGATCTGCACATC\",\n",
+    "    \"C\": \"CGACACCTGACGCG\",\n",
+    "    \"D\": \"CGCTATCCAGAGCG\",\n",
+    "    \"E\": \"CGCGATGCTTCTCG\",\n",
+    "    \"F\": \"CGGCTGGATTACCG\",\n",
+    "    \"G\": \"CGAGAACATAGTCG\",\n",
+    "    \"H\": \"CGGGGAAACGCCCG\",\n",
+    "    \"I\": \"CGCCCAAAGCCGCG\",\n",
+    "    \"J\": \"CGGAGGTAATGACG\",\n",
+    "    \"K\": \"CGCACCGACTCACG\",\n",
+    "    \"L\": \"CGGCCCTTTGCGCG\",\n",
+    "    \"M\": \"CGCCGTTAGTGTCG\"\n",
+    "}\n",
+    "sorted_chroms = list(chrom_cache.keys())\n",
+    "sorted_chroms = sorted(sorted_chroms, key = lambda x: int(x.split(\"chr\")[1]) if x.split(\"chr\")[1] not in [\"M\",\"X\",\"Y\"] else 0)\n",
+    "\n",
+    "for seq_letter, seq in baker_sequences.items():\n",
+    "    print(f\"Testing sequence {seq_letter}: {seq}\")\n",
+    "    for chrom in sorted_chroms:\n",
+    "        chrom_dna = chrom_cache[chrom][\"dna\"].upper()\n",
+    "        match_chroms=[]\n",
+    "        try:\n",
+    "            print(f\"\\tChrom {chrom} index of sequence {seq_letter} ({seq}): {chrom.index(seq)}\")\n",
+    "            match_chroms+=[chrom]\n",
+    "        except:\n",
+    "            match_chroms = match_chroms\n",
+    "    print(f\"\\tChrom {chrom} does not have sequence {seq_letter}({seq})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99078ac6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#[m.start() for m in re.finditer(chr1_dna, 'GCAGATCTGCACATC')]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dnabind2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

dpacman/scripts/delay_run.sh CHANGED Viewed

@@ -1,10 +1,10 @@
 #!/usr/bin/env bash
 set -euo pipefail
-# Usage: ./stagger.sh <first_script.sh> <second_script.sh>
 # Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
-WAIT1=${WAIT1:-3600}  # 3 hours in seconds
 WAIT2=${WAIT2:-10800}
 SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"

 #!/usr/bin/env bash
 set -euo pipefail
+# Usage: nohup bash scripts/delay_run.sh scripts/run_train.sh scripts/run_train_2.sh > delay.log 2>&1 &
 # Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
+WAIT1=${WAIT1:-10800}  # 3 hours in seconds
 WAIT2=${WAIT2:-10800}
 SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"

dpacman/scripts/run_eval.sh CHANGED Viewed

@@ -14,7 +14,7 @@ if [ -z "$WANDB_API_KEY" ]; then
     export WANDB_API_KEY="$wandb_key"
 fi
-CUDA_VISIBLE_DEVICES=3 nohup python -u -m scripts.eval \
   hydra.run.dir="${run_dir}" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
@@ -23,8 +23,10 @@ CUDA_VISIBLE_DEVICES=3 nohup python -u -m scripts.eval \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
   model.hidden_dim=256 \
-  ckpt_path="/home/a03-svincoff/DPACMAN/logs/train/classifier/runs/2025-08-27_18-52-25/checkpoints/epoch_009.ckpt" \
-  model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

     export WANDB_API_KEY="$wandb_key"
 fi
+CUDA_VISIBLE_DEVICES=2 nohup python -u -m scripts.eval \
   hydra.run.dir="${run_dir}" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
   model.hidden_dim=256 \
+  data_module.score_col="binary_scores" \
+  data_module.norm_value=1 \
+  model.loss_type="binary" \
+  ckpt_path="/home/a03-svincoff/DPACMAN/logs/train/classifier/runs/2025-08-28_04-37-58-stoic-snowball-99/checkpoints/epoch_009.ckpt" \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_split.sh CHANGED Viewed

@@ -10,13 +10,14 @@ mkdir -p "$run_dir"
 nohup python -u -m scripts.preprocess \
   hydra.run.dir="${run_dir}" \
-  +data_task.p_exclude="true" \
   data_task="${data_task_type}/remap" \
   data_task.split_by=dna \
   data_task.train_ratio=0.8 \
   data_task.val_ratio=0.1 \
   data_task.test_ratio=0.1 \
-  data_task.split_out_dir=dpacman/data_files/processed/splits/by_both \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

 nohup python -u -m scripts.preprocess \
   hydra.run.dir="${run_dir}" \
   data_task="${data_task_type}/remap" \
+  data_task.test_trs=["trseq23","trseq26","trseq17"] \
+  data_task.test_dnas=null \
   data_task.split_by=dna \
   data_task.train_ratio=0.8 \
   data_task.val_ratio=0.1 \
   data_task.test_ratio=0.1 \
+  data_task.split_out_dir=dpacman/data_files/processed/splits/handpicked_test \
   > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_train.sh CHANGED Viewed

@@ -23,17 +23,19 @@ CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
   trainer.max_epochs=10 \
-  data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
-  data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
-  data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
   data_module.score_col="binary_scores" \
   model.loss_type="binary" \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
   model.hidden_dim=256 \
   model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &

   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
   trainer.max_epochs=10 \
+  data_module.train_file="/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test_cropTR4/train.csv" \
+  data_module.val_file="/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test_cropTR4/val.csv" \
+  data_module.test_file="/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/splits/handpicked_val_test_cropTR4/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
   data_module.score_col="binary_scores" \
+  data_module.norm_value=1 \
   model.loss_type="binary" \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
   model.hidden_dim=256 \
+  model.dropout=0.2 \
   model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &

dpacman/scripts/run_train_baseline.sh CHANGED Viewed

@@ -31,6 +31,7 @@ CUDA_VISIBLE_DEVICES=2,3 nohup python -u -m scripts.train \
   data_module.batch_size=16 \
   data_module.score_col="binary_scores" \
   model.loss_type="binary" \
   model=baseline \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \

   data_module.batch_size=16 \
   data_module.score_col="binary_scores" \
   model.loss_type="binary" \
+  data_module.norm_value=1 \
   model=baseline \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \