eval mode, fixed, full binary mode

Browse files

Files changed (13) hide show

.gitignore +3 -1
configs/data_module/pair.yaml +3 -0
configs/eval.yaml +42 -0
configs/model/baseline.yaml +3 -1
configs/model/classifier.yaml +3 -1
dpacman/classifier/baseline.py +4 -3
dpacman/classifier/loss.py +37 -6
dpacman/classifier/model.py +17 -5
dpacman/data_modules/pair.py +79 -54
dpacman/scripts/eval.py +197 -0
dpacman/scripts/run_eval.sh +30 -0
dpacman/scripts/run_train.sh +4 -2
dpacman/scripts/run_train_baseline.sh +3 -1

.gitignore CHANGED Viewed

@@ -38,4 +38,6 @@ dpacman/peak_examples/
 dpacman/__pycache__/
 log.log
 log2.log
-dpacman/delay.log

 dpacman/__pycache__/
 log.log
 log2.log
+dpacman/delay.log
+dpacman/view_profiles.ipynb
+dpacman/find_wandb_run_dirs.py

configs/data_module/pair.yaml CHANGED Viewed

@@ -4,6 +4,9 @@ train_file: data_files/processed/splits/by_dna/babytrain.csv
 val_file: data_files/processed/splits/by_dna/babyval.csv
 test_file: data_files/processed/splits/by_dna/babytest.csv
 tr_shelf_path: data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf
 dna_shelf_path: data_files/processed/embeddings/fimo_hits_only/baby_peaks_segmentnt_pernuc_with_onehot.shelf

 val_file: data_files/processed/splits/by_dna/babyval.csv
 test_file: data_files/processed/splits/by_dna/babytest.csv
+target_col: dna_sequence
+score_col: scores
 tr_shelf_path: data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf
 dna_shelf_path: data_files/processed/embeddings/fimo_hits_only/baby_peaks_segmentnt_pernuc_with_onehot.shelf

configs/eval.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+defaults:
+  - paths: default
+  - hydra: default  # ← tells Hydra to use the logging/output config
+  - data_module: pair
+  - model: classifier
+  - trainer: gpu
+  - extras: default
+  - logger: wandb
+  - callbacks: default
+  - _self_
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  # config for hyperparameter optimization
+  - hparams_search: null
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+task_name: eval/${model}
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: True
+# simply provide checkpoint path to resume training
+ckpt_path: /home/a03-svincoff/DPACMAN/logs/train/classifier/runs/2025-08-25_18-08-13/checkpoints/epoch_009.ckpt
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42
+data_module:
+  train_file: null
+  val_file: null
+  test_file: data_files/processed/splits/by_dna/test.csv

configs/model/baseline.yaml CHANGED Viewed

@@ -7,4 +7,6 @@ weight_decay: 0.01
 glm_input_dim: 256
 compressed_dim: 256
-hidden_dim: 128

 glm_input_dim: 256
 compressed_dim: 256
+hidden_dim: 128
+loss_type: mixed

configs/model/classifier.yaml CHANGED Viewed

@@ -7,4 +7,6 @@ weight_decay: 0.01
 glm_input_dim: 1029
 compressed_dim: 1029
-hidden_dim: 256

 glm_input_dim: 1029
 compressed_dim: 1029
+hidden_dim: 256
+loss_type: mixed

dpacman/classifier/baseline.py CHANGED Viewed

@@ -24,6 +24,7 @@ class BaselineBindPredictor(LightningModule):
         gamma: float = 20,
         dropout: float = 0,
         weight_decay: float = 0.01,
     ):
         # Init
         super(BaselineBindPredictor, self).__init__()
@@ -78,7 +79,7 @@ class BaselineBindPredictor(LightningModule):
         """
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "train/loss",
@@ -113,7 +114,7 @@ class BaselineBindPredictor(LightningModule):
     def validation_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "val/loss",
@@ -143,7 +144,7 @@ class BaselineBindPredictor(LightningModule):
     def test_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)

         gamma: float = 20,
         dropout: float = 0,
         weight_decay: float = 0.01,
+        loss_type: str = "mixed"
     ):
         # Init
         super(BaselineBindPredictor, self).__init__()
         """
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "train/loss",
     def validation_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "val/loss",
     def test_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)

dpacman/classifier/loss.py CHANGED Viewed

@@ -7,6 +7,12 @@ import torch.nn.functional as F
 from torchmetrics.functional.classification import (
     auroc, average_precision, roc, precision_recall_curve
 )
 def _expand_like(mask: torch.Tensor, like: torch.Tensor):
     # Make mask broadcastable to logits/targets (handles (B,L) vs (B,L,1))
@@ -14,7 +20,7 @@ def _expand_like(mask: torch.Tensor, like: torch.Tensor):
         mask = mask.unsqueeze(-1)
     return mask.expand_as(like)
-def bce_loss_masked(logits, targets, nonpeak_mask, pos_weight=None, eps=1e-8):
     """
     Compute masked BCE with logits over non-peak positions only.
     Expects nonpeak_mask already broadcastable to logits.
@@ -24,7 +30,7 @@ def bce_loss_masked(logits, targets, nonpeak_mask, pos_weight=None, eps=1e-8):
     loss = F.binary_cross_entropy_with_logits(
         logits, t, reduction="none", pos_weight=pos_weight
     )
-    m = _expand_like(nonpeak_mask, loss).to(loss.dtype)
     denom = m.sum().clamp_min(eps)
     return (loss * m).sum() / denom
@@ -41,17 +47,32 @@ def mse_peaks_only(logits, targets, peak_mask, eps=1e-8):
 def calculate_loss(
     logits,
     targets,
     eps: float = 1e-8,
     alpha: float = 1.0,
     gamma: float = 1.0,
     pos_weight=None,
     pad_value: float = -1.0,
 ):
     """
     Combine masked-BCE (non-peak) + masked-MSE on probs (peak), ignoring padding.
     Assumes targets == -1 are pads; non-peak = 0; peak > 0.
     """
     valid = (targets != pad_value)
     # Peak / non-peak masks that exclude pads
     nonpeak_mask = valid & (targets == 0)
@@ -60,10 +81,18 @@ def calculate_loss(
     # For safety, zero-out targets at pad positions so they never feed into BCE/MSE
     targets_safe = torch.where(valid, targets, torch.zeros_like(targets))
-    bce_nonpeak = bce_loss_masked(logits, targets_safe, nonpeak_mask, pos_weight=pos_weight, eps=eps)
-    mse_peak    = mse_peaks_only(logits, targets_safe, peak_mask, eps=eps)
-    return alpha * bce_nonpeak + gamma * mse_peak
 @torch.no_grad()
 def auroc_zeros_vs_ones_from_logits(
@@ -81,6 +110,7 @@ def auroc_zeros_vs_ones_from_logits(
       tp, fp:         integer counts per threshold (shape (T,))
     """
     device = logits.device
     valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
     keep = valid & ((labels > pos_thresh) | (labels == 0.0))
     if keep.sum() == 0:
@@ -126,6 +156,7 @@ def auprc_zeros_vs_ones_from_logits(
       thresholds:     (T,)
     """
     device = logits.device
     valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
     keep = valid & ((labels > pos_thresh) | (labels == 0.0))
     if keep.sum() == 0:

 from torchmetrics.functional.classification import (
     auroc, average_precision, roc, precision_recall_curve
 )
+import rootutils
+from dpacman.utils import pylogger
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+logger = pylogger.RankedLogger(__name__, rank_zero_only=True)
 def _expand_like(mask: torch.Tensor, like: torch.Tensor):
     # Make mask broadcastable to logits/targets (handles (B,L) vs (B,L,1))
         mask = mask.unsqueeze(-1)
     return mask.expand_as(like)
+def bce_loss_masked(logits, targets, mask, pos_weight=None, eps=1e-8):
     """
     Compute masked BCE with logits over non-peak positions only.
     Expects nonpeak_mask already broadcastable to logits.
     loss = F.binary_cross_entropy_with_logits(
         logits, t, reduction="none", pos_weight=pos_weight
     )
+    m = _expand_like(mask, loss).to(loss.dtype)
     denom = m.sum().clamp_min(eps)
     return (loss * m).sum() / denom
 def calculate_loss(
     logits,
     targets,
+    binder_kpm,
+    glm_kpm,
     eps: float = 1e-8,
     alpha: float = 1.0,
     gamma: float = 1.0,
     pos_weight=None,
     pad_value: float = -1.0,
+    loss_type="mixed"
 ):
     """
     Combine masked-BCE (non-peak) + masked-MSE on probs (peak), ignoring padding.
     Assumes targets == -1 are pads; non-peak = 0; peak > 0.
+    binder_kpm is 1 at PAD positions, 0 elsewhere
+    glm_kpm is 1 at PAD positions, 0 elsewhere
+    if loss_type is mixed, we're doing binary cross entropy off the peaks and MSE on the peaks.
+    if loss_type is binary, we're doing binary cross entropy everywhere because the labels are binary.
     """
+    # calculate validity in two ways; these should be the same.
+    # targets are padded to -1 where there is not really a DNA sequence there
     valid = (targets != pad_value)
+    if glm_kpm is not None:
+        nvalid = torch.sum(valid).item()
+        nvalid_2 = torch.sum(~glm_kpm).item()
+        assert nvalid==nvalid_2
     # Peak / non-peak masks that exclude pads
     nonpeak_mask = valid & (targets == 0)
     # For safety, zero-out targets at pad positions so they never feed into BCE/MSE
     targets_safe = torch.where(valid, targets, torch.zeros_like(targets))
+    if loss_type=="mixed":
+        bce_nonpeak = bce_loss_masked(logits, targets_safe, nonpeak_mask, pos_weight=pos_weight, eps=eps)
+        mse_peak    = mse_peaks_only(logits, targets_safe, peak_mask, eps=eps)
+        return alpha * bce_nonpeak + gamma * mse_peak
+    else:
+        # we're expecting all binary labels. make sure.
+        all_binary = ((targets_safe==1) | (targets_safe==0)).all().item()
+        if not(all_binary):
+            logger.info(f"WARNING: expecting all binary labels for loss_type={loss_type}. Did not get all binary labels.")
+        # bce over all valid positions
+        bce_all = bce_loss_masked(logits, targets_safe, valid, pos_weight=pos_weight, eps=eps)
+        return alpha*bce_all
 @torch.no_grad()
 def auroc_zeros_vs_ones_from_logits(
       tp, fp:         integer counts per threshold (shape (T,))
     """
     device = logits.device
+    # glm_kpm is 1 where there's a pad, so ~glm_kpm is valid positions
     valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
     keep = valid & ((labels > pos_thresh) | (labels == 0.0))
     if keep.sum() == 0:
       thresholds:     (T,)
     """
     device = logits.device
+    # glm_kpm is 1 where there's a pad, so ~glm_kpm is valid
     valid = ~glm_kpm if glm_kpm is not None else torch.ones_like(labels, dtype=torch.bool, device=device)
     keep = valid & ((labels > pos_thresh) | (labels == 0.0))
     if keep.sum() == 0:

dpacman/classifier/model.py CHANGED Viewed

@@ -10,7 +10,6 @@ from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_v
 set_seed()
 class LocalCNN(nn.Module):
     def __init__(self, dim: int = 256, kernel_size: int = 3):
         super().__init__()
@@ -156,6 +155,7 @@ class BindPredictor(LightningModule):
         dropout: float = 0,
         use_local_cnn_on_glm: bool = True,
         weight_decay: float = 0.01,
     ):
         # Init
         super(BindPredictor, self).__init__()
@@ -222,7 +222,7 @@ class BindPredictor(LightningModule):
         """
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "train/loss",
@@ -256,7 +256,7 @@ class BindPredictor(LightningModule):
     def validation_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "val/loss",
@@ -287,7 +287,7 @@ class BindPredictor(LightningModule):
     def test_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
-            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
@@ -307,8 +307,20 @@ class BindPredictor(LightningModule):
         self.log("test/auroc_0v1",
                 auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
                 on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
     def on_before_optimizer_step(self, optimizer):
         # Compute global L2 norm of all parameter gradients (ignores None grads)
         grads = []

 set_seed()
 class LocalCNN(nn.Module):
     def __init__(self, dim: int = 256, kernel_size: int = 3):
         super().__init__()
         dropout: float = 0,
         use_local_cnn_on_glm: bool = True,
         weight_decay: float = 0.01,
+        loss_type = "mixed"
     ):
         # Init
         super(BindPredictor, self).__init__()
         """
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "train/loss",
     def validation_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "val/loss",
     def test_step(self, batch, batch_idx):
         logits = self.forward(batch["binder_emb"], batch["glm_emb"], batch["binder_kpm"], batch["glm_kpm"])
         loss = calculate_loss(
+            logits, batch["labels"], batch["binder_kpm"], batch["glm_kpm"], alpha=self.hparams.alpha, gamma=self.hparams.gamma, loss_type=self.hparams.loss_type
         )
         self.log(
             "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
         self.log("test/auroc_0v1",
                 auc if torch.isfinite(auc) else torch.tensor(0.0, device=auc.device),
                 on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
         return loss
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"],
+                            batch["binder_kpm"], batch["glm_kpm"]).squeeze(-1)  # (B,L)
+        valid = ~batch["glm_kpm"]   # (B,L)
+        return {
+            "ids": batch["ID"],                           # list[str]
+            "logits": logits.detach().cpu(),              # (B,Lmax) padded
+            "valid": valid.detach().cpu(),                # (B,Lmax) booleans
+            "labels": batch["labels"].detach().cpu(),     # (B,Lmax) padded
+        }
     def on_before_optimizer_step(self, optimizer):
         # Compute global L2 norm of all parameter gradients (ignores None grads)
         grads = []

dpacman/data_modules/pair.py CHANGED Viewed

@@ -157,7 +157,7 @@ def make_length_batches(
 # ---- dataset ---------------------------------------------------------
 class PairDataset(Dataset):
     def __init__(
-        self, dataset: pd.DataFrame, norm_value: int = 1333, round_to: int = 4
     ):
         """
         Args:
@@ -165,21 +165,29 @@ class PairDataset(Dataset):
             - norm_value: max score, which we'll use to divide all the integer scores in "scores"
             - round_to: how many decimal places for the numerical score values
         """
-        self.dataset = self._load_and_normalize(dataset, norm_value, round_to)
-        self.norm_value = (
-            norm_value  # what to divide everything in labels by to make it a float
-        )
-    def _load_and_normalize(self, dataset, norm_value: int, round_to: int):
         """
         Labels come in looking like "0,0,0,100,100,133,133,100,100,0,0,"
         This method turns the labels from strings into floats out to 4 decimal places
         """
         # split string into list of strings
-        dataset["scores"] = dataset["scores"].apply(lambda x: x.split(","))
         # turn list of strings into list of normalized, rounded floats
-        dataset["scores"] = dataset["scores"].apply(
-            lambda x: [round(int(y) / norm_value, round_to) for y in x]
         )
         # convert to records for ease of loading
@@ -212,6 +220,9 @@ class PairDataModule(LightningDataModule):
         debug_run: bool = False,
         pin_memory: bool = False,
         shuffle_train_batch_order: bool = True,
     ):
         super().__init__()
         self.save_hyperparameters()
@@ -221,6 +232,9 @@ class PairDataModule(LightningDataModule):
         self.train_data_file = train_file
         self.val_data_file = val_file
         self.test_data_file = test_file
         # Initialize hyperparameters like batch size
         self.batch_size = batch_size
@@ -232,10 +246,12 @@ class PairDataModule(LightningDataModule):
         self.collate = ShelfCollator(
             tr_shelf_path=str(tr_shelf_path),
             dna_shelf_path=str(dna_shelf_path),
-            tr_key="tr_sequence",
-            dna_key="dna_sequence",
             dtype=torch.float32,
-            pad_value=0.0,
         )
         self.drop_last = False  # or True, your choice
         self.shuffle_batch_order = shuffle_train_batch_order  # False keep batches deterministic per epoch; set True if you want to shuffle batch order
@@ -247,11 +263,13 @@ class PairDataModule(LightningDataModule):
         """
         Load and unpack an input csv whose columns are binder_path,glm_path,label
         """
-        df = pd.read_csv(file_path)
-        if lim is not None:
-            df = df[:lim].reset_index(drop=True)
-        return df[["ID", "dna_sequence", "tr_sequence", "scores"]]
     def setup(self, stage: str | None = None):
         lim = 5 if self.debug_run else None
@@ -260,7 +278,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "fit"):
             if not hasattr(self, "train_dataset"):
                 train_df = self.load_file(self.train_data_file, lim=lim)
-                self.train_dataset = PairDataset(train_df)
                 self.train_batches = make_length_batches(
                     dataset_records=self.train_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -276,7 +294,7 @@ class PairDataModule(LightningDataModule):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
-                self.val_dataset = PairDataset(val_df)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -291,7 +309,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "validate"):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
-                self.val_dataset = PairDataset(val_df)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -306,7 +324,7 @@ class PairDataModule(LightningDataModule):
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
                 test_df = self.load_file(self.test_data_file, lim=lim)
-                self.test_dataset = PairDataset(test_df)
                 self.test_batches = make_length_batches(
                     dataset_records=self.test_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
@@ -346,6 +364,17 @@ class PairDataModule(LightningDataModule):
             persistent_workers=(self.num_workers > 0),
             pin_memory=self.hparams.pin_memory,
         )
 class ShelfCollator:
@@ -373,13 +402,17 @@ class ShelfCollator:
         dna_key: str = "dna_sequence",
         dtype: torch.dtype = torch.float32,
         pad_value: float = -1.0,
     ):
         self.tr_path = tr_shelf_path
         self.dna_path = dna_shelf_path
         self.tr_key = tr_key
         self.dna_key = dna_key
         self.dtype = dtype
         self.pad_value = pad_value
         # opened lazily per worker:
         self._tr_db = None
@@ -400,7 +433,7 @@ class ShelfCollator:
         ids = [b.get("ID", None) for b in batch]
         tr_seqs = [b[self.tr_key] for b in batch]
         dna_seqs = [b[self.dna_key] for b in batch]
-        scores_list = [b["scores"] for b in batch]
         # 1) Fetch embeddings lazily from shelves
         binder_list = []
@@ -438,10 +471,10 @@ class ShelfCollator:
         glm_emb = pad_sequence(
             glm_list, batch_first=True, padding_value=self.pad_value
         )  # [B, Lg_max, Dg]
         binder_lens = torch.as_tensor(binder_lens, dtype=torch.int64)
         glm_lens = torch.as_tensor(glm_lens, dtype=torch.int64)
         binder_mask = torch.arange(binder_emb.size(1)).unsqueeze(
             0
         ) < binder_lens.unsqueeze(
@@ -460,6 +493,24 @@ class ShelfCollator:
         labels = pad_sequence(
             labels_list, batch_first=True, padding_value=self.pad_value
         )  # [B, Lg_max]
         return {
             "binder_emb": binder_emb,  # [B, Lb_max, Db]
@@ -474,32 +525,6 @@ class ShelfCollator:
             "dna_sequence": dna_seqs,
         }
-def collate_fn(batch, tr_shelf_path, dna_shelf_path):
-    Bs = [b.shape[0] for b, _, _ in batch]
-    Gs = [g.shape[0] for _, g, _ in batch]
-    maxB, maxG = max(Bs), max(Gs)
-    def pad_seq(x, L):
-        if x.shape[0] < L:
-            pad = torch.zeros(
-                (L - x.shape[0], x.shape[1]), dtype=x.dtype, device=x.device
-            )
-            return torch.cat([x, pad], dim=0)
-        return x
-    def pad_t(y, L):
-        if y.shape[0] < L:
-            pad = torch.zeros((L - y.shape[0],), dtype=y.dtype, device=y.device)
-            return torch.cat([y, pad], dim=0)
-        return y
-    b_stack = torch.stack([pad_seq(b, maxB) for b, _, _ in batch])
-    g_stack = torch.stack([pad_seq(g, maxG) for _, g, _ in batch])
-    t_stack = torch.stack([pad_t(t, maxG) for *_, t in batch])
-    return b_stack, g_stack, t_stack
 # ------------------------ Helpers for main method debugging only ------------------------------------------#
 def _peek_batches(dl, n_batches: int = 2, tag: str = "train"):
     logger.info(f"\n=== Peek {n_batches} batch(es) from {tag} loader ===")
@@ -519,13 +544,13 @@ def _peek_batches(dl, n_batches: int = 2, tag: str = "train"):
         logger.info(f"  glm_mask  true count: {gm.sum().item()} / {gm.numel()}")
         logger.info(f"  glm_mask: {tuple(gm.shape)}  dtype={gm.dtype}")
         logger.info(
-            f"  labels:     {tuple(y.shape)}  min={y.min().item():.4f} max={y.max().item():.4f}"
         )
         logger.info(f"  IDs (first 5): {ids[:5]}")
         if i + 1 >= n_batches:
             break
 def _warn_on_paths(args):
     import os
@@ -577,7 +602,7 @@ def main():
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--num_workers", type=int, default=4)
     parser.add_argument(
-        "--debug_run", action="store_true", help="limit dataset to a few rows"
     )
     parser.add_argument(
         "--n_batches", type=int, default=2, help="how many batches to print per split"

 # ---- dataset ---------------------------------------------------------
 class PairDataset(Dataset):
     def __init__(
+        self, dataset: pd.DataFrame, norm_value: int = 1333, round_to: int = 4, score_col="scores", target_col="dna_sequence", binder_col="tr_sequence"
     ):
         """
         Args:
             - norm_value: max score, which we'll use to divide all the integer scores in "scores"
             - round_to: how many decimal places for the numerical score values
         """
+        self.fake_scores=False
+        self.score_col = score_col
+        self.target_col = target_col
+        self.binder_col = binder_col
+        self.norm_value = norm_value
+        self.round_to = round_to
+        self.dataset = self._load_and_normalize(dataset)
+    def _load_and_normalize(self, dataset):
         """
         Labels come in looking like "0,0,0,100,100,133,133,100,100,0,0,"
         This method turns the labels from strings into floats out to 4 decimal places
         """
+        if self.score_col not in dataset.columns:
+            logger.info(f"Scores not provided. Adding placeholder scores where all positions are considered binding")
+            dataset[self.score_col] = dataset["dna_sequence"].str.len()
+            dataset[self.score_col] = dataset[self.score_col].apply(lambda x: ",".join([str(self.norm_value)]*x))
+            self.fake_scores=True
         # split string into list of strings
+        dataset[self.score_col] = dataset[self.score_col].apply(lambda x: x.split(","))
         # turn list of strings into list of normalized, rounded floats
+        dataset[self.score_col] = dataset[self.score_col].apply(
+            lambda x: [round(int(y) / self.norm_value, self.round_to) for y in x]
         )
         # convert to records for ease of loading
         debug_run: bool = False,
         pin_memory: bool = False,
         shuffle_train_batch_order: bool = True,
+        score_col: str = "scores",
+        target_col: str = "dna_sequence",
+        binder_col: str = "tr_sequence"
     ):
         super().__init__()
         self.save_hyperparameters()
         self.train_data_file = train_file
         self.val_data_file = val_file
         self.test_data_file = test_file
+        self.target_col = target_col
+        self.binder_col = binder_col
+        self.score_col = score_col
         # Initialize hyperparameters like batch size
         self.batch_size = batch_size
         self.collate = ShelfCollator(
             tr_shelf_path=str(tr_shelf_path),
             dna_shelf_path=str(dna_shelf_path),
+            tr_key=self.binder_col,
+            dna_key=self.target_col,
             dtype=torch.float32,
+            pad_value=-1.0,
+            debug_run =self.debug_run,
+            score_col = self.score_col
         )
         self.drop_last = False  # or True, your choice
         self.shuffle_batch_order = shuffle_train_batch_order  # False keep batches deterministic per epoch; set True if you want to shuffle batch order
         """
         Load and unpack an input csv whose columns are binder_path,glm_path,label
         """
+        try:
+            df = pd.read_csv(file_path)
+            if lim is not None:
+                df = df[:lim].reset_index(drop=True)
+            return df[["ID", "dna_sequence", "tr_sequence", "scores"]]
+        except:
+            raise Exception(f"{file_path} is not a valid file")
     def setup(self, stage: str | None = None):
         lim = 5 if self.debug_run else None
         if stage in (None, "fit"):
             if not hasattr(self, "train_dataset"):
                 train_df = self.load_file(self.train_data_file, lim=lim)
+                self.train_dataset = PairDataset(train_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.train_batches = make_length_batches(
                     dataset_records=self.train_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
+                self.val_dataset = PairDataset(val_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
         if stage in (None, "validate"):
             if not hasattr(self, "val_dataset"):
                 val_df = self.load_file(self.val_data_file, lim=lim)
+                self.val_dataset = PairDataset(val_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.val_batches = make_length_batches(
                     dataset_records=self.val_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
         if stage in (None, "test"):
             if not hasattr(self, "test_dataset"):
                 test_df = self.load_file(self.test_data_file, lim=lim)
+                self.test_dataset = PairDataset(test_df, score_col = self.score_col, target_col = self.target_col, binder_col = self.binder_col)
                 self.test_batches = make_length_batches(
                     dataset_records=self.test_dataset.dataset,
                     tr_shelf_path=str(self.hparams.tr_shelf_path),
             persistent_workers=(self.num_workers > 0),
             pin_memory=self.hparams.pin_memory,
         )
+    def predict_dataloader(self):
+        # Same as test
+        return DataLoader(
+            self.test_dataset,
+            batch_sampler=self.test_batch_sampler,
+            collate_fn=self.collate,
+            num_workers=self.num_workers,
+            persistent_workers=(self.num_workers > 0),
+            pin_memory=self.hparams.pin_memory,
+        )
 class ShelfCollator:
         dna_key: str = "dna_sequence",
         dtype: torch.dtype = torch.float32,
         pad_value: float = -1.0,
+        debug_run: bool = False,
+        score_col = "scores"
     ):
         self.tr_path = tr_shelf_path
         self.dna_path = dna_shelf_path
+        self.score_col = score_col
         self.tr_key = tr_key
         self.dna_key = dna_key
         self.dtype = dtype
         self.pad_value = pad_value
+        self.debug_run = debug_run
         # opened lazily per worker:
         self._tr_db = None
         ids = [b.get("ID", None) for b in batch]
         tr_seqs = [b[self.tr_key] for b in batch]
         dna_seqs = [b[self.dna_key] for b in batch]
+        scores_list = [b[self.score_col] for b in batch]
         # 1) Fetch embeddings lazily from shelves
         binder_list = []
         glm_emb = pad_sequence(
             glm_list, batch_first=True, padding_value=self.pad_value
         )  # [B, Lg_max, Dg]
         binder_lens = torch.as_tensor(binder_lens, dtype=torch.int64)
         glm_lens = torch.as_tensor(glm_lens, dtype=torch.int64)
         binder_mask = torch.arange(binder_emb.size(1)).unsqueeze(
             0
         ) < binder_lens.unsqueeze(
         labels = pad_sequence(
             labels_list, batch_first=True, padding_value=self.pad_value
         )  # [B, Lg_max]
+        if self.debug_run:
+            max_binder_len = max(binder_lens)
+            max_glm_len = max(glm_lens)
+            binder_expected_false = sum(max_binder_len-binder_lens).item()
+            binder_expected_true = sum(binder_lens)
+            binder_expected_total = binder_expected_true + binder_expected_false
+            glm_expected_false = sum(max_glm_len-glm_lens).item()
+            glm_expected_true = sum(glm_lens).item()
+            glm_expected_total = glm_expected_true + glm_expected_false
+            labels_neg1 = sum(sum(labels==-1)).item()
+            expected_labels_neg1 = glm_expected_false
+            logger.info(f"  Max binder length: {max_binder_len}, original lengths: {binder_lens}, ultimate dimensions: {binder_emb.shape}")
+            logger.info(f"  Binder expect: true/total = {binder_expected_true}/{binder_expected_total}")
+            logger.info(f"  Max DNA length: {max_glm_len}, original lengths: {glm_lens}, ultimate dimensions: {glm_emb.shape}")
+            logger.info(f"  DNA expect: true/total = {glm_expected_true}/{glm_expected_total}")
+            logger.info(f"  Labels expect -1: -1/total = {expected_labels_neg1}/{glm_expected_total}. True: {labels_neg1}/{labels.numel()}")
         return {
             "binder_emb": binder_emb,  # [B, Lb_max, Db]
             "dna_sequence": dna_seqs,
         }
 # ------------------------ Helpers for main method debugging only ------------------------------------------#
 def _peek_batches(dl, n_batches: int = 2, tag: str = "train"):
     logger.info(f"\n=== Peek {n_batches} batch(es) from {tag} loader ===")
         logger.info(f"  glm_mask  true count: {gm.sum().item()} / {gm.numel()}")
         logger.info(f"  glm_mask: {tuple(gm.shape)}  dtype={gm.dtype}")
         logger.info(
+            f"  labels:     {tuple(y.shape)}  min={y.min().item():.4f} max={y.max().item():.4f}, total -1 = {sum(sum(y==-1)).item()}"
         )
         logger.info(f"  IDs (first 5): {ids[:5]}")
+        # should make sure that the number of labels that are -1 equals the number of padding tokens
         if i + 1 >= n_batches:
             break
 def _warn_on_paths(args):
     import os
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--num_workers", type=int, default=4)
     parser.add_argument(
+        "--debug_run", default=True, action="store_true", help="limit dataset to a few rows"
     )
     parser.add_argument(
         "--n_batches", type=int, default=2, help="how many batches to print per split"

dpacman/scripts/eval.py CHANGED Viewed

	@@ -0,0 +1,197 @@

+"""
+Script for using the model just for inference.
+"""
+from typing import Any, Dict, List, Optional, Tuple
+import hydra
+from hydra.core.hydra_config import HydraConfig
+import torch
+import rootutils
+import lightning as L
+from lightning import Callback, LightningDataModule, LightningModule, Trainer
+from lightning.pytorch.loggers import Logger
+from omegaconf import DictConfig
+from pathlib import Path
+import pandas as pd
+from dpacman.classifier.loss import calculate_loss, auprc_zeros_vs_ones_from_logits, auroc_zeros_vs_ones_from_logits
+import pickle
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from dpacman.utils import (
+    RankedLogger,
+    extras,
+    get_metric_value,
+    instantiate_callbacks,
+    instantiate_loggers,
+    log_hyperparameters,
+    task_wrapper,
+)
+log = RankedLogger(__name__, rank_zero_only=True)
+def h100_settings():
+    # Use TensorFloat-32 for float32 matmuls → big speedup with tiny accuracy tradeoff
+    torch.set_float32_matmul_precision("high")  # or "medium" for even more speed
+    # (optional; older PyTorch toggle)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+def flatten_preds(pred_batches):
+    """
+    Flatten what the model predicts, which includes:
+            "ids": batch["ID"],                           # list[str] or list
+            "logits": logits.detach().cpu(),              # (B, Lmax) padded
+            "valid": valid.detach().cpu(),           # (B, Lmax) booleans
+            "labels"
+    """
+    out = []
+    for b in pred_batches:
+        ids, logits, valid, labels = b["ids"], b["logits"], b["valid"], b["labels"]
+        for i, id_ in enumerate(ids):
+            L = int(valid[i].sum().item())     # strip padding
+            trim_logits = logits[i, :L].numpy()
+            out.append({"ID": id_, "logits": trim_logits, "labels": labels[i, :L].numpy()})
+    return out
+@task_wrapper
+def predict(cfg: DictConfig) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    """trains model given checkpoint on a datamodule train set.
+    This method is wrapped in optional @task_wrapper decorator, that controls the behavior during
+    failure. Useful for multiruns, saving info about the crash, etc.
+    :param cfg: DictConfig configuration composed by Hydra.
+    :return: Tuple[dict, dict] with metrics and dict with all instantiated objects.
+    """
+    # set seed for random number generators in pytorch, numpy and python.random
+    if cfg.get("seed"):
+        L.seed_everything(cfg.seed, workers=True)
+    log.info(f"Instantiating datamodule <{cfg.data_module._target_}>")
+    datamodule: LightningDataModule = hydra.utils.instantiate(cfg.data_module)
+    log.info(f"Instantiating model <{cfg.model._target_}>")
+    model: LightningModule = hydra.utils.instantiate(cfg.model)
+    log.info("Instantiating callbacks...")
+    callbacks: List[Callback] = instantiate_callbacks(cfg.get("callbacks"))
+    log.info("Instantiating loggers...")
+    logger: List[Logger] = instantiate_loggers(cfg.get("logger"))
+    log.info(f"Instantiating trainer <{cfg.trainer._target_}>")
+    trainer: Trainer = hydra.utils.instantiate(
+        cfg.trainer, callbacks=callbacks, logger=logger
+    )
+    object_dict = {
+        "cfg": cfg,
+        "datamodule": datamodule,
+        "model": model,
+        "callbacks": callbacks,
+        "logger": logger,
+        "trainer": trainer,
+    }
+    if logger:
+        log.info("Logging hyperparameters!")
+        log_hyperparameters(object_dict)
+    if cfg.get("test"):
+        log.info("Starting testing!")
+        ckpt_path = cfg.ckpt_path
+        if ckpt_path == "":
+            log.warning("No ckpt path was passed! Cannot continue")
+            return
+        trainer.test(model=model, datamodule=datamodule, ckpt_path=ckpt_path)
+        pred_batches = trainer.predict(model, datamodule=datamodule, ckpt_path=ckpt_path, return_predictions=True)
+        out = flatten_preds(pred_batches)
+        # make output dir
+        output_dir = Path(HydraConfig.get().run.dir)
+        save_path = output_dir / "predictions.pkl"
+        with open(save_path, "wb") as f:
+            pickle.dump(out, f)
+        # iterate through out and recalculate AUC, AUPRC, loss - only if there are labels
+        # only if the user actually passed scores; otherwise don't bother
+        if not(datamodule.test_dataset.fake_scores):
+            for i, d in enumerate(out):
+                loss = calculate_loss(
+                    torch.tensor(d["logits"]), torch.tensor(d["labels"]), None, None, alpha=cfg.model.alpha, gamma=cfg.model.gamma
+                )
+                # ---- AUPRC and AUROC on labels in {0, >0.99} only ----
+                ap, n_pos, n_neg, precision, recall, ap_thresholds = auprc_zeros_vs_ones_from_logits(
+                    torch.tensor(d["logits"]), torch.tensor(d["labels"]), torch.zeros(d["labels"].shape, dtype=torch.bool), pos_thresh=0.99
+                )
+                auc, n_pos, n_neg, tpr, fpr, auc_thresolds, tp, fp = auroc_zeros_vs_ones_from_logits(
+                    torch.tensor(d["logits"]), torch.tensor(d["labels"]), torch.zeros(d["labels"].shape, dtype=torch.bool), pos_thresh=0.99
+                )
+                out[i]["loss"] = loss.item() if loss.numel()>0 else None
+                out[i]["auprc"] = ap.item() if ap.numel()>0 else None
+                out[i]["auroc"] = auc.item() if auc.numel()>0 else None
+                out[i]["n_pos"] = n_pos
+                out[i]["n_neg"] = n_neg
+                out[i]["precision"] = precision.numpy() if precision.numel()>0 else None
+                out[i]["recall"] = recall.numpy() if recall.numel()>0 else None
+                out[i]["auprc_thresholds"] = ap_thresholds.numpy() if ap_thresholds.numel()>0 else None
+                out[i]["auc_thresholds"] = auc_thresolds.numpy() if auc_thresolds.numel()>0 else None
+                out[i]["tpr"] = tpr
+                out[i]["fpr"] = fpr
+            # Summary CSV (no big arrays inside)
+            summary_rows = []
+            for d in out:
+                summary_rows.append({
+                    "ID": d["ID"],
+                    "loss": d.get("loss"),
+                    "auprc": d.get("auprc"),
+                    "auroc": d.get("auroc"),
+                    "n_pos": d.get("n_pos"),
+                    "n_neg": d.get("n_neg"),
+                })
+            save_path = output_dir / "summary.csv"
+            pd.DataFrame(summary_rows).to_csv(output_dir / "summary.csv", index=False)
+            # save it
+            log.info(f"Saved eval/predict results to {save_path}")
+    test_metrics = trainer.callback_metrics
+    # merge train and test metrics
+    metric_dict = {**test_metrics}
+    return metric_dict, object_dict
+@hydra.main(
+    version_base="1.3", config_path=str(root / "configs"), config_name="eval.yaml"
+)
+def main(cfg: DictConfig) -> None:
+    """Main entry point for evaluation.
+    :param cfg: DictConfig configuration composed by Hydra.
+    """
+    # apply extra utilities
+    # (e.g. ask for tags if none are provided in cfg, print cfg tree, etc.)
+    extras(cfg)
+    h100_settings()  # try using settings for faster h100s training
+    # train the model
+    metric_dict, _ = predict(cfg)
+    # safely retrieve metric value for hydra-based hyperparameter optimization
+    metric_value = get_metric_value(
+        metric_dict=metric_dict, metric_name=cfg.get("optimized_metric")
+    )
+    # return optimized metric
+    return metric_value
+if __name__ == "__main__":
+    main()

dpacman/scripts/run_eval.sh ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/bin/bash
+# Manually specify values used in the config
+main_task="eval"
+model_type="classifier"
+timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+run_dir="$HOME/DPACMAN/logs/${main_task}/${model_type}/runs/${timestamp}"
+mkdir -p "$run_dir"
+if [ -z "$WANDB_API_KEY" ]; then
+    read -s -p "Enter your WANDB API key: " wandb_key
+    echo
+    export WANDB_API_KEY="$wandb_key"
+fi
+CUDA_VISIBLE_DEVICES=3 nohup python -u -m scripts.eval \
+  hydra.run.dir="${run_dir}" \
+  data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
+  data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
+  data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
+  data_module.batch_size=16 \
+  model.glm_input_dim=256 \
+  model.compressed_dim=256 \
+  model.hidden_dim=256 \
+  ckpt_path="/home/a03-svincoff/DPACMAN/logs/train/classifier/runs/2025-08-27_18-52-25/checkpoints/epoch_009.ckpt" \
+  model.lr=1e-5 \
+  > "${run_dir}/run.log" 2>&1 &
+echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_train.sh CHANGED Viewed

@@ -22,16 +22,18 @@ CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   +trainer.gradient_clip_algorithm="norm" \
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
-  trainer.max_epochs=20 \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
-  model.hidden_dim=128 \
   model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &

   +trainer.gradient_clip_algorithm="norm" \
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
+  trainer.max_epochs=10 \
   data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
   data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
   data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
+  data_module.score_col="binary_scores" \
+  model.loss_type="binary" \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \
+  model.hidden_dim=256 \
   model.lr=1e-5 \
   > "${run_dir}/run.log" 2>&1 &

dpacman/scripts/run_train_baseline.sh CHANGED Viewed

@@ -14,7 +14,7 @@ if [ -z "$WANDB_API_KEY" ]; then
     export WANDB_API_KEY="$wandb_key"
 fi
-CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   +trainer.strategy=ddp \
   +trainer.use_distributed_sampler="false" \
   +trainer.detect_anomaly="false" \
@@ -29,6 +29,8 @@ CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
   model=baseline \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \

     export WANDB_API_KEY="$wandb_key"
 fi
+CUDA_VISIBLE_DEVICES=2,3 nohup python -u -m scripts.train \
   +trainer.strategy=ddp \
   +trainer.use_distributed_sampler="false" \
   +trainer.detect_anomaly="false" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   data_module.batch_size=16 \
+  data_module.score_col="binary_scores" \
+  model.loss_type="binary" \
   model=baseline \
   model.glm_input_dim=256 \
   model.compressed_dim=256 \