fixes

Browse files

Files changed (4) hide show

configs/logger/wandb.yaml +1 -0
dpacman/classifier/loss.py +54 -0
dpacman/classifier/model.py +16 -2
dpacman/scripts/run_train.sh +4 -3

configs/logger/wandb.yaml CHANGED Viewed

@@ -8,6 +8,7 @@ wandb:
   id: null # pass correct id to resume experiment!
   anonymous: null # enable anonymous logging
   project: "dnabind"
   log_model: False # upload lightning ckpts
   prefix: "" # a string to put at the beginning of metric keys
   # entity: "" # set to name of your wandb team

   id: null # pass correct id to resume experiment!
   anonymous: null # enable anonymous logging
   project: "dnabind"
+  entity: "sophia-vincoff-team"
   log_model: False # upload lightning ckpts
   prefix: "" # a string to put at the beginning of metric keys
   # entity: "" # set to name of your wandb team

dpacman/classifier/loss.py CHANGED Viewed

@@ -62,6 +62,60 @@ def calculate_loss(
     return alpha * bce_nonpeak + gamma * mse_peak
 def accuracy_percentage(
     logits,
     targets,

     return alpha * bce_nonpeak + gamma * mse_peak
+import torch
+@torch.no_grad()
+def auprc_zeros_vs_ones_from_logits(
+    logits: torch.Tensor,            # (B, L)
+    labels: torch.Tensor,            # (B, L)
+    glm_kpm: torch.Tensor | None,    # (B, L) True=PAD; pass None if not available
+    pos_thresh: float = 0.99,
+) -> tuple[torch.Tensor, int, int]:
+    """
+    Returns (ap, n_pos, n_neg). AP is Average Precision (area under PR).
+    Uses only positions with labels == 0.0 or > pos_thresh. Ignores PADs via glm_kpm.
+    Computation stays on the same device as logits.
+    """
+    probs = torch.sigmoid(logits)
+    # Valid positions: not padded
+    if glm_kpm is not None:
+        valid = ~glm_kpm
+    else:
+        valid = torch.ones_like(labels, dtype=torch.bool, device=labels.device)
+    # Keep only exact zeros and near-ones
+    pos = labels > pos_thresh
+    neg = labels == 0.0
+    keep = valid & (pos | neg)
+    if keep.sum() == 0:
+        return torch.tensor(float('nan'), device=logits.device), 0, 0
+    y = pos[keep].to(probs.dtype)            # 1 for >0.99, 0 for 0.0
+    s = probs[keep].to(probs.dtype)
+    n = y.numel()
+    n_pos = int(y.sum().item())
+    n_neg = n - n_pos
+    if n_pos == 0:                            # no positives → AP = 0 by convention
+        return torch.tensor(0.0, device=logits.device), 0, n_neg
+    # Sort by score descending
+    order = torch.argsort(s, descending=True)
+    y_sorted = y[order]
+    # CumTP and precision/recall
+    tp = torch.cumsum(y_sorted, dim=0)
+    ranks = torch.arange(1, n + 1, device=logits.device, dtype=probs.dtype)
+    precision = tp / ranks
+    recall = tp / n_pos
+    # AP = sum( precision * Δrecall )
+    recall_prev = torch.cat([torch.zeros(1, device=logits.device, dtype=probs.dtype), recall[:-1]])
+    ap = (precision * (recall - recall_prev)).sum()
+    return ap, n_pos, n_neg
 def accuracy_percentage(
     logits,
     targets,

dpacman/classifier/model.py CHANGED Viewed

@@ -6,7 +6,7 @@ import torch
 from torch import nn
 from lightning import LightningModule
 from dpacman.utils.models import set_seed
-from .loss import calculate_loss
 set_seed()
@@ -174,7 +174,7 @@ class BindPredictor(LightningModule):
             [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
         )
-        self.ln_out = nn.LayerNorm(hidden_dim)
         # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
         self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
@@ -231,6 +231,20 @@ class BindPredictor(LightningModule):
             prog_bar=True,
             batch_size=logits.size(0),
         )
         return loss
     def validation_step(self, batch, batch_idx):

 from torch import nn
 from lightning import LightningModule
 from dpacman.utils.models import set_seed
+from .loss import calculate_loss, auprc_zeros_vs_ones_from_logits
 set_seed()
             [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
         )
+        #self.ln_out = nn.LayerNorm(hidden_dim)
         # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
         self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
             prog_bar=True,
             batch_size=logits.size(0),
         )
+        # ---- AUPRC on labels in {0, >0.99} only ----
+        if False:
+            ap, n_pos, n_neg = auprc_zeros_vs_ones_from_logits(
+                logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
+            )
+            # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
+            self.log("train/auprc_0v1",
+                    ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
+                    on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
+            # (optional) also log class counts so you can sanity-check balance
+            self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
+            self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
         return loss
     def validation_step(self, batch, batch_idx):

dpacman/scripts/run_train.sh CHANGED Viewed

@@ -17,11 +17,12 @@ fi
 CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   +trainer.strategy=ddp \
   +trainer.use_distributed_sampler="false"\
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
-  data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
-  data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
-  data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   model.glm_input_dim=256 \

 CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
   +trainer.strategy=ddp \
   +trainer.use_distributed_sampler="false"\
+  +trainer.detect_anomaly="false"\
   hydra.run.dir="${run_dir}" \
   trainer.devices=2 \
+  data_module.train_file="data_files/processed/splits/by_dna/babytrain.csv" \
+  data_module.val_file="data_files/processed/splits/by_dna/babyval.csv" \
+  data_module.test_file="data_files/processed/splits/by_dna/babytest.csv" \
   data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
   data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
   model.glm_input_dim=256 \