svincoff commited on
Commit
4c4b1fc
·
1 Parent(s): 1d43edf
configs/callbacks/wandb.yaml ADDED
File without changes
configs/train.yaml CHANGED
@@ -37,7 +37,4 @@ test: True
37
  ckpt_path: null
38
 
39
  # seed for random number generators in pytorch, numpy and python.random
40
- seed: 42
41
-
42
- trainer:
43
- max_epochs: 20
 
37
  ckpt_path: null
38
 
39
  # seed for random number generators in pytorch, numpy and python.random
40
+ seed: 42
 
 
 
configs/trainer/default.yaml CHANGED
@@ -8,6 +8,9 @@ max_epochs: 10
8
  accelerator: cpu
9
  devices: 1
10
 
 
 
 
11
  # mixed precision for extra speed-up
12
  # precision: 16
13
 
 
8
  accelerator: cpu
9
  devices: 1
10
 
11
+ #gradient_clip_val: 1.0
12
+ #gradient_clip_algorithm: "norm"
13
+
14
  # mixed precision for extra speed-up
15
  # precision: 16
16
 
dpacman/classifier/model.py CHANGED
@@ -153,6 +153,7 @@ class BindPredictor(LightningModule):
153
  lr: float = 1e-4,
154
  alpha: float = 20,
155
  gamma: float = 20,
 
156
  use_local_cnn_on_glm: bool = True,
157
  weight_decay: float = 0.01,
158
  ):
@@ -171,7 +172,7 @@ class BindPredictor(LightningModule):
171
  self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
172
 
173
  self.layers = nn.ModuleList(
174
- [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
175
  )
176
 
177
  #self.ln_out = nn.LayerNorm(hidden_dim)
@@ -233,17 +234,16 @@ class BindPredictor(LightningModule):
233
  )
234
 
235
  # ---- AUPRC on labels in {0, >0.99} only ----
236
- if False:
237
- ap, n_pos, n_neg = auprc_zeros_vs_ones_from_logits(
238
- logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
239
- )
240
- # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
241
- self.log("train/auprc_0v1",
242
- ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
243
- on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
244
- # (optional) also log class counts so you can sanity-check balance
245
- self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
246
- self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
247
 
248
  return loss
249
 
@@ -271,6 +271,24 @@ class BindPredictor(LightningModule):
271
  "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
272
  )
273
  return loss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  def on_train_epoch_end(self):
276
  if False:
 
153
  lr: float = 1e-4,
154
  alpha: float = 20,
155
  gamma: float = 20,
156
+ dropout: float = 0,
157
  use_local_cnn_on_glm: bool = True,
158
  weight_decay: float = 0.01,
159
  ):
 
172
  self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
173
 
174
  self.layers = nn.ModuleList(
175
+ [CrossModalBlock(hidden_dim, heads, self.hparams.dropout) for _ in range(num_layers)]
176
  )
177
 
178
  #self.ln_out = nn.LayerNorm(hidden_dim)
 
234
  )
235
 
236
  # ---- AUPRC on labels in {0, >0.99} only ----
237
+ ap, n_pos, n_neg = auprc_zeros_vs_ones_from_logits(
238
+ logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
239
+ )
240
+ # per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
241
+ self.log("train/auprc_0v1",
242
+ ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
243
+ on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
244
+ # (optional) also log class counts so you can sanity-check balance
245
+ self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
246
+ self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
 
247
 
248
  return loss
249
 
 
271
  "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
272
  )
273
  return loss
274
+
275
+ def on_before_optimizer_step(self, optimizer):
276
+ # Compute global L2 norm of all parameter gradients (ignores None grads)
277
+ grads = []
278
+ for p in self.parameters():
279
+ if p.grad is not None:
280
+ # .detach() avoids autograd tracking; .float() avoids fp16 overflow in norms
281
+ grads.append(p.grad.detach().float().norm(2))
282
+ if grads:
283
+ total_norm = torch.norm(torch.stack(grads), p=2)
284
+ self.log("train/grad_norm", total_norm, on_step=True, prog_bar=False, logger=True)
285
+
286
+ def on_after_backward(self):
287
+ grads = [p.grad.detach().float().norm(2)
288
+ for p in self.parameters() if p.grad is not None]
289
+ if grads:
290
+ total_norm = torch.norm(torch.stack(grads), p=2)
291
+ self.log("train/grad_norm_back", total_norm, on_step=True, prog_bar=False)
292
 
293
  def on_train_epoch_end(self):
294
  if False:
dpacman/data_modules/pair.py CHANGED
@@ -299,7 +299,7 @@ class PairDataModule(LightningDataModule):
299
  drop_last=False,
300
  )
301
  self.val_batch_sampler = PreBatchedDistributedBatchSampler(
302
- self.test_batches, shuffle_batch_order=False, drop_last=False, seed=0
303
  )
304
 
305
  # TEST phase
 
299
  drop_last=False,
300
  )
301
  self.val_batch_sampler = PreBatchedDistributedBatchSampler(
302
+ self.val_batches, shuffle_batch_order=False, drop_last=False, seed=0
303
  )
304
 
305
  # TEST phase
dpacman/data_tasks/split/remap.py CHANGED
@@ -50,6 +50,13 @@ def split_bipartite_fast(
50
  kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
51
  return dna_assign, kept_by_split
52
 
 
 
 
 
 
 
 
53
 
54
  def split_bipartite_with_ratios_and_leaky(
55
  edges,
@@ -524,21 +531,24 @@ def main(cfg: DictConfig):
524
  logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
525
 
526
  if cfg.data_task.split_by == "dna":
527
- logger.info(f"Easy split: all proteins are in their own clusters.")
528
- dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
529
- results = split_bipartite_fast(
530
- dna_clusters,
531
- split_names=("train", "val", "test"),
532
- ratios=(
533
- cfg.data_task.train_ratio,
534
- cfg.data_task.val_ratio,
535
- cfg.data_task.test_ratio,
536
- ),
537
- )
538
- dna_assign, kept_by_split = results
539
-
540
- # assign datapoints to cluster by their DNA cluster rep
541
- edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
 
 
 
542
  else:
543
  results = split_bipartite_by_components(
544
  edges,
@@ -648,7 +658,14 @@ def main(cfg: DictConfig):
648
  # create the output dir
649
  split_out_dir = Path(root) / cfg.data_task.split_out_dir
650
  os.makedirs(split_out_dir, exist_ok=True)
651
- split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "split"]
 
 
 
 
 
 
 
652
  train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
653
  val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
654
  test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
 
50
  kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
51
  return dna_assign, kept_by_split
52
 
53
+ def convert_scores(scores):
54
+ svec = [int(x) for x in scores.split(",")]
55
+ max_score = max(svec)
56
+ binary_svec = [0 if x<max_score else 1 for x in svec]
57
+ assert(svec.count(max_score)==binary_svec.count(1))
58
+ binary_svec = ",".join([str(x) for x in binary_svec])
59
+ return binary_svec
60
 
61
  def split_bipartite_with_ratios_and_leaky(
62
  edges,
 
531
  logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
532
 
533
  if cfg.data_task.split_by == "dna":
534
+ if cfg.data_task.p_exclude:
535
+ return
536
+ else:
537
+ logger.info(f"Easy split: all proteins are in their own clusters.")
538
+ dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
539
+ results = split_bipartite_fast(
540
+ dna_clusters,
541
+ split_names=("train", "val", "test"),
542
+ ratios=(
543
+ cfg.data_task.train_ratio,
544
+ cfg.data_task.val_ratio,
545
+ cfg.data_task.test_ratio,
546
+ ),
547
+ )
548
+ dna_assign, kept_by_split = results
549
+
550
+ # assign datapoints to cluster by their DNA cluster rep
551
+ edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
552
  else:
553
  results = split_bipartite_by_components(
554
  edges,
 
658
  # create the output dir
659
  split_out_dir = Path(root) / cfg.data_task.split_out_dir
660
  os.makedirs(split_out_dir, exist_ok=True)
661
+
662
+ # add binary_scores to allow other training modes
663
+ train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x))
664
+ val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x))
665
+ test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x))
666
+
667
+ # slect final cols and save
668
+ split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
669
  train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
670
  val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
671
  test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
dpacman/scripts/delay_run.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # Usage: ./stagger.sh <first_script.sh> <second_script.sh>
5
+ # Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
6
+
7
+ WAIT1=${WAIT1:-10800} # 3 hours in seconds
8
+ WAIT2=${WAIT2:-10800}
9
+
10
+ SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"
11
+ SCRIPT2="${2:?usage: $0 <first_script.sh> <second_script.sh>}"
12
+
13
+ log() { echo "[$(date '+%F %T')] $*"; }
14
+
15
+ log "Sleeping for $WAIT1 seconds..."
16
+ sleep "$WAIT1"
17
+
18
+ log "Running: $SCRIPT1"
19
+ bash "$SCRIPT1"
20
+
21
+ log "Sleeping for $WAIT2 seconds..."
22
+ sleep "$WAIT2"
23
+
24
+ log "Running: $SCRIPT2"
25
+ bash "$SCRIPT2"
26
+
27
+ log "Done."
dpacman/scripts/run_split.sh CHANGED
@@ -10,12 +10,13 @@ mkdir -p "$run_dir"
10
 
11
  nohup python -u -m scripts.preprocess \
12
  hydra.run.dir="${run_dir}" \
 
13
  data_task="${data_task_type}/remap" \
14
  data_task.split_by=dna \
15
  data_task.train_ratio=0.8 \
16
  data_task.val_ratio=0.1 \
17
  data_task.test_ratio=0.1 \
18
- data_task.split_out_dir=dpacman/data_files/processed/splits/by_dna \
19
  > "${run_dir}/run.log" 2>&1 &
20
 
21
  echo $! > "${run_dir}/pid.txt"
 
10
 
11
  nohup python -u -m scripts.preprocess \
12
  hydra.run.dir="${run_dir}" \
13
+ +data_task.p_exclude="true" \
14
  data_task="${data_task_type}/remap" \
15
  data_task.split_by=dna \
16
  data_task.train_ratio=0.8 \
17
  data_task.val_ratio=0.1 \
18
  data_task.test_ratio=0.1 \
19
+ data_task.split_out_dir=dpacman/data_files/processed/splits/by_both \
20
  > "${run_dir}/run.log" 2>&1 &
21
 
22
  echo $! > "${run_dir}/pid.txt"
dpacman/scripts/run_train.sh CHANGED
@@ -16,19 +16,23 @@ fi
16
 
17
  CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
18
  +trainer.strategy=ddp \
19
- +trainer.use_distributed_sampler="false"\
20
- +trainer.detect_anomaly="false"\
 
 
21
  hydra.run.dir="${run_dir}" \
22
  trainer.devices=2 \
23
- data_module.train_file="data_files/processed/splits/by_dna/babytrain.csv" \
24
- data_module.val_file="data_files/processed/splits/by_dna/babyval.csv" \
25
- data_module.test_file="data_files/processed/splits/by_dna/babytest.csv" \
 
26
  data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
27
  data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
 
28
  model.glm_input_dim=256 \
29
  model.compressed_dim=256 \
30
  model.hidden_dim=256 \
31
- model.lr=1e-5 \
32
  > "${run_dir}/run.log" 2>&1 &
33
 
34
- echo $! > "${run_dir}/pid.txt"
 
16
 
17
  CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
18
  +trainer.strategy=ddp \
19
+ +trainer.use_distributed_sampler="false" \
20
+ +trainer.detect_anomaly="false" \
21
+ +trainer.gradient_clip_val=0.5 \
22
+ +trainer.gradient_clip_algorithm="norm" \
23
  hydra.run.dir="${run_dir}" \
24
  trainer.devices=2 \
25
+ trainer.max_epochs=10 \
26
+ data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
27
+ data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
28
+ data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
29
  data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
30
  data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
31
+ data_module.batch_size=16 \
32
  model.glm_input_dim=256 \
33
  model.compressed_dim=256 \
34
  model.hidden_dim=256 \
35
+ model.lr=5e-6 \
36
  > "${run_dir}/run.log" 2>&1 &
37
 
38
+ echo $! > "${run_dir}/pid.txt"