updates
Browse files- configs/callbacks/wandb.yaml +0 -0
- configs/train.yaml +1 -4
- configs/trainer/default.yaml +3 -0
- dpacman/classifier/model.py +30 -12
- dpacman/data_modules/pair.py +1 -1
- dpacman/data_tasks/split/remap.py +33 -16
- dpacman/scripts/delay_run.sh +27 -0
- dpacman/scripts/run_split.sh +2 -1
- dpacman/scripts/run_train.sh +11 -7
configs/callbacks/wandb.yaml
ADDED
|
File without changes
|
configs/train.yaml
CHANGED
|
@@ -37,7 +37,4 @@ test: True
|
|
| 37 |
ckpt_path: null
|
| 38 |
|
| 39 |
# seed for random number generators in pytorch, numpy and python.random
|
| 40 |
-
seed: 42
|
| 41 |
-
|
| 42 |
-
trainer:
|
| 43 |
-
max_epochs: 20
|
|
|
|
| 37 |
ckpt_path: null
|
| 38 |
|
| 39 |
# seed for random number generators in pytorch, numpy and python.random
|
| 40 |
+
seed: 42
|
|
|
|
|
|
|
|
|
configs/trainer/default.yaml
CHANGED
|
@@ -8,6 +8,9 @@ max_epochs: 10
|
|
| 8 |
accelerator: cpu
|
| 9 |
devices: 1
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
# mixed precision for extra speed-up
|
| 12 |
# precision: 16
|
| 13 |
|
|
|
|
| 8 |
accelerator: cpu
|
| 9 |
devices: 1
|
| 10 |
|
| 11 |
+
#gradient_clip_val: 1.0
|
| 12 |
+
#gradient_clip_algorithm: "norm"
|
| 13 |
+
|
| 14 |
# mixed precision for extra speed-up
|
| 15 |
# precision: 16
|
| 16 |
|
dpacman/classifier/model.py
CHANGED
|
@@ -153,6 +153,7 @@ class BindPredictor(LightningModule):
|
|
| 153 |
lr: float = 1e-4,
|
| 154 |
alpha: float = 20,
|
| 155 |
gamma: float = 20,
|
|
|
|
| 156 |
use_local_cnn_on_glm: bool = True,
|
| 157 |
weight_decay: float = 0.01,
|
| 158 |
):
|
|
@@ -171,7 +172,7 @@ class BindPredictor(LightningModule):
|
|
| 171 |
self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
|
| 172 |
|
| 173 |
self.layers = nn.ModuleList(
|
| 174 |
-
[CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
|
| 175 |
)
|
| 176 |
|
| 177 |
#self.ln_out = nn.LayerNorm(hidden_dim)
|
|
@@ -233,17 +234,16 @@ class BindPredictor(LightningModule):
|
|
| 233 |
)
|
| 234 |
|
| 235 |
# ---- AUPRC on labels in {0, >0.99} only ----
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
|
| 247 |
|
| 248 |
return loss
|
| 249 |
|
|
@@ -271,6 +271,24 @@ class BindPredictor(LightningModule):
|
|
| 271 |
"test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
|
| 272 |
)
|
| 273 |
return loss
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
def on_train_epoch_end(self):
|
| 276 |
if False:
|
|
|
|
| 153 |
lr: float = 1e-4,
|
| 154 |
alpha: float = 20,
|
| 155 |
gamma: float = 20,
|
| 156 |
+
dropout: float = 0,
|
| 157 |
use_local_cnn_on_glm: bool = True,
|
| 158 |
weight_decay: float = 0.01,
|
| 159 |
):
|
|
|
|
| 172 |
self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
|
| 173 |
|
| 174 |
self.layers = nn.ModuleList(
|
| 175 |
+
[CrossModalBlock(hidden_dim, heads, self.hparams.dropout) for _ in range(num_layers)]
|
| 176 |
)
|
| 177 |
|
| 178 |
#self.ln_out = nn.LayerNorm(hidden_dim)
|
|
|
|
| 234 |
)
|
| 235 |
|
| 236 |
# ---- AUPRC on labels in {0, >0.99} only ----
|
| 237 |
+
ap, n_pos, n_neg = auprc_zeros_vs_ones_from_logits(
|
| 238 |
+
logits.detach(), batch["labels"], batch.get("glm_kpm"), pos_thresh=0.99
|
| 239 |
+
)
|
| 240 |
+
# per-batch AP (epoch-mean is a decent summary); sync across GPUs if using DDP
|
| 241 |
+
self.log("train/auprc_0v1",
|
| 242 |
+
ap if torch.isfinite(ap) else torch.tensor(0.0, device=ap.device),
|
| 243 |
+
on_step=False, on_epoch=True, prog_bar=True, sync_dist=True, batch_size=logits.size(0))
|
| 244 |
+
# (optional) also log class counts so you can sanity-check balance
|
| 245 |
+
self.log("train/n_pos_0v1", float(n_pos), on_step=False, on_epoch=True, sync_dist=True)
|
| 246 |
+
self.log("train/n_neg_0v1", float(n_neg), on_step=False, on_epoch=True, sync_dist=True)
|
|
|
|
| 247 |
|
| 248 |
return loss
|
| 249 |
|
|
|
|
| 271 |
"test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
|
| 272 |
)
|
| 273 |
return loss
|
| 274 |
+
|
| 275 |
+
def on_before_optimizer_step(self, optimizer):
|
| 276 |
+
# Compute global L2 norm of all parameter gradients (ignores None grads)
|
| 277 |
+
grads = []
|
| 278 |
+
for p in self.parameters():
|
| 279 |
+
if p.grad is not None:
|
| 280 |
+
# .detach() avoids autograd tracking; .float() avoids fp16 overflow in norms
|
| 281 |
+
grads.append(p.grad.detach().float().norm(2))
|
| 282 |
+
if grads:
|
| 283 |
+
total_norm = torch.norm(torch.stack(grads), p=2)
|
| 284 |
+
self.log("train/grad_norm", total_norm, on_step=True, prog_bar=False, logger=True)
|
| 285 |
+
|
| 286 |
+
def on_after_backward(self):
|
| 287 |
+
grads = [p.grad.detach().float().norm(2)
|
| 288 |
+
for p in self.parameters() if p.grad is not None]
|
| 289 |
+
if grads:
|
| 290 |
+
total_norm = torch.norm(torch.stack(grads), p=2)
|
| 291 |
+
self.log("train/grad_norm_back", total_norm, on_step=True, prog_bar=False)
|
| 292 |
|
| 293 |
def on_train_epoch_end(self):
|
| 294 |
if False:
|
dpacman/data_modules/pair.py
CHANGED
|
@@ -299,7 +299,7 @@ class PairDataModule(LightningDataModule):
|
|
| 299 |
drop_last=False,
|
| 300 |
)
|
| 301 |
self.val_batch_sampler = PreBatchedDistributedBatchSampler(
|
| 302 |
-
self.
|
| 303 |
)
|
| 304 |
|
| 305 |
# TEST phase
|
|
|
|
| 299 |
drop_last=False,
|
| 300 |
)
|
| 301 |
self.val_batch_sampler = PreBatchedDistributedBatchSampler(
|
| 302 |
+
self.val_batches, shuffle_batch_order=False, drop_last=False, seed=0
|
| 303 |
)
|
| 304 |
|
| 305 |
# TEST phase
|
dpacman/data_tasks/split/remap.py
CHANGED
|
@@ -50,6 +50,13 @@ def split_bipartite_fast(
|
|
| 50 |
kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
|
| 51 |
return dna_assign, kept_by_split
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def split_bipartite_with_ratios_and_leaky(
|
| 55 |
edges,
|
|
@@ -524,21 +531,24 @@ def main(cfg: DictConfig):
|
|
| 524 |
logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
|
| 525 |
|
| 526 |
if cfg.data_task.split_by == "dna":
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
| 542 |
else:
|
| 543 |
results = split_bipartite_by_components(
|
| 544 |
edges,
|
|
@@ -648,7 +658,14 @@ def main(cfg: DictConfig):
|
|
| 648 |
# create the output dir
|
| 649 |
split_out_dir = Path(root) / cfg.data_task.split_out_dir
|
| 650 |
os.makedirs(split_out_dir, exist_ok=True)
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
|
| 653 |
val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
|
| 654 |
test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
|
|
|
|
| 50 |
kept_by_split = {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
|
| 51 |
return dna_assign, kept_by_split
|
| 52 |
|
| 53 |
+
def convert_scores(scores):
|
| 54 |
+
svec = [int(x) for x in scores.split(",")]
|
| 55 |
+
max_score = max(svec)
|
| 56 |
+
binary_svec = [0 if x<max_score else 1 for x in svec]
|
| 57 |
+
assert(svec.count(max_score)==binary_svec.count(1))
|
| 58 |
+
binary_svec = ",".join([str(x) for x in binary_svec])
|
| 59 |
+
return binary_svec
|
| 60 |
|
| 61 |
def split_bipartite_with_ratios_and_leaky(
|
| 62 |
edges,
|
|
|
|
| 531 |
logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
|
| 532 |
|
| 533 |
if cfg.data_task.split_by == "dna":
|
| 534 |
+
if cfg.data_task.p_exclude:
|
| 535 |
+
return
|
| 536 |
+
else:
|
| 537 |
+
logger.info(f"Easy split: all proteins are in their own clusters.")
|
| 538 |
+
dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
|
| 539 |
+
results = split_bipartite_fast(
|
| 540 |
+
dna_clusters,
|
| 541 |
+
split_names=("train", "val", "test"),
|
| 542 |
+
ratios=(
|
| 543 |
+
cfg.data_task.train_ratio,
|
| 544 |
+
cfg.data_task.val_ratio,
|
| 545 |
+
cfg.data_task.test_ratio,
|
| 546 |
+
),
|
| 547 |
+
)
|
| 548 |
+
dna_assign, kept_by_split = results
|
| 549 |
+
|
| 550 |
+
# assign datapoints to cluster by their DNA cluster rep
|
| 551 |
+
edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
|
| 552 |
else:
|
| 553 |
results = split_bipartite_by_components(
|
| 554 |
edges,
|
|
|
|
| 658 |
# create the output dir
|
| 659 |
split_out_dir = Path(root) / cfg.data_task.split_out_dir
|
| 660 |
os.makedirs(split_out_dir, exist_ok=True)
|
| 661 |
+
|
| 662 |
+
# add binary_scores to allow other training modes
|
| 663 |
+
train["fimo_binary_sores"] = train["scores"].apply(lambda x: convert_scores(x))
|
| 664 |
+
val["fimo_binary_sores"] = val["scores"].apply(lambda x: convert_scores(x))
|
| 665 |
+
test["fimo_binary_sores"] = test["scores"].apply(lambda x: convert_scores(x))
|
| 666 |
+
|
| 667 |
+
# slect final cols and save
|
| 668 |
+
split_final_cols = ["ID", "dna_sequence", "tr_sequence", "scores", "fimo_binary_sores", "split"]
|
| 669 |
train[split_final_cols].to_csv(split_out_dir / "train.csv", index=False)
|
| 670 |
val[split_final_cols].to_csv(split_out_dir / "val.csv", index=False)
|
| 671 |
test[split_final_cols].to_csv(split_out_dir / "test.csv", index=False)
|
dpacman/scripts/delay_run.sh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
# Usage: ./stagger.sh <first_script.sh> <second_script.sh>
|
| 5 |
+
# Optional: override waits via env vars WAIT1 / WAIT2 (seconds). Defaults: 3 hours each.
|
| 6 |
+
|
| 7 |
+
WAIT1=${WAIT1:-10800} # 3 hours in seconds
|
| 8 |
+
WAIT2=${WAIT2:-10800}
|
| 9 |
+
|
| 10 |
+
SCRIPT1="${1:?usage: $0 <first_script.sh> <second_script.sh>}"
|
| 11 |
+
SCRIPT2="${2:?usage: $0 <first_script.sh> <second_script.sh>}"
|
| 12 |
+
|
| 13 |
+
log() { echo "[$(date '+%F %T')] $*"; }
|
| 14 |
+
|
| 15 |
+
log "Sleeping for $WAIT1 seconds..."
|
| 16 |
+
sleep "$WAIT1"
|
| 17 |
+
|
| 18 |
+
log "Running: $SCRIPT1"
|
| 19 |
+
bash "$SCRIPT1"
|
| 20 |
+
|
| 21 |
+
log "Sleeping for $WAIT2 seconds..."
|
| 22 |
+
sleep "$WAIT2"
|
| 23 |
+
|
| 24 |
+
log "Running: $SCRIPT2"
|
| 25 |
+
bash "$SCRIPT2"
|
| 26 |
+
|
| 27 |
+
log "Done."
|
dpacman/scripts/run_split.sh
CHANGED
|
@@ -10,12 +10,13 @@ mkdir -p "$run_dir"
|
|
| 10 |
|
| 11 |
nohup python -u -m scripts.preprocess \
|
| 12 |
hydra.run.dir="${run_dir}" \
|
|
|
|
| 13 |
data_task="${data_task_type}/remap" \
|
| 14 |
data_task.split_by=dna \
|
| 15 |
data_task.train_ratio=0.8 \
|
| 16 |
data_task.val_ratio=0.1 \
|
| 17 |
data_task.test_ratio=0.1 \
|
| 18 |
-
data_task.split_out_dir=dpacman/data_files/processed/splits/
|
| 19 |
> "${run_dir}/run.log" 2>&1 &
|
| 20 |
|
| 21 |
echo $! > "${run_dir}/pid.txt"
|
|
|
|
| 10 |
|
| 11 |
nohup python -u -m scripts.preprocess \
|
| 12 |
hydra.run.dir="${run_dir}" \
|
| 13 |
+
+data_task.p_exclude="true" \
|
| 14 |
data_task="${data_task_type}/remap" \
|
| 15 |
data_task.split_by=dna \
|
| 16 |
data_task.train_ratio=0.8 \
|
| 17 |
data_task.val_ratio=0.1 \
|
| 18 |
data_task.test_ratio=0.1 \
|
| 19 |
+
data_task.split_out_dir=dpacman/data_files/processed/splits/by_both \
|
| 20 |
> "${run_dir}/run.log" 2>&1 &
|
| 21 |
|
| 22 |
echo $! > "${run_dir}/pid.txt"
|
dpacman/scripts/run_train.sh
CHANGED
|
@@ -16,19 +16,23 @@ fi
|
|
| 16 |
|
| 17 |
CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
|
| 18 |
+trainer.strategy=ddp \
|
| 19 |
-
+trainer.use_distributed_sampler="false"\
|
| 20 |
-
+trainer.detect_anomaly="false"\
|
|
|
|
|
|
|
| 21 |
hydra.run.dir="${run_dir}" \
|
| 22 |
trainer.devices=2 \
|
| 23 |
-
|
| 24 |
-
data_module.
|
| 25 |
-
data_module.
|
|
|
|
| 26 |
data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
|
| 27 |
data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
|
|
|
|
| 28 |
model.glm_input_dim=256 \
|
| 29 |
model.compressed_dim=256 \
|
| 30 |
model.hidden_dim=256 \
|
| 31 |
-
model.lr=
|
| 32 |
> "${run_dir}/run.log" 2>&1 &
|
| 33 |
|
| 34 |
-
echo $! > "${run_dir}/pid.txt"
|
|
|
|
| 16 |
|
| 17 |
CUDA_VISIBLE_DEVICES=0,1 nohup python -u -m scripts.train \
|
| 18 |
+trainer.strategy=ddp \
|
| 19 |
+
+trainer.use_distributed_sampler="false" \
|
| 20 |
+
+trainer.detect_anomaly="false" \
|
| 21 |
+
+trainer.gradient_clip_val=0.5 \
|
| 22 |
+
+trainer.gradient_clip_algorithm="norm" \
|
| 23 |
hydra.run.dir="${run_dir}" \
|
| 24 |
trainer.devices=2 \
|
| 25 |
+
trainer.max_epochs=10 \
|
| 26 |
+
data_module.train_file="data_files/processed/splits/by_dna/train.csv" \
|
| 27 |
+
data_module.val_file="data_files/processed/splits/by_dna/val.csv" \
|
| 28 |
+
data_module.test_file="data_files/processed/splits/by_dna/test.csv" \
|
| 29 |
data_module.tr_shelf_path="data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf" \
|
| 30 |
data_module.dna_shelf_path="data_files/processed/embeddings/fimo_hits_only/peaks_caduceus.shelf" \
|
| 31 |
+
data_module.batch_size=16 \
|
| 32 |
model.glm_input_dim=256 \
|
| 33 |
model.compressed_dim=256 \
|
| 34 |
model.hidden_dim=256 \
|
| 35 |
+
model.lr=5e-6 \
|
| 36 |
> "${run_dir}/run.log" 2>&1 &
|
| 37 |
|
| 38 |
+
echo $! > "${run_dir}/pid.txt"
|