svincoff commited on
Commit
bc0d37c
·
1 Parent(s): 80b6a2c

embeddings

Browse files
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- dpacman/data_files
2
  dpacman/preprocess/tfclust/*.log
3
  dpacman/preprocess/tfclust/temp.py
4
  bigBedToBed
@@ -25,4 +25,8 @@ dpacman/idmap_filt.csv
25
  dpacman/temp3.py
26
  dpacman/temp4.py
27
  dpacman/temp.ipynb
28
- dpacman/nohup.out
 
 
 
 
 
1
+ dpacman/data_files/
2
  dpacman/preprocess/tfclust/*.log
3
  dpacman/preprocess/tfclust/temp.py
4
  bigBedToBed
 
25
  dpacman/temp3.py
26
  dpacman/temp4.py
27
  dpacman/temp.ipynb
28
+ dpacman/nohup.out
29
+ dpacman/*/__pycache__/
30
+ dpacman/data_tasks/split/__pycache__/
31
+ dpacman/data_tasks/cluster/__pycache__/
32
+ dpacman/data_tasks/embeddings/__pycache__/
configs/data_task/embeddings/dna.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: dna
2
+ type: embeddings
3
+
4
+ genome_json_dir: null
5
+ chrom_model: caduceus
6
+ input_file: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
7
+ out_dir: dpacman/data_files/processed/embeddings/fimo_hits_only
8
+
9
+ device: gpu
configs/data_task/embeddings/protein.yaml ADDED
File without changes
dpacman/data_tasks/embeddings/__init__.py CHANGED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .embedders import (
2
+ CaduceusEmbedder,
3
+ DNABertEmbedder,
4
+ NucleotideTransformerEmbedder,
5
+ GPNEmbedder,
6
+ SegmentNTEmbedder,
7
+ ESMEmbedder,
8
+ ESMDBPEmbedder,
9
+ ProGenEmbedder
10
+ )
11
+
12
+ def get_embedder(name, device, for_dna=True):
13
+ name = name.lower()
14
+ if for_dna:
15
+ if name=="caduceus": return CaduceusEmbedder(device)
16
+ if name=="dnabert": return DNABertEmbedder(device)
17
+ if name=="nucleotide": return NucleotideTransformerEmbedder(device)
18
+ if name=="gpn": return GPNEmbedder(device)
19
+ if name=="segmentnt": return SegmentNTEmbedder(device)
20
+ else:
21
+ if name in ("esm",): return ESMEmbedder(device)
22
+ if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
23
+ if name=="progen": return ProGenEmbedder(device)
24
+ raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
dpacman/data_tasks/embeddings/dna.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .utils import pad_token_embeddings, embed_and_save
2
+ from dpacman.data_tasks.embeddings import get_embedder
3
+
4
+ import logging
5
+ import rootutils
6
+ import os
7
+ import torch
8
+ import json
9
+ import pandas as pd
10
+ from pathlib import Path
11
+ from omegaconf import DictConfig
12
+
13
+ root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def main(cfg: DictConfig):
17
+ logger.info(f"Making embeddings using {cfg.data_task.chrom_model} for dna sequences at {cfg.data_task.input_file}")
18
+ # make out dir if necessary
19
+ out_dir = Path(root) / cfg.data_task.out_dir
20
+ os.makedirs(out_dir, exist_ok=True)
21
+
22
+ # set device
23
+ device = "cpu"
24
+ if cfg.data_task.device=="gpu":
25
+ if torch.cuda.is_available():
26
+ device = "cuda"
27
+ logger.info(f"Using device: {device}")
28
+
29
+ # read the input file
30
+ input_file = Path(root) / cfg.data_task.input_file
31
+ if str(input_file).endswith(".json"):
32
+ # load the json and isolate the sequences and ids
33
+ with open(input_file, "r") as f:
34
+ d = json.load(f)
35
+
36
+ df = pd.DataFrame.from_dict(d, orient="index").reset_index()
37
+ df.columns = ["seq_id","sequence"]
38
+
39
+ # turn into list of sequences and IDs
40
+ peak_seqs = df["sequence"].tolist()
41
+ peak_ids = df["seq_id"].tolist()
42
+ logger.info(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data")
43
+
44
+ # Get the DNA embedder
45
+ dna_embedder = get_embedder(cfg.data_task.chrom_model, device, for_dna=True)
46
+ out_peaks = out_dir/ f"peaks_{cfg.data_task.chrom_model}.npy"
47
+ embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
48
+
49
+ logger.info("Finished embedding DNA sequences.")
50
+
51
+ if __name__=="__main__":
52
+ main()
dpacman/data_tasks/embeddings/embedders.py CHANGED
@@ -22,10 +22,10 @@ import numpy as np
22
  from pathlib import Path
23
  import torch
24
  from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
25
- import esm
26
  from Bio import SeqIO
27
  import time
28
  import pandas as pd
 
29
  from tqdm.auto import tqdm
30
  import logging, math
31
 
@@ -197,23 +197,6 @@ class NucleotideTransformerEmbedder:
197
  pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
198
  return np.vstack(pooled)
199
 
200
- # class ESMEmbedder:
201
- # def __init__(self, device):
202
- # self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
203
- # self.batch_converter = self.alphabet.get_batch_converter()
204
- # self.model.to(device).eval()
205
- # self.device = device
206
-
207
- # def embed(self, seqs):
208
- # batch = [(str(i), seq) for i, seq in enumerate(seqs)]
209
- # _, _, toks = self.batch_converter(batch)
210
- # toks = toks.to(self.device)
211
- # with torch.no_grad():
212
- # results = self.model(toks, repr_layers=[33], return_contacts=False)
213
- # reps = results["representations"][33]
214
- # return reps[:, 1:-1].mean(1).cpu().numpy()
215
-
216
-
217
  class ESMEmbedder:
218
  def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
219
  # Try to load the specified ESM-2 model; fallback to esm1b if missing
@@ -280,39 +263,6 @@ class ESMEmbedder:
280
  all_embeddings.append(seq_vec.cpu().numpy())
281
  return np.vstack(all_embeddings) # (N, D)
282
 
283
-
284
- # class ESMDBPEmbedder:
285
- # def __init__(self, device):
286
- # base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
287
- # model_path = (
288
- # Path(__file__).resolve().parent.parent
289
- # / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
290
- # )
291
- # checkpoint = torch.load(model_path, map_location="cpu")
292
- # clean_sd = {}
293
- # for k, v in checkpoint.items():
294
- # clean_sd[k.replace("module.", "")] = v
295
- # result = base_model.load_state_dict(clean_sd, strict=False)
296
- # if result.missing_keys:
297
- # print(f"[ESMDBP] missing keys: {result.missing_keys}")
298
- # if result.unexpected_keys:
299
- # print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
300
-
301
- # self.model = base_model.to(device).eval()
302
- # self.alphabet = alphabet
303
- # self.batch_converter = alphabet.get_batch_converter()
304
- # self.device = device
305
-
306
- # def embed(self, seqs):
307
- # batch = [(str(i), seq) for i, seq in enumerate(seqs)]
308
- # _, _, toks = self.batch_converter(batch)
309
- # toks = toks.to(self.device)
310
- # with torch.no_grad():
311
- # out = self.model(toks, repr_layers=[33], return_contacts=False)
312
- # reps = out["representations"][33]
313
- # # skip start/end tokens
314
- # return reps[:, 1:-1].mean(1).cpu().numpy()
315
-
316
  class ESMDBPEmbedder:
317
  def __init__(self, device):
318
  base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
@@ -412,149 +362,4 @@ class ProGenEmbedder:
412
  ).to(self.device)
413
  with torch.no_grad():
414
  last_hidden = self.model(**inputs).last_hidden_state
415
- return last_hidden.mean(dim=1).cpu().numpy()
416
-
417
- # ---- main pipeline ----
418
-
419
- def get_embedder(name, device, for_dna=True):
420
- name = name.lower()
421
- if for_dna:
422
- if name=="caduceus": return CaduceusEmbedder(device)
423
- if name=="dnabert": return DNABertEmbedder(device)
424
- if name=="nucleotide": return NucleotideTransformerEmbedder(device)
425
- if name=="gpn": return GPNEmbedder(device)
426
- if name=="segmentnt": return SegmentNTEmbedder(device)
427
- else:
428
- if name in ("esm",): return ESMEmbedder(device)
429
- if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
430
- if name=="progen": return ProGenEmbedder(device)
431
- raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
432
-
433
-
434
- def pad_token_embeddings(list_of_arrays, pad_value=0.0):
435
- """
436
- list_of_arrays: list of (L_i, D) numpy arrays
437
- Returns:
438
- padded: (N, L_max, D) array
439
- mask: (N, L_max) boolean array where True = real token, False = padding
440
- """
441
- N = len(list_of_arrays)
442
- D = list_of_arrays[0].shape[1]
443
- L_max = max(arr.shape[0] for arr in list_of_arrays)
444
- padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
445
- mask = np.zeros((N, L_max), dtype=bool)
446
- for i, arr in enumerate(list_of_arrays):
447
- L = arr.shape[0]
448
- padded[i, :L] = arr
449
- mask[i, :L] = True
450
- return padded, mask
451
-
452
- def embed_and_save(seqs, ids, embedder, out_path):
453
- embs = embedder.embed(seqs)
454
-
455
- # Decide whether we got variable-length per-token outputs (list of (L, D))
456
- is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
457
-
458
- if is_variable_token:
459
- # pad to (N, L_max, D) + mask
460
- padded, mask = pad_token_embeddings(embs)
461
- # Save both embeddings and mask together in an .npz for convenience
462
- np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
463
- embeddings=padded,
464
- mask=mask,
465
- ids=np.array(ids, dtype=object))
466
- else:
467
- # fixed shape output, e.g., pooled (N, D)
468
- array = np.vstack(embs) if isinstance(embs, list) else embs
469
- np.save(out_path, array)
470
- with open(out_path.with_suffix(".ids"), "w") as f:
471
- f.write("\n".join(ids))
472
-
473
-
474
- if __name__=="__main__":
475
-
476
- p = argparse.ArgumentParser()
477
- #p.add_argument("--peak_fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
478
- p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
479
- p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
480
- p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
481
- p.add_argument("--chrom-model", default="caduceus")
482
- p.add_argument("--tf-model", default="esm-dbp")
483
- p.add_argument("--out-dir", default="dpacman/model/embeddings")
484
- p.add_argument("--device", default="cpu")
485
- args = p.parse_args()
486
-
487
- os.makedirs(args.out_dir, exist_ok=True)
488
- device = args.device
489
- print(device)
490
-
491
- if not args.skip_dna:
492
- if args.genome_json_dir == None:
493
- dna_df = pd.read_parquet('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.parquet', engine='pyarrow')
494
- #df.to_csv('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.csv', index=False)
495
- peak_seqs = dna_df["dna_sequence"]
496
- peak_ids = dna_df["ID"]
497
- print(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data", flush=True)
498
- dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
499
- out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
500
- embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
501
-
502
- # peak_fasta = Path(args.peak_fasta)
503
- # if peak_fasta.exists():
504
- # # Load peak sequences from FASTA
505
- # from Bio import SeqIO
506
-
507
- # peak_seqs = []
508
- # peak_ids = []
509
- # for rec in SeqIO.parse(peak_fasta, "fasta"):
510
- # peak_ids.append(rec.id)
511
- # peak_seqs.append(str(rec.seq))
512
- # print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
513
- # dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
514
- # out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
515
- # embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
516
- elif args.genome_json_dir:
517
- # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
518
- genome_dir = Path(args.genome_json_dir)
519
- chrom_seqs, chrom_ids = [], []
520
- primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
521
- for j in sorted(genome_dir.iterdir()):
522
- if not primary_pattern.match(j.name):
523
- continue
524
- data = json.loads(j.read_text())
525
- seq = data.get("dna") or data.get("sequence")
526
- chrom = data.get("chrom") or j.stem.split("_")[-1]
527
- chrom_seqs.append(seq)
528
- chrom_ids.append(chrom)
529
- cutoff = CaduceusEmbedder(device).chunk_size
530
- long_chroms = [
531
- (chrom, len(seq))
532
- for chrom, seq in zip(chrom_ids, chrom_seqs)
533
- if len(seq) > cutoff
534
- ]
535
- if long_chroms:
536
- print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
537
- for chrom, L in long_chroms:
538
- print(f" {chrom}: {L} bases")
539
- else:
540
- print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
541
-
542
- chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
543
- out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
544
- embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
545
- else:
546
- raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
547
-
548
-
549
- #Load TF sequences
550
- tf_seqs, tf_ids = [], []
551
- for record in SeqIO.parse(args.tf_fasta, "fasta"):
552
- tf_ids.append(record.id)
553
- tf_seqs.append(str(record.seq))
554
-
555
- # embed and save
556
- tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
557
- out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
558
- embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
559
-
560
- print("Done.")
 
22
  from pathlib import Path
23
  import torch
24
  from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
 
25
  from Bio import SeqIO
26
  import time
27
  import pandas as pd
28
+ import esm
29
  from tqdm.auto import tqdm
30
  import logging, math
31
 
 
197
  pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
198
  return np.vstack(pooled)
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  class ESMEmbedder:
201
  def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
202
  # Try to load the specified ESM-2 model; fallback to esm1b if missing
 
263
  all_embeddings.append(seq_vec.cpu().numpy())
264
  return np.vstack(all_embeddings) # (N, D)
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  class ESMDBPEmbedder:
267
  def __init__(self, device):
268
  base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
 
362
  ).to(self.device)
363
  with torch.no_grad():
364
  last_hidden = self.model(**inputs).last_hidden_state
365
+ return last_hidden.mean(dim=1).cpu().numpy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dpacman/data_tasks/embeddings/protein.py ADDED
File without changes
dpacman/data_tasks/embeddings/utils.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility funcitons related to creating embeddings
3
+ """
4
+ import numpy as np
5
+
6
+ def pad_token_embeddings(list_of_arrays, pad_value=0.0):
7
+ """
8
+ list_of_arrays: list of (L_i, D) numpy arrays
9
+ Returns:
10
+ padded: (N, L_max, D) array
11
+ mask: (N, L_max) boolean array where True = real token, False = padding
12
+ """
13
+ N = len(list_of_arrays)
14
+ D = list_of_arrays[0].shape[1]
15
+ L_max = max(arr.shape[0] for arr in list_of_arrays)
16
+ padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
17
+ mask = np.zeros((N, L_max), dtype=bool)
18
+ for i, arr in enumerate(list_of_arrays):
19
+ L = arr.shape[0]
20
+ padded[i, :L] = arr
21
+ mask[i, :L] = True
22
+ return padded, mask
23
+
24
+ def embed_and_save(seqs, ids, embedder, out_path):
25
+ """
26
+ Using the passed embedder, make embeddings
27
+ """
28
+ embs = embedder.embed(seqs)
29
+
30
+ # Decide whether we got variable-length per-token outputs (list of (L, D))
31
+ is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
32
+
33
+ if is_variable_token:
34
+ # pad to (N, L_max, D) + mask
35
+ padded, mask = pad_token_embeddings(embs)
36
+ # Save both embeddings and mask together in an .npz for convenience
37
+ np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
38
+ embeddings=padded,
39
+ mask=mask,
40
+ ids=np.array(ids, dtype=object),
41
+ seqs=np.array(seqs, dtype=object))
42
+ else:
43
+ # fixed shape output, e.g., pooled (N, D)
44
+ array = np.vstack(embs) if isinstance(embs, list) else embs
45
+ np.save(out_path, array)
46
+ with open(out_path.with_suffix(".ids"), "w") as f:
47
+ f.write("\n".join(ids))
dpacman/data_tasks/split/remap.py CHANGED
@@ -450,7 +450,8 @@ def main(cfg: DictConfig):
450
  )
451
  dna_assign, kept_by_split = results
452
 
453
- edge_df["split"] = edge_df["dna_seqid"].map(dna_assign)
 
454
  else:
455
  results = split_bipartite_by_components(
456
  edges,
@@ -498,9 +499,11 @@ def main(cfg: DictConfig):
498
  # ensure there is no overlap
499
  check_validity(train, val, test, split_by=cfg.data_task.split_by)
500
 
501
- logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/sum([len(train),len(val),len(test)]):.2f}%)")
502
- logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/sum([len(train),len(val),len(test)]):.2f}%)")
503
- logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/sum([len(train),len(val),len(test)]):.2f}%)")
 
 
504
 
505
  # create the output dir
506
  split_out_dir = Path(root)/cfg.data_task.split_out_dir
 
450
  )
451
  dna_assign, kept_by_split = results
452
 
453
+ # assign datapoints to cluster by their DNA cluster rep
454
+ edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
455
  else:
456
  results = split_bipartite_by_components(
457
  edges,
 
499
  # ensure there is no overlap
500
  check_validity(train, val, test, split_by=cfg.data_task.split_by)
501
 
502
+ total = sum([len(train),len(val),len(test)])
503
+ logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
504
+ logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
505
+ logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
506
+ logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")
507
 
508
  # create the output dir
509
  split_out_dir = Path(root)/cfg.data_task.split_out_dir
dpacman/scripts/preprocess.py CHANGED
@@ -15,6 +15,7 @@ from dpacman.data_tasks.fimo.run_fimo import main as run_fimo_main
15
  from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
16
  from dpacman.data_tasks.cluster.remap import main as cluster_remap_main
17
  from dpacman.data_tasks.split.remap import main as split_remap_main
 
18
 
19
  @hydra.main(
20
  config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
@@ -59,12 +60,19 @@ def main(cfg: DictConfig):
59
  else:
60
  raise ValueError(f"No clean pipeline defined for: {task_name}")
61
 
 
62
  elif task_type == "split":
63
  if task_name == "remap":
64
  split_remap_main(cfg)
65
  else:
66
  raise ValueError(f"No clean pipeline defined for: {task_name}")
67
 
 
 
 
 
 
 
68
  # Unknown - error
69
  else:
70
  raise ValueError(f"Unknown task type: {task_type}")
 
15
  from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
16
  from dpacman.data_tasks.cluster.remap import main as cluster_remap_main
17
  from dpacman.data_tasks.split.remap import main as split_remap_main
18
+ from dpacman.data_tasks.embeddings.dna import main as embed_dna_main
19
 
20
  @hydra.main(
21
  config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
 
60
  else:
61
  raise ValueError(f"No clean pipeline defined for: {task_name}")
62
 
63
+ # Split
64
  elif task_type == "split":
65
  if task_name == "remap":
66
  split_remap_main(cfg)
67
  else:
68
  raise ValueError(f"No clean pipeline defined for: {task_name}")
69
 
70
+ # Embed
71
+ elif task_type=="embeddings":
72
+ if task_name == "dna":
73
+ embed_dna_main(cfg)
74
+ else:
75
+ raise ValueError(f"No clean pipeline defined for: {task_name}")
76
  # Unknown - error
77
  else:
78
  raise ValueError(f"Unknown task type: {task_type}")
dpacman/scripts/run_embeddings.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Manually specify values used in the config
4
+ main_task="preprocess"
5
+ data_task_type="embeddings"
6
+ timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
7
+
8
+ run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
9
+ mkdir -p "$run_dir"
10
+
11
+ nohup python -u -m scripts.preprocess \
12
+ hydra.run.dir="${run_dir}" \
13
+ data_task="${data_task_type}/dna" \
14
+ > "${run_dir}/run.log" 2>&1 &
15
+
16
+ echo $! > "${run_dir}/pid.txt"
dpacman/scripts/run_split.sh CHANGED
@@ -5,7 +5,7 @@ main_task="preprocess"
5
  data_task_type="split"
6
  timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
7
 
8
- run_dir="/vast/projects/pranam/lab/sophie/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
9
  mkdir -p "$run_dir"
10
 
11
  nohup python -u -m scripts.preprocess \
 
5
  data_task_type="split"
6
  timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
7
 
8
+ run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
9
  mkdir -p "$run_dir"
10
 
11
  nohup python -u -m scripts.preprocess \
environment.yaml CHANGED
@@ -19,12 +19,12 @@ dependencies:
19
  - matplotlib=3.10.*
20
  - pip:
21
  # Pull GPU wheels (CUDA 12.8) from PyTorch's cu128 index; fall back to PyPI for others
22
- - --index-url https://download.pytorch.org/whl/cu128
23
  - --extra-index-url https://pypi.org/simple
24
 
25
- # PyTorch + CUDA 12.8
26
- - torch==2.7.1
27
- - torchvision==0.22.1
28
  # - torchaudio==2.7.1 # optional, if you need it
29
 
30
  # Lightning (classic)
@@ -41,5 +41,6 @@ dependencies:
41
  - scikit-learn==1.7.1
42
  - biopython==1.85
43
  - ortools==9.14.6206
 
44
  # Your package in editable mode
45
  - -e .
 
19
  - matplotlib=3.10.*
20
  - pip:
21
  # Pull GPU wheels (CUDA 12.8) from PyTorch's cu128 index; fall back to PyPI for others
22
+ - --index-url https://download.pytorch.org/whl/cu129
23
  - --extra-index-url https://pypi.org/simple
24
 
25
+ # PyTorch + CUDA 12.9
26
+ - torch==2.8
27
+ - torchvision==0.23
28
  # - torchaudio==2.7.1 # optional, if you need it
29
 
30
  # Lightning (classic)
 
41
  - scikit-learn==1.7.1
42
  - biopython==1.85
43
  - ortools==9.14.6206
44
+ - esm==3.2.1.post1
45
  # Your package in editable mode
46
  - -e .