embeddings
Browse files- .gitignore +6 -2
- configs/data_task/embeddings/dna.yaml +9 -0
- configs/data_task/embeddings/protein.yaml +0 -0
- dpacman/data_tasks/embeddings/__init__.py +24 -0
- dpacman/data_tasks/embeddings/dna.py +52 -0
- dpacman/data_tasks/embeddings/embedders.py +2 -197
- dpacman/data_tasks/embeddings/protein.py +0 -0
- dpacman/data_tasks/embeddings/utils.py +47 -0
- dpacman/data_tasks/split/remap.py +7 -4
- dpacman/scripts/preprocess.py +8 -0
- dpacman/scripts/run_embeddings.sh +16 -0
- dpacman/scripts/run_split.sh +1 -1
- environment.yaml +5 -4
.gitignore
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
dpacman/data_files
|
| 2 |
dpacman/preprocess/tfclust/*.log
|
| 3 |
dpacman/preprocess/tfclust/temp.py
|
| 4 |
bigBedToBed
|
|
@@ -25,4 +25,8 @@ dpacman/idmap_filt.csv
|
|
| 25 |
dpacman/temp3.py
|
| 26 |
dpacman/temp4.py
|
| 27 |
dpacman/temp.ipynb
|
| 28 |
-
dpacman/nohup.out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dpacman/data_files/
|
| 2 |
dpacman/preprocess/tfclust/*.log
|
| 3 |
dpacman/preprocess/tfclust/temp.py
|
| 4 |
bigBedToBed
|
|
|
|
| 25 |
dpacman/temp3.py
|
| 26 |
dpacman/temp4.py
|
| 27 |
dpacman/temp.ipynb
|
| 28 |
+
dpacman/nohup.out
|
| 29 |
+
dpacman/*/__pycache__/
|
| 30 |
+
dpacman/data_tasks/split/__pycache__/
|
| 31 |
+
dpacman/data_tasks/cluster/__pycache__/
|
| 32 |
+
dpacman/data_tasks/embeddings/__pycache__/
|
configs/data_task/embeddings/dna.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: dna
|
| 2 |
+
type: embeddings
|
| 3 |
+
|
| 4 |
+
genome_json_dir: null
|
| 5 |
+
chrom_model: caduceus
|
| 6 |
+
input_file: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
|
| 7 |
+
out_dir: dpacman/data_files/processed/embeddings/fimo_hits_only
|
| 8 |
+
|
| 9 |
+
device: gpu
|
configs/data_task/embeddings/protein.yaml
ADDED
|
File without changes
|
dpacman/data_tasks/embeddings/__init__.py
CHANGED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .embedders import (
|
| 2 |
+
CaduceusEmbedder,
|
| 3 |
+
DNABertEmbedder,
|
| 4 |
+
NucleotideTransformerEmbedder,
|
| 5 |
+
GPNEmbedder,
|
| 6 |
+
SegmentNTEmbedder,
|
| 7 |
+
ESMEmbedder,
|
| 8 |
+
ESMDBPEmbedder,
|
| 9 |
+
ProGenEmbedder
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
def get_embedder(name, device, for_dna=True):
|
| 13 |
+
name = name.lower()
|
| 14 |
+
if for_dna:
|
| 15 |
+
if name=="caduceus": return CaduceusEmbedder(device)
|
| 16 |
+
if name=="dnabert": return DNABertEmbedder(device)
|
| 17 |
+
if name=="nucleotide": return NucleotideTransformerEmbedder(device)
|
| 18 |
+
if name=="gpn": return GPNEmbedder(device)
|
| 19 |
+
if name=="segmentnt": return SegmentNTEmbedder(device)
|
| 20 |
+
else:
|
| 21 |
+
if name in ("esm",): return ESMEmbedder(device)
|
| 22 |
+
if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
|
| 23 |
+
if name=="progen": return ProGenEmbedder(device)
|
| 24 |
+
raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
|
dpacman/data_tasks/embeddings/dna.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .utils import pad_token_embeddings, embed_and_save
|
| 2 |
+
from dpacman.data_tasks.embeddings import get_embedder
|
| 3 |
+
|
| 4 |
+
import logging
|
| 5 |
+
import rootutils
|
| 6 |
+
import os
|
| 7 |
+
import torch
|
| 8 |
+
import json
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from omegaconf import DictConfig
|
| 12 |
+
|
| 13 |
+
root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
def main(cfg: DictConfig):
|
| 17 |
+
logger.info(f"Making embeddings using {cfg.data_task.chrom_model} for dna sequences at {cfg.data_task.input_file}")
|
| 18 |
+
# make out dir if necessary
|
| 19 |
+
out_dir = Path(root) / cfg.data_task.out_dir
|
| 20 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
# set device
|
| 23 |
+
device = "cpu"
|
| 24 |
+
if cfg.data_task.device=="gpu":
|
| 25 |
+
if torch.cuda.is_available():
|
| 26 |
+
device = "cuda"
|
| 27 |
+
logger.info(f"Using device: {device}")
|
| 28 |
+
|
| 29 |
+
# read the input file
|
| 30 |
+
input_file = Path(root) / cfg.data_task.input_file
|
| 31 |
+
if str(input_file).endswith(".json"):
|
| 32 |
+
# load the json and isolate the sequences and ids
|
| 33 |
+
with open(input_file, "r") as f:
|
| 34 |
+
d = json.load(f)
|
| 35 |
+
|
| 36 |
+
df = pd.DataFrame.from_dict(d, orient="index").reset_index()
|
| 37 |
+
df.columns = ["seq_id","sequence"]
|
| 38 |
+
|
| 39 |
+
# turn into list of sequences and IDs
|
| 40 |
+
peak_seqs = df["sequence"].tolist()
|
| 41 |
+
peak_ids = df["seq_id"].tolist()
|
| 42 |
+
logger.info(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data")
|
| 43 |
+
|
| 44 |
+
# Get the DNA embedder
|
| 45 |
+
dna_embedder = get_embedder(cfg.data_task.chrom_model, device, for_dna=True)
|
| 46 |
+
out_peaks = out_dir/ f"peaks_{cfg.data_task.chrom_model}.npy"
|
| 47 |
+
embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
|
| 48 |
+
|
| 49 |
+
logger.info("Finished embedding DNA sequences.")
|
| 50 |
+
|
| 51 |
+
if __name__=="__main__":
|
| 52 |
+
main()
|
dpacman/data_tasks/embeddings/embedders.py
CHANGED
|
@@ -22,10 +22,10 @@ import numpy as np
|
|
| 22 |
from pathlib import Path
|
| 23 |
import torch
|
| 24 |
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
|
| 25 |
-
import esm
|
| 26 |
from Bio import SeqIO
|
| 27 |
import time
|
| 28 |
import pandas as pd
|
|
|
|
| 29 |
from tqdm.auto import tqdm
|
| 30 |
import logging, math
|
| 31 |
|
|
@@ -197,23 +197,6 @@ class NucleotideTransformerEmbedder:
|
|
| 197 |
pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
|
| 198 |
return np.vstack(pooled)
|
| 199 |
|
| 200 |
-
# class ESMEmbedder:
|
| 201 |
-
# def __init__(self, device):
|
| 202 |
-
# self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
|
| 203 |
-
# self.batch_converter = self.alphabet.get_batch_converter()
|
| 204 |
-
# self.model.to(device).eval()
|
| 205 |
-
# self.device = device
|
| 206 |
-
|
| 207 |
-
# def embed(self, seqs):
|
| 208 |
-
# batch = [(str(i), seq) for i, seq in enumerate(seqs)]
|
| 209 |
-
# _, _, toks = self.batch_converter(batch)
|
| 210 |
-
# toks = toks.to(self.device)
|
| 211 |
-
# with torch.no_grad():
|
| 212 |
-
# results = self.model(toks, repr_layers=[33], return_contacts=False)
|
| 213 |
-
# reps = results["representations"][33]
|
| 214 |
-
# return reps[:, 1:-1].mean(1).cpu().numpy()
|
| 215 |
-
|
| 216 |
-
|
| 217 |
class ESMEmbedder:
|
| 218 |
def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
|
| 219 |
# Try to load the specified ESM-2 model; fallback to esm1b if missing
|
|
@@ -280,39 +263,6 @@ class ESMEmbedder:
|
|
| 280 |
all_embeddings.append(seq_vec.cpu().numpy())
|
| 281 |
return np.vstack(all_embeddings) # (N, D)
|
| 282 |
|
| 283 |
-
|
| 284 |
-
# class ESMDBPEmbedder:
|
| 285 |
-
# def __init__(self, device):
|
| 286 |
-
# base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
|
| 287 |
-
# model_path = (
|
| 288 |
-
# Path(__file__).resolve().parent.parent
|
| 289 |
-
# / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
|
| 290 |
-
# )
|
| 291 |
-
# checkpoint = torch.load(model_path, map_location="cpu")
|
| 292 |
-
# clean_sd = {}
|
| 293 |
-
# for k, v in checkpoint.items():
|
| 294 |
-
# clean_sd[k.replace("module.", "")] = v
|
| 295 |
-
# result = base_model.load_state_dict(clean_sd, strict=False)
|
| 296 |
-
# if result.missing_keys:
|
| 297 |
-
# print(f"[ESMDBP] missing keys: {result.missing_keys}")
|
| 298 |
-
# if result.unexpected_keys:
|
| 299 |
-
# print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
|
| 300 |
-
|
| 301 |
-
# self.model = base_model.to(device).eval()
|
| 302 |
-
# self.alphabet = alphabet
|
| 303 |
-
# self.batch_converter = alphabet.get_batch_converter()
|
| 304 |
-
# self.device = device
|
| 305 |
-
|
| 306 |
-
# def embed(self, seqs):
|
| 307 |
-
# batch = [(str(i), seq) for i, seq in enumerate(seqs)]
|
| 308 |
-
# _, _, toks = self.batch_converter(batch)
|
| 309 |
-
# toks = toks.to(self.device)
|
| 310 |
-
# with torch.no_grad():
|
| 311 |
-
# out = self.model(toks, repr_layers=[33], return_contacts=False)
|
| 312 |
-
# reps = out["representations"][33]
|
| 313 |
-
# # skip start/end tokens
|
| 314 |
-
# return reps[:, 1:-1].mean(1).cpu().numpy()
|
| 315 |
-
|
| 316 |
class ESMDBPEmbedder:
|
| 317 |
def __init__(self, device):
|
| 318 |
base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
|
|
@@ -412,149 +362,4 @@ class ProGenEmbedder:
|
|
| 412 |
).to(self.device)
|
| 413 |
with torch.no_grad():
|
| 414 |
last_hidden = self.model(**inputs).last_hidden_state
|
| 415 |
-
return last_hidden.mean(dim=1).cpu().numpy()
|
| 416 |
-
|
| 417 |
-
# ---- main pipeline ----
|
| 418 |
-
|
| 419 |
-
def get_embedder(name, device, for_dna=True):
|
| 420 |
-
name = name.lower()
|
| 421 |
-
if for_dna:
|
| 422 |
-
if name=="caduceus": return CaduceusEmbedder(device)
|
| 423 |
-
if name=="dnabert": return DNABertEmbedder(device)
|
| 424 |
-
if name=="nucleotide": return NucleotideTransformerEmbedder(device)
|
| 425 |
-
if name=="gpn": return GPNEmbedder(device)
|
| 426 |
-
if name=="segmentnt": return SegmentNTEmbedder(device)
|
| 427 |
-
else:
|
| 428 |
-
if name in ("esm",): return ESMEmbedder(device)
|
| 429 |
-
if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
|
| 430 |
-
if name=="progen": return ProGenEmbedder(device)
|
| 431 |
-
raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
def pad_token_embeddings(list_of_arrays, pad_value=0.0):
|
| 435 |
-
"""
|
| 436 |
-
list_of_arrays: list of (L_i, D) numpy arrays
|
| 437 |
-
Returns:
|
| 438 |
-
padded: (N, L_max, D) array
|
| 439 |
-
mask: (N, L_max) boolean array where True = real token, False = padding
|
| 440 |
-
"""
|
| 441 |
-
N = len(list_of_arrays)
|
| 442 |
-
D = list_of_arrays[0].shape[1]
|
| 443 |
-
L_max = max(arr.shape[0] for arr in list_of_arrays)
|
| 444 |
-
padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
|
| 445 |
-
mask = np.zeros((N, L_max), dtype=bool)
|
| 446 |
-
for i, arr in enumerate(list_of_arrays):
|
| 447 |
-
L = arr.shape[0]
|
| 448 |
-
padded[i, :L] = arr
|
| 449 |
-
mask[i, :L] = True
|
| 450 |
-
return padded, mask
|
| 451 |
-
|
| 452 |
-
def embed_and_save(seqs, ids, embedder, out_path):
|
| 453 |
-
embs = embedder.embed(seqs)
|
| 454 |
-
|
| 455 |
-
# Decide whether we got variable-length per-token outputs (list of (L, D))
|
| 456 |
-
is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
|
| 457 |
-
|
| 458 |
-
if is_variable_token:
|
| 459 |
-
# pad to (N, L_max, D) + mask
|
| 460 |
-
padded, mask = pad_token_embeddings(embs)
|
| 461 |
-
# Save both embeddings and mask together in an .npz for convenience
|
| 462 |
-
np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
|
| 463 |
-
embeddings=padded,
|
| 464 |
-
mask=mask,
|
| 465 |
-
ids=np.array(ids, dtype=object))
|
| 466 |
-
else:
|
| 467 |
-
# fixed shape output, e.g., pooled (N, D)
|
| 468 |
-
array = np.vstack(embs) if isinstance(embs, list) else embs
|
| 469 |
-
np.save(out_path, array)
|
| 470 |
-
with open(out_path.with_suffix(".ids"), "w") as f:
|
| 471 |
-
f.write("\n".join(ids))
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
if __name__=="__main__":
|
| 475 |
-
|
| 476 |
-
p = argparse.ArgumentParser()
|
| 477 |
-
#p.add_argument("--peak_fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
|
| 478 |
-
p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
|
| 479 |
-
p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
|
| 480 |
-
p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
|
| 481 |
-
p.add_argument("--chrom-model", default="caduceus")
|
| 482 |
-
p.add_argument("--tf-model", default="esm-dbp")
|
| 483 |
-
p.add_argument("--out-dir", default="dpacman/model/embeddings")
|
| 484 |
-
p.add_argument("--device", default="cpu")
|
| 485 |
-
args = p.parse_args()
|
| 486 |
-
|
| 487 |
-
os.makedirs(args.out_dir, exist_ok=True)
|
| 488 |
-
device = args.device
|
| 489 |
-
print(device)
|
| 490 |
-
|
| 491 |
-
if not args.skip_dna:
|
| 492 |
-
if args.genome_json_dir == None:
|
| 493 |
-
dna_df = pd.read_parquet('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.parquet', engine='pyarrow')
|
| 494 |
-
#df.to_csv('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.csv', index=False)
|
| 495 |
-
peak_seqs = dna_df["dna_sequence"]
|
| 496 |
-
peak_ids = dna_df["ID"]
|
| 497 |
-
print(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data", flush=True)
|
| 498 |
-
dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
|
| 499 |
-
out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
|
| 500 |
-
embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
|
| 501 |
-
|
| 502 |
-
# peak_fasta = Path(args.peak_fasta)
|
| 503 |
-
# if peak_fasta.exists():
|
| 504 |
-
# # Load peak sequences from FASTA
|
| 505 |
-
# from Bio import SeqIO
|
| 506 |
-
|
| 507 |
-
# peak_seqs = []
|
| 508 |
-
# peak_ids = []
|
| 509 |
-
# for rec in SeqIO.parse(peak_fasta, "fasta"):
|
| 510 |
-
# peak_ids.append(rec.id)
|
| 511 |
-
# peak_seqs.append(str(rec.seq))
|
| 512 |
-
# print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
|
| 513 |
-
# dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
|
| 514 |
-
# out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
|
| 515 |
-
# embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
|
| 516 |
-
elif args.genome_json_dir:
|
| 517 |
-
# Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
|
| 518 |
-
genome_dir = Path(args.genome_json_dir)
|
| 519 |
-
chrom_seqs, chrom_ids = [], []
|
| 520 |
-
primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
|
| 521 |
-
for j in sorted(genome_dir.iterdir()):
|
| 522 |
-
if not primary_pattern.match(j.name):
|
| 523 |
-
continue
|
| 524 |
-
data = json.loads(j.read_text())
|
| 525 |
-
seq = data.get("dna") or data.get("sequence")
|
| 526 |
-
chrom = data.get("chrom") or j.stem.split("_")[-1]
|
| 527 |
-
chrom_seqs.append(seq)
|
| 528 |
-
chrom_ids.append(chrom)
|
| 529 |
-
cutoff = CaduceusEmbedder(device).chunk_size
|
| 530 |
-
long_chroms = [
|
| 531 |
-
(chrom, len(seq))
|
| 532 |
-
for chrom, seq in zip(chrom_ids, chrom_seqs)
|
| 533 |
-
if len(seq) > cutoff
|
| 534 |
-
]
|
| 535 |
-
if long_chroms:
|
| 536 |
-
print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
|
| 537 |
-
for chrom, L in long_chroms:
|
| 538 |
-
print(f" {chrom}: {L} bases")
|
| 539 |
-
else:
|
| 540 |
-
print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
|
| 541 |
-
|
| 542 |
-
chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
|
| 543 |
-
out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
|
| 544 |
-
embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
|
| 545 |
-
else:
|
| 546 |
-
raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
#Load TF sequences
|
| 550 |
-
tf_seqs, tf_ids = [], []
|
| 551 |
-
for record in SeqIO.parse(args.tf_fasta, "fasta"):
|
| 552 |
-
tf_ids.append(record.id)
|
| 553 |
-
tf_seqs.append(str(record.seq))
|
| 554 |
-
|
| 555 |
-
# embed and save
|
| 556 |
-
tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
|
| 557 |
-
out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
|
| 558 |
-
embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
|
| 559 |
-
|
| 560 |
-
print("Done.")
|
|
|
|
| 22 |
from pathlib import Path
|
| 23 |
import torch
|
| 24 |
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
|
|
|
|
| 25 |
from Bio import SeqIO
|
| 26 |
import time
|
| 27 |
import pandas as pd
|
| 28 |
+
import esm
|
| 29 |
from tqdm.auto import tqdm
|
| 30 |
import logging, math
|
| 31 |
|
|
|
|
| 197 |
pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
|
| 198 |
return np.vstack(pooled)
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
class ESMEmbedder:
|
| 201 |
def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
|
| 202 |
# Try to load the specified ESM-2 model; fallback to esm1b if missing
|
|
|
|
| 263 |
all_embeddings.append(seq_vec.cpu().numpy())
|
| 264 |
return np.vstack(all_embeddings) # (N, D)
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
class ESMDBPEmbedder:
|
| 267 |
def __init__(self, device):
|
| 268 |
base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
|
|
|
|
| 362 |
).to(self.device)
|
| 363 |
with torch.no_grad():
|
| 364 |
last_hidden = self.model(**inputs).last_hidden_state
|
| 365 |
+
return last_hidden.mean(dim=1).cpu().numpy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dpacman/data_tasks/embeddings/protein.py
ADDED
|
File without changes
|
dpacman/data_tasks/embeddings/utils.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility funcitons related to creating embeddings
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
def pad_token_embeddings(list_of_arrays, pad_value=0.0):
|
| 7 |
+
"""
|
| 8 |
+
list_of_arrays: list of (L_i, D) numpy arrays
|
| 9 |
+
Returns:
|
| 10 |
+
padded: (N, L_max, D) array
|
| 11 |
+
mask: (N, L_max) boolean array where True = real token, False = padding
|
| 12 |
+
"""
|
| 13 |
+
N = len(list_of_arrays)
|
| 14 |
+
D = list_of_arrays[0].shape[1]
|
| 15 |
+
L_max = max(arr.shape[0] for arr in list_of_arrays)
|
| 16 |
+
padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
|
| 17 |
+
mask = np.zeros((N, L_max), dtype=bool)
|
| 18 |
+
for i, arr in enumerate(list_of_arrays):
|
| 19 |
+
L = arr.shape[0]
|
| 20 |
+
padded[i, :L] = arr
|
| 21 |
+
mask[i, :L] = True
|
| 22 |
+
return padded, mask
|
| 23 |
+
|
| 24 |
+
def embed_and_save(seqs, ids, embedder, out_path):
|
| 25 |
+
"""
|
| 26 |
+
Using the passed embedder, make embeddings
|
| 27 |
+
"""
|
| 28 |
+
embs = embedder.embed(seqs)
|
| 29 |
+
|
| 30 |
+
# Decide whether we got variable-length per-token outputs (list of (L, D))
|
| 31 |
+
is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
|
| 32 |
+
|
| 33 |
+
if is_variable_token:
|
| 34 |
+
# pad to (N, L_max, D) + mask
|
| 35 |
+
padded, mask = pad_token_embeddings(embs)
|
| 36 |
+
# Save both embeddings and mask together in an .npz for convenience
|
| 37 |
+
np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
|
| 38 |
+
embeddings=padded,
|
| 39 |
+
mask=mask,
|
| 40 |
+
ids=np.array(ids, dtype=object),
|
| 41 |
+
seqs=np.array(seqs, dtype=object))
|
| 42 |
+
else:
|
| 43 |
+
# fixed shape output, e.g., pooled (N, D)
|
| 44 |
+
array = np.vstack(embs) if isinstance(embs, list) else embs
|
| 45 |
+
np.save(out_path, array)
|
| 46 |
+
with open(out_path.with_suffix(".ids"), "w") as f:
|
| 47 |
+
f.write("\n".join(ids))
|
dpacman/data_tasks/split/remap.py
CHANGED
|
@@ -450,7 +450,8 @@ def main(cfg: DictConfig):
|
|
| 450 |
)
|
| 451 |
dna_assign, kept_by_split = results
|
| 452 |
|
| 453 |
-
|
|
|
|
| 454 |
else:
|
| 455 |
results = split_bipartite_by_components(
|
| 456 |
edges,
|
|
@@ -498,9 +499,11 @@ def main(cfg: DictConfig):
|
|
| 498 |
# ensure there is no overlap
|
| 499 |
check_validity(train, val, test, split_by=cfg.data_task.split_by)
|
| 500 |
|
| 501 |
-
|
| 502 |
-
logger.info(f"Length of
|
| 503 |
-
logger.info(f"Length of
|
|
|
|
|
|
|
| 504 |
|
| 505 |
# create the output dir
|
| 506 |
split_out_dir = Path(root)/cfg.data_task.split_out_dir
|
|
|
|
| 450 |
)
|
| 451 |
dna_assign, kept_by_split = results
|
| 452 |
|
| 453 |
+
# assign datapoints to cluster by their DNA cluster rep
|
| 454 |
+
edge_df["split"] = edge_df["dna_cluster_rep"].map(dna_assign)
|
| 455 |
else:
|
| 456 |
results = split_bipartite_by_components(
|
| 457 |
edges,
|
|
|
|
| 499 |
# ensure there is no overlap
|
| 500 |
check_validity(train, val, test, split_by=cfg.data_task.split_by)
|
| 501 |
|
| 502 |
+
total = sum([len(train),len(val),len(test)])
|
| 503 |
+
logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/total:.2f}%)")
|
| 504 |
+
logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/total:.2f}%)")
|
| 505 |
+
logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/total:.2f}%)")
|
| 506 |
+
logger.info(f"Total sequences = {total}. Same as edges size? {total==len(edge_df)}")
|
| 507 |
|
| 508 |
# create the output dir
|
| 509 |
split_out_dir = Path(root)/cfg.data_task.split_out_dir
|
dpacman/scripts/preprocess.py
CHANGED
|
@@ -15,6 +15,7 @@ from dpacman.data_tasks.fimo.run_fimo import main as run_fimo_main
|
|
| 15 |
from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
|
| 16 |
from dpacman.data_tasks.cluster.remap import main as cluster_remap_main
|
| 17 |
from dpacman.data_tasks.split.remap import main as split_remap_main
|
|
|
|
| 18 |
|
| 19 |
@hydra.main(
|
| 20 |
config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
|
|
@@ -59,12 +60,19 @@ def main(cfg: DictConfig):
|
|
| 59 |
else:
|
| 60 |
raise ValueError(f"No clean pipeline defined for: {task_name}")
|
| 61 |
|
|
|
|
| 62 |
elif task_type == "split":
|
| 63 |
if task_name == "remap":
|
| 64 |
split_remap_main(cfg)
|
| 65 |
else:
|
| 66 |
raise ValueError(f"No clean pipeline defined for: {task_name}")
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Unknown - error
|
| 69 |
else:
|
| 70 |
raise ValueError(f"Unknown task type: {task_type}")
|
|
|
|
| 15 |
from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
|
| 16 |
from dpacman.data_tasks.cluster.remap import main as cluster_remap_main
|
| 17 |
from dpacman.data_tasks.split.remap import main as split_remap_main
|
| 18 |
+
from dpacman.data_tasks.embeddings.dna import main as embed_dna_main
|
| 19 |
|
| 20 |
@hydra.main(
|
| 21 |
config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
|
|
|
|
| 60 |
else:
|
| 61 |
raise ValueError(f"No clean pipeline defined for: {task_name}")
|
| 62 |
|
| 63 |
+
# Split
|
| 64 |
elif task_type == "split":
|
| 65 |
if task_name == "remap":
|
| 66 |
split_remap_main(cfg)
|
| 67 |
else:
|
| 68 |
raise ValueError(f"No clean pipeline defined for: {task_name}")
|
| 69 |
|
| 70 |
+
# Embed
|
| 71 |
+
elif task_type=="embeddings":
|
| 72 |
+
if task_name == "dna":
|
| 73 |
+
embed_dna_main(cfg)
|
| 74 |
+
else:
|
| 75 |
+
raise ValueError(f"No clean pipeline defined for: {task_name}")
|
| 76 |
# Unknown - error
|
| 77 |
else:
|
| 78 |
raise ValueError(f"Unknown task type: {task_type}")
|
dpacman/scripts/run_embeddings.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Manually specify values used in the config
|
| 4 |
+
main_task="preprocess"
|
| 5 |
+
data_task_type="embeddings"
|
| 6 |
+
timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
|
| 7 |
+
|
| 8 |
+
run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
|
| 9 |
+
mkdir -p "$run_dir"
|
| 10 |
+
|
| 11 |
+
nohup python -u -m scripts.preprocess \
|
| 12 |
+
hydra.run.dir="${run_dir}" \
|
| 13 |
+
data_task="${data_task_type}/dna" \
|
| 14 |
+
> "${run_dir}/run.log" 2>&1 &
|
| 15 |
+
|
| 16 |
+
echo $! > "${run_dir}/pid.txt"
|
dpacman/scripts/run_split.sh
CHANGED
|
@@ -5,7 +5,7 @@ main_task="preprocess"
|
|
| 5 |
data_task_type="split"
|
| 6 |
timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
|
| 7 |
|
| 8 |
-
run_dir="/
|
| 9 |
mkdir -p "$run_dir"
|
| 10 |
|
| 11 |
nohup python -u -m scripts.preprocess \
|
|
|
|
| 5 |
data_task_type="split"
|
| 6 |
timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
|
| 7 |
|
| 8 |
+
run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
|
| 9 |
mkdir -p "$run_dir"
|
| 10 |
|
| 11 |
nohup python -u -m scripts.preprocess \
|
environment.yaml
CHANGED
|
@@ -19,12 +19,12 @@ dependencies:
|
|
| 19 |
- matplotlib=3.10.*
|
| 20 |
- pip:
|
| 21 |
# Pull GPU wheels (CUDA 12.8) from PyTorch's cu128 index; fall back to PyPI for others
|
| 22 |
-
- --index-url https://download.pytorch.org/whl/
|
| 23 |
- --extra-index-url https://pypi.org/simple
|
| 24 |
|
| 25 |
-
# PyTorch + CUDA 12.
|
| 26 |
-
- torch==2.
|
| 27 |
-
- torchvision==0.
|
| 28 |
# - torchaudio==2.7.1 # optional, if you need it
|
| 29 |
|
| 30 |
# Lightning (classic)
|
|
@@ -41,5 +41,6 @@ dependencies:
|
|
| 41 |
- scikit-learn==1.7.1
|
| 42 |
- biopython==1.85
|
| 43 |
- ortools==9.14.6206
|
|
|
|
| 44 |
# Your package in editable mode
|
| 45 |
- -e .
|
|
|
|
| 19 |
- matplotlib=3.10.*
|
| 20 |
- pip:
|
| 21 |
# Pull GPU wheels (CUDA 12.8) from PyTorch's cu128 index; fall back to PyPI for others
|
| 22 |
+
- --index-url https://download.pytorch.org/whl/cu129
|
| 23 |
- --extra-index-url https://pypi.org/simple
|
| 24 |
|
| 25 |
+
# PyTorch + CUDA 12.9
|
| 26 |
+
- torch==2.8
|
| 27 |
+
- torchvision==0.23
|
| 28 |
# - torchaudio==2.7.1 # optional, if you need it
|
| 29 |
|
| 30 |
# Lightning (classic)
|
|
|
|
| 41 |
- scikit-learn==1.7.1
|
| 42 |
- biopython==1.85
|
| 43 |
- ortools==9.14.6206
|
| 44 |
+
- esm==3.2.1.post1
|
| 45 |
# Your package in editable mode
|
| 46 |
- -e .
|