svincoff commited on
Commit
37761e5
·
2 Parent(s): 5dc1e475c34ec8

Merge remote-tracking branch 'origin/embeddings'

Browse files
dpacman/data/compute_embeddings.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Plug-and-play embedding extraction for:
3
+ • Chromosome sequences (from raw UCSC JSON)
4
+ • TF sequences (transcription_factors.fasta)
5
+
6
+ Usage example (DNA + protein in one go):
7
+ module load miniconda/24.7.1
8
+ conda activate dpacman
9
+ python dpacman/data/compute_embeddings.py \
10
+ --genome-json-dir ../data_files/raw/genomes/hg38 \
11
+ --tf-fasta ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
12
+ --chrom-model caduceus \
13
+ --tf-model esm-dbp \
14
+ --out-dir ../data_files/processed/tfclust/hg38_tf/embeddings \
15
+ --device cuda
16
+ """
17
+ import os
18
+ import re
19
+ import argparse
20
+ import json
21
+ import numpy as np
22
+ from pathlib import Path
23
+ import torch
24
+ from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
25
+ import esm
26
+ from Bio import SeqIO
27
+ import time
28
+
29
+ # ---- model wrappers ----
30
+
31
+ class CaduceusEmbedder:
32
+ def __init__(self, device, chunk_size=131_072, overlap=0):
33
+ """
34
+ device: 'cpu' or 'cuda'
35
+ chunk_size: max bases (and thus tokens) to send in one forward pass
36
+ overlap: how many bases each window overlaps the previous; 0 = no overlap
37
+ """
38
+ model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
39
+ self.tokenizer = AutoTokenizer.from_pretrained(
40
+ model_name, trust_remote_code=True
41
+ )
42
+ self.model = AutoModel.from_pretrained(
43
+ model_name, trust_remote_code=True
44
+ ).to(device).eval()
45
+ self.device = device
46
+ self.chunk_size = chunk_size
47
+ self.step = chunk_size - overlap
48
+
49
+ def embed(self, seqs):
50
+ """
51
+ seqs: List[str] of DNA sequences (each <= chunk_size for this test)
52
+ returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
53
+ """
54
+ outputs = []
55
+ for seq in seqs:
56
+ # --- old windowing + mean-pooling logic, now commented out ---
57
+ # window_vecs = []
58
+ # for i in range(0, len(seq), self.step):
59
+ # chunk = seq[i : i + self.chunk_size]
60
+ # if not chunk:
61
+ # break
62
+ # toks = self.tokenizer(
63
+ # chunk,
64
+ # return_tensors="pt",
65
+ # padding=False,
66
+ # truncation=True,
67
+ # max_length=self.chunk_size
68
+ # ).to(self.device)
69
+ # with torch.no_grad():
70
+ # out = self.model(**toks).last_hidden_state
71
+ # window_vecs.append(out.mean(dim=1).squeeze(0).cpu())
72
+ # seq_emb = torch.stack(window_vecs, dim=0).mean(dim=0).numpy()
73
+ # outputs.append(seq_emb)
74
+
75
+ # --- new: raw per‐token embeddings in one shot ---
76
+ toks = self.tokenizer(
77
+ seq,
78
+ return_tensors="pt",
79
+ padding=False,
80
+ truncation=True,
81
+ max_length=self.chunk_size
82
+ ).to(self.device)
83
+ with torch.no_grad():
84
+ out = self.model(**toks).last_hidden_state # (1, L, D)
85
+ outputs.append(out.cpu().numpy()[0]) # (L, D)
86
+
87
+ return np.stack(outputs, axis=0) # (N, L, D)
88
+
89
+ def benchmark(self, lengths=None):
90
+ """
91
+ Time embedding on single-sequence of various lengths.
92
+ By default tests [5K,10K,50K,100K,chunk_size].
93
+ """
94
+ tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
95
+ print(f"→ Benchmarking Caduceus on device={self.device}")
96
+ for sz in tests:
97
+ seq = "A" * sz
98
+ # Warm-up
99
+ _ = self.embed([seq])
100
+ if self.device != "cpu":
101
+ torch.cuda.synchronize()
102
+ t0 = time.perf_counter()
103
+ _ = self.embed([seq])
104
+ if self.device != "cpu":
105
+ torch.cuda.synchronize()
106
+ t1 = time.perf_counter()
107
+ print(f" length={sz:6,d} time={(t1-t0)*1000:7.1f} ms")
108
+
109
+ class DNABertEmbedder:
110
+ def __init__(self, device):
111
+ self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
112
+ self.model = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
113
+ self.device = device
114
+
115
+ def embed(self, seqs):
116
+ embs = []
117
+ for s in seqs:
118
+ tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
119
+ with torch.no_grad():
120
+ out = self.model(tokens).last_hidden_state.mean(1)
121
+ embs.append(out.cpu().numpy())
122
+ return np.vstack(embs)
123
+
124
+ class NucleotideTransformerEmbedder:
125
+ def __init__(self, device):
126
+ # HF “feature-extraction” returns a list of (L, D) arrays for each input
127
+ # device: “cpu” or “cuda”
128
+ self.pipe = pipeline(
129
+ "feature-extraction",
130
+ model="InstaDeepAI/nucleotide-transformer-500m-1000g",
131
+ device= -1 if device=="cpu" else 0 # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
132
+ )
133
+
134
+ def embed(self, seqs):
135
+ """
136
+ seqs: List[str] of raw DNA sequences
137
+ returns: (N, D) array, one D-dim vector per sequence
138
+ """
139
+ all_embeddings = self.pipe(seqs, truncation=True, padding=True)
140
+ # all_embeddings is a List of shape (L, D) arrays
141
+ pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
142
+ return np.vstack(pooled)
143
+
144
+ class ESMEmbedder:
145
+ def __init__(self, device):
146
+ self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
147
+ self.batch_converter = self.alphabet.get_batch_converter()
148
+ self.model.to(device).eval()
149
+ self.device = device
150
+
151
+ def embed(self, seqs):
152
+ batch = [(str(i), seq) for i, seq in enumerate(seqs)]
153
+ _, _, toks = self.batch_converter(batch)
154
+ toks = toks.to(self.device)
155
+ with torch.no_grad():
156
+ results = self.model(toks, repr_layers=[33], return_contacts=False)
157
+ reps = results["representations"][33]
158
+ return reps[:, 1:-1].mean(1).cpu().numpy()
159
+
160
+ class ESMDBPEmbedder:
161
+ def __init__(self, device):
162
+ base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
163
+ model_path = (
164
+ Path(__file__).resolve().parent.parent
165
+ / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
166
+ )
167
+ checkpoint = torch.load(model_path, map_location="cpu")
168
+ clean_sd = {}
169
+ for k, v in checkpoint.items():
170
+ clean_sd[k.replace("module.", "")] = v
171
+ result = base_model.load_state_dict(clean_sd, strict=False)
172
+ if result.missing_keys:
173
+ print(f"[ESMDBP] missing keys: {result.missing_keys}")
174
+ if result.unexpected_keys:
175
+ print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
176
+
177
+ self.model = base_model.to(device).eval()
178
+ self.alphabet = alphabet
179
+ self.batch_converter = alphabet.get_batch_converter()
180
+ self.device = device
181
+
182
+ def embed(self, seqs):
183
+ batch = [(str(i), seq) for i, seq in enumerate(seqs)]
184
+ _, _, toks = self.batch_converter(batch)
185
+ toks = toks.to(self.device)
186
+ with torch.no_grad():
187
+ out = self.model(toks, repr_layers=[33], return_contacts=False)
188
+ reps = out["representations"][33]
189
+ # skip start/end tokens
190
+ return reps[:, 1:-1].mean(1).cpu().numpy()
191
+
192
+ class GPNEmbedder:
193
+ def __init__(self, device):
194
+ model_name = "songlab/gpn-msa-sapiens"
195
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
196
+ self.model = AutoModelForMaskedLM.from_pretrained(model_name)
197
+ self.model.to(device)
198
+ self.model.eval()
199
+ self.device = device
200
+
201
+ def embed(self, seqs):
202
+ inputs = self.tokenizer(
203
+ seqs,
204
+ return_tensors="pt",
205
+ padding=True,
206
+ truncation=True
207
+ ).to(self.device)
208
+
209
+ with torch.no_grad():
210
+ last_hidden = self.model(**inputs).last_hidden_state
211
+ return last_hidden.mean(dim=1).cpu().numpy()
212
+
213
+ class ProGenEmbedder:
214
+ def __init__(self, device):
215
+ model_name = "jinyuan22/ProGen2-base"
216
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
217
+ self.model = AutoModel.from_pretrained(model_name).to(device).eval()
218
+ self.device = device
219
+
220
+ def embed(self, seqs):
221
+ inputs = self.tokenizer(
222
+ seqs,
223
+ return_tensors="pt",
224
+ padding=True,
225
+ truncation=True
226
+ ).to(self.device)
227
+ with torch.no_grad():
228
+ last_hidden = self.model(**inputs).last_hidden_state
229
+ return last_hidden.mean(dim=1).cpu().numpy()
230
+
231
+ # ---- main pipeline ----
232
+
233
+ def get_embedder(name, device, for_dna=True):
234
+ name = name.lower()
235
+ if for_dna:
236
+ if name=="caduceus": return CaduceusEmbedder(device)
237
+ if name=="dnabert": return DNABertEmbedder(device)
238
+ if name=="nucleotide": return NucleotideTransformerEmbedder(device)
239
+ if name=="gpn": return GPNEmbedder(device)
240
+ else:
241
+ if name in ("esm",): return ESMEmbedder(device)
242
+ if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
243
+ if name=="progen": return ProGenEmbedder(device)
244
+ raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
245
+
246
+
247
+ def embed_and_save(seqs, ids, embedder, out_path):
248
+ embs = embedder.embed(seqs)
249
+ np.save(out_path, embs)
250
+ with open(out_path.with_suffix(".ids"), "w") as f:
251
+ f.write("\n".join(ids))
252
+
253
+ if __name__=="__main__":
254
+
255
+ p = argparse.ArgumentParser()
256
+ p.add_argument("--genome-json-dir", default="data_files/raw/genomes/hg38", help="dir of UCSC JSONs")
257
+ p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
258
+ p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
259
+ p.add_argument("--chrom-model", default="caduceus")
260
+ p.add_argument("--tf-model", default="esm-dbp")
261
+ p.add_argument("--out-dir", default="data_files/processed/tfclust/hg38_tf/embeddings")
262
+ p.add_argument("--device", default="cpu")
263
+ args = p.parse_args()
264
+
265
+ os.makedirs(args.out_dir, exist_ok=True)
266
+ device = args.device
267
+
268
+ if not args.skip_dna:
269
+ #Load only primary chromosome JSONs (chr1–22, X, Y, M)
270
+ genome_dir = Path(args.genome_json_dir)
271
+ chrom_seqs, chrom_ids = [], []
272
+ primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
273
+ for j in sorted(genome_dir.iterdir()):
274
+ if not primary_pattern.match(j.name):
275
+ continue
276
+ data = json.loads(j.read_text())
277
+ seq = data.get("dna") or data.get("sequence")
278
+ chrom = data.get("chrom") or j.stem.split("_")[-1]
279
+ chrom_seqs.append(seq)
280
+ chrom_ids.append(chrom)
281
+ ########################
282
+ cutoff = CaduceusEmbedder(device).chunk_size
283
+ long_chroms = [(chrom, len(seq)) for chrom, seq in zip(chrom_ids, chrom_seqs) if len(seq) > cutoff]
284
+ if long_chroms:
285
+ print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
286
+ for chrom, L in long_chroms:
287
+ print(f" {chrom}: {L} bases")
288
+ else:
289
+ print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
290
+
291
+ ####################
292
+ chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
293
+ out_chrom = Path(args.out_dir)/f"chrom_{args.chrom_model}.npy"
294
+ embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
295
+
296
+ #Load TF sequences
297
+ tf_seqs, tf_ids = [], []
298
+ for record in SeqIO.parse(args.tf_fasta, "fasta"):
299
+ tf_ids.append(record.id)
300
+ tf_seqs.append(str(record.seq))
301
+
302
+ # embed and save
303
+ tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
304
+ out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
305
+ embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
306
+
307
+ print("Done.")
dpacman/data/remap/post_fimo.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ import uuid
5
+ import pandas as pd
6
+ import numpy as np
7
+
8
+ # ─────────────────────────────────────────────────────────────────────────────
9
+ # PATHS — edit these if needed
10
+ INPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/post_fimo.csv"
11
+ OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/final.csv"
12
+ JSON_DIR = "/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38"
13
+ # ─────────────────────────────────────────────────────────────────────────────
14
+
15
+ def load_chrom_dna(chrom, cache):
16
+ """Load & cache the full chromosome 'dna' string from hg38_chr{chrom}.json."""
17
+ if chrom in cache:
18
+ return cache[chrom]
19
+ path = os.path.join(JSON_DIR, f"hg38_chr{chrom}.json")
20
+ if not os.path.isfile(path):
21
+ raise FileNotFoundError(f"Missing JSON for chr{chrom}: {path}")
22
+ with open(path) as f:
23
+ data = json.load(f)
24
+ cache[chrom] = data["dna"]
25
+ return cache[chrom]
26
+
27
+ def sigmoid_array(arr: np.ndarray) -> np.ndarray:
28
+ """Elementwise logistic sigmoid → values in (0,1)."""
29
+ return 1.0 / (1.0 + np.exp(-arr))
30
+
31
+ def main():
32
+ # 1) load post‐FIMO results
33
+ df = pd.read_csv(INPUT_CSV)
34
+
35
+ dna_cache = {}
36
+ records = []
37
+
38
+ # 2) for each TF‐peak row, extract sequence & build scores
39
+ for _, row in df.iterrows():
40
+ tfid = row["TF_id"]
41
+ chrom = str(row["#chrom"])
42
+ cstart = int(row["contextStart"])
43
+ cend = int(row["contextEnd"])
44
+ peak_s = int(row["ChIPStart"])
45
+ peak_e = int(row["ChIPEnd"])
46
+ chipscore = int(row["chipscore"])
47
+ jaspar = str(row["jaspar"])
48
+
49
+ # pull out the exact context sequence (including any Ns)
50
+ dna = load_chrom_dna(chrom, dna_cache)
51
+ seq = dna[cstart:cend]
52
+ L = len(seq)
53
+
54
+ # initialize base‐resolution scores
55
+ scores = np.zeros(L, dtype=int)
56
+
57
+ # fill ChIP‐seq peak region
58
+ ps = peak_s - cstart
59
+ pe = peak_e - cstart
60
+ scores[ps:pe] = chipscore
61
+
62
+ # overlay Jaspar hits (+100)
63
+ if jaspar.strip():
64
+ for hit in jaspar.split(","):
65
+ hs, he = hit.split("-")
66
+ hs_i = max(int(hs) - cstart, 0)
67
+ he_i = min(int(he) - cstart, L)
68
+ scores[hs_i:he_i] = chipscore + 100
69
+
70
+ # stringify the raw scores
71
+ score_str = ",".join(map(str, scores.tolist()))
72
+
73
+ # sigmoid‐transform
74
+ sig_vals = sigmoid_array(scores.astype(float))
75
+ score_sig = ",".join(f"{v:.4f}" for v in sig_vals.tolist())
76
+
77
+ records.append({
78
+ "TF_id": tfid,
79
+ "dna_sequence": seq,
80
+ "score_str": score_str,
81
+ "score_sig_r2": score_sig
82
+ })
83
+
84
+ # 3) assemble into a DataFrame
85
+ final_df = pd.DataFrame.from_records(records)
86
+
87
+ # 4) drop any exact TF+DNA duplicates
88
+ final_df = final_df.drop_duplicates(subset=["TF_id","dna_sequence"]).reset_index(drop=True)
89
+
90
+ # 5) assign random IDs
91
+ tf_map = {tf: uuid.uuid4().hex[:8] for tf in final_df["TF_id"].unique()}
92
+ dna_map = {sq: uuid.uuid4().hex[:8] for sq in final_df["dna_sequence"].unique()}
93
+
94
+ final_df["tf_seq_id"] = final_df["TF_id"].map(tf_map)
95
+ final_df["dna_seq_id"] = final_df["dna_sequence"].map(dna_map)
96
+ final_df["ID"] = final_df["tf_seq_id"] + "_" + final_df["dna_seq_id"]
97
+
98
+ # 6) reorder and write out
99
+ cols = ["TF_id","tf_seq_id","dna_sequence","dna_seq_id","score_str","score_sig_r2","ID"]
100
+ final_df[cols].to_csv(OUTPUT_CSV, index=False)
101
+ print(f"Wrote {len(final_df)} rows → {OUTPUT_CSV}")
102
+
103
+ if __name__ == "__main__":
104
+ main()
dpacman/data/remap/pre_fimo.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ # ------------------------------------------------------------------
6
+ # PARAMETERS
7
+ INPUT_CSV = "/home/a03-akrishna/DPACMAN/dpacman/data/remap/full_crm.csv"
8
+ OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/clean_pre_fimo.csv"
9
+ WINDOW_TOTAL = 500 # total extra context bp around each peak
10
+ # ------------------------------------------------------------------
11
+
12
+ def main():
13
+ # 1) load
14
+ df = pd.read_csv(INPUT_CSV)
15
+
16
+ # 2) normalize chromosomes and exclude non-whole chromosomes
17
+ df = df.rename(columns={"#chrom": "chrom"})
18
+ df["chrom"] = df["chrom"].str.replace(r"^chr", "", regex=True)
19
+
20
+ valid = [str(i) for i in range(1,23)] + ["X", "Y"]
21
+ df = df[df["chrom"].isin(valid)].reset_index(drop=True)
22
+
23
+ # 3) explode TF names
24
+ df["TF_list"] = df["name"].str.split(",")
25
+ df = df.explode("TF_list").rename(columns={"TF_list": "TF"})
26
+ df["TF"] = df["TF"].str.strip()
27
+
28
+ # 4) draw a random left‐flank between 0 and WINDOW_TOTAL,
29
+ # then right‐flank is whatever remains to sum to WINDOW_TOTAL
30
+ n = len(df)
31
+ df["left_context"] = np.random.randint(0, WINDOW_TOTAL + 1, size=n)
32
+ df["right_context"] = WINDOW_TOTAL - df["left_context"]
33
+
34
+ # 5) compute contextStart / contextEnd
35
+ df["contextStart"] = (df["chromStart"] - df["left_context"]).clip(lower=0).astype(int)
36
+ df["contextEnd"] = (df["chromEnd"] + df["right_context"]).astype(int)
37
+
38
+ # 6) assemble output
39
+ out = df[[
40
+ "chrom",
41
+ "contextStart",
42
+ "chromStart", # original ChIPStart
43
+ "chromEnd", # original ChIPEnd
44
+ "contextEnd",
45
+ "score", # original score column
46
+ "TF"
47
+ ]].rename(columns={
48
+ "chrom": "#chrom",
49
+ "chromStart": "ChIPStart",
50
+ "chromEnd": "ChIPEnd",
51
+ "score": "chipscore"
52
+ })
53
+
54
+ # 7) write CSV
55
+ out.to_csv(OUTPUT_CSV, index=False)
56
+ print(f"Wrote {len(out)} rows to {OUTPUT_CSV}")
57
+
58
+ if __name__ == "__main__":
59
+ main()
60
+
61
+
dpacman/data/remap/run_fimo.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+ import json
4
+ import subprocess
5
+ import pandas as pd
6
+ from multiprocessing import Pool, cpu_count
7
+ from tqdm import tqdm
8
+
9
+ # ─────────────────────────────────────────────────────────────────────────────
10
+ # CONFIG — edit these paths if needed
11
+ INPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/clean_pre_fimo.csv"
12
+ OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/post_fimo.csv"
13
+ JSON_DIR = "/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38"
14
+
15
+ # Full paths to MEME‐suite binaries
16
+ FIMO_BIN = "/home/a03-svincoff/meme/bin/fimo"
17
+ FASTA_GET_MARKOV = "/home/a03-svincoff/meme/libexec/meme-5.5.8/fasta-get-markov"
18
+
19
+ # JASPAR MEME file
20
+ MOTIF_FILE = "/home/a03-svincoff/DPACMAN/dpacman/softwares/meme-5.5.8/tests/common/JASPAR_CORE_2014_vertebrates.meme"
21
+
22
+ # Working filenames
23
+ SEQ_FASTA = "to_scan.fa"
24
+ BG_MODEL = "bg_model.txt"
25
+ FIMO_OUTDIR = "fimo_out"
26
+
27
+ # FIMO parameters
28
+ PVAL_THRESH = 1e-4
29
+ MAX_STORED = 1000000
30
+
31
+ # How many parallel FIMO jobs (defaults to all cores)
32
+ N_JOBS = cpu_count()
33
+ # ─────────────────────────────────────────────────────────────────────────────
34
+
35
+ def load_chrom_dna(chrom, cache):
36
+ if chrom in cache:
37
+ return cache[chrom]
38
+ fname = os.path.join(JSON_DIR, f"hg38_chr{chrom}.json")
39
+ if not os.path.isfile(fname):
40
+ raise FileNotFoundError(f"Chrom JSON not found: {fname}")
41
+ with open(fname) as f:
42
+ cache[chrom] = json.load(f)["dna"]
43
+ return cache[chrom]
44
+
45
+ def extract_sequences(df):
46
+ dna_cache = {}
47
+ with open(SEQ_FASTA, "w") as fa:
48
+ for idx, row in df.iterrows():
49
+ chrom = str(row["#chrom"])
50
+ dna = load_chrom_dna(chrom, dna_cache)
51
+ start = int(row["contextStart"])
52
+ end = int(row["contextEnd"])
53
+ seq = dna[start:end]
54
+ fa.write(f">{idx}\n{seq}\n")
55
+
56
+ def run_markov():
57
+ subprocess.check_call([FASTA_GET_MARKOV, SEQ_FASTA, BG_MODEL],
58
+ stdout=subprocess.DEVNULL,
59
+ stderr=subprocess.DEVNULL)
60
+
61
+ def split_fasta(n_chunks):
62
+ """Round-robin split SEQ_FASTA into chunked FASTA files."""
63
+ out_handles = [open(f"to_scan_{i}.fa","w") for i in range(n_chunks)]
64
+ with open(SEQ_FASTA) as inf:
65
+ header = None
66
+ seq_lines = []
67
+ for line in inf:
68
+ if line.startswith(">"):
69
+ if header is not None:
70
+ idx = int(header[1:].split()[0]) % n_chunks
71
+ out_handles[idx].write(header)
72
+ out_handles[idx].write("".join(seq_lines))
73
+ header = line
74
+ seq_lines = []
75
+ else:
76
+ seq_lines.append(line)
77
+ # last record
78
+ if header is not None:
79
+ idx = int(header[1:].split()[0]) % n_chunks
80
+ out_handles[idx].write(header)
81
+ out_handles[idx].write("".join(seq_lines))
82
+ for o in out_handles:
83
+ o.close()
84
+ return [f"to_scan_{i}.fa" for i in range(n_chunks)]
85
+
86
+ def run_fimo_chunk(args):
87
+ """Run FIMO on one FASTA chunk."""
88
+ chunk_id, fasta_path = args
89
+ outdir = f"{FIMO_OUTDIR}_{chunk_id}"
90
+ os.makedirs(outdir, exist_ok=True)
91
+ print(f"▶ Chunk {chunk_id} starting FIMO", flush=True)
92
+ subprocess.check_call([
93
+ FIMO_BIN,
94
+ "--oc", outdir,
95
+ "--bgfile", BG_MODEL,
96
+ "--max-stored-scores", str(MAX_STORED),
97
+ "--thresh", str(PVAL_THRESH),
98
+ MOTIF_FILE,
99
+ fasta_path
100
+ ])
101
+ print(f"▶ Chunk {chunk_id} finished", flush=True)
102
+ return os.path.join(outdir, "fimo.tsv")
103
+
104
+ def annotate_with_fimo(df, fimo_tsv):
105
+ fdf = pd.read_csv(fimo_tsv, sep="\t", comment="#")
106
+ fdf["idx"] = fdf["sequence_name"].astype(int)
107
+ fdf = fdf.merge(df[["idx","contextStart"]], on="idx", how="left")
108
+ fdf["genomic_start"] = fdf["contextStart"] + fdf["start"] - 1
109
+ fdf["genomic_end"] = fdf["contextStart"] + fdf["stop"]
110
+ fdf["coord"] = (
111
+ fdf["genomic_start"].astype(str)
112
+ + "-" +
113
+ fdf["genomic_end"].astype(str)
114
+ )
115
+ agg = fdf.groupby("idx")["coord"].agg(lambda hits: ",".join(hits))
116
+ df["jaspar"] = df["idx"].map(agg).fillna("")
117
+ return df
118
+
119
+ def main():
120
+ # 1) load & explode
121
+ df = pd.read_csv(INPUT_CSV, low_memory=False)
122
+ df = df.reset_index().rename(columns={"index":"idx"})
123
+ df["TF_occurrence"] = df.groupby("TF").cumcount() + 1
124
+ df["TF_id"] = df["TF"] + "_seq" + df["TF_occurrence"].astype(str)
125
+
126
+ # 2) extract sequences & build BG model
127
+ extract_sequences(df)
128
+ print("▶ Building background model��", flush=True)
129
+ run_markov()
130
+
131
+ # 3) chunk FASTA and run FIMO in parallel
132
+ chunks = split_fasta(N_JOBS)
133
+ print(f"▶ Running FIMO in parallel ({N_JOBS} jobs)…", flush=True)
134
+ with Pool(N_JOBS) as pool:
135
+ tsv_paths = list(tqdm(
136
+ pool.imap(run_fimo_chunk, enumerate(chunks)),
137
+ total=len(chunks),
138
+ desc="FIMO chunks",
139
+ leave=True
140
+ ))
141
+
142
+ # 4) merge chunked TSVs
143
+ combined = pd.concat([
144
+ pd.read_csv(tsv, sep="\t", comment="#")
145
+ for tsv in tsv_paths
146
+ ], ignore_index=True)
147
+ merged_tsv = "fimo_combined.tsv"
148
+ combined.to_csv(merged_tsv, sep="\t", index=False)
149
+
150
+ # 5) annotate & write final CSV
151
+ df = annotate_with_fimo(df, merged_tsv)
152
+ final = df[[
153
+ "#chrom","contextStart","ChIPStart","ChIPEnd",
154
+ "contextEnd","chipscore","TF","TF_id","jaspar"
155
+ ]]
156
+ final.to_csv(OUTPUT_CSV, index=False)
157
+ print(f"▶ Wrote {len(final)} rows → {OUTPUT_CSV}")
158
+
159
+ if __name__ == "__main__":
160
+ main()
dpacman/data/visualizations.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import random
3
+ import matplotlib.pyplot as plt
4
+ import glob
5
+ import re
6
+ from pathlib import Path
7
+
8
+ def trim_sequence(seq: str, seq_flanked: str, total_len: int):
9
+ """
10
+ Return a substring of seq_flanked of length total_len that contains seq
11
+ at a random valid position. Also returns (upstream, downstream).
12
+ """
13
+ i = seq_flanked.find(seq)
14
+ if i < 0:
15
+ raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
16
+ motif_len = len(seq)
17
+ extra = total_len - motif_len
18
+ left_avail = i
19
+ right_avail = len(seq_flanked) - (i + motif_len)
20
+ if extra > left_avail + right_avail:
21
+ raise ValueError("Not enough flank to reach desired length.")
22
+ # decide upstream bases
23
+ min_left = max(0, extra - right_avail)
24
+ max_left = min(extra, left_avail)
25
+ upstream = random.randint(min_left, max_left)
26
+ downstream = extra - upstream
27
+ start = i - upstream
28
+ end = i + motif_len + downstream
29
+ return seq_flanked[start:end], upstream, downstream
30
+
31
+
32
+ def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
33
+ df = pd.read_csv(input_csv)
34
+ ups, downs, abs_pos, rel_pos = [], [], [], []
35
+ trimmed_seqs = []
36
+ for _, row in df.iterrows():
37
+ trimmed, u, d = trim_sequence(row['seq'], row['seq_flanked'], total_len)
38
+ trimmed_seqs.append(trimmed)
39
+ ups.append(u)
40
+ downs.append(d)
41
+ abs_pos.append(u)
42
+ rel_pos.append(u / (total_len - len(row['seq'])))
43
+ df_out = df.copy()
44
+ df_out['seq_trimmed'] = trimmed_seqs
45
+ df_out['motif_abs_start'] = abs_pos
46
+ df_out['motif_rel_pos'] = rel_pos
47
+ df_out.to_csv(output_csv, index=False)
48
+
49
+ basename = input_csv.stem
50
+ # Absolute position histogram
51
+ plt.figure(figsize=(6,4))
52
+ plt.hist(df_out['motif_abs_start'], bins=50, edgecolor='k')
53
+ plt.title(f'{basename}: Absolute Motif Start')
54
+ plt.xlabel('Start Index (nt)')
55
+ plt.ylabel('Count')
56
+ plt.tight_layout()
57
+ plt.savefig(fig_dir / f"{basename}_abs.png")
58
+ plt.close()
59
+ # Relative position histogram
60
+ plt.figure(figsize=(6,4))
61
+ plt.hist(df_out['motif_rel_pos'], bins=50, edgecolor='k')
62
+ plt.title(f'{basename}: Relative Motif Position')
63
+ plt.xlabel('Relative Position')
64
+ plt.ylabel('Count')
65
+ plt.tight_layout()
66
+ plt.savefig(fig_dir / f"{basename}_rel.png")
67
+ plt.close()
68
+
69
+ if __name__ == '__main__':
70
+ # === USER SETTINGS ===
71
+ PATTERN = '/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv'
72
+ CHR_FILTER = re.compile(
73
+ r'encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$'
74
+ )
75
+ DESIRED_LEN = 1000
76
+ OUTPUT_DIR = Path('trimmed_csvs')
77
+ FIG_DIR = Path('figures')
78
+ # =====================
79
+
80
+ OUTPUT_DIR.mkdir(exist_ok=True)
81
+ FIG_DIR.mkdir(exist_ok=True)
82
+ # Clear old figures
83
+ for f in FIG_DIR.iterdir():
84
+ if f.is_file():
85
+ f.unlink()
86
+
87
+ # Gather files and filter to pure chr1-22, X, Y
88
+ all_files = glob.glob(PATTERN)
89
+ files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
90
+ if not files:
91
+ print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
92
+ exit(1)
93
+
94
+ for infile in sorted(files):
95
+ out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
96
+ try:
97
+ process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
98
+ print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
99
+ except Exception as e:
100
+ print(f"Error processing {infile.name}: {e}")