Merge remote-tracking branch 'origin/embeddings'

Browse files

Files changed (5) hide show

dpacman/data/compute_embeddings.py +307 -0
dpacman/data/remap/post_fimo.py +104 -0
dpacman/data/remap/pre_fimo.py +61 -0
dpacman/data/remap/run_fimo.py +160 -0
dpacman/data/visualizations.py +100 -0

dpacman/data/compute_embeddings.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+Plug-and-play embedding extraction for:
+  • Chromosome sequences (from raw UCSC JSON)
+  • TF sequences (transcription_factors.fasta)
+Usage example (DNA + protein in one go):
+  module load miniconda/24.7.1
+  conda activate dpacman
+  python dpacman/data/compute_embeddings.py \
+    --genome-json-dir ../data_files/raw/genomes/hg38 \
+    --tf-fasta         ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
+    --chrom-model      caduceus \
+    --tf-model         esm-dbp \
+    --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
+    --device           cuda
+"""
+import os
+import re
+import argparse
+import json
+import numpy as np
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
+import esm
+from Bio import SeqIO
+import time
+# ---- model wrappers ----
+class CaduceusEmbedder:
+    def __init__(self, device, chunk_size=131_072, overlap=0):
+        """
+        device: 'cpu' or 'cuda'
+        chunk_size: max bases (and thus tokens) to send in one forward pass
+        overlap: how many bases each window overlaps the previous; 0 = no overlap
+        """
+        model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(
+            model_name, trust_remote_code=True
+        ).to(device).eval()
+        self.device     = device
+        self.chunk_size = chunk_size
+        self.step       = chunk_size - overlap
+    def embed(self, seqs):
+        """
+        seqs: List[str] of DNA sequences (each <= chunk_size for this test)
+        returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
+        """
+        outputs = []
+        for seq in seqs:
+            # --- old windowing + mean-pooling logic, now commented out ---
+            # window_vecs = []
+            # for i in range(0, len(seq), self.step):
+            #     chunk = seq[i : i + self.chunk_size]
+            #     if not chunk:
+            #         break
+            #     toks = self.tokenizer(
+            #         chunk,
+            #         return_tensors="pt",
+            #         padding=False,
+            #         truncation=True,
+            #         max_length=self.chunk_size
+            #     ).to(self.device)
+            #     with torch.no_grad():
+            #         out = self.model(**toks).last_hidden_state
+            #     window_vecs.append(out.mean(dim=1).squeeze(0).cpu())
+            # seq_emb = torch.stack(window_vecs, dim=0).mean(dim=0).numpy()
+            # outputs.append(seq_emb)
+            # --- new: raw per‐token embeddings in one shot ---
+            toks = self.tokenizer(
+                seq,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=self.chunk_size
+            ).to(self.device)
+            with torch.no_grad():
+                out = self.model(**toks).last_hidden_state  # (1, L, D)
+            outputs.append(out.cpu().numpy()[0])             # (L, D)
+        return np.stack(outputs, axis=0)  # (N, L, D)
+    def benchmark(self, lengths=None):
+        """
+        Time embedding on single-sequence of various lengths.
+        By default tests [5K,10K,50K,100K,chunk_size].
+        """
+        tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
+        print(f"→ Benchmarking Caduceus on device={self.device}")
+        for sz in tests:
+            seq = "A" * sz
+            # Warm-up
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t1 = time.perf_counter()
+            print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
+class DNABertEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
+        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
+        self.device    = device
+    def embed(self, seqs):
+        embs = []
+        for s in seqs:
+            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
+            with torch.no_grad():
+                out = self.model(tokens).last_hidden_state.mean(1)
+            embs.append(out.cpu().numpy())
+        return np.vstack(embs)
+class NucleotideTransformerEmbedder:
+    def __init__(self, device):
+        # HF “feature-extraction” returns a list of (L, D) arrays for each input
+        # device: “cpu” or “cuda”
+        self.pipe = pipeline(
+            "feature-extraction",
+            model="InstaDeepAI/nucleotide-transformer-500m-1000g",
+            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
+        )
+    def embed(self, seqs):
+        """
+        seqs: List[str] of raw DNA sequences
+        returns: (N, D) array, one D-dim vector per sequence
+        """
+        all_embeddings = self.pipe(seqs, truncation=True, padding=True)
+        # all_embeddings is a List of shape (L, D) arrays
+        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
+        return np.vstack(pooled)
+class ESMEmbedder:
+    def __init__(self, device):
+        self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+        self.batch_converter = self.alphabet.get_batch_converter()
+        self.model.to(device).eval()
+        self.device = device
+    def embed(self, seqs):
+        batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+        _, _, toks = self.batch_converter(batch)
+        toks = toks.to(self.device)
+        with torch.no_grad():
+            results = self.model(toks, repr_layers=[33], return_contacts=False)
+        reps = results["representations"][33]
+        return reps[:, 1:-1].mean(1).cpu().numpy()
+class ESMDBPEmbedder:
+    def __init__(self, device):
+        base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+        model_path = (
+            Path(__file__).resolve().parent.parent
+            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+        )
+        checkpoint = torch.load(model_path, map_location="cpu")
+        clean_sd = {}
+        for k, v in checkpoint.items():
+            clean_sd[k.replace("module.", "")] = v
+        result = base_model.load_state_dict(clean_sd, strict=False)
+        if result.missing_keys:
+            print(f"[ESMDBP] missing keys: {result.missing_keys}")
+        if result.unexpected_keys:
+            print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+        self.model = base_model.to(device).eval()
+        self.alphabet = alphabet
+        self.batch_converter = alphabet.get_batch_converter()
+        self.device = device
+    def embed(self, seqs):
+        batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+        _, _, toks = self.batch_converter(batch)
+        toks = toks.to(self.device)
+        with torch.no_grad():
+            out = self.model(toks, repr_layers=[33], return_contacts=False)
+        reps = out["representations"][33]
+        # skip start/end tokens
+        return reps[:, 1:-1].mean(1).cpu().numpy()
+class GPNEmbedder:
+    def __init__(self, device):
+        model_name = "songlab/gpn-msa-sapiens"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.model.to(device)
+        self.model.eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+class ProGenEmbedder:
+    def __init__(self, device):
+        model_name = "jinyuan22/ProGen2-base"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+# ---- main pipeline ----
+def get_embedder(name, device, for_dna=True):
+    name = name.lower()
+    if for_dna:
+        if name=="caduceus":   return CaduceusEmbedder(device)
+        if name=="dnabert":    return DNABertEmbedder(device)
+        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
+        if name=="gpn":        return GPNEmbedder(device)
+    else:
+        if name in ("esm",):    return ESMEmbedder(device)
+        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
+        if name=="progen":      return ProGenEmbedder(device)
+    raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
+def embed_and_save(seqs, ids, embedder, out_path):
+    embs = embedder.embed(seqs)
+    np.save(out_path, embs)
+    with open(out_path.with_suffix(".ids"), "w") as f:
+        f.write("\n".join(ids))
+if __name__=="__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--genome-json-dir", default="data_files/raw/genomes/hg38", help="dir of UCSC JSONs")
+    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
+    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
+    p.add_argument("--chrom-model",   default="caduceus")
+    p.add_argument("--tf-model",      default="esm-dbp")
+    p.add_argument("--out-dir",       default="data_files/processed/tfclust/hg38_tf/embeddings")
+    p.add_argument("--device",        default="cpu")
+    args = p.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    device = args.device
+    if not args.skip_dna:
+        #Load only primary chromosome JSONs (chr1–22, X, Y, M)
+        genome_dir = Path(args.genome_json_dir)
+        chrom_seqs, chrom_ids = [], []
+        primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
+        for j in sorted(genome_dir.iterdir()):
+            if not primary_pattern.match(j.name):
+                continue
+            data = json.loads(j.read_text())
+            seq  = data.get("dna") or data.get("sequence")
+            chrom = data.get("chrom") or j.stem.split("_")[-1]
+            chrom_seqs.append(seq)
+            chrom_ids.append(chrom)
+        ########################
+        cutoff = CaduceusEmbedder(device).chunk_size
+        long_chroms = [(chrom, len(seq)) for chrom, seq in zip(chrom_ids, chrom_seqs) if len(seq) > cutoff]
+        if long_chroms:
+            print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
+            for chrom, L in long_chroms:
+                print(f"  {chrom}: {L} bases")
+        else:
+            print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
+        ####################
+        chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+        out_chrom = Path(args.out_dir)/f"chrom_{args.chrom_model}.npy"
+        embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
+    #Load TF sequences
+    tf_seqs, tf_ids = [], []
+    for record in SeqIO.parse(args.tf_fasta, "fasta"):
+        tf_ids.append(record.id)
+        tf_seqs.append(str(record.seq))
+    # embed and save
+    tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
+    out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
+    embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
+    print("Done.")

dpacman/data/remap/post_fimo.py ADDED Viewed

	@@ -0,0 +1,104 @@

+#!/usr/bin/env python3
+import os
+import json
+import uuid
+import pandas as pd
+import numpy as np
+# ─────────────────────────────────────────────────────────────────────────────
+# PATHS — edit these if needed
+INPUT_CSV  = "/home/a03-akrishna/DPACMAN/data_files/processed/post_fimo.csv"
+OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/final.csv"
+JSON_DIR   = "/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38"
+# ─────────────────────────────────────────────────────────────────────────────
+def load_chrom_dna(chrom, cache):
+    """Load & cache the full chromosome 'dna' string from hg38_chr{chrom}.json."""
+    if chrom in cache:
+        return cache[chrom]
+    path = os.path.join(JSON_DIR, f"hg38_chr{chrom}.json")
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Missing JSON for chr{chrom}: {path}")
+    with open(path) as f:
+        data = json.load(f)
+    cache[chrom] = data["dna"]
+    return cache[chrom]
+def sigmoid_array(arr: np.ndarray) -> np.ndarray:
+    """Elementwise logistic sigmoid → values in (0,1)."""
+    return 1.0 / (1.0 + np.exp(-arr))
+def main():
+    # 1) load post‐FIMO results
+    df = pd.read_csv(INPUT_CSV)
+    dna_cache = {}
+    records   = []
+    # 2) for each TF‐peak row, extract sequence & build scores
+    for _, row in df.iterrows():
+        tfid      = row["TF_id"]
+        chrom     = str(row["#chrom"])
+        cstart    = int(row["contextStart"])
+        cend      = int(row["contextEnd"])
+        peak_s    = int(row["ChIPStart"])
+        peak_e    = int(row["ChIPEnd"])
+        chipscore = int(row["chipscore"])
+        jaspar    = str(row["jaspar"])
+        # pull out the exact context sequence (including any Ns)
+        dna = load_chrom_dna(chrom, dna_cache)
+        seq = dna[cstart:cend]
+        L   = len(seq)
+        # initialize base‐resolution scores
+        scores = np.zeros(L, dtype=int)
+        # fill ChIP‐seq peak region
+        ps = peak_s - cstart
+        pe = peak_e - cstart
+        scores[ps:pe] = chipscore
+        # overlay Jaspar hits (+100)
+        if jaspar.strip():
+            for hit in jaspar.split(","):
+                hs, he = hit.split("-")
+                hs_i = max(int(hs) - cstart, 0)
+                he_i = min(int(he) - cstart, L)
+                scores[hs_i:he_i] = chipscore + 100
+        # stringify the raw scores
+        score_str    = ",".join(map(str, scores.tolist()))
+        # sigmoid‐transform
+        sig_vals     = sigmoid_array(scores.astype(float))
+        score_sig    = ",".join(f"{v:.4f}" for v in sig_vals.tolist())
+        records.append({
+            "TF_id":        tfid,
+            "dna_sequence": seq,
+            "score_str":    score_str,
+            "score_sig_r2": score_sig
+        })
+    # 3) assemble into a DataFrame
+    final_df = pd.DataFrame.from_records(records)
+    # 4) drop any exact TF+DNA duplicates
+    final_df = final_df.drop_duplicates(subset=["TF_id","dna_sequence"]).reset_index(drop=True)
+    # 5) assign random IDs
+    tf_map  = {tf: uuid.uuid4().hex[:8] for tf in final_df["TF_id"].unique()}
+    dna_map = {sq: uuid.uuid4().hex[:8] for sq in final_df["dna_sequence"].unique()}
+    final_df["tf_seq_id"]  = final_df["TF_id"].map(tf_map)
+    final_df["dna_seq_id"] = final_df["dna_sequence"].map(dna_map)
+    final_df["ID"]         = final_df["tf_seq_id"] + "_" + final_df["dna_seq_id"]
+    # 6) reorder and write out
+    cols = ["TF_id","tf_seq_id","dna_sequence","dna_seq_id","score_str","score_sig_r2","ID"]
+    final_df[cols].to_csv(OUTPUT_CSV, index=False)
+    print(f"Wrote {len(final_df)} rows → {OUTPUT_CSV}")
+if __name__ == "__main__":
+    main()

dpacman/data/remap/pre_fimo.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env python3
+import pandas as pd
+import numpy as np
+# ------------------------------------------------------------------
+# PARAMETERS
+INPUT_CSV  = "/home/a03-akrishna/DPACMAN/dpacman/data/remap/full_crm.csv"
+OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/clean_pre_fimo.csv"
+WINDOW_TOTAL = 500   # total extra context bp around each peak
+# ------------------------------------------------------------------
+def main():
+    # 1) load
+    df = pd.read_csv(INPUT_CSV)
+    # 2) normalize chromosomes and exclude non-whole chromosomes
+    df = df.rename(columns={"#chrom": "chrom"})
+    df["chrom"] = df["chrom"].str.replace(r"^chr", "", regex=True)
+    valid = [str(i) for i in range(1,23)] + ["X", "Y"]
+    df = df[df["chrom"].isin(valid)].reset_index(drop=True)
+    # 3) explode TF names
+    df["TF_list"] = df["name"].str.split(",")
+    df = df.explode("TF_list").rename(columns={"TF_list": "TF"})
+    df["TF"] = df["TF"].str.strip()
+    # 4) draw a random left‐flank between 0 and WINDOW_TOTAL,
+    #    then right‐flank is whatever remains to sum to WINDOW_TOTAL
+    n = len(df)
+    df["left_context"]  = np.random.randint(0, WINDOW_TOTAL + 1, size=n)
+    df["right_context"] = WINDOW_TOTAL - df["left_context"]
+    # 5) compute contextStart / contextEnd
+    df["contextStart"] = (df["chromStart"] - df["left_context"]).clip(lower=0).astype(int)
+    df["contextEnd"]   = (df["chromEnd"]   + df["right_context"]).astype(int)
+    # 6) assemble output
+    out = df[[
+        "chrom",
+        "contextStart",
+        "chromStart",  # original ChIPStart
+        "chromEnd",    # original ChIPEnd
+        "contextEnd",
+        "score",       # original score column
+        "TF"
+    ]].rename(columns={
+        "chrom":      "#chrom",
+        "chromStart": "ChIPStart",
+        "chromEnd":   "ChIPEnd",
+        "score":      "chipscore"
+    })
+    # 7) write CSV
+    out.to_csv(OUTPUT_CSV, index=False)
+    print(f"Wrote {len(out)} rows to {OUTPUT_CSV}")
+if __name__ == "__main__":
+    main()

dpacman/data/remap/run_fimo.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#!/usr/bin/env python3
+import os
+import json
+import subprocess
+import pandas as pd
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+# ─────────────────────────────────────────────────────────────────────────────
+# CONFIG — edit these paths if needed
+INPUT_CSV         = "/home/a03-akrishna/DPACMAN/data_files/processed/clean_pre_fimo.csv"
+OUTPUT_CSV        = "/home/a03-akrishna/DPACMAN/data_files/processed/post_fimo.csv"
+JSON_DIR          = "/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38"
+# Full paths to MEME‐suite binaries
+FIMO_BIN          = "/home/a03-svincoff/meme/bin/fimo"
+FASTA_GET_MARKOV  = "/home/a03-svincoff/meme/libexec/meme-5.5.8/fasta-get-markov"
+# JASPAR MEME file
+MOTIF_FILE        = "/home/a03-svincoff/DPACMAN/dpacman/softwares/meme-5.5.8/tests/common/JASPAR_CORE_2014_vertebrates.meme"
+# Working filenames
+SEQ_FASTA         = "to_scan.fa"
+BG_MODEL          = "bg_model.txt"
+FIMO_OUTDIR       = "fimo_out"
+# FIMO parameters
+PVAL_THRESH       = 1e-4
+MAX_STORED        = 1000000
+# How many parallel FIMO jobs (defaults to all cores)
+N_JOBS            = cpu_count()
+# ─────────────────────────────────────────────────────────────────────────────
+def load_chrom_dna(chrom, cache):
+    if chrom in cache:
+        return cache[chrom]
+    fname = os.path.join(JSON_DIR, f"hg38_chr{chrom}.json")
+    if not os.path.isfile(fname):
+        raise FileNotFoundError(f"Chrom JSON not found: {fname}")
+    with open(fname) as f:
+        cache[chrom] = json.load(f)["dna"]
+    return cache[chrom]
+def extract_sequences(df):
+    dna_cache = {}
+    with open(SEQ_FASTA, "w") as fa:
+        for idx, row in df.iterrows():
+            chrom = str(row["#chrom"])
+            dna   = load_chrom_dna(chrom, dna_cache)
+            start = int(row["contextStart"])
+            end   = int(row["contextEnd"])
+            seq   = dna[start:end]
+            fa.write(f">{idx}\n{seq}\n")
+def run_markov():
+    subprocess.check_call([FASTA_GET_MARKOV, SEQ_FASTA, BG_MODEL],
+                          stdout=subprocess.DEVNULL,
+                          stderr=subprocess.DEVNULL)
+def split_fasta(n_chunks):
+    """Round-robin split SEQ_FASTA into chunked FASTA files."""
+    out_handles = [open(f"to_scan_{i}.fa","w") for i in range(n_chunks)]
+    with open(SEQ_FASTA) as inf:
+        header = None
+        seq_lines = []
+        for line in inf:
+            if line.startswith(">"):
+                if header is not None:
+                    idx = int(header[1:].split()[0]) % n_chunks
+                    out_handles[idx].write(header)
+                    out_handles[idx].write("".join(seq_lines))
+                header = line
+                seq_lines = []
+            else:
+                seq_lines.append(line)
+        # last record
+        if header is not None:
+            idx = int(header[1:].split()[0]) % n_chunks
+            out_handles[idx].write(header)
+            out_handles[idx].write("".join(seq_lines))
+    for o in out_handles:
+        o.close()
+    return [f"to_scan_{i}.fa" for i in range(n_chunks)]
+def run_fimo_chunk(args):
+    """Run FIMO on one FASTA chunk."""
+    chunk_id, fasta_path = args
+    outdir = f"{FIMO_OUTDIR}_{chunk_id}"
+    os.makedirs(outdir, exist_ok=True)
+    print(f"▶ Chunk {chunk_id} starting FIMO", flush=True)
+    subprocess.check_call([
+        FIMO_BIN,
+        "--oc",                outdir,
+        "--bgfile",            BG_MODEL,
+        "--max-stored-scores", str(MAX_STORED),
+        "--thresh",            str(PVAL_THRESH),
+        MOTIF_FILE,
+        fasta_path
+    ])
+    print(f"▶ Chunk {chunk_id} finished", flush=True)
+    return os.path.join(outdir, "fimo.tsv")
+def annotate_with_fimo(df, fimo_tsv):
+    fdf = pd.read_csv(fimo_tsv, sep="\t", comment="#")
+    fdf["idx"] = fdf["sequence_name"].astype(int)
+    fdf = fdf.merge(df[["idx","contextStart"]], on="idx", how="left")
+    fdf["genomic_start"] = fdf["contextStart"] + fdf["start"] - 1
+    fdf["genomic_end"]   = fdf["contextStart"] + fdf["stop"]
+    fdf["coord"] = (
+        fdf["genomic_start"].astype(str)
+        + "-" +
+        fdf["genomic_end"].astype(str)
+    )
+    agg = fdf.groupby("idx")["coord"].agg(lambda hits: ",".join(hits))
+    df["jaspar"] = df["idx"].map(agg).fillna("")
+    return df
+def main():
+    # 1) load & explode
+    df = pd.read_csv(INPUT_CSV, low_memory=False)
+    df = df.reset_index().rename(columns={"index":"idx"})
+    df["TF_occurrence"] = df.groupby("TF").cumcount() + 1
+    df["TF_id"]         = df["TF"] + "_seq" + df["TF_occurrence"].astype(str)
+    # 2) extract sequences & build BG model
+    extract_sequences(df)
+    print("▶ Building background model��", flush=True)
+    run_markov()
+    # 3) chunk FASTA and run FIMO in parallel
+    chunks = split_fasta(N_JOBS)
+    print(f"▶ Running FIMO in parallel ({N_JOBS} jobs)…", flush=True)
+    with Pool(N_JOBS) as pool:
+        tsv_paths = list(tqdm(
+            pool.imap(run_fimo_chunk, enumerate(chunks)),
+            total=len(chunks),
+            desc="FIMO chunks",
+            leave=True
+        ))
+    # 4) merge chunked TSVs
+    combined = pd.concat([
+        pd.read_csv(tsv, sep="\t", comment="#")
+        for tsv in tsv_paths
+    ], ignore_index=True)
+    merged_tsv = "fimo_combined.tsv"
+    combined.to_csv(merged_tsv, sep="\t", index=False)
+    # 5) annotate & write final CSV
+    df = annotate_with_fimo(df, merged_tsv)
+    final = df[[
+        "#chrom","contextStart","ChIPStart","ChIPEnd",
+        "contextEnd","chipscore","TF","TF_id","jaspar"
+    ]]
+    final.to_csv(OUTPUT_CSV, index=False)
+    print(f"▶ Wrote {len(final)} rows → {OUTPUT_CSV}")
+if __name__ == "__main__":
+    main()

dpacman/data/visualizations.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import pandas as pd
+import random
+import matplotlib.pyplot as plt
+import glob
+import re
+from pathlib import Path
+def trim_sequence(seq: str, seq_flanked: str, total_len: int):
+    """
+    Return a substring of seq_flanked of length total_len that contains seq
+    at a random valid position. Also returns (upstream, downstream).
+    """
+    i = seq_flanked.find(seq)
+    if i < 0:
+        raise ValueError(f"Motif '{seq}' not found in flanked sequence.")
+    motif_len = len(seq)
+    extra = total_len - motif_len
+    left_avail = i
+    right_avail = len(seq_flanked) - (i + motif_len)
+    if extra > left_avail + right_avail:
+        raise ValueError("Not enough flank to reach desired length.")
+    # decide upstream bases
+    min_left = max(0, extra - right_avail)
+    max_left = min(extra, left_avail)
+    upstream = random.randint(min_left, max_left)
+    downstream = extra - upstream
+    start = i - upstream
+    end = i + motif_len + downstream
+    return seq_flanked[start:end], upstream, downstream
+def process_and_plot(input_csv: str, total_len: int, output_csv: Path, fig_dir: Path):
+    df = pd.read_csv(input_csv)
+    ups, downs, abs_pos, rel_pos = [], [], [], []
+    trimmed_seqs = []
+    for _, row in df.iterrows():
+        trimmed, u, d = trim_sequence(row['seq'], row['seq_flanked'], total_len)
+        trimmed_seqs.append(trimmed)
+        ups.append(u)
+        downs.append(d)
+        abs_pos.append(u)
+        rel_pos.append(u / (total_len - len(row['seq'])))
+    df_out = df.copy()
+    df_out['seq_trimmed'] = trimmed_seqs
+    df_out['motif_abs_start'] = abs_pos
+    df_out['motif_rel_pos'] = rel_pos
+    df_out.to_csv(output_csv, index=False)
+    basename = input_csv.stem
+    # Absolute position histogram
+    plt.figure(figsize=(6,4))
+    plt.hist(df_out['motif_abs_start'], bins=50, edgecolor='k')
+    plt.title(f'{basename}: Absolute Motif Start')
+    plt.xlabel('Start Index (nt)')
+    plt.ylabel('Count')
+    plt.tight_layout()
+    plt.savefig(fig_dir / f"{basename}_abs.png")
+    plt.close()
+    # Relative position histogram
+    plt.figure(figsize=(6,4))
+    plt.hist(df_out['motif_rel_pos'], bins=50, edgecolor='k')
+    plt.title(f'{basename}: Relative Motif Position')
+    plt.xlabel('Relative Position')
+    plt.ylabel('Count')
+    plt.tight_layout()
+    plt.savefig(fig_dir / f"{basename}_rel.png")
+    plt.close()
+if __name__ == '__main__':
+    # === USER SETTINGS ===
+    PATTERN      = '/home/a03-svincoff/DPACMAN/dpacman/data_files/processed/tfclust/hg38/encRegTfbsClustered_hg38_chr*.csv'
+    CHR_FILTER   = re.compile(
+        r'encRegTfbsClustered_hg38_chr([1-9]|1[0-9]|2[0-2]|X|Y)\.csv$'
+    )
+    DESIRED_LEN  = 1000
+    OUTPUT_DIR   = Path('trimmed_csvs')
+    FIG_DIR      = Path('figures')
+    # =====================
+    OUTPUT_DIR.mkdir(exist_ok=True)
+    FIG_DIR.mkdir(exist_ok=True)
+    # Clear old figures
+    for f in FIG_DIR.iterdir():
+        if f.is_file():
+            f.unlink()
+    # Gather files and filter to pure chr1-22, X, Y
+    all_files = glob.glob(PATTERN)
+    files = [Path(f) for f in all_files if CHR_FILTER.match(Path(f).name)]
+    if not files:
+        print(f"No matching chr1-22, X, Y files found (pattern={PATTERN}).")
+        exit(1)
+    for infile in sorted(files):
+        out_csv = OUTPUT_DIR / f"{infile.stem}_trimmed.csv"
+        try:
+            process_and_plot(infile, DESIRED_LEN, out_csv, FIG_DIR)
+            print(f"Processed {infile.name} -> {out_csv.name}; figures in {FIG_DIR}/")
+        except Exception as e:
+            print(f"Error processing {infile.name}: {e}")