finished fimo pipeline

Browse files

Files changed (12) hide show

.gitignore +2 -0
configs/data_task/download/genome.yaml +1 -1
configs/data_task/fimo/post_fimo.yaml +9 -3
configs/data_task/fimo/pre_fimo.yaml +6 -2
configs/data_task/fimo/run_fimo.yaml +16 -14
dpacman/data_tasks/fimo/post_fimo.py +362 -91
dpacman/data_tasks/fimo/pre_fimo.py +200 -55
dpacman/data_tasks/fimo/run_fimo.py +203 -126
dpacman/scripts/run_download.sh +1 -1
dpacman/scripts/run_fimo.sh +3 -2
dpacman/scripts/run_fimo_batch.sh +59 -0
environment.yaml +3 -0

.gitignore CHANGED Viewed

@@ -14,5 +14,7 @@ dpacman/data_tasks/clean/__pycache__/
 dpacman/data_tasks/download/__pycache__/
 dpacman/data_tasks/fimo/__pycache__/
 dpacman/scripts/__pycache__/
 logs/
 tree.txt

 dpacman/data_tasks/download/__pycache__/
 dpacman/data_tasks/fimo/__pycache__/
 dpacman/scripts/__pycache__/
+dpacman/temp.py
+dpacman/temp2.py
 logs/
 tree.txt

configs/data_task/download/genome.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: genome
 type: download
-output_dir: dpacman/classifier/data_files/raw/genomes
 genomes:
   - hg38

 name: genome
 type: download
+output_dir: dpacman/data_files/raw/genomes
 genomes:
   - hg38

configs/data_task/fimo/post_fimo.yaml CHANGED Viewed

@@ -1,6 +1,12 @@
 name: post_fimo
 type: fimo
-input_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output.csv
-output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output_processed.csv
-json_dir: dpacman/data_files/raw/genomes/hg38

 name: post_fimo
 type: fimo
+fimo_out_dir: dpacman/data_files/processed/fimo/fimo_out_q
+unprocessed_output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output_q_unprocessed.csv
+processed_output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output_q_processed.csv
+json_dir: dpacman/data_files/raw/genomes/hg38
+idmap_path: dpacman/data_files/raw/remap/idmapping_reviewed_true_2025_08_11.tsv
+jaspar_boost: 100
+debug: false

configs/data_task/fimo/pre_fimo.yaml CHANGED Viewed

@@ -1,8 +1,12 @@
 name: pre_fimo
 type: fimo
-input_csv: dpacman/data_files/processed/remap/remap2022_crm_macs2_hg38_v1_0_clean.tsv
-output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_input.csv
 window_total: 500
 save_example_files: true

 name: pre_fimo
 type: fimo
+paths:
+  input_csv: dpacman/data_files/processed/remap/remap2022_crm_macs2_hg38_v1_0_clean.tsv
+  output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_input.csv
+  chrom_output_path: dpacman/data_files/processed/fimo/chrom_inputs
+  json_dir: dpacman/data_files/raw/genomes/hg38
 window_total: 500
 save_example_files: true

configs/data_task/fimo/run_fimo.yaml CHANGED Viewed

@@ -1,24 +1,26 @@
-name: post_fimo
 type: fimo
-debug: false
 paths:
-  input_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_input.tsv
-  output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output.csv
   json_dir: dpacman/data_files/raw/genomes/hg38
-meme:
-  fimo_bin: dpacman/softwares/meme/bin/fimo
-  fasta_get_markov: dpacman/softwares/meme/libexec/meme-5.5.8/fasta-get-markov
-  jaspar_motif_file: dpacman/softwares/meme-5.5.8/tests/common/JASPAR_CORE_2014_vertebrates.meme
-fnames:
   seq_fasta: to_scan.fa
   bg_model: bg_model.txt
-  fimo_outdir: fimo_out
 fimo:
-  pval_thresh: 1e-4
   max_stored: 1000000
-  njobs: max

+name: run_fimo
 type: fimo
+debug: true
 paths:
+  input_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_input.csv
   json_dir: dpacman/data_files/raw/genomes/hg38
+  input_fasta_outer_dir: dpacman/data_files/processed/fimo/chrom_inputs
+  fimo_outdir: dpacman/data_files/processed/fimo/fimo_out
   seq_fasta: to_scan.fa
   bg_model: bg_model.txt
+meme:
+  fimo_bin: /vast/projects/pranam/lab/shared/meme/bin/fimo
+  fasta_get_markov: /vast/projects/pranam/lab/shared/meme/libexec/meme-5.5.8/fasta-get-markov
+  jaspar_motif_file: /vast/projects/pranam/lab/shared/meme-5.5.8/tests/common/JASPAR_CORE_2014_vertebrates.meme
 fimo:
+  thresh: 1e-2
+  thresh_mode: q
   max_stored: 1000000
+  njobs: 64
+chroms: [1]
+all_caps: true

dpacman/data_tasks/fimo/post_fimo.py CHANGED Viewed

@@ -1,120 +1,391 @@
 #!/usr/bin/env python3
 import os
-import json
 import uuid
-import pandas as pd
 import numpy as np
-# ─────────────────────────────────────────────────────────────────────────────
-# PATHS — edit these if needed
-INPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/post_fimo.csv"
-OUTPUT_CSV = "/home/a03-akrishna/DPACMAN/data_files/processed/final.csv"
-JSON_DIR = "/home/a03-svincoff/DPACMAN/dpacman/data_files/raw/genomes/hg38"
-# ─────────────────────────────────────────────────────────────────────────────
-def load_chrom_dna(chrom, cache):
-    """Load & cache the full chromosome 'dna' string from hg38_chr{chrom}.json."""
-    if chrom in cache:
-        return cache[chrom]
-    path = os.path.join(JSON_DIR, f"hg38_chr{chrom}.json")
-    if not os.path.isfile(path):
-        raise FileNotFoundError(f"Missing JSON for chr{chrom}: {path}")
-    with open(path) as f:
-        data = json.load(f)
-    cache[chrom] = data["dna"]
-    return cache[chrom]
-def sigmoid_array(arr: np.ndarray) -> np.ndarray:
-    """Elementwise logistic sigmoid → values in (0,1)."""
-    return 1.0 / (1.0 + np.exp(-arr))
-def main():
-    # 1) load post‐FIMO results
-    df = pd.read_csv(INPUT_CSV)
     dna_cache = {}
     records = []
-    # 2) for each TF‐peak row, extract sequence & build scores
-    for _, row in df.iterrows():
-        tfid = row["TF_id"]
-        chrom = str(row["#chrom"])
-        cstart = int(row["contextStart"])
-        cend = int(row["contextEnd"])
-        peak_s = int(row["ChIPStart"])
-        peak_e = int(row["ChIPEnd"])
-        chipscore = int(row["chipscore"])
-        jaspar = str(row["jaspar"])
-        # pull out the exact context sequence (including any Ns)
-        dna = load_chrom_dna(chrom, dna_cache)
-        seq = dna[cstart:cend]
-        L = len(seq)
-        # initialize base‐resolution scores
-        scores = np.zeros(L, dtype=int)
-        # fill ChIP‐seq peak region
-        ps = peak_s - cstart
-        pe = peak_e - cstart
-        scores[ps:pe] = chipscore
-        # overlay Jaspar hits (+100)
-        if jaspar.strip():
-            for hit in jaspar.split(","):
-                hs, he = hit.split("-")
-                hs_i = max(int(hs) - cstart, 0)
-                he_i = min(int(he) - cstart, L)
-                scores[hs_i:he_i] = chipscore + 100
-        # stringify the raw scores
-        score_str = ",".join(map(str, scores.tolist()))
-        # sigmoid‐transform
-        sig_vals = sigmoid_array(scores.astype(float))
-        score_sig = ",".join(f"{v:.4f}" for v in sig_vals.tolist())
         records.append(
-            {
-                "TF_id": tfid,
-                "dna_sequence": seq,
-                "score_str": score_str,
-                "score_sig_r2": score_sig,
-            }
         )
-    # 3) assemble into a DataFrame
-    final_df = pd.DataFrame.from_records(records)
-    # 4) drop any exact TF+DNA duplicates
-    final_df = final_df.drop_duplicates(subset=["TF_id", "dna_sequence"]).reset_index(
-        drop=True
-    )
-    # 5) assign random IDs
-    tf_map = {tf: uuid.uuid4().hex[:8] for tf in final_df["TF_id"].unique()}
-    dna_map = {sq: uuid.uuid4().hex[:8] for sq in final_df["dna_sequence"].unique()}
-    final_df["tf_seq_id"] = final_df["TF_id"].map(tf_map)
-    final_df["dna_seq_id"] = final_df["dna_sequence"].map(dna_map)
-    final_df["ID"] = final_df["tf_seq_id"] + "_" + final_df["dna_seq_id"]
-    # 6) reorder and write out
     cols = [
-        "TF_id",
-        "tf_seq_id",
-        "dna_sequence",
-        "dna_seq_id",
-        "score_str",
-        "score_sig_r2",
-        "ID",
     ]
-    final_df[cols].to_csv(OUTPUT_CSV, index=False)
-    print(f"Wrote {len(final_df)} rows → {OUTPUT_CSV}")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python3
 import os
 import uuid
+import logging
+from pathlib import Path
+import multiprocessing as mp
 import numpy as np
+import pandas as pd
+import math
+import rootutils
+import polars as pl
+from omegaconf import DictConfig
+from hydra.core.hydra_config import HydraConfig
+from dpacman.data_tasks.fimo.pre_fimo import load_chrom_dna
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+logger = logging.getLogger(__name__)
+def normalize_array(arr: np.ndarray, max_chipseq_score: int=1000, jaspar_boost:int=100) -> np.ndarray:
+    normalization_factor = max_chipseq_score + jaspar_boost
+    return arr / normalization_factor
+def format_sig(sig_vals, decimals=4, atol=0.0, rtol=1e-5):
+    a = np.asarray(sig_vals, dtype=float)
+    scale  = 10.0 ** decimals
+    thresh = 0.5 / scale                     # 0.00005 for 4 dp
+    # Would display as 0.0000 or 1.0000 at given precision?
+    m0 = np.isclose(a, 0.0, atol=atol, rtol=rtol) | (np.abs(a) <= thresh)
+    m1 = np.isclose(a, 1.0, atol=atol, rtol=rtol) | (np.abs(a - 1.0) <= thresh)
+    out = np.char.mod(f'%.{decimals}f', a)
+    out = np.where(m0, '0', out)
+    out = np.where(m1 & ~m0, '1', out)       # don’t overwrite any zeros
+    return ",".join(out.tolist())
+def _safe_process(task):
+    try:
+        return ("ok", _process_one_chrom_folder(task))
+    except Exception as e:
+        return ("err", (task[0], repr(e), traceback.format_exc()))
+def discover_chrom_folders(fimo_out_dir: Path) -> list[str]:
+    return sorted(
+        name for name in os.listdir(fimo_out_dir)
+        if name.startswith("chrom") and (fimo_out_dir / name / "final.csv").exists()
+    )
+def _process_one_row(row, dna: str, jaspar_boost: int = 100) -> dict:
+    # row order: TR, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar
+    trname, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar = row
+    seq = dna[cstart:cend]
+    L = len(seq)
+    scores = np.zeros(L)
+    # ChIP peak
+    ps = peak_s - cstart
+    pe = peak_e - cstart
+    peak_seq = ""
+    if ps < L and pe > 0:
+        scores[max(ps, 0):min(pe, L)] = chipscore
+        peak_seq = dna[max(ps, 0):min(pe, L)]
+    # JASPAR hits (+jaspar_boost)
+    # only run if the peak is not np.nan
+    total_jaspar = 0
+    if isinstance(jaspar, str) and jaspar.strip():
+        for hit in jaspar.split(","):
+            total_jaspar+=1
+            hs, he = hit.split("-")
+            hs_i = max(int(hs) - cstart, 0)
+            he_i = min(int(he) - cstart, L)
+            if hs_i < he_i:
+                scores[hs_i:he_i] = chipscore + jaspar_boost
+    score_str = ",".join(map(str, [int(x) for x in scores.tolist()]))
+    #sig_vals = normalize_array(scores.astype(np.float32))
+    # store out to 4 decimal places unless it's 0
+    #score_sig = format_sig(sig_vals)
+    return {
+        "chrom": chrom,
+        "tr_name": trname,
+        "dna_sequence": seq,
+        "peak_sequence": peak_seq,
+        "chipscore": chipscore,
+        "total_jaspar_hits": total_jaspar,
+        "scores": score_str,
+    }
+def _process_one_chrom_folder(task) -> pd.DataFrame:
+    """Runs inside a worker process. Reads one chrom’s final.csv, loads DNA once, builds records."""
+    chrom_folder, fimo_out_dir_str, json_dir, jaspar_boost, output_parts_folder = task
+    # make unique logger for this process
+    log_dir = Path(HydraConfig.get().run.dir) / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    output_parts_folder.mkdir(parents=True,exist_ok=True)
+    log_file = log_dir / f"fimo_{chrom_folder}.log"
+    wlogger = logging.getLogger(f"fimo_{chrom_folder}")
+    wlogger.setLevel(logging.DEBUG)
+    wlogger.propagate = False  # Don't double-log to root
+    if not any(isinstance(h, logging.FileHandler) for h in wlogger.handlers):
+        fh = logging.FileHandler(log_file, mode="w", encoding="utf-8")
+        fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+        wlogger.addHandler(fh)
+    fimo_out_dir = Path(fimo_out_dir_str)
+    final_csv = fimo_out_dir / chrom_folder / "final.csv"
+    if not final_csv.exists():
+        return pd.DataFrame()
+    usecols = ["TR", "#chrom", "contextStart", "contextEnd",
+               "ChIPStart", "ChIPEnd", "chipscore", "jaspar"]
+    df = pd.read_csv(final_csv, usecols=usecols)
+    if df.empty:
+        return pd.DataFrame()
+    # Normalize dtypes up-front
+    df["#chrom"] = df["#chrom"].astype(str)
+    for col in ("contextStart", "contextEnd", "ChIPStart", "ChIPEnd", "chipscore"):
+        df[col] = pd.to_numeric(df[col], downcast="integer")
+    chrom = df["#chrom"].iloc[0]
     dna_cache = {}
+    dna = load_chrom_dna(str(chrom), dna_cache, json_dir).upper()   # just capitalize it for training
+    wlogger.info(f"Loaded DNA for {chrom}, length {len(dna)}")
     records = []
+    # rename to make processing easier
+    rename = {
+        "#chrom": "chrom",
+        "contextStart": "cstart",
+        "contextEnd": "cend",
+        "ChIPStart": "peak_s",
+        "ChIPEnd": "peak_e",
+        "TR": "tr_name",
+    }
+    df = df.rename(columns=rename)
+    # (Optional) ensure numeric dtypes; will raise if non-numeric
+    for col in ["cstart", "cend", "peak_s", "peak_e", "chipscore"]:
+        df[col] = pd.to_numeric(df[col], errors="raise")
+    total = len(df)
+    last_decile = 0
+    for i, row in enumerate(df.itertuples(index=False), start=1):
         records.append(
+            _process_one_row(
+                (row.tr_name, row.chrom, int(row.cstart), int(row.cend),
+                int(row.peak_s), int(row.peak_e), int(row.chipscore), row.jaspar),
+                dna, jaspar_boost
+            )
         )
+        # progress every ~10%
+        decile = (i * 10) // max(total, 1)
+        if decile > last_decile:
+            last_decile = decile
+            wlogger.info("Progress: %d%% (%d/%d)", decile * 10, i, total)
+    wlogger.info(f"Completed processing {len(records)} rows for {chrom_folder}")
+    # make into a DataFrame and save
+    records_df = pd.DataFrame.from_records(records)
+    savepath = output_parts_folder / f"{chrom_folder}_processed.csv"
+    records_df.to_csv(savepath, index=False)
+    wlogger.info(f"Saved records to {savepath}")
+    return savepath
+def build_dataset_fast_mp(fimo_out_dir: Path, json_dir: str, debug: bool, max_workers: int | None, jaspar_boost: int = 100, output_parts_folder: str = None) -> pd.DataFrame:
+    """
+    Multiprocessing to build final dataset across chromosomes
+    """
+    chrom_folders = discover_chrom_folders(fimo_out_dir)
+    if not chrom_folders:
+        logger.warning(f"No chrom* folders with final.csv under {fimo_out_dir}")
+        return pd.DataFrame()
+    if debug:
+        chrom_folders = ["chromY"] if "chromY" in chrom_folders else chrom_folders[:1]
+        logger.info(f"DEBUG MODE: considering {chrom_folders[0]} only")
+    tasks = [(cf, str(fimo_out_dir), json_dir, jaspar_boost, output_parts_folder) for cf in chrom_folders]
+    # serial path (debug/deterministic)
+    if max_workers is not None and max_workers <= 1 or len(tasks) == 1:
+        parts = []
+        errs = []
+        for t in tasks:
+            status, payload = _safe_process(t)
+            if status == "ok" and isinstance(payload, pd.DataFrame) and not payload.empty:
+                parts.append(payload)
+            else:
+                errs.append(payload)
+        if errs:
+            for chrom, msg, tb in errs:
+                logger.error("Worker error for %s: %s\n%s", chrom, msg, tb)
+            # raise after serial run if you want hard failure
+        return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
+    # parallel path
+    procs = min(max_workers or mp.cpu_count(), len(tasks))
+    logger.info(f"Using {procs} parallel workers for {len(tasks)} chrom folders")
+    paths, errs = [], []
+    with mp.Pool(processes=procs, maxtasksperchild=10) as pool:
+        for status, payload in pool.imap_unordered(_safe_process, tasks, chunksize=1):
+            if status == "ok" and isinstance(payload, pd.DataFrame) and not payload.empty:
+                paths.append(payload)
+            else:
+                errs.append(payload)
+    if errs:
+        for chrom, msg, tb in errs:
+            logger.error("Worker error for %s: %s\n%s", chrom, msg, tb)
+        # optional: raise RuntimeError("One or more workers failed, see logs.")
+    paths = [p for p in parts if os.path.exists(p)]
+    return paths
+def combine_processed_with_polars(
+    paths_to_processed_dfs: list[str],
+    idmap_path: str,                  # TSV with columns: From, Entry, Sequence
+    out_path: str,                    # e.g., "processed_out.parquet" or ".csv"
+):
+    if not paths_to_processed_dfs:
+        logger.info("No records produced; nothing to write.")
+        return
+    # 1) Scan all CSVs lazily (no full read)
+    lfs = [pl.scan_csv(p, infer_schema_length=0) for p in paths_to_processed_dfs]
+    lf = pl.concat(lfs, how="vertical")
+    logger.info(f"Scanned CSVs")
+    # 2) Drop duplicate occurrences of tr_name and peak_sequence, because these are the same peak
+    lf = lf.unique(subset=["tr_name", "peak_sequence"], keep="first", maintain_order=True)
+    logger.info(f"Dropped duplicate examples of tr_name + peak_sequence")
+    # 3) Join small idmap (read eagerly; it’s tiny)
+    idmap = (
+        pl.read_csv(idmap_path, separator="\t", columns=["From", "Entry", "Sequence"])
+          .rename({"From": "tr_name", "Entry": "tr_uniprot", "Sequence": "tr_sequence"})
+    )
+    lf = lf.join(idmap.lazy(), on="tr_name", how="left")
+    logger.info(f"Merged in UniProt IDs and TR sequences from UniProt ID mappping")
+    # 4) Per-chromosome unique peak index and peak_id
+    #    (dense rank over peak_sequence per chrom; if you require "first-appearance" order,
+    #     see the note below for an alternate approach.)
+    lf = lf.with_columns([
+        pl.col("peak_sequence").fill_null("").alias("peak_sequence"),
+        pl.col("chrom").cast(pl.Utf8),
+    ])
+    lf = lf.with_columns(
+        pl.col("peak_sequence")
+          .rank(method="dense")      # 1,2,3,... per group
+          .over("chrom")
+          .cast(pl.Int64)
+          .alias("chrom_peak_idx")
+    )
+    lf = lf.with_columns(
+        pl.format("chrom{}_peak{}", pl.col("chrom"), pl.col("chrom_peak_idx")).alias("peak_id")
+    )
+    logger.info(f"Assigned unique peak_ids per chromosome based on peak_sequence")
+    # 5) Build stable IDs for dna_sequence and tr_sequence based on first appearance
+    #    (do this by creating small maps with unique(..., maintain_order=True) and joining)
+    dna_map = (
+        lf.select("dna_sequence")
+          .unique(maintain_order=True)
+          .with_row_index("dna_idx", offset=1)
+          .with_columns(pl.format("dnaseq{}", pl.col("dna_idx")).alias("dna_seqid"))
+          .select("dna_sequence", "dna_seqid")
+    )
+    logger.info(f"Assigned dna_sequence IDs")
+    tr_map = (
+        lf.select("tr_sequence")
+          .unique(maintain_order=True)
+          .with_row_index("tr_idx", offset=1)
+          .with_columns(pl.format("trseq{}", pl.col("tr_idx")).alias("tr_seqid"))
+          .select("tr_sequence", "tr_seqid")
+    )
+    logger.info(f"Assigned tr_sequence IDs")
+    lf = lf.join(dna_map, on="dna_sequence", how="left").join(tr_map, on="tr_sequence", how="left")
+    logger.info(f"Applied dna_sequence and tr_sequence IDs to main table")
+    # 6) Final ID and column selection
+    lf = lf.with_columns(
+        (pl.col("tr_seqid") + pl.lit("_") + pl.col("dna_seqid")).alias("ID")
+    )
     cols = [
+        "ID", "tr_name", "peak_id", "chipscore", "total_jaspar_hits",
+        "dna_sequence", "tr_sequence", "scores"
     ]
+    lf_out = lf.select(cols)
+    #n_rows = lf_out.select(pl.len().alias("rows")).collect(streaming=True)["rows"][0]
+    logger.info(f"Selected final columns")
+    # 7) Write streaming to disk
+    out_path = str(out_path)
+    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
+    if out_path.lower().endswith(".parquet"):
+        lf_out.sink_parquet(out_path, compression="zstd", statistics=True, row_group_size=128_000)
+        logger.info(f"Wrote parquet file to {out_path}")
+    elif out_path.lower().endswith(".csv"):
+        # NOTE: collect(streaming=True) still returns an in-memory DataFrame;
+        # prefer Parquet for very large outputs.
+        lf_out.collect(streaming=True).write_csv(out_path)
+        logger.info(f"Wrote csv file to {out_path}")
+    else:
+        # default to Parquet if no/unknown extension
+        lf_out.sink_parquet(out_path + ".parquet", compression="zstd", statistics=True)
+        logger.info(f"Wrote parquet file to {out_path}")
+    # 8) (Optional) small summary: unique peaks per chrom
+    peaks_per_chrom = (
+        lf.select("chrom", "peak_sequence")
+          .unique()
+          .group_by("chrom")
+          .len()
+          .collect(streaming=True)
+          .sort("chrom")
+    )
+    logger.info(f"Summary per chromosome:\n{peaks_per_chrom}")
+    logger.info("Schema:")
+    for name, dtype in lf_out.schema.items():
+        logger.info(f"  {name}: {dtype}")
+    # Quick preview of a few rows (safe)
+    logger.info("\nHead(5):")
+    logger.info(lf_out.head(5).collect())  # or: lf.limit(5).collect()
+    # Save the FIRST 1000 rows to CSV (streaming-friendly)
+    df_first = lf_out.limit(1000).collect(streaming=True)
+    example_out_path = Path(root) / "dpacman/data_files/processed/remap/examples" / "example1000_remap2022_crm_fimo_output_q_processed.csv"
+    df_first.write_csv(example_out_path)
+    logger.info(f"Wrote first 1000 rows to {example_out_path} as an example")
+def main(cfg: DictConfig):
+    debug = bool(cfg.data_task.debug)
+    json_dir = cfg.data_task.json_dir
+    fimo_out_dir = Path(root) / cfg.data_task.fimo_out_dir
+    processed_output_csv = Path(root) / cfg.data_task.processed_output_csv
+    output_parts_folder = processed_output_csv.parent / "temp_parts"
+    max_workers = getattr(cfg.data_task, "max_workers", None)
+    logger.info(f"Debug: {debug}")
+    logger.info(f"Reading per-chrom final.csv under: {fimo_out_dir}")
+    if False:
+        paths_to_processed_dfs = build_dataset_fast_mp(
+            fimo_out_dir=fimo_out_dir,
+            json_dir=json_dir,
+            debug=debug,
+            max_workers=max_workers,
+            jaspar_boost=cfg.data_task.jaspar_boost,
+            output_parts_folder=output_parts_folder
+        )
+    else:
+        paths_to_processed_dfs = [output_parts_folder/x for x in os.listdir(output_parts_folder)] if output_parts_folder.exists() else []
+    logger.info(f"Combining {len(paths_to_processed_dfs)} processed parts with Polars")
+    combine_processed_with_polars(
+        paths_to_processed_dfs=paths_to_processed_dfs,
+        idmap_path=Path(root) / cfg.data_task.idmap_path,
+        out_path=str(processed_output_csv).replace(".csv", ".parquet")
+    )
+    # Delete the folder that had the temporary DFs, don't need these
+    if False and output_parts_folder.exists():
+        for f in output_parts_folder.glob("*.csv"):
+            f.unlink()
+        output_parts_folder.rmdir()
+        logger.info(f"Cleaned up temporary files in {output_parts_folder}")
 if __name__ == "__main__":
+    # On some clusters with older Python, 'fork' is default and fine.
+    # If you hit issues (e.g., with threads/IO), uncomment spawn:
+    # mp.set_start_method("spawn", force=True)
     main()

dpacman/data_tasks/fimo/pre_fimo.py CHANGED Viewed

@@ -4,84 +4,229 @@ import numpy as np
 import rootutils
 import logging
 import os
 from omegaconf import DictConfig
 from pathlib import Path
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = logging.getLogger(__name__)
-def main(cfg: DictConfig):
-    # 1) load
-    input_path = Path(root) / cfg.data_task.input_csv
-    df = pd.read_csv(input_path, sep="\t")
-    # 2) normalize chromosomes and exclude non-whole chromosomes
-    df["chrom"] = df["chrom"].str.replace(r"^chr", "", regex=True)
-    valid = [str(i) for i in range(1, 23)] + ["X", "Y"]
-    df = df[df["chrom"].isin(valid)].reset_index(drop=True)
-    # 3) explode TF names
-    df["tr_list"] = df["tr"].str.split(",")
-    df = df.explode("tr_list").rename(columns={"tr_list": "TR"})
-    df["TR"] = df["TR"].str.strip()
-    # 4) draw a random left‐flank between 0 and WINDOW_TOTAL,
-    #    then right‐flank is whatever remains to sum to WINDOW_TOTAL
-    n = len(df)
-    df["left_context"] = np.random.randint(0, cfg.data_task.window_total + 1, size=n)
-    df["right_context"] = cfg.data_task.window_total - df["left_context"]
-    # 5) compute contextStart / contextEnd
-    df["contextStart"] = (
-        (df["chromStart"] - df["left_context"]).clip(lower=0).astype(int)
-    )
-    df["contextEnd"] = (df["chromEnd"] + df["right_context"]).astype(int)
-    # 6) assemble output
-    out = df[
-        [
-            "chrom",
-            "contextStart",
-            "chromStart",  # original ChIPStart
-            "chromEnd",  # original ChIPEnd
-            "contextEnd",
-            "score",  # original score column
-            "TR",
-        ]
-    ].rename(
-        columns={
-            "chrom": "#chrom",
-            "chromStart": "ChIPStart",
-            "chromEnd": "ChIPEnd",
-            "score": "chipscore",
-        }
-    )
-    # 7 make folder for tsv
-    output_path = Path(root) / cfg.data_task.output_csv
     os.makedirs(output_path.parent, exist_ok=True)
-    # 8) write csv
-    out.to_csv(output_path, index=False)
-    logger.info(f"Wrote {len(out)} rows to {output_path}")
     # 9) write example csv if necessary
-    if cfg.data_task.save_example_files:
         example_dir = output_path.parent / "examples"
         os.makedirs(example_dir, exist_ok=True)
-        output_csv_name = cfg.data_task.output_csv.split("/")[-1]
         example_savepath = os.path.join(
             example_dir, "example500_" + output_csv_name
         )
         if not (os.path.exists(example_savepath)):
             out.sample(n=500, random_state=42).reset_index(drop=True).to_csv(
-                example_savepath, sep="\t", index=False
             )
             logger.info(
                 f"Saved example FIMO input file with 500 rows to: {example_savepath}"
             )
 if __name__ == "__main__":

 import rootutils
 import logging
 import os
+import json
+import multiprocessing as mp
+from multiprocessing import Pool, cpu_count
 from omegaconf import DictConfig
 from pathlib import Path
+from hydra.core.hydra_config import HydraConfig
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = logging.getLogger(__name__)
+def init_worker(log_file, logger_name):
+    """Initialize a logger in each worker."""
+    wlogger = logging.getLogger(logger_name)
+    wlogger.setLevel(logging.INFO)
+    wlogger.propagate = False  # Don't double-log to root
+    # Avoid re-adding handlers if this logger is reused
+    if not wlogger.handlers:
+        handler = logging.FileHandler(log_file)
+        formatter = logging.Formatter('%(asctime)s - %(message)s')
+        handler.setFormatter(formatter)
+        wlogger.addHandler(handler)
+    return wlogger
+def process_chromosome(chrom, df, json_dir, output_path, example_dir, save_example_files, log_dir):
+    log_file = Path(log_dir) / f"chrom_{chrom}.log"
+    logger_name = f"logger_chrom_{chrom}"
+    wlogger = init_worker(log_file, logger_name)
+    wlogger.info(f"Processing chromosome {chrom}")
+    sub_df = df[df["#chrom"] == chrom].reset_index(drop=True)
+    sub_out_dir = Path(output_path) / f"chr{chrom}"
+    os.makedirs(sub_out_dir, exist_ok=True)
+    seq_fasta_path = sub_out_dir / "to_scan.fa"
+    extract_sequences(sub_df, seq_fasta_path, json_dir, wlogger)
+    if save_example_files:
+        example_out = Path(example_dir) / f"example_chr{chrom}_to_scan.fa"
+        if not example_out.exists():
+            with open(seq_fasta_path, "r") as f:
+                lines = f.readlines()
+            with open(example_out, "w") as f:
+                f.write("".join(lines[:50]))
+            wlogger.info(f"Saved example: {example_out}")
+def assemble_main_input(input_csv: str, window_total: int, output_csv: str, save_example_files: bool):
+    """
+    Method for assembling the main input dataframe
+    Args:
+        - input_csv: path to input csv that is converted to the file for FIMO input
+        - window_total: int determining the total non-ChIPseq-peak nucleotides included in a datapoint
+        - output_csv: where processed file will be saved
+        - save_example_files: bool determining whether we save example files that can be easily viewed
+    """
+    # 1) make input and output paths
+    input_path = Path(root) / input_csv
+    df = pd.read_csv(input_path, sep="\t")
+    out = None  # initialize out
+    output_path = Path(root) / output_csv
     os.makedirs(output_path.parent, exist_ok=True)
+    if not(os.path.exists(output_path)):
+        # 2) normalize chromosomes and exclude non-whole chromosomes
+        df["chrom"] = df["chrom"].str.replace(r"^chr", "", regex=True)
+        valid = [str(i) for i in range(1, 23)] + ["X", "Y"]
+        df = df[df["chrom"].isin(valid)].reset_index(drop=True)
+        # 3) explode TF names
+        df["tr_list"] = df["tr"].str.split(",")
+        df = df.explode("tr_list").rename(columns={"tr_list": "TR"})
+        df["TR"] = df["TR"].str.strip()
+        # 4) draw a random left‐flank between 0 and WINDOW_TOTAL,
+        #    then right‐flank is whatever remains to sum to WINDOW_TOTAL
+        n = len(df)
+        df["left_context"] = np.random.randint(0, window_total + 1, size=n)
+        df["right_context"] = window_total - df["left_context"]
+        # 5) compute contextStart / contextEnd
+        df["contextStart"] = (
+            (df["chromStart"] - df["left_context"]).clip(lower=0).astype(int)
+        )
+        df["contextEnd"] = (df["chromEnd"] + df["right_context"]).astype(int)
+        # 6) assemble output
+        out = df[
+            [
+                "chrom",
+                "contextStart",
+                "chromStart",  # original ChIPStart
+                "chromEnd",  # original ChIPEnd
+                "contextEnd",
+                "score",  # original score column
+                "TR",
+            ]
+        ].rename(
+            columns={
+                "chrom": "#chrom",
+                "chromStart": "ChIPStart",
+                "chromEnd": "ChIPEnd",
+                "score": "chipscore",
+            }
+        )
+        # 8) write csv
+        out.to_csv(output_path, index=False)
+        logger.info(f"Wrote {len(out)} rows to {output_path}")
+    # Load the DF if we need
+    if out is None:
+        out = pd.read_csv(output_path)
     # 9) write example csv if necessary
+    if save_example_files:
         example_dir = output_path.parent / "examples"
         os.makedirs(example_dir, exist_ok=True)
+        output_csv_name = output_csv.split("/")[-1]
         example_savepath = os.path.join(
             example_dir, "example500_" + output_csv_name
         )
         if not (os.path.exists(example_savepath)):
             out.sample(n=500, random_state=42).reset_index(drop=True).to_csv(
+                example_savepath, index=False
             )
             logger.info(
                 f"Saved example FIMO input file with 500 rows to: {example_savepath}"
             )
+    return out
+def load_chrom_dna(chrom, cache, json_dir):
+    """
+    Load DNA from the chromosome that we pre-downloaded
+    """
+    json_dir = Path(root) / json_dir
+    if chrom in cache:
+        return cache[chrom]
+    fname = os.path.join(json_dir, f"hg38_chr{chrom}.json")
+    if not os.path.isfile(fname):
+        raise FileNotFoundError(f"Chrom JSON not found: {fname}")
+    with open(fname) as f:
+        cache[chrom] = json.load(f)["dna"]
+    return cache[chrom]
+def parallel_make_all_fasta_inputs(df, json_dir, output_path, example_dir, save_example_files=True, max_workers=8):
+    df["#chrom"] = df["#chrom"].astype(str)
+    chromosomes = df["#chrom"].unique().tolist()
+    log_dir = Path(HydraConfig.get().run.dir) / "logs"
+    os.makedirs(log_dir, exist_ok=True)
+    logger.info(f"Created {log_dir} for storing logs for subprocesses.")
+    os.makedirs(example_dir, exist_ok=True)
+    logger.info(f"Created {example_dir} for storing example inputs")
+    args = [
+        (chrom, df, json_dir, output_path, example_dir, save_example_files, log_dir)
+        for chrom in chromosomes
+    ]
+    with mp.Pool(processes=max_workers) as pool:
+        pool.starmap(process_chromosome, args)
+def extract_sequences(df, seq_fasta, json_dir, wlogger):
+    """
+    Make the main sequence fasta for this chromosome. Used for building the background model.
+    """
+    dna_cache = {}
+    n_rows = len(df)
+    checkpoints = set(int(n_rows * i / 100) for i in range(1, 101))  # 1% to 100%
+    wlogger.info(f"Writing to {seq_fasta}")
+    if not(os.path.exists(seq_fasta)):
+        with open(seq_fasta, "w") as fa:
+            for idx, row in df.iterrows():
+                chrom = str(row["#chrom"])
+                tr = str(row["TR"])
+                dna = load_chrom_dna(chrom, dna_cache, json_dir)
+                start = int(row["contextStart"])
+                end = int(row["contextEnd"])
+                seq = dna[start:end] # end index is not included in ChIP-seq peaks
+                header = f"{idx}_chr{chrom}_{tr}_{start}_{end}"
+                fa.write(f">{header}\n{seq}\n")
+                # log every 1%
+                if idx in checkpoints:
+                    wlogger.info(f"   Reached {idx / n_rows:.0%} of the DataFrame (index {idx})")
+def main(cfg: DictConfig):
+    # 1) make the full input CSV
+    paths = cfg.data_task.paths
+    df = assemble_main_input(input_csv=paths.input_csv,
+                             window_total=cfg.data_task.window_total,
+                             output_csv=paths.output_csv,
+                             save_example_files=cfg.data_task.save_example_files)
+    # Make example dir to use in future methods
+    example_dir = Path(root) / paths.output_csv
+    example_dir = example_dir.parent / "examples"
+    os.makedirs(example_dir, exist_ok=True)
+    # 2) Make individual input files per chromosome
+    total_chroms = len(df["#chrom"].unique().tolist())
+    max_workers = cpu_count() - 1
+    logger.info(f"Max workers available (cpu_count - 1): {max_workers}")
+    max_workers = min(max_workers, total_chroms)
+    logger.info(f"min(max_workers, total_chroms) = {max_workers}")
+    parallel_make_all_fasta_inputs(df,
+                               json_dir=paths.json_dir,
+                               output_path=Path(root) / paths.chrom_output_path,
+                               example_dir=example_dir,
+                               save_example_files=cfg.data_task.save_example_files,
+                               max_workers=max_workers)
 if __name__ == "__main__":

dpacman/data_tasks/fimo/run_fimo.py CHANGED Viewed

@@ -9,34 +9,13 @@ import rootutils
 import logging
 from omegaconf import DictConfig
 from pathlib import Path
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = logging.getLogger(__name__)
-def load_chrom_dna(chrom, cache, json_dir):
-    if chrom in cache:
-        return cache[chrom]
-    fname = os.path.join(json_dir, f"hg38_chr{chrom}.json")
-    if not os.path.isfile(fname):
-        raise FileNotFoundError(f"Chrom JSON not found: {fname}")
-    with open(fname) as f:
-        cache[chrom] = json.load(f)["dna"]
-    return cache[chrom]
-def extract_sequences(df, seq_fasta, json_dir):
-    dna_cache = {}
-    with open(seq_fasta, "w") as fa:
-        for idx, row in df.iterrows():
-            chrom = str(row["#chrom"])
-            dna = load_chrom_dna(chrom, dna_cache, json_dir)
-            start = int(row["contextStart"])
-            end = int(row["contextEnd"])
-            seq = dna[start:end]
-            fa.write(f">{idx}\n{seq}\n")
 def run_markov(fasta_get_markov, seq_fasta, bg_model):
     subprocess.check_call(
         [fasta_get_markov, seq_fasta, bg_model],
@@ -44,31 +23,55 @@ def run_markov(fasta_get_markov, seq_fasta, bg_model):
         stderr=subprocess.DEVNULL,
     )
-def split_fasta(n_chunks, seq_fasta):
-    """Round-robin split SEQ_FASTA into chunked FASTA files."""
-    out_handles = [open(f"to_scan_{i}.fa", "w") for i in range(n_chunks)]
-    with open(seq_fasta) as inf:
         header = None
         seq_lines = []
         for line in inf:
             if line.startswith(">"):
                 if header is not None:
-                    idx = int(header[1:].split()[0]) % n_chunks
-                    out_handles[idx].write(header)
-                    out_handles[idx].write("".join(seq_lines))
                 header = line
                 seq_lines = []
             else:
                 seq_lines.append(line)
         # last record
         if header is not None:
-            idx = int(header[1:].split()[0]) % n_chunks
-            out_handles[idx].write(header)
-            out_handles[idx].write("".join(seq_lines))
     for o in out_handles:
         o.close()
-    return [f"to_scan_{i}.fa" for i in range(n_chunks)]
 def run_fimo_chunk(cfg):
@@ -83,41 +86,80 @@ def run_fimo_chunk(cfg):
             - bg_model
             - max_stored
             - motif_file
-            - pval_thresh
     """
-    outdir = f"{cfg['fimo_outdir']}_{cfg['chunk_id']}"
     os.makedirs(outdir, exist_ok=True)
-    logger.info(f"Chunk {cfg['chunk_id']} starting FIMO")
-    subprocess.check_call(
-        [
-            cfg["fimo_bin"],
-            "--oc",
-            outdir,
-            "--bgfile",
-            cfg["bg_model"],
-            "--max-stored-scores",
-            str(cfg["max_stored"]),
-            "--thresh",
-            str(cfg["pval_thresh"]),
-            cfg["motif_file"],
-            cfg["fasta_path"],
-        ]
-    )
-    logger.info(f"Chunk {cfg['chunk_id']} finished")
-    return os.path.join(outdir, "fimo.tsv")
-def annotate_with_fimo(df, fimo_tsv):
-    fdf = pd.read_csv(fimo_tsv, sep="\t", comment="#")
-    fdf["idx"] = fdf["sequence_name"].astype(int)
-    fdf = fdf.merge(df[["idx", "contextStart"]], on="idx", how="left")
     fdf["genomic_start"] = fdf["contextStart"] + fdf["start"] - 1
     fdf["genomic_end"] = fdf["contextStart"] + fdf["stop"]
     fdf["coord"] = (
         fdf["genomic_start"].astype(str) + "-" + fdf["genomic_end"].astype(str)
     )
-    agg = fdf.groupby("idx")["coord"].agg(lambda hits: ",".join(hits))
-    df["jaspar"] = df["idx"].map(agg).fillna("")
     return df
@@ -128,81 +170,116 @@ def main(cfg: DictConfig):
     # 0) configs
     paths = cfg.data_task.paths
     fimo = cfg.data_task.fimo
-    fnames = cfg.data_task.fnames
     meme = cfg.data_task.meme
     # set njobs to max or whatever # is specified by user
     njobs = fimo.njobs
     if njobs == "max":
-        njobs = cpu_count() - 1
     else:
-        njobs = min(cpu_count() - 1, int(njobs))
-    # 1) load & explode
-    input_csv_path = Path(root) / paths.input_csv
-    df = pd.read_csv(input_csv_path, low_memory=False)
-    df = df.reset_index().rename(columns={"index": "idx"})
-    df["TF_occurrence"] = df.groupby("TF").cumcount() + 1
-    df["TF_id"] = df["TF"] + "_seq" + df["TF_occurrence"].astype(str)
     # 2) extract sequences & build BG model
-    extract_sequences(df, fnames.seq_fasta, paths.json_dir)
-    logger.info("Building background model…")
-    run_markov(meme.fasta_get_markov, fnames.seq_fasta, fnames.bg_model)
-    # 3) chunk FASTA and run FIMO in parallel
-    chunks = split_fasta(njobs)
-    chunk_cfgs = [
-        dict(
-            chunk_id=i,
-            fasta_path=chunk,
-            fimo_outdir=fnames.fimo_outdir,
-            fimo_bin=paths.fimo_bin,
-            bg_model=fnames.bg_model,
-            max_stored=fimo.max_stored,
-            motif_file=meme.jaspar_motif_file,
-            pval_thresh=fimo.pval_thresh,
-        )
-        for i, chunk in enumerate(chunks)
-    ]
-    logger.info(f"Running FIMO in parallel ({njobs} jobs)…")
-    with Pool(njobs) as pool:
-        tsv_paths = list(
-            tqdm(
-                pool.imap(run_fimo_chunk, chunk_cfgs),
-                total=len(chunks),
-                desc="FIMO chunks",
-                leave=True,
-            )
-        )
-    # 4) merge chunked TSVs
-    combined = pd.concat(
-        [pd.read_csv(tsv, sep="\t", comment="#") for tsv in tsv_paths],
-        ignore_index=True,
-    )
-    merged_tsv = "fimo_combined.tsv"
-    combined.to_csv(merged_tsv, sep="\t", index=False)
-    # 5) annotate & write final CSV
-    df = annotate_with_fimo(df, merged_tsv)
-    final = df[
-        [
-            "#chrom",
-            "contextStart",
-            "ChIPStart",
-            "ChIPEnd",
-            "contextEnd",
-            "chipscore",
-            "TF",
-            "TF_id",
-            "jaspar",
         ]
-    ]
-    output_csv_path = Path(root) / paths.output_csv
-    final.to_csv(output_csv_path, index=False)
-    logger.info(f"Wrote {len(final)} rows → {output_csv_path}")
 if __name__ == "__main__":
     main()

 import logging
 from omegaconf import DictConfig
 from pathlib import Path
+import time
+import shutil
+from hydra.core.hydra_config import HydraConfig
 root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = logging.getLogger(__name__)
 def run_markov(fasta_get_markov, seq_fasta, bg_model):
     subprocess.check_call(
         [fasta_get_markov, seq_fasta, bg_model],
         stderr=subprocess.DEVNULL,
     )
+def split_fasta(n_chunks, input_file, output_dir, debug=False, debug_n=1000, all_caps=True):
+    """
+    Round-robin split SEQ_FASTA into chunked FASTA files.
+    If in debug mode, only keep the first 5 entries for each.
+    """
+    output_dir = Path(root) / output_dir
+    out_names = [os.path.join(output_dir, f"to_scan_{i}.fa") for i in range(n_chunks)]
+    out_handles = [open(out_names[i], "w") for i in range(n_chunks)]
+    chunk_counts = [0] * n_chunks  # Count sequences per chunk
+    logger.info(f"ALL CAPS mode: {all_caps}")
+    with open(input_file) as inf:
         header = None
         seq_lines = []
         for line in inf:
             if line.startswith(">"):
                 if header is not None:
+                    idx = int(header[1:].split("_")[0]) % n_chunks
+                    if not debug or chunk_counts[idx] < debug_n:
+                        out_handles[idx].write(header)
+                        seqj = "".join(seq_lines)
+                        if all_caps: seqj = seqj.upper()
+                        out_handles[idx].write(seqj)
+                        chunk_counts[idx] += 1
                 header = line
                 seq_lines = []
             else:
                 seq_lines.append(line)
         # last record
         if header is not None:
+            idx = int(header[1:].split("_")[0]) % n_chunks
+            if not debug or chunk_counts[idx] < debug_n:
+                out_handles[idx].write(header)
+                seqj = "".join(seq_lines)
+                if all_caps: seqj = seqj.upper()
+                out_handles[idx].write(seqj)
+                chunk_counts[idx] += 1
     for o in out_handles:
         o.close()
+    # Log chunk sizes
+    for i, count in enumerate(chunk_counts):
+        logger.info(f"Chunk {i}: {count} sequences")
+    return out_names
 def run_fimo_chunk(cfg):
             - bg_model
             - max_stored
             - motif_file
+            - thresh
+            - thresh_mode
+            - outdir
     """
+    chunk_id = cfg["chunk_id"]
+    log_dir = Path(HydraConfig.get().run.dir) / "logs"
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / f"fimo_chunk_{chunk_id}.log"
+    wlogger = logging.getLogger(f"fimo_chunk_{chunk_id}")
+    wlogger.setLevel(logging.DEBUG)
+    wlogger.propagate = False  # Don't double-log to root
+    outdir = Path(cfg["outdir"])
     os.makedirs(outdir, exist_ok=True)
+    if not any(isinstance(h, logging.FileHandler) for h in wlogger.handlers):
+        fh = logging.FileHandler(log_file, mode="w", encoding="utf-8")
+        fh.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+        wlogger.addHandler(fh)
+    # make an output directory for this chromosome
+    wlogger.info(f"Chunk {cfg['chunk_id']} starting FIMO")
+    wlogger.info(f"Threshold mode: {cfg['thresh_mode']}")
+    try:
+        call_list = [
+                    cfg["fimo_bin"],
+                    "--oc",
+                    outdir,
+                    "--bfile",
+                    cfg["bg_model"],
+                    "--max-stored-scores",
+                    str(cfg["max_stored"]),
+                    "--thresh",
+                    str(cfg["thresh"]),
+                    "--qv-thresh",  # threshold on q-value
+                    "--no-pgc", # suppress parsing of genomic coordinates in FASTA sequence header
+                    cfg["motif_file"],
+                    cfg["fasta_path"],
+                ]
+        if cfg["thresh_mode"]!="q":
+            call_list = [x for x in call_list if x!="--qv-thresh"]
+            assert "--qv-thresh" not in call_list
+        with open(log_file, "a") as log_fh:
+            subprocess.check_call(
+                call_list,
+                stdout=log_fh,
+                stderr=log_fh,
+            )
+        wlogger.info(f"\tChunk {cfg['chunk_id']} finished")
+        # Delete the file - gotta save space!
+        file_path = Path(cfg["fasta_path"])
+        if file_path.exists() and file_path.is_file():
+            file_path.unlink()
+            wlogger.info(f"\tDeleted file: {file_path}")
+    except subprocess.CalledProcessError as e:
+        wlogger.error(f"\tChunk {chunk_id}: FIMO failed with error code {e.returncode}")
+        raise
+    return os.path.join(outdir, f"fimo.tsv")
+def annotate_with_fimo(df, fdf):
+    df = df.reset_index().rename(columns={"index":"idx"})
+    df["sequence_name"] = df["idx"].astype(str) + "_chr" + df["#chrom"] + "_" + df["TR"] + "_" + df["contextStart"].astype(str) + "_" + df["contextEnd"].astype(str) #construt it the same way as headers
+    fdf = fdf.merge(df[["sequence_name", "contextStart"]], on="sequence_name", how="left")
     fdf["genomic_start"] = fdf["contextStart"] + fdf["start"] - 1
     fdf["genomic_end"] = fdf["contextStart"] + fdf["stop"]
     fdf["coord"] = (
         fdf["genomic_start"].astype(str) + "-" + fdf["genomic_end"].astype(str)
     )
+    agg = fdf.groupby("sequence_name")["coord"].agg(lambda hits: ",".join(hits))
+    df["jaspar"] = df["sequence_name"].map(agg).fillna("")
     return df
     # 0) configs
     paths = cfg.data_task.paths
     fimo = cfg.data_task.fimo
     meme = cfg.data_task.meme
     # set njobs to max or whatever # is specified by user
     njobs = fimo.njobs
     if njobs == "max":
+        njobs = cpu_count()
     else:
+        njobs = min(cpu_count(), int(njobs))
+    # 1) Optionally, acitvate test mode
+    # ── TEST MODE: extract just chromosome 1 to benchmark a smaller job ──
+    chroms = [str(x) for x in cfg.data_task.chroms]
+    logger.info(f"Debug setting: {cfg.data_task.debug}")
+    if cfg.data_task.debug:
+        chroms = chroms[0:1]
+        logging.info(f" DEBUG MODE: running on only one chromosome: {chroms}")
     # 2) extract sequences & build BG model
+    for chrom in chroms:
+        path_to_fasta = Path(root) / Path(paths.input_fasta_outer_dir) / f"chr{chrom}" / paths.seq_fasta
+        path_to_bg = Path(root) / Path(paths.input_fasta_outer_dir) / f"chr{chrom}" / paths.bg_model
+        logging.info(f"Path to fasta file: {path_to_fasta}")
+        logger.info(f"Building background model at {path_to_bg}…")
+        run_markov(Path(root)/meme.fasta_get_markov, path_to_fasta, Path(root) / path_to_bg)
+        # 3) chunk FASTA and run FIMO in parallel
+        # make a folder to store the split fastas
+        chunk_folder = Path(path_to_fasta.parent) / "chunks"
+        os.makedirs(chunk_folder, exist_ok=True)
+        logger.info(f"Made directory {chunk_folder} to store {njobs} chunked fastas")
+        chunks = split_fasta(njobs, input_file=path_to_fasta, output_dir=chunk_folder, debug=cfg.data_task.debug, all_caps=cfg.data_task.all_caps)
+        chrom_outdir = Path(root) / paths.fimo_outdir / f"chrom{chrom}"
+        os.makedirs(chrom_outdir, exist_ok=True)
+        chunk_cfgs = [
+            dict(
+                chunk_id=i,
+                fasta_path=chunk,
+                fimo_outdir=Path(root)/ paths.fimo_outdir,
+                fimo_bin=Path(root) / meme.fimo_bin,
+                bg_model=path_to_bg,
+                max_stored=fimo.max_stored,
+                motif_file=Path(root) / meme.jaspar_motif_file,
+                thresh=fimo.thresh,
+                thresh_mode=fimo.thresh_mode,
+                outdir=Path(chrom_outdir) / f"chunk{i}"
+            )
+            for i, chunk in enumerate(chunks)
         ]
+        logger.info(f"Running FIMO in parallel ({njobs} jobs)…")
+        start_time = time.time()
+        # Call the parallel jobs and get back a list of tsv paths
+        with Pool(njobs) as pool:
+            tsv_paths = pool.map(run_fimo_chunk, chunk_cfgs)
+        end_time = time.time()
+        logger.info(f"COMPLETED FIMO ({njobs} parallel jobs) in {end_time-start_time:.2f}s")
+        # cleanup! delete the chunked input files
+        if not any(chunk_folder.iterdir()):  # Empty folder
+            chunk_folder.rmdir()
+            logger.info(f"Deleted empty folder: {chunk_folder}")
+        # 4) merge chunked TSVs. Some may be empty, so can't do a simple loop
+        # delete intermediate folders as we go
+        dfs = []
+        for tsv in tsv_paths:
+            try:
+                df = pd.read_csv(tsv, sep="\t", comment="#")
+                if not df.empty:
+                    dfs.append(df)
+            except pd.errors.EmptyDataError:
+                logger.warning(f"Skipped empty TSV (only comments or blank): {tsv}")
+            except Exception as e:
+                logger.error(f"Error reading {tsv}: {e}")
+                raise  # Or continue, depending on your needs
+            # delete this folder to save storage
+            chunk_dir = Path(tsv).parent
+            try:
+                shutil.rmtree(chunk_dir)
+                logger.info(f"Deleted chunk directory: {chunk_dir}")
+            except Exception as e:
+                logger.warning(f"Could not delete chunk dir {chunk_dir}: {e}")
+        combined = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
+        # 5) annotate & write final CSV
+        df = pd.read_csv(Path(root) / paths.input_csv, low_memory=False)
+        df["#chrom"] = df["#chrom"].astype(str)
+        df = df.loc[df["#chrom"]==chrom].reset_index(drop=True)
+        output_full_csv_path = Path(root) / chrom_outdir / f"fimo_annotations.csv"
+        combined.to_csv(output_full_csv_path, index=False)
+        logger.info(f"Merging FIMO results into input DataFrame, which has {len(df)} rows for chromosome {chrom}")
+        df = annotate_with_fimo(df, combined)
+        final = df[
+            [
+                "#chrom",
+                "contextStart",
+                "ChIPStart",
+                "ChIPEnd",
+                "contextEnd",
+                "chipscore",
+                "TR",
+                "jaspar",
+            ]
+        ]
+        output_csv_path = Path(root) / chrom_outdir / f"final.csv"
+        final.to_csv(output_csv_path, index=False)
+        logger.info(f"Wrote {len(final)} rows to {output_csv_path}")
 if __name__ == "__main__":
     main()

dpacman/scripts/run_download.sh CHANGED Viewed

@@ -2,7 +2,7 @@
 # Manually specify values used in the config
 main_task="preprocess"
-data_task_type="download"
 timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
 run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"

 # Manually specify values used in the config
 main_task="preprocess"
+data_task_type="clean"
 timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
 run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"

dpacman/scripts/run_fimo.sh CHANGED Viewed

@@ -8,9 +8,10 @@ timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
 run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
 mkdir -p "$run_dir"
-CUDA_VISIBLE_DEVICES=0 nohup python -u -m scripts.preprocess \
     hydra.run.dir="${run_dir}" \
-    data_task=${data_task_type}/pre_fimo \
     > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

 run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
 mkdir -p "$run_dir"
+nohup python -u -m scripts.preprocess \
     hydra.run.dir="${run_dir}" \
+    data_task=${data_task_type}/post_fimo \
+    data_task.debug="false" \
     > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_fimo_batch.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+#!/bin/bash
+# -------------------------
+# Slurm + Hydra Configuration
+# -------------------------
+main_task="preprocess"
+data_task_type="fimo"
+fimo_thresh="1e-2"
+fimo_thresh_mode="q"
+max_stored="100000"
+fimo_outdir="dpacman/data_files/processed/fimo/fimo_out_q"
+debug="false"
+# Chromosomes to run
+#chromosomes=('1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22' '3' '4' '5' '6' '7' '8' '9' 'X' 'Y')
+chromosomes=('3')
+# -------------------------
+# Slurm Job Submission
+# -------------------------
+for chrom in "${chromosomes[@]}"; do
+    timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+    run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}_chr${chrom}"
+    mkdir -p "$run_dir"
+    sbatch <<EOF
+#!/bin/bash
+#SBATCH --job-name=fimo${chrom}
+#SBATCH --partition=genoa-lrg-mem
+#SBATCH -N 1 ## Number of nodes
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=64
+#SBATCH --output=${run_dir}/run.log
+#SBATCH --error=${run_dir}/run.log
+echo "Running FIMO for chromosome ${chrom} at \$(date)"
+cd /vast/projects/pranam/lab/sophie/DPACMAN/dpacman
+# Load environment
+source /vast/projects/pranam/lab/shared/miniconda3/etc/profile.d/conda.sh
+conda activate dpacman
+# Run Hydra-based script
+python -u -m scripts.preprocess \\
+    hydra.run.dir="${run_dir}" \\
+    data_task=${data_task_type}/run_fimo \\
+    data_task.chroms=["${chrom}"] \\
+    data_task.fimo.thresh=${fimo_thresh} \\
+    data_task.fimo.thresh_mode=${fimo_thresh_mode} \\
+    data_task.paths.fimo_outdir=${fimo_outdir} \\
+    data_task.fimo.max_stored=${max_stored}\\
+    data_task.debug=${debug}
+# Save SLURM job ID
+echo \$SLURM_JOB_ID > "${run_dir}/pid.txt"
+EOF
+done

environment.yaml CHANGED Viewed

@@ -9,6 +9,7 @@
 name: dpacman
 channels:
   - conda-forge
   - defaults
@@ -24,8 +25,10 @@ dependencies:
   - python=3.10
   - dask[complete]
   - pip>=23
   - pip:
     - rootutils==1.0.7
     - hydra-core==1.3.2         # Hydra for config management
     - hydra-colorlog==1.2.0     # Allow colorful logging in Hydra
     - omegaconf==2.3.0          # Required by hydra-core

 name: dpacman
 channels:
+  - bioconda
   - conda-forge
   - defaults
   - python=3.10
   - dask[complete]
   - pip>=23
+  - ghostscript=9.18
   - pip:
     - rootutils==1.0.7
+    - polars==1.32.2
     - hydra-core==1.3.2         # Hydra for config management
     - hydra-colorlog==1.2.0     # Allow colorful logging in Hydra
     - omegaconf==2.3.0          # Required by hydra-core