svincoff commited on Aug 19, 2025

Commit

80b6a2c

1 Parent(s): fad71ca

temp changes for gpu switch

Browse files

Files changed (40) hide show

.gitignore +9 -1
configs/data_modules/pair.yaml +9 -0
configs/data_task/cluster/remap.yaml +44 -0
configs/data_task/fimo/post_fimo.yaml +9 -4
configs/data_task/split/remap.yaml +23 -0
configs/models/classifier.yaml +11 -0
configs/models/pooling/truncatedsvd.yaml +7 -0
configs/train.yaml +8 -0
dpacman/classifier/model/loss.py +34 -0
dpacman/classifier/model/train.py +0 -79
dpacman/classifier/model_tmp/__init__.py +0 -0
dpacman/classifier/model_tmp/clustering_data.py +383 -0
dpacman/classifier/model_tmp/compress_embeddings.py +54 -0
dpacman/{data_tasks/embeddings → classifier/model_tmp}/compute_embeddings.py +342 -141
dpacman/classifier/model_tmp/extract_tf_symbols.py +27 -0
dpacman/classifier/model_tmp/make_pair_list.py +220 -0
dpacman/classifier/model_tmp/make_peak_fasta.py +13 -0
dpacman/classifier/model_tmp/model.py +103 -0
dpacman/classifier/model_tmp/prep_splits.py +133 -0
dpacman/classifier/model_tmp/train.py +180 -0
dpacman/data_modules/__init__.py +0 -0
dpacman/data_modules/pair.py +342 -0
dpacman/data_tasks/cluster/__init__.py +0 -0
dpacman/data_tasks/cluster/remap.py +144 -0
dpacman/data_tasks/embeddings/embedders.py +560 -0
dpacman/data_tasks/fimo/__init__.py +0 -0
dpacman/data_tasks/fimo/post_fimo.py +526 -124
dpacman/data_tasks/fimo/run_fimo.py +16 -7
dpacman/data_tasks/split/__init__.py +0 -0
dpacman/data_tasks/split/remap.py +512 -0
dpacman/scripts/preprocess.py +20 -4
dpacman/scripts/run_cluster.sh +19 -0
dpacman/scripts/run_fimo.sh +3 -1
dpacman/scripts/run_split.sh +21 -0
dpacman/utils/README.md +1 -0
dpacman/utils/__init__.py +0 -0
dpacman/utils/clustering.py +144 -0
dpacman/utils/models.py +19 -0
dpacman/utils/splitting.py +0 -0
environment.yaml +39 -36

.gitignore CHANGED Viewed

@@ -17,4 +17,12 @@ dpacman/scripts/__pycache__/
 dpacman/temp.py
 dpacman/temp2.py
 logs/
-tree.txt

 dpacman/temp.py
 dpacman/temp2.py
 logs/
+tree.txt
+dpacman/utils/ubuntu_font/
+dpacman/tf_clusters.tsv
+dpacman/tf_family_tree.nwk
+dpacman/idmap_filt.csv
+dpacman/temp3.py
+dpacman/temp4.py
+dpacman/temp.ipynb
+dpacman/nohup.out

configs/data_modules/pair.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+train_file: data_files/splits/train.csv
+val_file: data_files/splits/val.csv
+test_file: data_files/splits/test.csv
+batch_size: 32
+num_workers: 8
+maximize_num_workers: False

configs/data_task/cluster/remap.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: remap
+type: cluster
+max_protein_length: 1998
+cluster_dna_full: true
+cluster_dna_peaks: true
+cluster_protein: false
+dna_full:
+  input_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
+  fasta_path: dpacman/data_files/processed/mmseqs/inputs/fimo_hits_only/dna_full.fasta
+  output_dir: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full
+  mmseqs:
+    min_seq_id: 0.3
+    c: 0.8
+    cov_mode: 0
+    cluster_mode: 0
+dna_peaks:
+  input_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/peak_seqid_to_peak_sequence.json
+  fasta_path: dpacman/data_files/processed/mmseqs/inputs/fimo_hits_only/dna_peaks.fasta
+  output_dir: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_peaks
+  mmseqs:
+    min_seq_id: 0.3
+    c: 0.8
+    cov_mode: 0
+    cluster_mode: 0
+protein:
+  input_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/tr_seqid_to_tr_sequence.json
+  fasta_path: dpacman/data_files/processed/mmseqs/inputs/fimo_hits_only/protein.fasta
+  output_dir: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein
+  mmseqs:
+    min_seq_id: 0.3
+    c: 0.8
+    cov_mode: 0
+    cluster_mode: 0
+input_data_path: /vast/projects/pranam/lab/sophie/DPACMAN/dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet
+path_to_mmseqs: /vast/projects/pranam/lab/shared/mmseqs
+out_dir: na
+final_csv: na

configs/data_task/fimo/post_fimo.yaml CHANGED Viewed

@@ -2,11 +2,16 @@ name: post_fimo
 type: fimo
 fimo_out_dir: dpacman/data_files/processed/fimo/fimo_out_q
-unprocessed_output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output_q_unprocessed.csv
-processed_output_csv: dpacman/data_files/processed/fimo/remap2022_crm_fimo_output_q_processed.csv
 json_dir: dpacman/data_files/raw/genomes/hg38
-idmap_path: dpacman/data_files/raw/remap/idmapping_reviewed_true_2025_08_11.tsv
-jaspar_boost: 100
 debug: false

 type: fimo
 fimo_out_dir: dpacman/data_files/processed/fimo/fimo_out_q
+processed_output_csv: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed.csv
 json_dir: dpacman/data_files/raw/genomes/hg38
+idmap_path: dpacman/data_files/raw/remap/idmapping_reviewed_true_2025_08_15.tsv
+remap_path: /vast/projects/pranam/lab/sophie/DPACMAN/dpacman/data_files/processed/fimo/remap2022_crm_fimo_input.csv
+jaspar_boost: 333
+keep_fimo_only: true
+seeds: [0]
+max_protein_len: 1998
 debug: false

configs/data_task/split/remap.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: remap
+type: split
+max_protein_length: 1998
+cluster_output_paths:
+  dna: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/dna_full/mmseqs_cluster.tsv
+  protein: dpacman/data_files/processed/mmseqs/outputs/fimo_hits_only/protein/mmseqs_cluster.tsv
+input_data_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet
+split_out_dir: dpacman/data_files/processed/splits
+split_by: both # protein, dna, or both
+test_ratio: 0.10
+val_ratio: 0.10
+train_ratio: 0.80
+require_nonempty: true
+ratio_tolerance: null
+bigM: null
+seed: 0

configs/models/classifier.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+name: classifier
+type: train
+params:
+  epochs: 10
+  batch_size: 32
+  lr: 1e-4
+  seed: 42
+out_dir: null
+pair_list: null

configs/models/pooling/truncatedsvd.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+n_components: 2
+algorithm: randomized
+n_iter: 5
+n_oversamples: 10
+poewr_iteration_normalizer: auto
+random_state: 42
+tol: 0

configs/train.yaml CHANGED Viewed

	@@ -0,0 +1,8 @@

+defaults:
+  - _self_
+  - paths: default
+  - hydra: default  # ← tells Hydra to use the logging/output config
+  - trainer: gpu
+  - data_task: model/classifier
+task_name: train/${data_task.type}

dpacman/classifier/model/loss.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Define loss functions needed for training the model
+"""
+import torch
+from torch.nn import functional as F
+def combined_loss_components(logits, targets, peak_thresh=0.5, eps=1e-8):
+    probs = torch.sigmoid(logits)
+    labels = (targets >= peak_thresh).float()
+    non_peak_mask = (labels == 0).float()
+    peak_mask     = (labels == 1).float()
+    bce_all = F.binary_cross_entropy_with_logits(logits, labels, reduction='none')
+    bce_non = (bce_all * non_peak_mask)
+    bce_non = bce_non.sum() / (non_peak_mask.sum() + eps)
+    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction='sum') \
+                / (peak_mask.sum() + eps)
+    t_dist = (targets + eps)
+    p_dist = (probs + eps)
+    t_dist = t_dist / t_dist.sum(dim=1, keepdim=True)
+    p_dist = p_dist / p_dist.sum(dim=1, keepdim=True)
+    kl = (t_dist * (t_dist.clamp(min=eps).log() - p_dist.clamp(min=eps).log())).sum(dim=1).mean()
+    return bce_non, kl, mse_peaks, probs
+def accuracy_percentage(logits, targets, peak_thresh=0.5):
+    probs = torch.sigmoid(logits)
+    preds_bin = (probs >= 0.5).float()
+    labels    = (targets >= peak_thresh).float()
+    correct   = (preds_bin == labels).float().sum()
+    total     = torch.numel(labels)
+    return (correct / max(1, total)).item() * 100.0

dpacman/classifier/model/train.py CHANGED Viewed

@@ -77,85 +77,6 @@ def build_tf_cache(tf_paths, target_dim=256):
     print("[i] TF cache ready (raw); compression will be learned.", flush=True)
     return cache
-# ─────────────── Dataset & Collation ─────────────────────────────────────
-class PairDataset(Dataset):
-    def __init__(self, tf_paths, dna_paths, final_df, tf_cache):
-        self.tf_paths, self.dna_paths = tf_paths, dna_paths
-        self.tf_cache                = tf_cache
-        self.targets = {}
-        for _, row in final_df.iterrows():
-            dna_id = row["dna_id"]
-            vec    = np.array(list(map(float, row["score_sig_r2"].split(","))), dtype=np.float32)
-            self.targets[dna_id] = vec
-    def __len__(self): return len(self.tf_paths)
-    def __getitem__(self, i):
-        b = self.tf_cache[self.tf_paths[i]]      # (L_b, D_b) or (D_b,)
-        if b.ndim==1: b = b[None,:]
-        g = np.load(self.dna_paths[i])           # (L_g, 256) or (256,)
-        if g.ndim==1: g = g[None,:]
-        stem = Path(self.dna_paths[i]).stem
-        dna_id = stem.replace("dna_","")
-        t = self.targets.get(dna_id, np.zeros(g.shape[0],dtype=np.float32))
-        return torch.from_numpy(b).float(), \
-               torch.from_numpy(g).float(), \
-               torch.from_numpy(t).float()
-def collate_fn(batch):
-    Bs = [b.shape[0] for b,_,_ in batch]
-    Gs = [g.shape[0] for _,g,_ in batch]
-    maxB, maxG = max(Bs), max(Gs)
-    def pad_seq(x, L):
-        if x.shape[0] < L:
-            pad = torch.zeros((L-x.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
-            return torch.cat([x, pad], dim=0)
-        return x
-    def pad_t(y, L):
-        if y.shape[0] < L:
-            pad = torch.zeros((L-y.shape[0],), dtype=y.dtype, device=y.device)
-            return torch.cat([y, pad], dim=0)
-        return y
-    b_stack = torch.stack([pad_seq(b, maxB) for b,_,_ in batch])
-    g_stack = torch.stack([pad_seq(g, maxG) for _,g,_ in batch])
-    t_stack = torch.stack([pad_t(t, maxG) for *_,t in batch])
-    return b_stack, g_stack, t_stack
-# ─────────────── losses, metrics ─────────────────────────────────────────
-def combined_loss_components(logits, targets, peak_thresh=0.5, eps=1e-8):
-    probs = torch.sigmoid(logits)
-    labels = (targets >= peak_thresh).float()
-    non_peak_mask = (labels == 0).float()
-    peak_mask     = (labels == 1).float()
-    bce_all = F.binary_cross_entropy_with_logits(logits, labels, reduction='none')
-    bce_non = (bce_all * non_peak_mask)
-    bce_non = bce_non.sum() / (non_peak_mask.sum() + eps)
-    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction='sum') \
-                / (peak_mask.sum() + eps)
-    t_dist = (targets + eps)
-    p_dist = (probs + eps)
-    t_dist = t_dist / t_dist.sum(dim=1, keepdim=True)
-    p_dist = p_dist / p_dist.sum(dim=1, keepdim=True)
-    kl = (t_dist * (t_dist.clamp(min=eps).log() - p_dist.clamp(min=eps).log())).sum(dim=1).mean()
-    return bce_non, kl, mse_peaks, probs
-def accuracy_percentage(logits, targets, peak_thresh=0.5):
-    probs = torch.sigmoid(logits)
-    preds_bin = (probs >= 0.5).float()
-    labels    = (targets >= peak_thresh).float()
-    correct   = (preds_bin == labels).float().sum()
-    total     = torch.numel(labels)
-    return (correct / max(1, total)).item() * 100.0
 def evaluate(model, dl, device, alpha, beta, gamma, peak_thresh, eps=1e-8):
     model.eval()
     tot_loss, tot_acc = 0.0, 0.0

     print("[i] TF cache ready (raw); compression will be learned.", flush=True)
     return cache
 def evaluate(model, dl, device, alpha, beta, gamma, peak_thresh, eps=1e-8):
     model.eval()
     tot_loss, tot_acc = 0.0, 0.0

dpacman/classifier/model_tmp/__init__.py ADDED Viewed

File without changes

dpacman/classifier/model_tmp/clustering_data.py ADDED Viewed

	@@ -0,0 +1,383 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import random
+import sys
+import subprocess
+from collections import defaultdict
+# ─────────────────────────────────────────────────────────────────────────
+# Original helpers (kept; some lightly edited/commented where needed)
+# ─────────────────────────────────────────────────────────────────────────
+def read_ids_file(p):
+    p = Path(p)
+    if not p.exists():
+        raise FileNotFoundError(f"IDs file not found: {p}")
+    return [line.strip() for line in p.open() if line.strip()]
+def split_embeddings(emb_path, ids_path, out_dir, prefix):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if not Path(emb_path).exists():
+        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
+    if not Path(ids_path).exists():
+        raise FileNotFoundError(f"IDs file not found: {ids_path}")
+    if emb_path.endswith(".npz"):
+        data = np.load(emb_path, allow_pickle=True)
+        if "embeddings" in data:
+            emb = data["embeddings"]
+        else:
+            raise ValueError(f"{emb_path} missing 'embeddings' key")
+    else:
+        emb = np.load(emb_path)
+    ids = read_ids_file(ids_path)
+    if len(ids) != emb.shape[0]:
+        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
+    mapping = {}
+    for i, ident in enumerate(ids):
+        if i >= emb.shape[0]:
+            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
+            continue
+        arr = emb[i]
+        out_file = out_dir / f"{prefix}_{ident}.npy"
+        np.save(out_file, arr)
+        mapping[ident] = str(out_file)
+    return mapping
+def extract_symbol_from_tf_id(full_id: str) -> str:
+    """
+    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
+    return the gene symbol uppercase (e.g., 'ZBTB5').
+    """
+    if "|" in full_id:
+        try:
+            # format sp|Accession|SYMBOL_HUMAN
+            genepart = full_id.split("|")[2]
+        except IndexError:
+            genepart = full_id
+    else:
+        genepart = full_id
+    symbol = genepart.split("_")[0]
+    return symbol.upper()
+def build_tf_symbol_map(tf_map):
+    """
+    Build mapping gene_symbol -> list of embedding paths.
+    """
+    symbol_map = {}
+    for full_id, path in tf_map.items():
+        symbol = extract_symbol_from_tf_id(full_id)
+        symbol_map.setdefault(symbol, []).append(path)
+    return symbol_map
+def tf_key_from_path(path: str) -> str:
+    """
+    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
+    """
+    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
+    # remove leading prefix if present (tf_)
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return extract_symbol_from_tf_id(rest)
+def dna_key_from_path(path: str) -> str:
+    """
+    Given .../dna_peak42.npy -> 'peak42'
+    """
+    stem = Path(path).stem
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return rest
+# ─────────────────────────────────────────────────────────────────────────
+# New helpers for MMseqs clustering & cluster-level splitting
+# ─────────────────────────────────────────────────────────────────────────
+def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
+    """
+    Write unique DNA sequences to FASTA using dna_id as header.
+    Requires df with columns: dna_id, dna_sequence
+    """
+    uniq = df[["dna_id", "dna_sequence"]].drop_duplicates()
+    with open(out_fasta, "w") as f:
+        for _, row in uniq.iterrows():
+            did = row["dna_id"]
+            seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
+            f.write(f">{did}\n{seq}\n")
+def run_mmseqs_easy_cluster(
+    mmseqs_bin: str,
+    fasta: Path,
+    out_prefix: Path,
+    tmp_dir: Path,
+    min_seq_id: float,
+    coverage: float,
+    cov_mode: int,
+) -> Path:
+    """
+    Runs mmseqs easy-cluster on nucleotide sequences.
+    Returns the path to a clusters TSV file (creating it if the default one isn't present).
+    """
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    out_prefix.parent.mkdir(parents=True, exist_ok=True)
+    cmd = [
+        mmseqs_bin, "easy-cluster",
+        str(fasta), str(out_prefix), str(tmp_dir),
+        "--min-seq-id", str(min_seq_id),
+        "-c", str(coverage),
+        "--cov-mode", str(cov_mode),
+        # You can add performance flags here if needed, e.g.:
+        # "--threads", "8"
+    ]
+    print("[i] Running:", " ".join(cmd), flush=True)
+    subprocess.run(cmd, check=True)
+    # MMseqs easy-cluster typically writes <out_prefix>_cluster.tsv
+    default_tsv = Path(str(out_prefix) + "_cluster.tsv")
+    if default_tsv.exists():
+        print(f"[i] Found cluster TSV: {default_tsv}")
+        return default_tsv
+    # Fallback: try createtsv if default is missing
+    # This requires the internal DBs. easy-cluster creates DBs alongside out_prefix.
+    # We'll try to locate them and emit a TSV.
+    in_db = Path(str(out_prefix) + "_query")
+    cl_db = Path(str(out_prefix) + "_cluster")
+    out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
+    if in_db.exists() and cl_db.exists():
+        cmd2 = [mmseqs_bin, "createtsv", str(in_db), str(in_db), str(cl_db), str(out_tsv)]
+        print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
+        subprocess.run(cmd2, check=True)
+        if out_tsv.exists():
+            return out_tsv
+    raise FileNotFoundError("Could not locate clusters TSV from mmseqs. "
+                            "Expected {default_tsv} or createtsv fallback.")
+def parse_mmseqs_clusters(tsv_path: Path) -> dict:
+    """
+    Parse MMseqs cluster TSV (rep \t member). Returns dna_id -> cluster_rep_id
+    """
+    mapping = {}
+    with open(tsv_path) as f:
+        for line in f:
+            parts = line.rstrip("\n").split("\t")
+            if len(parts) < 2:
+                continue
+            rep, member = parts[0], parts[1]
+            mapping[member] = rep
+            # Some TSVs include rep->rep; if not, ensure rep is mapped to itself:
+            if rep not in mapping:
+                mapping[rep] = rep
+    return mapping
+def assign_clusters_to_splits(cluster_rep_to_members: dict,
+                              val_frac: float,
+                              test_frac: float,
+                              seed: int = 42):
+    """
+    cluster_rep_to_members: dict[rep] = [members...]
+    Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
+    Ensures all members of a cluster go to the same split.
+    """
+    rng = random.Random(seed)
+    reps = list(cluster_rep_to_members.keys())
+    rng.shuffle(reps)
+    # Greedy-ish fill by total member counts to match desired fractions.
+    total = sum(len(cluster_rep_to_members[r]) for r in reps)
+    target_val = int(round(total * val_frac))
+    target_test = int(round(total * test_frac))
+    cur_val = cur_test = 0
+    val_ids, test_ids, train_ids = set(), set(), set()
+    for rep in reps:
+        members = cluster_rep_to_members[rep]
+        c = len(members)
+        # Fill val first, then test, then train
+        if cur_val + c <= target_val:
+            val_ids.update(members); cur_val += c
+        elif cur_test + c <= target_test:
+            test_ids.update(members); cur_test += c
+        else:
+            train_ids.update(members)
+    return {"train": train_ids, "val": val_ids, "test": test_ids}
+# ─────────────────────────────────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
+    )
+    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
+    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
+    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)")
+    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
+    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)")
+    parser.add_argument("--out_dir", required=True, help="Output directory")
+    parser.add_argument("--seed", type=int, default=42)
+    # NEW: MMseqs options & split fractions
+    parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
+    parser.add_argument("--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id")
+    parser.add_argument("--cov", type=float, default=0.8, help="MMseqs -c coverage fraction")
+    parser.add_argument("--cov_mode", type=int, default=1, help="MMseqs --cov-mode (1 = coverage of target)")
+    parser.add_argument("--val_frac", type=float, default=0.10)
+    parser.add_argument("--test_frac", type=float, default=0.10)
+    parser.add_argument("--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)")
+    args = parser.parse_args()
+    random.seed(args.seed)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Load final.csv
+    df = pd.read_csv(args.final_csv, dtype=str)
+    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
+        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
+    # Assign dna_id (unique per dna_sequence)
+    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
+    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
+    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
+    enriched_csv = out_dir / "final_with_dna_id.csv"
+    df.to_csv(enriched_csv, index=False)
+    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
+    # Split embeddings into per-item files (unchanged)
+    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
+    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
+    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
+    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
+    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
+    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
+    # Build gene-symbol normalized map
+    tf_symbol_map = build_tf_symbol_map(tf_map)
+    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
+    # Diagnostic overlaps
+    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
+    available_tf_symbols = set(tf_symbol_map.keys())
+    intersect_tf = norm_tf_in_final & available_tf_symbols
+    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
+    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
+    print(f"[i] Intersection count: {len(intersect_tf)}")
+    if len(intersect_tf) == 0:
+        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
+        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
+        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
+        sys.exit(1)
+    dna_ids_final = set(df["dna_id"].unique())
+    available_dna_ids = set(dna_map.keys())
+    intersect_dna = dna_ids_final & available_dna_ids
+    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
+    if len(intersect_dna) == 0:
+        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
+        sys.exit(1)
+    # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
+    fasta_path = out_dir / "dna_unique.fasta"
+    write_dna_fasta(df, fasta_path)
+    print(f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}")
+    tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
+    cluster_prefix = out_dir / "mmseqs_dna_clusters"
+    clusters_tsv = run_mmseqs_easy_cluster(
+        mmseqs_bin=args.mmseqs_bin,
+        fasta=fasta_path,
+        out_prefix=cluster_prefix,
+        tmp_dir=tmp_dir,
+        min_seq_id=args.min_seq_id,
+        coverage=args.cov,
+        cov_mode=args.cov_mode,
+    )
+    # Parse clusters
+    member_to_rep = parse_mmseqs_clusters(clusters_tsv)   # dna_id -> rep_id
+    # Build rep -> members list
+    rep_to_members = defaultdict(list)
+    for member, rep in member_to_rep.items():
+        rep_to_members[rep].append(member)
+    print(f"[i] Parsed {len(rep_to_members)} clusters from {clusters_tsv}")
+    clusters_table = []
+    for rep, members in rep_to_members.items():
+        for m in members:
+            clusters_table.append((m, rep))
+    clusters_df = pd.DataFrame(clusters_table, columns=["dna_id", "cluster_id"])
+    clusters_df.to_csv(out_dir / "clusters.tsv", sep="\t", index=False)
+    print(f"[i] Wrote clusters mapping → {out_dir / 'clusters.tsv'}")
+    # Attach cluster_id back to final df
+    df = df.merge(clusters_df, on="dna_id", how="left")
+    df.to_csv(out_dir / "final_with_dna_id_and_cluster.csv", index=False)
+    print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
+    # Assign entire clusters to splits
+    splits = assign_clusters_to_splits(rep_to_members,
+                                       val_frac=args.val_frac,
+                                       test_frac=args.test_frac,
+                                       seed=args.seed)
+    for k in ["train", "val", "test"]:
+        print(f"[i] {k}: {len(splits[k])} dna_ids")
+    # ── Build positive pairs only, per split (NO negatives) ───────────────
+    positives_by_split = {"train": [], "val": [], "test": []}
+    # Build a quick dna_id -> embedding path map
+    dnaid_to_path = {did: path for did, path in dna_map.items()}
+    pos_count = 0
+    for _, row in df.iterrows():
+        tf_raw = row["TF_id"]
+        tf_symbol = tf_raw.split("_seq")[0].upper()
+        dnaid = row["dna_id"]
+        if (tf_symbol not in tf_symbol_map) or (dnaid not in dnaid_to_path):
+            continue
+        tf_embedding_path = tf_symbol_map[tf_symbol][0]  # first embedding per symbol
+        # decide split by dna_id cluster assignment
+        if dnaid in splits["train"]:
+            positives_by_split["train"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        elif dnaid in splits["val"]:
+            positives_by_split["val"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        elif dnaid in splits["test"]:
+            positives_by_split["test"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
+        pos_count += 1
+    print(f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})")
+    for k in ["train", "val", "test"]:
+        print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
+    # # OLD: negatives (kept commented)
+    # negatives = []
+    # print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive not used)")
+    # Emit split-specific pair lists
+    for split in ["train", "val", "test"]:
+        out_tsv = out_dir / f"pair_list_{split}.tsv"
+        with open(out_tsv, "w") as f:
+            for binder_path, glm_path, label in positives_by_split[split]:  # + negatives if you add later
+                f.write(f"{binder_path}\t{glm_path}\t{label}\n")
+        print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
+    print("✅ Done. Cluster-aware splits ready.")
+if __name__ == "__main__":
+    main()

dpacman/classifier/model_tmp/compress_embeddings.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# compress_embeddings.py
+# USAGE: python compress_embeddings.py --input_glob "/path/to/esm_embeddings/*.npy" --output_dir "/path/to/compressed_embeddings" --esm_dim 1280 --out_dim 256
+# --------------
+import os
+import glob
+import numpy as np
+import torch
+from torch import nn
+class EmbeddingCompressor(nn.Module):
+    def __init__(self, input_dim: int = 1280, output_dim: int = 256):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (batch, L, input_dim)  or (L, input_dim)
+        returns: (batch, output_dim) or (output_dim,)
+        """
+        if x.dim() == 2:
+            # single example: mean over tokens
+            x = x.mean(dim=0, keepdim=True)      # → (1, input_dim)
+        else:
+            # batch: mean over tokens
+            x = x.mean(dim=1)                     # → (batch, input_dim)
+        return self.fc(x)                         # → (batch, output_dim)
+def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
+    arr = np.load(in_path)                      # shape (L, D) or (batch, L, D)
+    tensor = torch.from_numpy(arr).float()
+    with torch.no_grad():
+        compressed = model(tensor)              # → (batch, 256)
+    out = compressed.cpu().numpy()
+    np.save(out_path, out)
+    print(f"Saved {out_path}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
+    parser.add_argument("--input_glob", type=str, required=True,
+                        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)")
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--esm_dim", type=int, default=1280)
+    parser.add_argument("--out_dim", type=int, default=256)
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    compressor = EmbeddingCompressor(args.esm_dim, args.out_dim)
+    compressor.eval()
+    for fn in glob.glob(args.input_glob):
+        base = os.path.basename(fn).replace(".npy", "_256.npy")
+        out_path = os.path.join(args.output_dir, base)
+        compress_file(fn, out_path, compressor)

dpacman/{data_tasks/embeddings → classifier/model_tmp}/compute_embeddings.py RENAMED Viewed

@@ -14,7 +14,6 @@ Usage example (DNA + protein in one go):
     --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
     --device           cuda
 """
 import os
 import re
 import argparse
@@ -29,7 +28,6 @@ import time
 # ---- model wrappers ----
 class CaduceusEmbedder:
     def __init__(self, device, chunk_size=131_072, overlap=0):
         """
@@ -41,54 +39,47 @@ class CaduceusEmbedder:
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, trust_remote_code=True
         )
-        self.model = (
-            AutoModel.from_pretrained(model_name, trust_remote_code=True)
-            .to(device)
-            .eval()
-        )
-        self.device = device
         self.chunk_size = chunk_size
-        self.step = chunk_size - overlap
     def embed(self, seqs):
         """
         seqs: List[str] of DNA sequences (each <= chunk_size for this test)
         returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
         """
         outputs = []
         for seq in seqs:
-            # --- old windowing + mean-pooling logic, now commented out ---
-            # window_vecs = []
-            # for i in range(0, len(seq), self.step):
-            #     chunk = seq[i : i + self.chunk_size]
-            #     if not chunk:
-            #         break
-            #     toks = self.tokenizer(
-            #         chunk,
-            #         return_tensors="pt",
-            #         padding=False,
-            #         truncation=True,
-            #         max_length=self.chunk_size
-            #     ).to(self.device)
-            #     with torch.no_grad():
-            #         out = self.model(**toks).last_hidden_state
-            #     window_vecs.append(out.mean(dim=1).squeeze(0).cpu())
-            # seq_emb = torch.stack(window_vecs, dim=0).mean(dim=0).numpy()
-            # outputs.append(seq_emb)
-            # --- new: raw per‐token embeddings in one shot ---
             toks = self.tokenizer(
                 seq,
                 return_tensors="pt",
                 padding=False,
                 truncation=True,
-                max_length=self.chunk_size,
             ).to(self.device)
             with torch.no_grad():
                 out = self.model(**toks).last_hidden_state  # (1, L, D)
-            outputs.append(out.cpu().numpy()[0])  # (L, D)
-        return np.stack(outputs, axis=0)  # (N, L, D)
     def benchmark(self, lengths=None):
         """
@@ -110,29 +101,79 @@ class CaduceusEmbedder:
             t1 = time.perf_counter()
             print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
 class DNABertEmbedder:
     def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            "zhihan1996/DNA_bert_6", trust_remote_code=True
-        )
-        self.model = AutoModel.from_pretrained(
-            "zhihan1996/DNA_bert_6", trust_remote_code=True
-        ).to(device)
-        self.device = device
     def embed(self, seqs):
         embs = []
         for s in seqs:
-            tokens = self.tokenizer(s, return_tensors="pt", padding=True)[
-                "input_ids"
-            ].to(self.device)
             with torch.no_grad():
                 out = self.model(tokens).last_hidden_state.mean(1)
             embs.append(out.cpu().numpy())
         return np.vstack(embs)
 class NucleotideTransformerEmbedder:
     def __init__(self, device):
         # HF “feature-extraction” returns a list of (L, D) arrays for each input
@@ -140,9 +181,7 @@ class NucleotideTransformerEmbedder:
         self.pipe = pipeline(
             "feature-extraction",
             model="InstaDeepAI/nucleotide-transformer-500m-1000g",
-            device=(
-                -1 if device == "cpu" else 0
-            ),  # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
         )
     def embed(self, seqs):
@@ -152,35 +191,131 @@ class NucleotideTransformerEmbedder:
         """
         all_embeddings = self.pipe(seqs, truncation=True, padding=True)
         # all_embeddings is a List of shape (L, D) arrays
-        pooled = [np.mean(x, axis=0) for x in all_embeddings]
-        return np.vstack(pooled)
 class ESMEmbedder:
-    def __init__(self, device):
-        self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
         self.batch_converter = self.alphabet.get_batch_converter()
         self.model.to(device).eval()
-        self.device = device
-    def embed(self, seqs):
-        batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-        _, _, toks = self.batch_converter(batch)
-        toks = toks.to(self.device)
-        with torch.no_grad():
-            results = self.model(toks, repr_layers=[33], return_contacts=False)
-        reps = results["representations"][33]
-        return reps[:, 1:-1].mean(1).cpu().numpy()
 class ESMDBPEmbedder:
     def __init__(self, device):
         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
         model_path = (
             Path(__file__).resolve().parent.parent
-            / "pretrained"
-            / "ESM-DBP"
-            / "ESM-DBP.model"
         )
         checkpoint = torch.load(model_path, map_location="cpu")
         clean_sd = {}
@@ -196,17 +331,46 @@ class ESMDBPEmbedder:
         self.alphabet = alphabet
         self.batch_converter = alphabet.get_batch_converter()
         self.device = device
     def embed(self, seqs):
-        batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-        _, _, toks = self.batch_converter(batch)
-        toks = toks.to(self.device)
-        with torch.no_grad():
-            out = self.model(toks, repr_layers=[33], return_contacts=False)
-        reps = out["representations"][33]
-        # skip start/end tokens
-        return reps[:, 1:-1].mean(1).cpu().numpy()
 class GPNEmbedder:
     def __init__(self, device):
@@ -219,14 +383,16 @@ class GPNEmbedder:
     def embed(self, seqs):
         inputs = self.tokenizer(
-            seqs, return_tensors="pt", padding=True, truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 class ProGenEmbedder:
     def __init__(self, device):
         model_name = "jinyuan22/ProGen2-base"
@@ -236,102 +402,137 @@ class ProGenEmbedder:
     def embed(self, seqs):
         inputs = self.tokenizer(
-            seqs, return_tensors="pt", padding=True, truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 # ---- main pipeline ----
 def get_embedder(name, device, for_dna=True):
     name = name.lower()
     if for_dna:
-        if name == "caduceus":
-            return CaduceusEmbedder(device)
-        if name == "dnabert":
-            return DNABertEmbedder(device)
-        if name == "nucleotide":
-            return NucleotideTransformerEmbedder(device)
-        if name == "gpn":
-            return GPNEmbedder(device)
     else:
-        if name in ("esm",):
-            return ESMEmbedder(device)
-        if name in ("esm-dbp", "esm_dbp"):
-            return ESMDBPEmbedder(device)
-        if name == "progen":
-            return ProGenEmbedder(device)
     raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
 def embed_and_save(seqs, ids, embedder, out_path):
     embs = embedder.embed(seqs)
-    np.save(out_path, embs)
-    with open(out_path.with_suffix(".ids"), "w") as f:
-        f.write("\n".join(ids))
-if __name__ == "__main__":
     p = argparse.ArgumentParser()
-    p.add_argument(
-        "--genome-json-dir",
-        default="data_files/raw/genomes/hg38",
-        help="dir of UCSC JSONs",
-    )
-    p.add_argument(
-        "--skip-dna",
-        action="store_true",
-        help="if set, skip the chromosome embedding step",
-    )  # if glm embeddings successful but not plm embeddings
-    p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
-    p.add_argument("--chrom-model", default="caduceus")
-    p.add_argument("--tf-model", default="esm-dbp")
-    p.add_argument(
-        "--out-dir", default="data_files/processed/tfclust/hg38_tf/embeddings"
-    )
-    p.add_argument("--device", default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
     device = args.device
     if not args.skip_dna:
-        # Load only primary chromosome JSONs (chr1–22, X, Y, M)
-        genome_dir = Path(args.genome_json_dir)
-        chrom_seqs, chrom_ids = [], []
-        primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
-        for j in sorted(genome_dir.iterdir()):
-            if not primary_pattern.match(j.name):
-                continue
-            data = json.loads(j.read_text())
-            seq = data.get("dna") or data.get("sequence")
-            chrom = data.get("chrom") or j.stem.split("_")[-1]
-            chrom_seqs.append(seq)
-            chrom_ids.append(chrom)
-        ########################
-        cutoff = CaduceusEmbedder(device).chunk_size
-        long_chroms = [
-            (chrom, len(seq))
-            for chrom, seq in zip(chrom_ids, chrom_seqs)
-            if len(seq) > cutoff
-        ]
-        if long_chroms:
-            print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
-            for chrom, L in long_chroms:
-                print(f"  {chrom}: {L} bases")
         else:
-            print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
-        ####################
-        chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-        out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
-        embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
-    # Load TF sequences
     tf_seqs, tf_ids = [], []
     for record in SeqIO.parse(args.tf_fasta, "fasta"):
         tf_ids.append(record.id)
@@ -342,4 +543,4 @@ if __name__ == "__main__":
     out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
     embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
-    print("Done.")

     --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
     --device           cuda
 """
 import os
 import re
 import argparse
 # ---- model wrappers ----
 class CaduceusEmbedder:
     def __init__(self, device, chunk_size=131_072, overlap=0):
         """
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, trust_remote_code=True
         )
+        self.model = AutoModel.from_pretrained(
+            model_name, trust_remote_code=True
+        ).to(device).eval()
+        self.device     = device
         self.chunk_size = chunk_size
+        self.step       = chunk_size - overlap
     def embed(self, seqs):
         """
         seqs: List[str] of DNA sequences (each <= chunk_size for this test)
         returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
         """
+        # outputs = []
+        # for seq in seqs:
+        #     # --- new: raw per‐token embeddings in one shot ---
+        #     toks = self.tokenizer(
+        #         seq,
+        #         return_tensors="pt",
+        #         padding=False,
+        #         truncation=True,
+        #         max_length=self.chunk_size
+        #     ).to(self.device)
+        #     with torch.no_grad():
+        #         out = self.model(**toks).last_hidden_state  # (1, L, D)
+        #     outputs.append(out.cpu().numpy()[0])             # (L, D)
+        # return np.stack(outputs, axis=0)  # (N, L, D)
         outputs = []
         for seq in seqs:
             toks = self.tokenizer(
                 seq,
                 return_tensors="pt",
                 padding=False,
                 truncation=True,
+                max_length=self.chunk_size
             ).to(self.device)
             with torch.no_grad():
                 out = self.model(**toks).last_hidden_state  # (1, L, D)
+            outputs.append(out.cpu().numpy()[0])             # (L, D)
+        return outputs  # list of variable-length (L_i, D) arrays
     def benchmark(self, lengths=None):
         """
             t1 = time.perf_counter()
             print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
+class SegmentNTEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
+        self.model = AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True).to(device).eval()
+        self.device = device
+    def _adjust_length(self, input_ids):
+        bs, L = input_ids.shape
+        excl = L - 1
+        remainder = (excl) % 4
+        if remainder != 0:
+            pad_needed = 4 - remainder
+            pad_tensor = torch.full((bs, pad_needed), self.tokenizer.pad_token_id, dtype=input_ids.dtype, device=input_ids.device)
+            input_ids = torch.cat([input_ids, pad_tensor], dim=1)
+        return input_ids
+    def embed(self, seqs, batch_size=16):
+        """
+        seqs: List[str]
+        Returns: np.ndarray of shape (N, D)
+        """
+        all_embeddings = []
+        for i in range(0, len(seqs), batch_size):
+            batch_seqs = seqs[i : i + batch_size]
+            encoded = self.tokenizer.batch_encode_plus(
+                batch_seqs,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+            )
+            input_ids = encoded["input_ids"].to(self.device)  # (B, L)
+            attention_mask = input_ids != self.tokenizer.pad_token_id
+            input_ids = self._adjust_length(input_ids)
+            attention_mask = (input_ids != self.tokenizer.pad_token_id)
+            with torch.no_grad():
+                outs = self.model(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+            if hasattr(outs, "hidden_states") and outs.hidden_states is not None:
+                last_hidden = outs.hidden_states[-1]  # (B, L, D)
+            else:
+                last_hidden = outs.last_hidden_state  # fallback
+            # Exclude CLS token if present (assume first token) and pool
+            pooled = last_hidden[:, 1:, :].mean(dim=1)  # (B, D)
+            all_embeddings.append(pooled.cpu().numpy())
+            # release fragmentation
+            torch.cuda.empty_cache()
+        return np.vstack(all_embeddings)  # (N, D)
 class DNABertEmbedder:
     def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
+        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
+        self.device    = device
     def embed(self, seqs):
         embs = []
         for s in seqs:
+            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
             with torch.no_grad():
                 out = self.model(tokens).last_hidden_state.mean(1)
             embs.append(out.cpu().numpy())
         return np.vstack(embs)
 class NucleotideTransformerEmbedder:
     def __init__(self, device):
         # HF “feature-extraction” returns a list of (L, D) arrays for each input
         self.pipe = pipeline(
             "feature-extraction",
             model="InstaDeepAI/nucleotide-transformer-500m-1000g",
+            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
         )
     def embed(self, seqs):
         """
         all_embeddings = self.pipe(seqs, truncation=True, padding=True)
         # all_embeddings is a List of shape (L, D) arrays
+        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
+        return np.vstack(pooled)
+# class ESMEmbedder:
+#     def __init__(self, device):
+#         self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         self.batch_converter = self.alphabet.get_batch_converter()
+#         self.model.to(device).eval()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             results = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = results["representations"][33]
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
 class ESMEmbedder:
+    def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
+        # Try to load the specified ESM-2 model; fallback to esm1b if missing
+        self.device = device
+        try:
+            self.model, self.alphabet = getattr(esm.pretrained, model_name)()
+            self.is_esm2 = model_name.lower().startswith("esm2")
+        except AttributeError:
+            # fallback to ESM-1b
+            self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+            self.is_esm2 = False
         self.batch_converter = self.alphabet.get_batch_converter()
         self.model.to(device).eval()
+        # determine max length: esm2 models vary; use default 1024 for esm1b
+        self.max_len = 4096 if self.is_esm2 else 1024  # adjust if your esm2 variant has explicit limit
+        # for chunking: reserve 2 tokens if model uses BOS/EOS
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
+    def _chunk_sequence(self, seq):
+        """
+        Return list of possibly overlapping chunks of seq, each <= chunk_size.
+        """
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
+    def embed(self, seqs):
+        """
+        seqs: List[str] of protein sequences.
+        Returns: np.ndarray of shape (N, D) pooled per-sequence embeddings.
+        """
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            # process chunks in batch if small number, else sequentially
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    results = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = results["representations"][33]  # (1, L, D)
+                # remove BOS/EOS if present: take 1:-1 if length permits
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)  # (1, D)
+                else:
+                    rep = reps.mean(1)  # fallback
+                chunk_vecs.append(rep.squeeze(0))  # (D,)
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                # average chunk vectors
+                stacked = torch.stack(chunk_vecs, dim=0)  # (num_chunks, D)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)  # (N, D)
+# class ESMDBPEmbedder:
+#     def __init__(self, device):
+#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         model_path = (
+#             Path(__file__).resolve().parent.parent
+#             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+#         )
+#         checkpoint = torch.load(model_path, map_location="cpu")
+#         clean_sd = {}
+#         for k, v in checkpoint.items():
+#             clean_sd[k.replace("module.", "")] = v
+#         result = base_model.load_state_dict(clean_sd, strict=False)
+#         if result.missing_keys:
+#             print(f"[ESMDBP] missing keys: {result.missing_keys}")
+#         if result.unexpected_keys:
+#             print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+#         self.model = base_model.to(device).eval()
+#         self.alphabet = alphabet
+#         self.batch_converter = alphabet.get_batch_converter()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             out = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = out["representations"][33]
+#         # skip start/end tokens
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
 class ESMDBPEmbedder:
     def __init__(self, device):
         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
         model_path = (
             Path(__file__).resolve().parent.parent
+            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
         )
         checkpoint = torch.load(model_path, map_location="cpu")
         clean_sd = {}
         self.alphabet = alphabet
         self.batch_converter = alphabet.get_batch_converter()
         self.device = device
+        self.max_len = 1024  # same limit as esm1b
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4
+    def _chunk_sequence(self, seq):
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
     def embed(self, seqs):
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    out = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = out["representations"][33]
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)
+                else:
+                    rep = reps.mean(1)
+                chunk_vecs.append(rep.squeeze(0))
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                stacked = torch.stack(chunk_vecs, dim=0)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)
 class GPNEmbedder:
     def __init__(self, device):
     def embed(self, seqs):
         inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 class ProGenEmbedder:
     def __init__(self, device):
         model_name = "jinyuan22/ProGen2-base"
     def embed(self, seqs):
         inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 # ---- main pipeline ----
 def get_embedder(name, device, for_dna=True):
     name = name.lower()
     if for_dna:
+        if name=="caduceus":   return CaduceusEmbedder(device)
+        if name=="dnabert":    return DNABertEmbedder(device)
+        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
+        if name=="gpn":        return GPNEmbedder(device)
+        if name=="segmentnt":    return SegmentNTEmbedder(device)
     else:
+        if name in ("esm",):    return ESMEmbedder(device)
+        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
+        if name=="progen":      return ProGenEmbedder(device)
     raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
+def pad_token_embeddings(list_of_arrays, pad_value=0.0):
+    """
+    list_of_arrays: list of (L_i, D) numpy arrays
+    Returns:
+      padded: (N, L_max, D) array
+      mask:   (N, L_max) boolean array where True = real token, False = padding
+    """
+    N = len(list_of_arrays)
+    D = list_of_arrays[0].shape[1]
+    L_max = max(arr.shape[0] for arr in list_of_arrays)
+    padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
+    mask = np.zeros((N, L_max), dtype=bool)
+    for i, arr in enumerate(list_of_arrays):
+        L = arr.shape[0]
+        padded[i, :L] = arr
+        mask[i, :L] = True
+    return padded, mask
 def embed_and_save(seqs, ids, embedder, out_path):
     embs = embedder.embed(seqs)
+    # Decide whether we got variable-length per-token outputs (list of (L, D))
+    is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
+    if is_variable_token:
+        # pad to (N, L_max, D) + mask
+        padded, mask = pad_token_embeddings(embs)
+        # Save both embeddings and mask together in an .npz for convenience
+        np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
+                            embeddings=padded,
+                            mask=mask,
+                            ids=np.array(ids, dtype=object))
+    else:
+        # fixed shape output, e.g., pooled (N, D)
+        array = np.vstack(embs) if isinstance(embs, list) else embs
+        np.save(out_path, array)
+        with open(out_path.with_suffix(".ids"), "w") as f:
+            f.write("\n".join(ids))
+if __name__=="__main__":
     p = argparse.ArgumentParser()
+    p.add_argument("--peak-fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
+    p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
+    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
+    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
+    p.add_argument("--chrom-model",   default="caduceus")
+    p.add_argument("--tf-model",      default="esm-dbp")
+    p.add_argument("--out-dir",       default="data_files/processed/tfclust/hg38_tf/embeddings")
+    p.add_argument("--device",        default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
     device = args.device
     if not args.skip_dna:
+        peak_fasta = Path(args.peak_fasta)
+        if peak_fasta.exists():
+            # Load peak sequences from FASTA
+            from Bio import SeqIO
+            peak_seqs = []
+            peak_ids = []
+            for rec in SeqIO.parse(peak_fasta, "fasta"):
+                peak_ids.append(rec.id)
+                peak_seqs.append(str(rec.seq))
+            print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
+            dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
+            embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
+        elif args.genome_json_dir:
+            # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
+            genome_dir = Path(args.genome_json_dir)
+            chrom_seqs, chrom_ids = [], []
+            primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
+            for j in sorted(genome_dir.iterdir()):
+                if not primary_pattern.match(j.name):
+                    continue
+                data = json.loads(j.read_text())
+                seq = data.get("dna") or data.get("sequence")
+                chrom = data.get("chrom") or j.stem.split("_")[-1]
+                chrom_seqs.append(seq)
+                chrom_ids.append(chrom)
+            cutoff = CaduceusEmbedder(device).chunk_size
+            long_chroms = [
+                (chrom, len(seq))
+                for chrom, seq in zip(chrom_ids, chrom_seqs)
+                if len(seq) > cutoff
+            ]
+            if long_chroms:
+                print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
+                for chrom, L in long_chroms:
+                    print(f"  {chrom}: {L} bases")
+            else:
+                print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
+            chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
+            embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
         else:
+            raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
+    #Load TF sequences
     tf_seqs, tf_ids = [], []
     for record in SeqIO.parse(args.tf_fasta, "fasta"):
         tf_ids.append(record.id)
     out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
     embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
+    print("Done.")

dpacman/classifier/model_tmp/extract_tf_symbols.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+import pandas as pd
+from pathlib import Path
+FINAL_CSV = Path("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv")
+OUT_SYMBOLS = Path("tf_symbols.txt")
+def normalize_tf(tf_id: str) -> str:
+    return tf_id.split("_seq")[0].upper()
+def main():
+    df = pd.read_csv(FINAL_CSV, dtype=str)
+    if "TF_id" not in df.columns:
+        raise RuntimeError("final.csv missing TF_id column")
+    tf_raw = df["TF_id"].dropna().unique().tolist()
+    normalized = sorted({normalize_tf(t) for t in tf_raw})
+    print(f"Unique raw TF_id count: {len(tf_raw)}")
+    print(f"Unique normalized TF symbols: {len(normalized)}")
+    with open(OUT_SYMBOLS, "w") as f:
+        for s in normalized:
+            f.write(s + "\n")
+    print(f"Wrote normalized TF symbols to {OUT_SYMBOLS}")
+    # Optional: show sample
+    print("Sample symbols:", normalized[:50])
+if __name__ == "__main__":
+    main()

dpacman/classifier/model_tmp/make_pair_list.py ADDED Viewed

	@@ -0,0 +1,220 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import random
+import sys
+def read_ids_file(p):
+    p = Path(p)
+    if not p.exists():
+        raise FileNotFoundError(f"IDs file not found: {p}")
+    return [line.strip() for line in p.open() if line.strip()]
+def split_embeddings(emb_path, ids_path, out_dir, prefix):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if not Path(emb_path).exists():
+        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
+    if not Path(ids_path).exists():
+        raise FileNotFoundError(f"IDs file not found: {ids_path}")
+    if emb_path.endswith(".npz"):
+        data = np.load(emb_path, allow_pickle=True)
+        if "embeddings" in data:
+            emb = data["embeddings"]
+        else:
+            raise ValueError(f"{emb_path} missing 'embeddings' key")
+    else:
+        emb = np.load(emb_path)
+    ids = read_ids_file(ids_path)
+    if len(ids) != emb.shape[0]:
+        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
+    mapping = {}
+    for i, ident in enumerate(ids):
+        if i >= emb.shape[0]:
+            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
+            continue
+        arr = emb[i]
+        out_file = out_dir / f"{prefix}_{ident}.npy"
+        np.save(out_file, arr)
+        mapping[ident] = str(out_file)
+    return mapping
+def extract_symbol_from_tf_id(full_id: str) -> str:
+    """
+    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
+    return the gene symbol uppercase (e.g., 'ZBTB5').
+    """
+    if "|" in full_id:
+        try:
+            # format sp|Accession|SYMBOL_HUMAN
+            genepart = full_id.split("|")[2]
+        except IndexError:
+            genepart = full_id
+    else:
+        genepart = full_id
+    symbol = genepart.split("_")[0]
+    return symbol.upper()
+def build_tf_symbol_map(tf_map):
+    """
+    Build mapping gene_symbol -> list of embedding paths.
+    """
+    symbol_map = {}
+    for full_id, path in tf_map.items():
+        symbol = extract_symbol_from_tf_id(full_id)
+        symbol_map.setdefault(symbol, []).append(path)
+    return symbol_map
+def tf_key_from_path(path: str) -> str:
+    """
+    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
+    """
+    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
+    # remove leading prefix if present (tf_)
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return extract_symbol_from_tf_id(rest)
+def dna_key_from_path(path: str) -> str:
+    """
+    Given .../dna_peak42.npy -> 'peak42'
+    """
+    stem = Path(path).stem
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return rest
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build TF-DNA pair list from final.csv with gene-symbol normalization for TFs."
+    )
+    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
+    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
+    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (e.g., peak*.ids)")
+    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
+    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (e.g., sp|...|... ids)")
+    parser.add_argument("--out_dir", required=True, help="Output directory")
+    parser.add_argument("--neg_per_positive", type=int, default=2, help="Negatives per positive (half same-TF, half same-DNA)")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Load final.csv
+    df = pd.read_csv(args.final_csv, dtype=str)
+    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
+        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
+    # Assign dna_id (unique per dna_sequence)
+    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
+    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
+    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
+    enriched_csv = out_dir / "final_with_dna_id.csv"
+    df.to_csv(enriched_csv, index=False)
+    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
+    # Split embeddings into per-item files
+    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
+    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
+    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
+    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
+    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
+    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
+    # Build gene-symbol normalized map
+    tf_symbol_map = build_tf_symbol_map(tf_map)
+    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
+    # Diagnostic overlaps
+    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
+    available_tf_symbols = set(tf_symbol_map.keys())
+    intersect_tf = norm_tf_in_final & available_tf_symbols
+    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
+    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
+    print(f"[i] Intersection count: {len(intersect_tf)}")
+    if len(intersect_tf) == 0:
+        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
+        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
+        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
+        sys.exit(1)
+    dna_ids_final = set(df["dna_id"].unique())
+    available_dna_ids = set(dna_map.keys())
+    intersect_dna = dna_ids_final & available_dna_ids
+    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
+    if len(intersect_dna) == 0:
+        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
+        sys.exit(1)
+    # Build positive pairs
+    positives = []
+    for _, row in df.iterrows():
+        tf_raw = row["TF_id"]
+        tf_symbol = tf_raw.split("_seq")[0].upper()
+        dnaid = row["dna_id"]
+        if tf_symbol not in tf_symbol_map:
+            continue
+        if dnaid not in dna_map:
+            continue
+        # pick the first embedding for that symbol
+        tf_embedding_path = tf_symbol_map[tf_symbol][0]
+        positives.append((tf_embedding_path, dna_map[dnaid], 1))
+    print(f"[i] Constructed {len(positives)} positive pairs after TF symbol resolution")
+    if len(positives) == 0:
+        print("[ERROR] No positive pairs could be constructed; aborting.", file=sys.stderr)
+        sys.exit(1)
+    # Build negative samples
+    all_tf_symbols = sorted(tf_symbol_map.keys())
+    all_dnaids = sorted(dna_map.keys())
+    positive_set = set()
+    for tf_path, dna_path, _ in positives:
+        tf_key = tf_key_from_path(tf_path)
+        dna_key = dna_key_from_path(dna_path)
+        positive_set.add((tf_key, dna_key))
+    negatives = []
+    half = args.neg_per_positive // 2
+    for tf_path, dna_path, _ in positives:
+        tf_key = tf_key_from_path(tf_path)
+        dna_key = dna_key_from_path(dna_path)
+        # same TF, different DNA
+        for _ in range(half):
+            candidate_dna = random.choice(all_dnaids)
+            if candidate_dna == dna_key or (tf_key, candidate_dna) in positive_set:
+                continue
+            negatives.append((tf_path, dna_map[candidate_dna], 0))
+        # same DNA, different TF
+        for _ in range(half):
+            candidate_tf_symbol = random.choice(all_tf_symbols)
+            if candidate_tf_symbol == tf_key or (candidate_tf_symbol, dna_key) in positive_set:
+                continue
+            # pick its first embedding
+            candidate_tf_path = tf_symbol_map[candidate_tf_symbol][0]
+            negatives.append((candidate_tf_path, dna_map[dnaid], 0))
+    print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive={args.neg_per_positive})")
+    # Write pair list
+    pair_list_path = out_dir / "pair_list.tsv"
+    with open(pair_list_path, "w") as f:
+        for binder_path, glm_path, label in positives + negatives:
+            # binder=TF, glm=DNA
+            f.write(f"{binder_path}\t{glm_path}\t{label}\n")
+    print(f"[i] Wrote {len(positives)+len(negatives)} examples to {pair_list_path}")
+if __name__ == "__main__":
+    main()

dpacman/classifier/model_tmp/make_peak_fasta.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import pandas as pd
+from pathlib import Path
+df = pd.read_csv("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv", dtype=str)  # adjust path if needed
+# get unique sequences
+uniq = df[["dna_sequence"]].drop_duplicates().reset_index(drop=True)
+# make headers: e.g., peak0, peak1, ...
+out_fa = Path("binding_peaks_unique.fa")
+with open(out_fa, "w") as f:
+    for i, seq in enumerate(uniq["dna_sequence"]):
+        header = f">peak{i}"
+        f.write(f"{header}\n{seq}\n")
+print(f"Wrote {len(uniq)} unique binding sequences to {out_fa}")

dpacman/classifier/model_tmp/model.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from torch import nn
+class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+        self.act = nn.GELU()
+        self.ln = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor):
+        # x: (batch, L, dim)
+        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
+        out = self.act(out)
+        out = out.transpose(1, 2)           # → (batch, L, dim)
+        return self.ln(out + x)             # residual
+class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8):
+        super().__init__()
+        # self-attention for both sides
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_b1 = nn.LayerNorm(dim)
+        self.ln_g1 = nn.LayerNorm(dim)
+        self.ffn_b = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ffn_g = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ln_b2 = nn.LayerNorm(dim)
+        self.ln_g2 = nn.LayerNorm(dim)
+        # cross attention (binder queries, glm keys/values)
+        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_c1 = nn.LayerNorm(dim)
+        self.ffn_c = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ln_c2 = nn.LayerNorm(dim)
+    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
+        """
+        binder: (batch, Lb, dim)
+        glm: (batch, Lg, dim) -- has passed through its local CNN beforehand
+        returns: updated binder representation (batch, Lb, dim)
+        """
+        # binder self-attn + ffn
+        b = binder
+        b_sa, _ = self.sa_binder(b, b, b)
+        b = self.ln_b1(b + b_sa)
+        b_ff = self.ffn_b(b)
+        b = self.ln_b2(b + b_ff)
+        # glm self-attn + ffn
+        g = glm
+        g_sa, _ = self.sa_glm(g, g, g)
+        g = self.ln_g1(g + g_sa)
+        g_ff = self.ffn_g(g)
+        g = self.ln_g2(g + g_ff)
+        # cross-attention: binder queries glm
+        c_sa, _ = self.cross_attn(b, g, g)
+        c = self.ln_c1(b + c_sa)
+        c_ff = self.ffn_c(c)
+        c = self.ln_c2(c + c_ff)
+        return c  # (batch, Lb, dim)
+class BindPredictor(nn.Module):
+    def __init__(self,
+                 input_dim: int = 256,
+                 hidden_dim: int = 256,
+                 heads: int = 8,
+                 num_layers: int = 4,
+                 use_local_cnn_on_glm: bool = True):
+        super().__init__()
+        self.proj_binder = nn.Linear(input_dim, hidden_dim)
+        self.proj_glm = nn.Linear(input_dim, hidden_dim)
+        self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
+        self.layers = nn.ModuleList([
+            CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)
+        ])
+        self.ln_out = nn.LayerNorm(hidden_dim)
+        self.head = nn.Sequential(
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, binder_emb, glm_emb):
+        """
+        binder_emb, glm_emb: (batch, L, input_dim)
+        """
+        b = self.proj_binder(binder_emb)   # (B, Lb, hidden_dim)
+        g = self.proj_glm(glm_emb)         # (B, Lg, hidden_dim)
+        if self.use_local_cnn:
+            g = self.local_cnn(g)          # local context injected
+        for layer in self.layers:
+            b = layer(b, g)                # update binder with cross-modal info
+        pooled = b.mean(dim=1)             # (B, hidden_dim)
+        out = self.ln_out(pooled)
+        return self.head(out).squeeze(-1)   # (B,)

dpacman/classifier/model_tmp/prep_splits.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import numpy as np
+import pandas as pd
+import sys
+import json
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import train_test_split
+from collections import Counter
+def parse_pair_list(pair_list_path):
+    binder_paths, glm_paths, labels = [], [], []
+    with open(pair_list_path) as f:
+        for lineno, line in enumerate(f, start=1):
+            if not line.strip():
+                continue
+            parts = line.strip().split()
+            if len(parts) != 3:
+                print(f"[WARN] skipping malformed line {lineno}: {line.strip()}", file=sys.stderr)
+                continue
+            b, g, l = parts
+            try:
+                lab = int(l)
+            except ValueError:
+                print(f"[WARN] invalid label on line {lineno}: {l}", file=sys.stderr)
+                continue
+            binder_paths.append(b)
+            glm_paths.append(g)
+            labels.append(lab)
+    return binder_paths, glm_paths, labels
+def build_tf_compressed_cache(binder_paths, target_dim=256):
+    """
+    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
+    """
+    unique_paths = sorted(set(binder_paths))
+    print(f"[i] Found {len(unique_paths)} unique TF embedding files to compress.", flush=True)
+    # Load all embeddings to determine dimensionality
+    samples = []
+    for p in unique_paths:
+        arr = np.load(p)
+        samples.append(arr)
+    # Determine if reduction needed: assume all have same embedding width
+    first = samples[0]
+    orig_dim = first.shape[1] if first.ndim == 2 else 1
+    reduction_needed = (orig_dim != target_dim)
+    tf_cache = {}
+    if reduction_needed:
+        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
+        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
+        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
+        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
+        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
+        pooled = []
+        for arr in samples:
+            if arr.ndim == 2:
+                pooled.append(arr.mean(axis=0))  # (orig_dim,)
+            else:
+                pooled.append(arr)  # degenerate
+        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
+        print(f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}", flush=True)
+        svd = TruncatedSVD(n_components=target_dim, random_state=42)
+        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
+        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
+        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
+        proj_mat = svd.components_.T  # (orig_dim, target_dim)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]  # shape (L, orig_dim)
+            if arr.ndim == 1:
+                arr2 = arr @ proj_mat  # (target_dim,)
+            else:
+                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
+                arr2 = arr @ proj_mat
+            tf_cache[p] = arr2  # reduced per-token representation
+        print("[i] Completed compression of TF embeddings.", flush=True)
+    else:
+        # already correct dim: just cache originals
+        print(f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.", flush=True)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]
+            tf_cache[p] = arr
+    return tf_cache
+def main():
+    #df = pd.read_csv("../data_files/processed/fimo/ananya_aug4_2025_final.csv")
+    binder_paths, glm_paths, labels = parse_pair_list("../data_files/processed/fimo/ananya_aug4_2025_pair_list.tsv")
+    if len(labels) == 0:
+        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
+        sys.exit(1)
+    label_counts = Counter(labels)
+    print(f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}", flush=True)
+    # build compressed TF cache (reduces to 256 if needed)
+    #tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
+    # Combine all data into one structure for easy splitting
+    data = list(zip(binder_paths, glm_paths, labels))
+    # First split: train vs temp (val+test)
+    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42)
+    # Second split: val vs test (50% of 20% → 10% each)
+    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
+    # Unpack for dataset construction
+    def unpack(data):
+        binders, glms, labels = zip(*data)
+        return list(binders), list(glms), list(labels)
+    def save_split(binder_paths, glm_paths, labels, out_path):
+        df = pd.DataFrame({
+            "binder_path": binder_paths,
+            "glm_path": glm_paths,
+            "label": labels,
+        })
+        df.to_csv(out_path, index=False)
+    # Unpack data for saving
+    train_binders, train_glms, train_labels = unpack(train_data)
+    val_binders, val_glms, val_labels = unpack(val_data)
+    test_binders, test_glms, test_labels = unpack(test_data)
+    # Save each split
+    save_split(train_binders, train_glms, train_labels, "../data_files/splits/train.csv")
+    save_split(val_binders, val_glms, val_labels, "../data_files/splits/val.csv")
+    save_split(test_binders, test_glms, test_labels, "../data_files/splits/test.csv")
+if __name__=="__main__":
+    main()

dpacman/classifier/model_tmp/train.py ADDED Viewed

	@@ -0,0 +1,180 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import torch
+from torch import nn
+from model import BindPredictor
+from pathlib import Path
+from collections import Counter
+from sklearn.metrics import roc_auc_score, average_precision_score
+from sklearn.decomposition import TruncatedSVD
+import sys
+from dpacman.utils.models import set_seed
+def build_tf_compressed_cache(binder_paths, target_dim=256):
+    """
+    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
+    """
+    unique_paths = sorted(set(binder_paths))
+    print(f"[i] Found {len(unique_paths)} unique TF embedding files to compress.", flush=True)
+    # Load all embeddings to determine dimensionality
+    samples = []
+    for p in unique_paths:
+        arr = np.load(p)
+        samples.append(arr)
+    # Determine if reduction needed: assume all have same embedding width
+    first = samples[0]
+    orig_dim = first.shape[1] if first.ndim == 2 else 1
+    reduction_needed = (orig_dim != target_dim)
+    tf_cache = {}
+    if reduction_needed:
+        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
+        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
+        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
+        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
+        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
+        pooled = []
+        for arr in samples:
+            if arr.ndim == 2:
+                pooled.append(arr.mean(axis=0))  # (orig_dim,)
+            else:
+                pooled.append(arr)  # degenerate
+        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
+        print(f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}", flush=True)
+        svd = TruncatedSVD(n_components=target_dim, random_state=42)
+        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
+        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
+        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
+        proj_mat = svd.components_.T  # (orig_dim, target_dim)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]  # shape (L, orig_dim)
+            if arr.ndim == 1:
+                arr2 = arr @ proj_mat  # (target_dim,)
+            else:
+                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
+                arr2 = arr @ proj_mat
+            tf_cache[p] = arr2  # reduced per-token representation
+        print("[i] Completed compression of TF embeddings.", flush=True)
+    else:
+        # already correct dim: just cache originals
+        print(f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.", flush=True)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]
+            tf_cache[p] = arr
+    return tf_cache
+def evaluate(model, dl, device):
+    model.eval()
+    all_labels = []
+    all_preds = []
+    with torch.no_grad():
+        for b, g, y in dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            all_labels.append(y.cpu())
+            all_preds.append(pred.cpu())
+    if not all_labels:
+        return 0.0, 0.0
+    y_true = torch.cat(all_labels).numpy()
+    y_score = torch.cat(all_preds).numpy()
+    try:
+        auc = roc_auc_score(y_true, y_score)
+    except Exception:
+        auc = 0.0
+    try:
+        ap = average_precision_score(y_true, y_score)
+    except Exception:
+        ap = 0.0
+    return auc, ap
+def unpack(data):
+    binders, glms, labels = zip(*data)
+    return list(binders), list(glms), list(labels)
+# ---- main ------------------------------------------------------------
+def main(cfg):
+    # Set seed for reproducibility
+    set_seed(cfg.seed)
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    #
+    print("DEBUG: starting training script with in-line TF compression", flush=True)
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    binder_paths, glm_paths, labels = parse_pair_list(cfg.pair_list)
+    if len(labels) == 0:
+        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
+        sys.exit(1)
+    label_counts = Counter(labels)
+    print(f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}", flush=True)
+    # build compressed TF cache (reduces to 256 if needed)
+    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
+    # load training data aloiaushasfoiuhasfoiuafasdfoihuaaasdfoiuhasfaaoiufhasfoasasfoiuh
+    train_ds = PairDataset(None, tf_compressed_cache=tf_compressed_cache)
+    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
+    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
+    print(f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}", flush=True)
+    if len(train_ds) == 0 or len(val_ds) == 0:
+        print("[ERROR] Train or validation split is empty; cannot proceed.", file=sys.stderr)
+        sys.exit(1)
+    train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)
+    val_dl = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    test_dl = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    model = BindPredictor(input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True)
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
+    loss_fn = nn.BCELoss()
+    best_val = -float("inf")
+    os_out = Path(args.out_dir)
+    os_out.mkdir(exist_ok=True, parents=True)
+    for epoch in range(1, args.epochs + 1):
+        print(f"[Epoch {epoch}] starting...", flush=True)
+        model.train()
+        running_loss = 0.0
+        for b, g, y in train_dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            loss = loss_fn(pred, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item() * b.size(0)
+        train_loss = running_loss / len(train_ds)
+        val_auc, val_ap = evaluate(model, val_dl, device)
+        print(f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}", flush=True)
+        if val_auc > best_val:
+            best_val = val_auc
+            torch.save(model.state_dict(), os_out / "best_model.pt")
+            print(f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}", flush=True)
+    torch.save(model.state_dict(), os_out / "last_model.pt")
+    test_auc, test_ap = evaluate(model, test_dl, device)
+    print(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
+    print(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
+if __name__ == "__main__":
+    main()

dpacman/data_modules/__init__.py ADDED Viewed

File without changes

dpacman/data_modules/pair.py ADDED Viewed

	@@ -0,0 +1,342 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from lightning import LightningDataModule
+from model import BindPredictor
+from pathlib import Path
+from collections import Counter
+from sklearn.metrics import roc_auc_score, average_precision_score
+from sklearn.decomposition import TruncatedSVD
+from multiprocessing import cpu_count
+from functools import partial
+import random
+import sys
+import pandas as pd
+from dpacman.utils import RankedLogger
+logger = RankedLogger(__name__, rank_zero_only=True)
+# ---- dataset ---------------------------------------------------------
+class PairDataset(Dataset):
+    def __init__(self, tf_paths, dna_paths, final_df, tf_cache):
+        """
+        tf_cache: dict mapping binder_path -> compressed (256-d) tensor/array
+        """
+        self.tf_paths, self.dna_paths = tf_paths, dna_paths
+        self.tf_cache                = tf_cache
+        self.targets = {}
+        for _, row in final_df.iterrows():
+            dna_id = row["dna_id"]
+            vec    = np.array(list(map(float, row["score_sig_r2"].split(","))), dtype=np.float32)
+            self.targets[dna_id] = vec
+    def __len__(self):
+        return len(self.tf_paths)
+    def __getitem__(self, i):
+        b = self.tf_cache[self.tf_paths[i]]      # (L_b, D_b) or (D_b,)
+        if b.ndim==1: b = b[None,:]
+        g = np.load(self.dna_paths[i])           # (L_g, 256) or (256,)
+        if g.ndim==1: g = g[None,:]
+        stem = Path(self.dna_paths[i]).stem
+        dna_id = stem.replace("dna_","")
+        t = self.targets.get(dna_id, np.zeros(g.shape[0],dtype=np.float32))
+        return torch.from_numpy(b).float(), \
+               torch.from_numpy(g).float(), \
+               torch.from_numpy(t).float()
+class PairDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_file: str = "data_files/splits/train.csv",
+        val_file: str = "data_files/splits/val.csv",
+        test_file: str = "data_files/splits/test.csv",
+        tokenizer_path="facebook/esm2_t33_650M_UR50D",
+        batch_size: int = 1,
+        num_workers=8,
+        maximize_num_workers=False,
+        debug_run: bool = False,
+        pin_memory: bool = False,
+    ):
+        super().__init__()
+        self.save_hyperparameters()
+        self.debug_run = debug_run
+        # Initialize the data files
+        self.train_data_file = train_file
+        self.val_data_file = val_file
+        self.test_data_file = test_file
+        # Initialize hyperparameters like batch size
+        self.batch_size = batch_size
+        self.num_wokers = cpu_count() if maximize_num_workers else min(num_workers, cpu_count())
+        logger.info(f"num_workers={self.num_wokers}")
+        logger.info("Initialized BinderDecoyDataModule constants")
+    def load_and_unpack(self, file_path, lim=None):
+        """
+        Load and unpack an input csv whose columns are binder_path,glm_path,label
+        """
+        df = pd.read_csv(file_path)
+        if lim is not None:
+            df = df[:lim].reset_index(drop=True)
+        binder_paths = df["binder_path"].tolist()
+        glm_paths = df["glm_path"].tolist()
+        labels = df["label"].tolist()
+        return binder_paths, glm_paths, labels
+    def setup(self, stage):
+        lim = 5 if self.debug_run else None
+        if stage=="train":
+            binder_paths, glm_paths, labels = self.load_file(self.train_data_file, lim=lim)
+            self.train_dataset = PairDataset(binder_paths, glm_paths, labels)
+        elif stage=="val":
+            binder_paths, glm_paths, labels = self.load_file(self.val_data_file, lim=lim)
+            self.val_dataset = PairDataset(binder_paths, glm_paths, labels)
+        elif stage=="test":
+            binder_paths, glm_paths, labels = self.load_file(self.test_data_file, lim=lim)
+            self.test_dataset = PairDataset(binder_paths, glm_paths, labels)
+        else:
+            raise RuntimeError(f"Stage {stage} is not defined. Must be train, val, or test.")
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=collate_fn,
+            num_workers=self.num_wokers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            collate_fn=collate_fn,
+            num_workers=self.num_wokers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=False,
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            collate_fn=collate_fn,
+            num_workers=self.num_wokers,
+            pin_memory=self.hparams.pin_memory,
+            shuffle=False,
+        )
+def collate_fn(batch):
+    Bs = [b.shape[0] for b,_,_ in batch]
+    Gs = [g.shape[0] for _,g,_ in batch]
+    maxB, maxG = max(Bs), max(Gs)
+    def pad_seq(x, L):
+        if x.shape[0] < L:
+            pad = torch.zeros((L-x.shape[0], x.shape[1]), dtype=x.dtype, device=x.device)
+            return torch.cat([x, pad], dim=0)
+        return x
+    def pad_t(y, L):
+        if y.shape[0] < L:
+            pad = torch.zeros((L-y.shape[0],), dtype=y.dtype, device=y.device)
+            return torch.cat([y, pad], dim=0)
+        return y
+    b_stack = torch.stack([pad_seq(b, maxB) for b,_,_ in batch])
+    g_stack = torch.stack([pad_seq(g, maxG) for _,g,_ in batch])
+    t_stack = torch.stack([pad_t(t, maxG) for *_,t in batch])
+    return b_stack, g_stack, t_stack
+def build_tf_compressed_cache(binder_paths, target_dim=256):
+    """
+    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
+    """
+    unique_paths = sorted(set(binder_paths))
+    logger.info(f"[i] Found {len(unique_paths)} unique TF embedding files to compress.", flush=True)
+    # Load all embeddings to determine dimensionality
+    samples = []
+    for p in unique_paths:
+        arr = np.load(p)
+        samples.append(arr)
+    # Determine if reduction needed: assume all have same embedding width
+    first = samples[0]
+    orig_dim = first.shape[1] if first.ndim == 2 else 1
+    reduction_needed = (orig_dim != target_dim)
+    tf_cache = {}
+    if reduction_needed:
+        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
+        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
+        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
+        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
+        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
+        pooled = []
+        for arr in samples:
+            if arr.ndim == 2:
+                pooled.append(arr.mean(axis=0))  # (orig_dim,)
+            else:
+                pooled.append(arr)  # degenerate
+        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
+        logger.info(f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}", flush=True)
+        svd = TruncatedSVD(n_components=target_dim, random_state=42)
+        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
+        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
+        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
+        proj_mat = svd.components_.T  # (orig_dim, target_dim)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]  # shape (L, orig_dim)
+            if arr.ndim == 1:
+                arr2 = arr @ proj_mat  # (target_dim,)
+            else:
+                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
+                arr2 = arr @ proj_mat
+            tf_cache[p] = arr2  # reduced per-token representation
+        logger.info("[i] Completed compression of TF embeddings.", flush=True)
+    else:
+        # already correct dim: just cache originals
+        logger.info(f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.", flush=True)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]
+            tf_cache[p] = arr
+    return tf_cache
+def evaluate(model, dl, device):
+    model.eval()
+    all_labels = []
+    all_preds = []
+    with torch.no_grad():
+        for b, g, y in dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            all_labels.append(y.cpu())
+            all_preds.append(pred.cpu())
+    if not all_labels:
+        return 0.0, 0.0
+    y_true = torch.cat(all_labels).numpy()
+    y_score = torch.cat(all_preds).numpy()
+    try:
+        auc = roc_auc_score(y_true, y_score)
+    except Exception:
+        auc = 0.0
+    try:
+        ap = average_precision_score(y_true, y_score)
+    except Exception:
+        ap = 0.0
+    return auc, ap
+# ---- main ------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pair_list", type=str, required=True,
+                        help="TSV: binder_path glm_path label")
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    # reproducibility
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    logger.info("DEBUG: starting training script with in-line TF compression", flush=True)
+    logger.info(f"[i] pair_list: {args.pair_list}", flush=True)
+    logger.info(f"[i] output dir: {args.out_dir}", flush=True)
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    binder_paths, glm_paths, labels = parse_pair_list(args.pair_list)
+    if len(labels) == 0:
+        logger.info("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
+        sys.exit(1)
+    label_counts = Counter(labels)
+    logger.info(f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}", flush=True)
+    # build compressed TF cache (reduces to 256 if needed)
+    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
+    # simple split: 80/10/10
+    n = len(labels)
+    idxs = np.arange(n)
+    np.random.shuffle(idxs)
+    train_i = idxs[: int(0.8 * n)]
+    val_i = idxs[int(0.8 * n): int(0.9 * n)]
+    test_i = idxs[int(0.9 * n):]
+    def subset(idxs):
+        return [binder_paths[i] for i in idxs], [glm_paths[i] for i in idxs], [labels[i] for i in idxs]
+    train_ds = PairDataset(*subset(train_i), tf_compressed_cache=tf_compressed_cache)
+    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
+    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
+    logger.info(f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}", flush=True)
+    if len(train_ds) == 0 or len(val_ds) == 0:
+        logger.info("[ERROR] Train or validation split is empty; cannot proceed.", file=sys.stderr)
+        sys.exit(1)
+    train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)
+    val_dl = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    test_dl = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    model = BindPredictor(input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True)
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
+    loss_fn = nn.BCELoss()
+    best_val = -float("inf")
+    os_out = Path(args.out_dir)
+    os_out.mkdir(exist_ok=True, parents=True)
+    for epoch in range(1, args.epochs + 1):
+        logger.info(f"[Epoch {epoch}] starting...", flush=True)
+        model.train()
+        running_loss = 0.0
+        for b, g, y in train_dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            loss = loss_fn(pred, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item() * b.size(0)
+        train_loss = running_loss / len(train_ds)
+        val_auc, val_ap = evaluate(model, val_dl, device)
+        logger.info(f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}", flush=True)
+        if val_auc > best_val:
+            best_val = val_auc
+            torch.save(model.state_dict(), os_out / "best_model.pt")
+            logger.info(f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}", flush=True)
+    torch.save(model.state_dict(), os_out / "last_model.pt")
+    test_auc, test_ap = evaluate(model, test_dl, device)
+    logger.info(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
+    logger.info(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
+if __name__ == "__main__":
+    main()

dpacman/data_tasks/cluster/__init__.py ADDED Viewed

File without changes

dpacman/data_tasks/cluster/remap.py ADDED Viewed

	@@ -0,0 +1,144 @@

+"""
+Holds Python methods for clustering Remap DNA sequences.
+"""
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import random
+import sys
+import subprocess
+from collections import defaultdict
+import rootutils
+import logging
+import os
+import json
+from omegaconf import DictConfig
+from hydra.core.hydra_config import HydraConfig
+from dpacman.utils.clustering import make_fasta, process_fasta, analyze_clustering_result, run_mmseqs_clustering, cluster_summary
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+logger = logging.getLogger(__name__)
+def cluster_molecules(fasta_dict, fasta_path, mmseqs_params: DictConfig, output_dir="", path_to_mmseqs="../softwares/mmseqs", moltype="dna", use_gpu=True):
+    """
+    Args:
+        - fasta_dict: dictionary object where the keys are sequence IDs, and the values are sequences
+        - fasta_path: str or Path to where the output fasta should be saved
+        - mmseqs_params: DictConfig of mmseqs hparams
+        - type: molecule type, "dna" or "protein"
+    """
+    # make the fasta
+    logger.info(f"Making fasta at: {fasta_path}")
+    fasta_path = str(make_fasta(fasta_dict, fasta_path))
+    # prepare directories
+    output_dir = str(Path(root) / output_dir)
+    path_to_mmseqs = str(Path(root) / path_to_mmseqs)
+    # run mmseqs
+    dbtype=1
+    if moltype=="dna": dbtype=2
+    run_mmseqs_clustering(fasta_path,
+                          output_dir,
+                          min_seq_id=mmseqs_params.min_seq_id,
+                          c=mmseqs_params.c,
+                          cov_mode=mmseqs_params.cov_mode,
+                          cluster_mode=mmseqs_params.cluster_mode,
+                          dbtype=dbtype,
+                          path_to_mmseqs=path_to_mmseqs)
+    tsv_path = [x for x in os.listdir(output_dir) if x.endswith(".tsv")][0]
+    clusters = analyze_clustering_result(
+        fasta_path, Path(output_dir) / tsv_path
+    )
+    logger.info(f"Made clusters DataFrame:\n{clusters.head()}")
+    cluster_summary(clusters)
+def read_input_data(input_path):
+    """
+    Read the data from the input path.
+    It may be a csv or parquet
+    """
+    input_path = Path(root) / input_path
+    df = None
+    if str(input_path).endswith(".parquet"):
+        df = pd.read_parquet(input_path, engine='pyarrow')
+    elif str(input_path).endswith(".csv"):
+        df = pd.read_csv(input_path)
+    elif str(input_path).endswith(".tsv") or str(input_path).endswith(".txt"):
+        df = pd.read_csv(input_path, sep="\t")
+    else:
+        raise Exception(f"Cannot read input data from {input_path}: invalid file type")
+    return df
+def main(cfg: DictConfig):
+    """
+    Run clustering on Remap protein AND DNA sequences.
+    Get clusters for each.
+    """
+    # Load input CSV
+    # columns: Index(['ID', 'tr_seqid', 'dna_seqid', 'tr_name', 'peak_id', 'chipscore', 'total_jaspar_hits', 'dna_sequence', 'tr_sequence', 'scores']
+    df = read_input_data(cfg.data_task.input_data_path)
+    # Separate configs
+    dna_full_cfg = cfg.data_task.dna_full
+    dna_peaks_cfg = cfg.data_task.dna_peaks
+    protein_cfg = cfg.data_task.protein
+    logger.info(f"Clustering DNA full: {cfg.data_task.cluster_dna_full}. Clustering DNA peaks: {cfg.data_task.cluster_dna_peaks}. Clustering protein: {cfg.data_task.cluster_protein}.")
+    # Make fastas
+    dna_full_fasta_path = Path(root) / dna_full_cfg.fasta_path
+    dna_peaks_fasta_path = Path(root) / dna_peaks_cfg.fasta_path
+    protein_fasta_path = Path(root) / protein_cfg.fasta_path
+    os.makedirs(dna_full_fasta_path.parent, exist_ok=True)
+    os.makedirs(dna_peaks_fasta_path.parent, exist_ok=True)
+    os.makedirs(protein_fasta_path.parent, exist_ok=True)
+    # Make dictioary needed for input to the fasta methods
+    with open(Path(root) / dna_full_cfg.input_map_path, "r") as f:
+        dna_full_fasta_dict = json.load(f)
+    with open(Path(root) / dna_peaks_cfg.input_map_path, "r") as f:
+        dna_peaks_fasta_dict = json.load(f)
+    with open(Path(root) / protein_cfg.input_map_path, "r") as f:
+        protein_fasta_dict = json.load(f)
+    logger.info(f"Loaded DNA seq dict from: {dna_full_cfg.input_map_path}. Size: {len(dna_full_fasta_dict)}")
+    logger.info(f"Loaded DNA peaks dict from: {dna_peaks_cfg.input_map_path}. Size: {len(dna_peaks_fasta_dict)}")
+    logger.info(f"Loaded TR (protein) seq dict from: {protein_cfg.input_map_path}. Size: {len(protein_fasta_dict)}")
+    # Build hash-sets once (drop NaNs to avoid weird matches)
+    dna_ids   = set(df["dna_seqid"].dropna())
+    peak_ids  = set(df["peak_seqid"].dropna())
+    tr_ids    = set(df["tr_seqid"].dropna())
+    # Iterate only the intersection (fast when allowed << dict size)
+    dna_full_fasta_dict   = {k: dna_full_fasta_dict[k]   for k in (dna_full_fasta_dict.keys()   & dna_ids)}
+    dna_peaks_fasta_dict  = {k: dna_peaks_fasta_dict[k]  for k in (dna_peaks_fasta_dict.keys()  & peak_ids)}
+    protein_fasta_dict    = {k: protein_fasta_dict[k]    for k in (protein_fasta_dict.keys()    & tr_ids)}
+    logger.info(f"Filtered dictionaries to only sequences in the filtered training data.")
+    logger.info(f"Total DNA sequences: {len(dna_full_fasta_dict)}. Total peak sequences: {len(dna_peaks_fasta_dict)}. Total protein sequences: {len(protein_fasta_dict)}")
+    if cfg.data_task.cluster_dna_full:
+        logger.info(f"Clustering DNA full sequences, with context")
+        cluster_molecules(dna_full_fasta_dict, dna_full_fasta_path,
+                          mmseqs_params=dna_full_cfg.mmseqs, output_dir=dna_full_cfg.output_dir, path_to_mmseqs=cfg.data_task.path_to_mmseqs, moltype="dna")
+    if cfg.data_task.cluster_dna_peaks:
+        logger.info(f"Clustering DNA peak sequences")
+        cluster_molecules(dna_peaks_fasta_dict, dna_peaks_fasta_path,
+                          mmseqs_params=dna_peaks_cfg.mmseqs, output_dir=dna_peaks_cfg.output_dir, path_to_mmseqs=cfg.data_task.path_to_mmseqs, moltype="dna")
+    if cfg.data_task.cluster_protein:
+        logger.info("Clustering protein sequences.")
+        cluster_molecules(protein_fasta_dict, protein_fasta_path, mmseqs_params=protein_cfg.mmseqs, output_dir=protein_cfg.output_dir, path_to_mmseqs=cfg.data_task.path_to_mmseqs, moltype="protein")
+    logger.info("Clustering pipeline complete")
+if __name__ == "__main__":
+    main()

dpacman/data_tasks/embeddings/embedders.py ADDED Viewed

	@@ -0,0 +1,560 @@

+"""
+Plug-and-play embedding extraction for:
+  • Chromosome sequences (from raw UCSC JSON)
+  • TF sequences (transcription_factors.fasta)
+Usage example (DNA + protein in one go):
+  module load miniconda/24.7.1
+  conda activate dpacman
+  python dpacman/data/compute_embeddings.py \
+    --genome-json-dir ../data_files/raw/genomes/hg38 \
+    --tf-fasta         ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
+    --chrom-model      caduceus \
+    --tf-model         esm-dbp \
+    --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
+    --device           cuda
+"""
+import os
+import re
+import argparse
+import json
+import numpy as np
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
+import esm
+from Bio import SeqIO
+import time
+import pandas as pd
+from tqdm.auto import tqdm
+import logging, math
+# ---- model wrappers ----
+class CaduceusEmbedder:
+    def __init__(self, device, chunk_size=131_072, overlap=0):
+        """
+        device: 'cpu' or 'cuda'
+        chunk_size: max bases (and thus tokens) to send in one forward pass
+        overlap: how many bases each window overlaps the previous; 0 = no overlap
+        """
+        model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(
+            model_name, trust_remote_code=True
+        ).to(device).eval()
+        self.device     = device
+        self.chunk_size = chunk_size
+        self.step       = chunk_size - overlap
+    def embed(self, seqs):
+        """
+        seqs: List[str] of DNA sequences (each <= chunk_size for this test)
+        returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
+        """
+        # outputs = []
+        # for seq in seqs:
+        #     # --- new: raw per‐token embeddings in one shot ---
+        #     toks = self.tokenizer(
+        #         seq,
+        #         return_tensors="pt",
+        #         padding=False,
+        #         truncation=True,
+        #         max_length=self.chunk_size
+        #     ).to(self.device)
+        #     with torch.no_grad():
+        #         out = self.model(**toks).last_hidden_state  # (1, L, D)
+        #     outputs.append(out.cpu().numpy()[0])             # (L, D)
+        # return np.stack(outputs, axis=0)  # (N, L, D)
+        outputs = []
+        for seq in tqdm(seqs, total=len(seqs), desc="DNA: Caduceus", dynamic_ncols=True):
+            toks = self.tokenizer(
+                seq,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=self.chunk_size
+            ).to(self.device)
+            with torch.no_grad():
+                out = self.model(**toks).last_hidden_state  # (1, L, D)
+            outputs.append(out.cpu().numpy()[0])             # (L, D)
+        return outputs  # list of variable-length (L_i, D) arrays
+    def benchmark(self, lengths=None):
+        """
+        Time embedding on single-sequence of various lengths.
+        By default tests [5K,10K,50K,100K,chunk_size].
+        """
+        tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
+        print(f"→ Benchmarking Caduceus on device={self.device}")
+        for sz in tests:
+            seq = "A" * sz
+            # Warm-up
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t1 = time.perf_counter()
+            print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
+class SegmentNTEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
+        self.model = AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True).to(device).eval()
+        self.device = device
+    def _adjust_length(self, input_ids):
+        bs, L = input_ids.shape
+        excl = L - 1
+        remainder = (excl) % 4
+        if remainder != 0:
+            pad_needed = 4 - remainder
+            pad_tensor = torch.full((bs, pad_needed), self.tokenizer.pad_token_id, dtype=input_ids.dtype, device=input_ids.device)
+            input_ids = torch.cat([input_ids, pad_tensor], dim=1)
+        return input_ids
+    def embed(self, seqs, batch_size=16):
+        """
+        seqs: List[str]
+        Returns: np.ndarray of shape (N, D)
+        """
+        all_embeddings = []
+        for i in range(0, len(seqs), batch_size):
+            batch_seqs = seqs[i : i + batch_size]
+            encoded = self.tokenizer.batch_encode_plus(
+                batch_seqs,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+            )
+            input_ids = encoded["input_ids"].to(self.device)  # (B, L)
+            attention_mask = input_ids != self.tokenizer.pad_token_id
+            input_ids = self._adjust_length(input_ids)
+            attention_mask = (input_ids != self.tokenizer.pad_token_id)
+            with torch.no_grad():
+                outs = self.model(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+            if hasattr(outs, "hidden_states") and outs.hidden_states is not None:
+                last_hidden = outs.hidden_states[-1]  # (B, L, D)
+            else:
+                last_hidden = outs.last_hidden_state  # fallback
+            # Exclude CLS token if present (assume first token) and pool
+            pooled = last_hidden[:, 1:, :].mean(dim=1)  # (B, D)
+            all_embeddings.append(pooled.cpu().numpy())
+            # release fragmentation
+            torch.cuda.empty_cache()
+        return np.vstack(all_embeddings)  # (N, D)
+class DNABertEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
+        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
+        self.device    = device
+    def embed(self, seqs):
+        embs = []
+        for s in seqs:
+            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
+            with torch.no_grad():
+                out = self.model(tokens).last_hidden_state.mean(1)
+            embs.append(out.cpu().numpy())
+        return np.vstack(embs)
+class NucleotideTransformerEmbedder:
+    def __init__(self, device):
+        # HF “feature-extraction” returns a list of (L, D) arrays for each input
+        # device: “cpu” or “cuda”
+        self.pipe = pipeline(
+            "feature-extraction",
+            model="InstaDeepAI/nucleotide-transformer-500m-1000g",
+            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
+        )
+    def embed(self, seqs):
+        """
+        seqs: List[str] of raw DNA sequences
+        returns: (N, D) array, one D-dim vector per sequence
+        """
+        all_embeddings = self.pipe(seqs, truncation=True, padding=True)
+        # all_embeddings is a List of shape (L, D) arrays
+        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
+        return np.vstack(pooled)
+# class ESMEmbedder:
+#     def __init__(self, device):
+#         self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         self.batch_converter = self.alphabet.get_batch_converter()
+#         self.model.to(device).eval()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             results = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = results["representations"][33]
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
+class ESMEmbedder:
+    def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
+        # Try to load the specified ESM-2 model; fallback to esm1b if missing
+        self.device = device
+        try:
+            self.model, self.alphabet = getattr(esm.pretrained, model_name)()
+            self.is_esm2 = model_name.lower().startswith("esm2")
+        except AttributeError:
+            # fallback to ESM-1b
+            self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+            self.is_esm2 = False
+        self.batch_converter = self.alphabet.get_batch_converter()
+        self.model.to(device).eval()
+        # determine max length: esm2 models vary; use default 1024 for esm1b
+        self.max_len = 4096 if self.is_esm2 else 1024  # adjust if your esm2 variant has explicit limit
+        # for chunking: reserve 2 tokens if model uses BOS/EOS
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
+    def _chunk_sequence(self, seq):
+        """
+        Return list of possibly overlapping chunks of seq, each <= chunk_size.
+        """
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
+    def embed(self, seqs):
+        """
+        seqs: List[str] of protein sequences.
+        Returns: np.ndarray of shape (N, D) pooled per-sequence embeddings.
+        """
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            # process chunks in batch if small number, else sequentially
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    results = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = results["representations"][33]  # (1, L, D)
+                # remove BOS/EOS if present: take 1:-1 if length permits
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)  # (1, D)
+                else:
+                    rep = reps.mean(1)  # fallback
+                chunk_vecs.append(rep.squeeze(0))  # (D,)
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                # average chunk vectors
+                stacked = torch.stack(chunk_vecs, dim=0)  # (num_chunks, D)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)  # (N, D)
+# class ESMDBPEmbedder:
+#     def __init__(self, device):
+#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         model_path = (
+#             Path(__file__).resolve().parent.parent
+#             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+#         )
+#         checkpoint = torch.load(model_path, map_location="cpu")
+#         clean_sd = {}
+#         for k, v in checkpoint.items():
+#             clean_sd[k.replace("module.", "")] = v
+#         result = base_model.load_state_dict(clean_sd, strict=False)
+#         if result.missing_keys:
+#             print(f"[ESMDBP] missing keys: {result.missing_keys}")
+#         if result.unexpected_keys:
+#             print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+#         self.model = base_model.to(device).eval()
+#         self.alphabet = alphabet
+#         self.batch_converter = alphabet.get_batch_converter()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             out = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = out["representations"][33]
+#         # skip start/end tokens
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
+class ESMDBPEmbedder:
+    def __init__(self, device):
+        base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+        model_path = (
+            Path(__file__).resolve().parent.parent
+            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+        )
+        checkpoint = torch.load(model_path, map_location="cpu")
+        clean_sd = {}
+        for k, v in checkpoint.items():
+            clean_sd[k.replace("module.", "")] = v
+        result = base_model.load_state_dict(clean_sd, strict=False)
+        if result.missing_keys:
+            print(f"[ESMDBP] missing keys: {result.missing_keys}")
+        if result.unexpected_keys:
+            print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+        self.model = base_model.to(device).eval()
+        self.alphabet = alphabet
+        self.batch_converter = alphabet.get_batch_converter()
+        self.device = device
+        self.max_len = 1024  # same limit as esm1b
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4
+    def _chunk_sequence(self, seq):
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
+    def embed(self, seqs):
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    out = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = out["representations"][33]
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)
+                else:
+                    rep = reps.mean(1)
+                chunk_vecs.append(rep.squeeze(0))
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                stacked = torch.stack(chunk_vecs, dim=0)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)
+class GPNEmbedder:
+    def __init__(self, device):
+        model_name = "songlab/gpn-msa-sapiens"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.model.to(device)
+        self.model.eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+class ProGenEmbedder:
+    def __init__(self, device):
+        model_name = "jinyuan22/ProGen2-base"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+# ---- main pipeline ----
+def get_embedder(name, device, for_dna=True):
+    name = name.lower()
+    if for_dna:
+        if name=="caduceus":   return CaduceusEmbedder(device)
+        if name=="dnabert":    return DNABertEmbedder(device)
+        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
+        if name=="gpn":        return GPNEmbedder(device)
+        if name=="segmentnt":    return SegmentNTEmbedder(device)
+    else:
+        if name in ("esm",):    return ESMEmbedder(device)
+        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
+        if name=="progen":      return ProGenEmbedder(device)
+    raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
+def pad_token_embeddings(list_of_arrays, pad_value=0.0):
+    """
+    list_of_arrays: list of (L_i, D) numpy arrays
+    Returns:
+      padded: (N, L_max, D) array
+      mask:   (N, L_max) boolean array where True = real token, False = padding
+    """
+    N = len(list_of_arrays)
+    D = list_of_arrays[0].shape[1]
+    L_max = max(arr.shape[0] for arr in list_of_arrays)
+    padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
+    mask = np.zeros((N, L_max), dtype=bool)
+    for i, arr in enumerate(list_of_arrays):
+        L = arr.shape[0]
+        padded[i, :L] = arr
+        mask[i, :L] = True
+    return padded, mask
+def embed_and_save(seqs, ids, embedder, out_path):
+    embs = embedder.embed(seqs)
+    # Decide whether we got variable-length per-token outputs (list of (L, D))
+    is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
+    if is_variable_token:
+        # pad to (N, L_max, D) + mask
+        padded, mask = pad_token_embeddings(embs)
+        # Save both embeddings and mask together in an .npz for convenience
+        np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
+                            embeddings=padded,
+                            mask=mask,
+                            ids=np.array(ids, dtype=object))
+    else:
+        # fixed shape output, e.g., pooled (N, D)
+        array = np.vstack(embs) if isinstance(embs, list) else embs
+        np.save(out_path, array)
+        with open(out_path.with_suffix(".ids"), "w") as f:
+            f.write("\n".join(ids))
+if __name__=="__main__":
+    p = argparse.ArgumentParser()
+    #p.add_argument("--peak_fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
+    p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
+    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
+    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
+    p.add_argument("--chrom-model",   default="caduceus")
+    p.add_argument("--tf-model",      default="esm-dbp")
+    p.add_argument("--out-dir",       default="dpacman/model/embeddings")
+    p.add_argument("--device",        default="cpu")
+    args = p.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    device = args.device
+    print(device)
+    if not args.skip_dna:
+        if args.genome_json_dir == None:
+            dna_df = pd.read_parquet('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.parquet', engine='pyarrow')
+            #df.to_csv('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.csv', index=False)
+            peak_seqs = dna_df["dna_sequence"]
+            peak_ids = dna_df["ID"]
+            print(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data", flush=True)
+            dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
+            embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
+        # peak_fasta = Path(args.peak_fasta)
+        # if peak_fasta.exists():
+        #     # Load peak sequences from FASTA
+        #     from Bio import SeqIO
+        #     peak_seqs = []
+        #     peak_ids = []
+        #     for rec in SeqIO.parse(peak_fasta, "fasta"):
+        #         peak_ids.append(rec.id)
+        #         peak_seqs.append(str(rec.seq))
+        #     print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
+        #     dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+        #     out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
+        #     embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
+        elif args.genome_json_dir:
+            # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
+            genome_dir = Path(args.genome_json_dir)
+            chrom_seqs, chrom_ids = [], []
+            primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
+            for j in sorted(genome_dir.iterdir()):
+                if not primary_pattern.match(j.name):
+                    continue
+                data = json.loads(j.read_text())
+                seq = data.get("dna") or data.get("sequence")
+                chrom = data.get("chrom") or j.stem.split("_")[-1]
+                chrom_seqs.append(seq)
+                chrom_ids.append(chrom)
+            cutoff = CaduceusEmbedder(device).chunk_size
+            long_chroms = [
+                (chrom, len(seq))
+                for chrom, seq in zip(chrom_ids, chrom_seqs)
+                if len(seq) > cutoff
+            ]
+            if long_chroms:
+                print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
+                for chrom, L in long_chroms:
+                    print(f"  {chrom}: {L} bases")
+            else:
+                print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
+            chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
+            embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
+        else:
+            raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
+    #Load TF sequences
+    tf_seqs, tf_ids = [], []
+    for record in SeqIO.parse(args.tf_fasta, "fasta"):
+        tf_ids.append(record.id)
+        tf_seqs.append(str(record.seq))
+    # embed and save
+    tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
+    out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
+    embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
+    print("Done.")

dpacman/data_tasks/fimo/__init__.py ADDED Viewed

File without changes

dpacman/data_tasks/fimo/post_fimo.py CHANGED Viewed

@@ -8,6 +8,7 @@ import multiprocessing as mp
 import numpy as np
 import pandas as pd
 import math
 import rootutils
 import polars as pl
 from omegaconf import DictConfig
@@ -37,10 +38,18 @@ def format_sig(sig_vals, decimals=4, atol=0.0, rtol=1e-5):
     return ",".join(out.tolist())
 def _safe_process(task):
     try:
-        return ("ok", _process_one_chrom_folder(task))
     except Exception as e:
-        return ("err", (task[0], repr(e), traceback.format_exc()))
 def discover_chrom_folders(fimo_out_dir: Path) -> list[str]:
     return sorted(
@@ -51,6 +60,10 @@ def discover_chrom_folders(fimo_out_dir: Path) -> list[str]:
 def _process_one_row(row, dna: str, jaspar_boost: int = 100) -> dict:
     # row order: TR, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar
     trname, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar = row
     seq = dna[cstart:cend]
     L = len(seq)
@@ -62,7 +75,7 @@ def _process_one_row(row, dna: str, jaspar_boost: int = 100) -> dict:
     peak_seq = ""
     if ps < L and pe > 0:
         scores[max(ps, 0):min(pe, L)] = chipscore
-        peak_seq = dna[max(ps, 0):min(pe, L)]
     # JASPAR hits (+jaspar_boost)
     # only run if the peak is not np.nan
@@ -92,7 +105,7 @@ def _process_one_row(row, dna: str, jaspar_boost: int = 100) -> dict:
 def _process_one_chrom_folder(task) -> pd.DataFrame:
     """Runs inside a worker process. Reads one chrom’s final.csv, loads DNA once, builds records."""
-    chrom_folder, fimo_out_dir_str, json_dir, jaspar_boost, output_parts_folder = task
     # make unique logger for this process
     log_dir = Path(HydraConfig.get().run.dir) / "logs"
@@ -120,6 +133,13 @@ def _process_one_chrom_folder(task) -> pd.DataFrame:
     if df.empty:
         return pd.DataFrame()
     # Normalize dtypes up-front
     df["#chrom"] = df["#chrom"].astype(str)
@@ -176,170 +196,342 @@ def _process_one_chrom_folder(task) -> pd.DataFrame:
     return savepath
-def build_dataset_fast_mp(fimo_out_dir: Path, json_dir: str, debug: bool, max_workers: int | None, jaspar_boost: int = 100, output_parts_folder: str = None) -> pd.DataFrame:
     """
     Multiprocessing to build final dataset across chromosomes
     """
     chrom_folders = discover_chrom_folders(fimo_out_dir)
     if not chrom_folders:
         logger.warning(f"No chrom* folders with final.csv under {fimo_out_dir}")
-        return pd.DataFrame()
     if debug:
-        chrom_folders = ["chromY"] if "chromY" in chrom_folders else chrom_folders[:1]
         logger.info(f"DEBUG MODE: considering {chrom_folders[0]} only")
-    tasks = [(cf, str(fimo_out_dir), json_dir, jaspar_boost, output_parts_folder) for cf in chrom_folders]
-    # serial path (debug/deterministic)
-    if max_workers is not None and max_workers <= 1 or len(tasks) == 1:
-        parts = []
-        errs = []
         for t in tasks:
             status, payload = _safe_process(t)
-            if status == "ok" and isinstance(payload, pd.DataFrame) and not payload.empty:
-                parts.append(payload)
-            else:
-                errs.append(payload)
-        if errs:
-            for chrom, msg, tb in errs:
-                logger.error("Worker error for %s: %s\n%s", chrom, msg, tb)
-            # raise after serial run if you want hard failure
-        return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
-    # parallel path
-    procs = min(max_workers or mp.cpu_count(), len(tasks))
-    logger.info(f"Using {procs} parallel workers for {len(tasks)} chrom folders")
-    paths, errs = [], []
-    with mp.Pool(processes=procs, maxtasksperchild=10) as pool:
-        for status, payload in pool.imap_unordered(_safe_process, tasks, chunksize=1):
-            if status == "ok" and isinstance(payload, pd.DataFrame) and not payload.empty:
-                paths.append(payload)
-            else:
-                errs.append(payload)
     if errs:
         for chrom, msg, tb in errs:
             logger.error("Worker error for %s: %s\n%s", chrom, msg, tb)
-        # optional: raise RuntimeError("One or more workers failed, see logs.")
-    paths = [p for p in parts if os.path.exists(p)]
-    return paths
 def combine_processed_with_polars(
     paths_to_processed_dfs: list[str],
     idmap_path: str,                  # TSV with columns: From, Entry, Sequence
     out_path: str,                    # e.g., "processed_out.parquet" or ".csv"
 ):
     if not paths_to_processed_dfs:
         logger.info("No records produced; nothing to write.")
         return
-    # 1) Scan all CSVs lazily (no full read)
-    lfs = [pl.scan_csv(p, infer_schema_length=0) for p in paths_to_processed_dfs]
-    lf = pl.concat(lfs, how="vertical")
-    logger.info(f"Scanned CSVs")
-    # 2) Drop duplicate occurrences of tr_name and peak_sequence, because these are the same peak
-    lf = lf.unique(subset=["tr_name", "peak_sequence"], keep="first", maintain_order=True)
-    logger.info(f"Dropped duplicate examples of tr_name + peak_sequence")
-    # 3) Join small idmap (read eagerly; it’s tiny)
     idmap = (
         pl.read_csv(idmap_path, separator="\t", columns=["From", "Entry", "Sequence"])
-          .rename({"From": "tr_name", "Entry": "tr_uniprot", "Sequence": "tr_sequence"})
     )
-    lf = lf.join(idmap.lazy(), on="tr_name", how="left")
-    logger.info(f"Merged in UniProt IDs and TR sequences from UniProt ID mappping")
     # 4) Per-chromosome unique peak index and peak_id
     #    (dense rank over peak_sequence per chrom; if you require "first-appearance" order,
     #     see the note below for an alternate approach.)
-    lf = lf.with_columns([
         pl.col("peak_sequence").fill_null("").alias("peak_sequence"),
         pl.col("chrom").cast(pl.Utf8),
     ])
-    lf = lf.with_columns(
         pl.col("peak_sequence")
-          .rank(method="dense")      # 1,2,3,... per group
-          .over("chrom")
-          .cast(pl.Int64)
-          .alias("chrom_peak_idx")
     )
-    lf = lf.with_columns(
-        pl.format("chrom{}_peak{}", pl.col("chrom"), pl.col("chrom_peak_idx")).alias("peak_id")
     )
-    logger.info(f"Assigned unique peak_ids per chromosome based on peak_sequence")
     # 5) Build stable IDs for dna_sequence and tr_sequence based on first appearance
     #    (do this by creating small maps with unique(..., maintain_order=True) and joining)
-    dna_map = (
-        lf.select("dna_sequence")
-          .unique(maintain_order=True)
-          .with_row_index("dna_idx", offset=1)
-          .with_columns(pl.format("dnaseq{}", pl.col("dna_idx")).alias("dna_seqid"))
-          .select("dna_sequence", "dna_seqid")
     )
-    logger.info(f"Assigned dna_sequence IDs")
-    tr_map = (
-        lf.select("tr_sequence")
-          .unique(maintain_order=True)
-          .with_row_index("tr_idx", offset=1)
-          .with_columns(pl.format("trseq{}", pl.col("tr_idx")).alias("tr_seqid"))
-          .select("tr_sequence", "tr_seqid")
-    )
-    logger.info(f"Assigned tr_sequence IDs")
-    lf = lf.join(dna_map, on="dna_sequence", how="left").join(tr_map, on="tr_sequence", how="left")
-    logger.info(f"Applied dna_sequence and tr_sequence IDs to main table")
-    # 6) Final ID and column selection
-    lf = lf.with_columns(
-        (pl.col("tr_seqid") + pl.lit("_") + pl.col("dna_seqid")).alias("ID")
-    )
-    cols = [
-        "ID", "tr_name", "peak_id", "chipscore", "total_jaspar_hits",
-        "dna_sequence", "tr_sequence", "scores"
-    ]
-    lf_out = lf.select(cols)
-    #n_rows = lf_out.select(pl.len().alias("rows")).collect(streaming=True)["rows"][0]
-    logger.info(f"Selected final columns")
-    # 7) Write streaming to disk
-    out_path = str(out_path)
-    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
-    if out_path.lower().endswith(".parquet"):
-        lf_out.sink_parquet(out_path, compression="zstd", statistics=True, row_group_size=128_000)
-        logger.info(f"Wrote parquet file to {out_path}")
-    elif out_path.lower().endswith(".csv"):
-        # NOTE: collect(streaming=True) still returns an in-memory DataFrame;
-        # prefer Parquet for very large outputs.
-        lf_out.collect(streaming=True).write_csv(out_path)
-        logger.info(f"Wrote csv file to {out_path}")
-    else:
-        # default to Parquet if no/unknown extension
-        lf_out.sink_parquet(out_path + ".parquet", compression="zstd", statistics=True)
-        logger.info(f"Wrote parquet file to {out_path}")
-    # 8) (Optional) small summary: unique peaks per chrom
-    peaks_per_chrom = (
-        lf.select("chrom", "peak_sequence")
-          .unique()
-          .group_by("chrom")
-          .len()
-          .collect(streaming=True)
-          .sort("chrom")
     )
-    logger.info(f"Summary per chromosome:\n{peaks_per_chrom}")
-    logger.info("Schema:")
-    for name, dtype in lf_out.schema.items():
-        logger.info(f"  {name}: {dtype}")
-    # Quick preview of a few rows (safe)
-    logger.info("\nHead(5):")
-    logger.info(lf_out.head(5).collect())  # or: lf.limit(5).collect()
     # Save the FIRST 1000 rows to CSV (streaming-friendly)
     df_first = lf_out.limit(1000).collect(streaming=True)
@@ -347,6 +539,197 @@ def combine_processed_with_polars(
     df_first.write_csv(example_out_path)
     logger.info(f"Wrote first 1000 rows to {example_out_path} as an example")
 def main(cfg: DictConfig):
     debug = bool(cfg.data_task.debug)
     json_dir = cfg.data_task.json_dir
@@ -357,24 +740,43 @@ def main(cfg: DictConfig):
     logger.info(f"Debug: {debug}")
     logger.info(f"Reading per-chrom final.csv under: {fimo_out_dir}")
-    if False:
         paths_to_processed_dfs = build_dataset_fast_mp(
             fimo_out_dir=fimo_out_dir,
             json_dir=json_dir,
             debug=debug,
             max_workers=max_workers,
             jaspar_boost=cfg.data_task.jaspar_boost,
-            output_parts_folder=output_parts_folder
         )
     else:
         paths_to_processed_dfs = [output_parts_folder/x for x in os.listdir(output_parts_folder)] if output_parts_folder.exists() else []
     logger.info(f"Combining {len(paths_to_processed_dfs)} processed parts with Polars")
     combine_processed_with_polars(
         paths_to_processed_dfs=paths_to_processed_dfs,
-        idmap_path=Path(root) / cfg.data_task.idmap_path,
-        out_path=str(processed_output_csv).replace(".csv", ".parquet")
     )
     # Delete the folder that had the temporary DFs, don't need these

 import numpy as np
 import pandas as pd
 import math
+import json
 import rootutils
 import polars as pl
 from omegaconf import DictConfig
     return ",".join(out.tolist())
 def _safe_process(task):
+    """
+    Returns:
+      ("ok", <path-to-output>) on success
+      ("err", (chrom, msg, traceback)) on failure
+    """
+    import traceback as tb
+    chrom = task[0]
     try:
+        out_path = _process_one_chrom_folder(task)   # MUST return a path (str/Path)
+        return ("ok", str(out_path))
     except Exception as e:
+        return ("err", (chrom, repr(e), tb.format_exc()))
 def discover_chrom_folders(fimo_out_dir: Path) -> list[str]:
     return sorted(
 def _process_one_row(row, dna: str, jaspar_boost: int = 100) -> dict:
     # row order: TR, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar
     trname, chrom, cstart, cend, peak_s, peak_e, chipscore, jaspar = row
+    # very few chipscores are > 1000. standardize by setting >1000 to a max score
+    if chipscore>=1000:
+        chipscore=1000
     seq = dna[cstart:cend]
     L = len(seq)
     peak_seq = ""
     if ps < L and pe > 0:
         scores[max(ps, 0):min(pe, L)] = chipscore
+        peak_seq = seq[max(ps, 0):min(pe, L)]
     # JASPAR hits (+jaspar_boost)
     # only run if the peak is not np.nan
 def _process_one_chrom_folder(task) -> pd.DataFrame:
     """Runs inside a worker process. Reads one chrom’s final.csv, loads DNA once, builds records."""
+    chrom_folder, fimo_out_dir_str, json_dir, jaspar_boost, output_parts_folder, keep_fimo_only = task
     # make unique logger for this process
     log_dir = Path(HydraConfig.get().run.dir) / "logs"
     if df.empty:
         return pd.DataFrame()
+    if keep_fimo_only:
+        logger.info(f"keep_fimo_only=True. Starting with {len(df)} rows.")
+        df = df.loc[
+            ~df["jaspar"].isna()
+            ].reset_index(drop=True)
+        logger.info(f"After keeping fimo hits only: {len(df)} rows remain.")
     # Normalize dtypes up-front
     df["#chrom"] = df["#chrom"].astype(str)
     return savepath
+def build_dataset_fast_mp(fimo_out_dir: Path, json_dir: str, debug: bool, max_workers: int | None, jaspar_boost: int = 100, output_parts_folder: str = None, keep_fimo_only: bool=False) -> pd.DataFrame:
     """
     Multiprocessing to build final dataset across chromosomes
     """
     chrom_folders = discover_chrom_folders(fimo_out_dir)
     if not chrom_folders:
         logger.warning(f"No chrom* folders with final.csv under {fimo_out_dir}")
+        return []
     if debug:
+        # keep chromY if present, otherwise just the first
+        chrom_folders = [c for c in chrom_folders if c == "chromY"] or chrom_folders[:1]
         logger.info(f"DEBUG MODE: considering {chrom_folders[0]} only")
+    tasks = [(cf, str(fimo_out_dir), json_dir, jaspar_boost, output_parts_folder, keep_fimo_only)
+             for cf in chrom_folders]
+    def _collect(status, payload, good_paths, errs):
+        if status == "ok":
+            p = Path(payload)
+            if p.exists():
+                good_paths.append(p)
+            else:
+                errs.append(("?", f"output missing: {p}", ""))
+        else:
+            chrom, msg, tb = payload
+            errs.append((chrom, msg, tb))
+    # Serial path (debug/deterministic or single task)
+    if (max_workers is not None and max_workers <= 1) or len(tasks) == 1:
+        good_paths: list[Path] = []
+        errs: list[tuple[str, str, str]] = []
         for t in tasks:
             status, payload = _safe_process(t)
+            _collect(status, payload, good_paths, errs)
+    else:
+        # Parallel path
+        procs = min(max_workers or mp.cpu_count(), len(tasks))
+        logger.info(f"Using {procs} parallel workers for {len(tasks)} chrom folders")
+        good_paths: list[Path] = []
+        errs: list[tuple[str, str, str]] = []
+        with mp.Pool(processes=procs, maxtasksperchild=10) as pool:
+            for status, payload in pool.imap_unordered(_safe_process, tasks, chunksize=1):
+                _collect(status, payload, good_paths, errs)
     if errs:
         for chrom, msg, tb in errs:
             logger.error("Worker error for %s: %s\n%s", chrom, msg, tb)
+        # Optionally: raise RuntimeError("One or more workers failed; see logs.")
+    return [str(p) for p in good_paths]
+def dedup_trname_peakseq_weighted(lf: pl.LazyFrame, seed: int = 42, outdir: str | None = None) -> pl.LazyFrame:
+    """
+    Remove duplicate pairings of TR + peak sequence, but keep the distribution of chromosomes as best as possible.
+    Use a seed so the results are reproducible
+    """
+    # Normalize key dtypes
+    lf = lf.with_columns([
+        pl.col("chrom").cast(pl.Utf8),
+        pl.col("tr_name").cast(pl.Utf8),
+        pl.col("peak_sequence").fill_null("<NULL>").cast(pl.Utf8),
+    ])
+    # --- BEFORE: counts/ratios (materialize tiny table)
+    pre_df = (
+        lf.group_by("chrom").len()
+        .with_columns((pl.col("len") / pl.col("len").sum()).alias("pre_ratio"))
+        .sort("chrom")
+        .collect()
+    )
+    # Expected #groups (must equal result rows)
+    exp_groups = lf.select(["tr_name","peak_sequence"]).unique().collect().height
+    logger.info(f"Expected groups: {exp_groups}")
+    # Tiny weights table back to lazy
+    pre_lf = pre_df.lazy().select(["chrom", "pre_ratio"]).rename({"pre_ratio": "w"})
+    # Weighted random score = log(w) + Gumbel(0,1); tie-break by a stable hash
+    TWO64 = 18446744073709551616.0
+    eps = 1e-12
+    h_expr = pl.concat_str(
+        [pl.lit(f"seed:{seed}"), pl.col("tr_name"), pl.col("peak_sequence"), pl.col("chrom")],
+        separator="|",
+    ).hash().cast(pl.UInt64)
+    u_expr = (h_expr.cast(pl.Float64) + 1.0) / pl.lit(TWO64)
+    u_expr = pl.when(u_expr < eps).then(eps).when(u_expr > 1 - eps).then(1 - eps).otherwise(u_expr)
+    logw_expr = pl.when(pl.col("w").is_null() | (pl.col("w") <= 0)).then(eps).otherwise(pl.col("w")).log()
+    gumbel_expr = -(-u_expr.log()).log()
+    score_expr = (logw_expr + gumbel_expr).alias("_score")
+    hash_expr  = h_expr.alias("_h")
+    # Attach weights & scores, globally sort, then unique on the keys (keep first)
+    lf_sorted = (
+        lf.join(pre_lf, on="chrom", how="left")
+          .with_columns([score_expr, hash_expr])
+          .sort(["_score","_h"], descending=[True, False])
+    )
+    lf_sel = lf_sorted.unique(subset=["tr_name","peak_sequence"], keep="first") \
+                      .drop(["w","_score","_h"])
+    # --- AFTER: counts/ratios + save
+    post_df = (
+        lf_sel.group_by("chrom").len()
+              .with_columns((pl.col("len") / pl.col("len").sum()).alias("post_ratio"))
+              .sort("chrom")
+              .collect(streaming=True)
+    )
+    compare_df = (pre_df.select(["chrom","len","pre_ratio"]).rename({"len":"pre_n"})
+              .join(post_df.select(["chrom","len","post_ratio"]).rename({"len":"post_n"}),
+                    on="chrom", how="full")
+              .fill_null(0)
+              .with_columns((pl.col("post_ratio") - pl.col("pre_ratio")).abs().alias("abs_delta"),
+                            (100*(pl.col("post_ratio") - pl.col("pre_ratio"))/pl.col("pre_ratio")).abs().alias("pcnt_delta"))
+              .sort("chrom")
+    ).to_pandas().drop(columns=["chrom_right"])
+    # --- Sanity: must keep exactly one per group
+    got_rows = lf_sel.select(pl.len()).collect()["len"][0]
+    if got_rows != exp_groups:
+        # optional: raise or just log
+        logger.warning(f"Dedup cardinality mismatch: expected {exp_groups}, got {got_rows}")
+    return lf_sel, compare_df
+def write_map(lf: pl.LazyFrame, out_path: str, key: str, val: str, outname: str):
+    """
+    Write the ID maps we created, spanning all the data. Will be called for:
+        - tr_seqid to tr_sequence
+        - peak_seqid to peak_sequence
+        - dna_seqid to dna_sequence
+    """
+    maps_dir = Path(out_path).parent / "maps"
+    maps_dir.mkdir(parents=True, exist_ok=True)
+    df = (
+        lf.select([pl.col(key), pl.col(val)])
+          .unique()
+          .collect(streaming=True)
+    )
+    mapping = dict(zip(df[key].to_list(), df[val].to_list()))
+    with open(maps_dir / outname, "w") as f:
+        json.dump(mapping, f, indent=2)
 def combine_processed_with_polars(
     paths_to_processed_dfs: list[str],
     idmap_path: str,                  # TSV with columns: From, Entry, Sequence
     out_path: str,                    # e.g., "processed_out.parquet" or ".csv"
+    max_protein_len: int = None,
+    check_violations: bool = False,
+    seeds: list = [0]
 ):
     if not paths_to_processed_dfs:
         logger.info("No records produced; nothing to write.")
         return
+    # 1) Scan each CSV and normalize dtypes BEFORE concat
+    lfs = []
+    for p in paths_to_processed_dfs:
+        lf_i = (
+            pl.scan_csv(p)  # don't use infer_schema_length=0 here
+            .with_columns([
+                pl.col("chrom").cast(pl.Utf8),
+                pl.col("tr_name").cast(pl.Utf8),
+                pl.col("dna_sequence").cast(pl.Utf8),
+                pl.col("peak_sequence").cast(pl.Utf8),
+                pl.col("scores").cast(pl.Utf8),
+                pl.col("chipscore").cast(pl.Float64),        # robust (int -> float OK)
+                pl.col("total_jaspar_hits").cast(pl.Int64),
+            ])
+        )
+        lfs.append(lf_i)
+    # 2) Now concat; schemas match
+    lf_og = pl.concat(lfs, how="vertical")
+    # Read idmap to get list of unmapped TRs, those with no sequence
     idmap = (
         pl.read_csv(idmap_path, separator="\t", columns=["From", "Entry", "Sequence"])
+        .rename({"From": "tr_name", "Entry": "tr_uniprot", "Sequence": "tr_sequence"})
     )
+    idmap = idmap.with_columns(
+        pl.col("tr_sequence").map_elements(lambda x: len(x), return_dtype=pl.Int64).alias("tr_len")
+    )
+    if max_protein_len is not None:
+        idmap = idmap.filter(
+            pl.col("tr_len")<=max_protein_len
+        )
+        logger.info(f"Filtered valid TRs to only those with len <= {max_protein_len}")
+    success_trs = list(idmap["tr_name"].unique())
+    logger.info(f"Total valid TRs: {len(success_trs)}")
+    # Filter lf to only have "success-TRs"
+    lf_og = lf_og.filter(pl.col("tr_name").is_in(success_trs))
+    # 2) We COULD drop duplicate occurrences of tr_name and peak_sequence, because these are the same peak. But here we're showing them in different contexts
+    # Instead, let's drop duplicate occurrences of the same tr_name and dna_sequence, because these are duplicate datapoints.
+    lf_out = None # the last one will be used to save the example file
+    out_path = str(out_path)
+    Path(out_path).parent.mkdir(parents=True, exist_ok=True)
+    lf_og = lf_og.join(idmap.lazy(), on="tr_name", how="left")
+    logger.info(f"Merged in UniProt IDs and TR sequences from UniProt ID mappping")
     # 4) Per-chromosome unique peak index and peak_id
     #    (dense rank over peak_sequence per chrom; if you require "first-appearance" order,
     #     see the note below for an alternate approach.)
+    # Ensure types
+    lf_og = lf_og.with_columns([
+        pl.col("dna_sequence").cast(pl.Utf8),
+        pl.col("tr_sequence").cast(pl.Utf8),
+        pl.col("peak_sequence").cast(pl.Utf8)
+    ])
+    # set chrom+ peak sequence IDs
+    lf_og = lf_og.with_columns([
         pl.col("peak_sequence").fill_null("").alias("peak_sequence"),
         pl.col("chrom").cast(pl.Utf8),
     ])
+    lf_og = lf_og.with_columns(
         pl.col("peak_sequence")
+        .rank(method="dense")      # 1,2,3,... per group
+        .over("chrom")
+        .cast(pl.Int64)
+        .alias("chrom_peak_idx")
     )
+    lf_og = lf_og.with_columns(
+        pl.format("chr{}_peak{}", pl.col("chrom"), pl.col("chrom_peak_idx")).alias("chrpeak_id")
     )
+    logger.info(f"Assigned unique chrpeak_ids per chromosome based on peak_sequence")
     # 5) Build stable IDs for dna_sequence and tr_sequence based on first appearance
     #    (do this by creating small maps with unique(..., maintain_order=True) and joining)
+    # Sequence-based IDs without any joins
+    lf_og = (
+        lf_og.with_columns([
+            pl.col("dna_sequence").rank(method="dense").cast(pl.Int64).alias("dna_idx"),
+            pl.col("tr_sequence").rank(method="dense").cast(pl.Int64).alias("tr_idx"),
+            pl.col("peak_sequence").rank(method="dense").cast(pl.Int64).alias("peak_idx"),
+        ])
+        .with_columns([
+            pl.format("dnaseq{}", pl.col("dna_idx")).alias("dna_seqid"),
+            pl.format("trseq{}", pl.col("tr_idx")).alias("tr_seqid"),
+            pl.format("peakseq{}", pl.col("peak_idx")).alias("peak_seqid"),
+        ])
+        .drop(["dna_idx", "tr_idx", "peak_idx"])
     )
+    logger.info(f"Assigned unique dna IDs, transcriptional regulator IDs, and peak IDs")
+    # Final ID (will never be None now)
+    lf_og = lf_og.with_columns(
+        pl.concat_str([pl.col("tr_seqid"), pl.lit("_"), pl.col("dna_seqid")], ignore_nulls=False)
+        .alias("ID")
     )
+    # Write the maps
+    # call it for each mapping
+    write_map(lf_og, out_path=out_path, val="tr_sequence",  key="tr_seqid",   outname="tr_seqid_to_tr_sequence.json")
+    write_map(lf_og, out_path=out_path, val="peak_sequence", key="peak_seqid", outname="peak_seqid_to_peak_sequence.json")
+    write_map(lf_og, out_path=out_path, val="dna_sequence",  key="dna_seqid",  outname="dna_seqid_to_dna_sequence.json")
+    for seed in seeds:
+        # edit out path to include seed
+        if "." in out_path:
+            out_path_full = out_path[0:out_path.rindex(".")] + f"_seed{seed}" + out_path[out_path.rindex("."):]
+        else:
+            out_path_full = out_path + f"_seed{seed}.parquet"
+        lf, compare_df = dedup_trname_peakseq_weighted(lf_og, seed=seed)
+        logger.info(f"Dropped duplicate examples of tr_name + peak_sequence. Maintained chrom distribution with weighted random sampling (seed={seed}).")
+        # Save comparison df. Annotate with debug if it's a debug run
+        compare_df_path = str(Path(out_path).parent/f"chrom_ratio_compare_seed{seed}.csv")
+        if "debug" in out_path: compare_df_path = compare_df_path.replace(".csv", "_debug.csv")
+        compare_df.to_csv(compare_df_path, index=False)
+        # 3) Join small idmap (read eagerly; it’s tiny)
+        lf = lf.join(idmap.lazy(), on="tr_name", how="left")
+        logger.info(f"Merged in UniProt IDs and TR sequences from UniProt ID mappping")
+        logger.info(f"Applied dna_sequence and tr_sequence IDs to main table")
+        # Each sequence maps to exactly one id
+        viol1 = (
+            lf.select("dna_sequence", "dna_seqid").unique()
+            .group_by("dna_sequence").agg(pl.n_unique("dna_seqid").alias("n_ids"))
+            .filter(pl.col("n_ids") > 1)
+            .collect()
+        )
+        # Each id maps to exactly one sequence
+        viol2 = (
+            lf.select("dna_sequence", "dna_seqid").unique()
+            .group_by("dna_seqid").agg(pl.n_unique("dna_sequence").alias("n_seqs"))
+            .filter(pl.col("n_seqs") > 1)
+            .collect()
+        )
+        logger.info("viol1 rows (seq→>1 id): %d; viol2 rows (id→>1 seq): %d", viol1.height, viol2.height)
+        # No NULLs
+        nulls = lf.select([
+            pl.col("dna_seqid").is_null().sum().alias("null_dna_seqid"),
+            pl.col("tr_seqid").is_null().sum().alias("null_tr_seqid"),
+            pl.col("ID").is_null().sum().alias("null_ID"),
+        ]).collect()
+        logger.info("NULL counts:\n%s", nulls)
+        # 6) Final column selection
+        cols = [
+            "ID", "tr_seqid", "dna_seqid", "peak_seqid", "chrpeak_id", "tr_name", "chipscore", "total_jaspar_hits",
+            "dna_sequence", "tr_sequence", "scores"
+        ]
+        lf_out = lf.select(cols)
+        #n_rows = lf_out.select(pl.len().alias("rows")).collect(streaming=True)["rows"][0]
+        logger.info(f"Selected final columns")
+        # 7) Write streaming to disk
+        if out_path_full.lower().endswith(".parquet"):
+            lf_out.sink_parquet(out_path_full, compression="zstd", statistics=True, row_group_size=128_000)
+            logger.info(f"Wrote parquet file to {out_path_full}")
+        elif out_path_full.lower().endswith(".csv"):
+            # NOTE: collect(streaming=True) still returns an in-memory DataFrame;
+            # prefer Parquet for very large outputs.
+            lf_out.collect(streaming=True).write_csv(out_path_full)
+            logger.info(f"Wrote csv file to {out_path_full}")
+        else:
+            # default to Parquet if no/unknown extension
+            lf_out.sink_parquet(out_path_full + ".parquet", compression="zstd", statistics=True)
+            logger.info(f"Wrote parquet file to {out_path_full}")
     # Save the FIRST 1000 rows to CSV (streaming-friendly)
     df_first = lf_out.limit(1000).collect(streaming=True)
     df_first.write_csv(example_out_path)
     logger.info(f"Wrote first 1000 rows to {example_out_path} as an example")
+# FIMO check
+def get_reverse_complement(s):
+    """
+    Returns 5' to 3' sequence of the reverse complement
+    """
+    chars = list(s)
+    recon = []
+    rev_map = {
+        "a": "t",
+        "c": "g",
+        "t": "a",
+        "g": "c",
+        "A":"T",
+        "C": "G",
+        "T": "A",
+        "G": "C",
+        "n": "n",
+        "N": "N"
+    }
+    for c in chars:
+        recon += [rev_map[c]]
+    recon = "".join(recon)
+    return recon[::-1]
+def extract_jaspar_motifs(row, reverse_complement=False):
+    s = row["scores"]
+    s = [int(x) for x in s.split(",")]
+    n_motifs = row["total_jaspar_hits"]
+    if n_motifs==0:
+        return ""
+    chipscore = row["chipscore"]
+    dna_seq = row["dna_sequence"]
+    if reverse_complement:
+        dna_seq = row["dna_sequence_rc"]
+    jaspar_indices = [i for i in list(range(len(s))) if s[i]>chipscore]
+    pred_motif = ""
+    for i in list(range(jaspar_indices[0], jaspar_indices[-1]+1)):
+        if not(i in jaspar_indices):
+            pred_motif += "-"
+        else:
+            pred_motif += dna_seq[i]
+    return pred_motif
+def clean_idmap(idmap_path):
+    """
+    The raw ID Map from UniProt returned multiple results.
+    We went to ReMap and wrote down what the right mappings are in these cases.
+    """
+    manual_map = {
+        "BACH1": "O14867",
+        "BAP1": "Q92560",
+        "BDP1": "A6H8Y1",
+        "BRF1": "Q92994",
+        "CUX1": "Q13948",
+        "DDX21": "Q9NR30",
+        "ERG": "P11308",
+        "HBP1": "O60381",
+        "KLF14": "Q8TD94",
+        "MED1": "Q15648",
+        "MED25": "Q71SY5",
+        "MGA": "Q8IWI9",
+        "NRF1": "Q16656",
+        "PAF1": "Q8N7H5",
+        "PDX1": "P52945",
+        "RBP2": "P50120",
+        "RLF": "Q13129",
+        "SP1": "P08047",
+        "SPIN1": "Q9Y657",
+        "STAG1": "Q8WVM7",
+        "TAF15": "Q92804",
+        "TCF3": "P15923",
+        "ZFP36": "P26651",
+        "EVI1": "Q03112",
+        "MCM2": "P49736"
+    }
+    idmap = pd.read_csv(idmap_path, sep="\t")
+    idmap["Remap_Entry"] = idmap.apply(lambda row: row["Entry"] if not(row["From"] in manual_map) else manual_map[row["From"]], axis=1)
+    idmap_remapped = idmap.loc[
+        idmap["Entry"]==idmap["Remap_Entry"]
+    ].reset_index(drop=True).drop(columns=["Remap_Entry"])
+    assert len(idmap_remapped)==len(idmap_remapped["From"].unique())
+    logger.info(f"Total transcriptional regulators successfully mapped in UniProt: {len(idmap_remapped)}")
+    clean_idmap_path = Path(root)/"dpacman/data_files/processed/remap/idmapping_reviewed_true_processed_2025_08_11.tsv"
+    idmap_remapped.to_csv(clean_idmap_path, sep="\t")
+    return clean_idmap_path
+def debug_fimo_check(path_to_chrom_fimo, path_to_processed_chrom, chrom="Y", json_dir=""):
+    """
+    Make sure we are properly extracting fimo sequences.
+    """
+    processed = pd.read_csv(path_to_processed_chrom)
+    processed["pred_motif_string"] = processed.apply(lambda row: extract_jaspar_motifs(row), axis=1)
+    processed["dna_sequence_rc"] = processed["dna_sequence"].apply(lambda x: get_reverse_complement(x))
+    processed["pred_motif_string_rc"] = processed.apply(lambda row: extract_jaspar_motifs(row, reverse_complement=True), axis=1)
+    processed_trs = processed["tr_name"].unique().tolist()
+    fimo = pd.read_csv(path_to_chrom_fimo)
+    fimo["input_tr"] = fimo["sequence_name"].str.split("_",expand=True)[2]
+    fimo_valid = fimo.loc[
+            (fimo["motif_alt_id"]==fimo["input_tr"]) &
+            (fimo["motif_alt_id"].isin(processed_trs))
+        ].reset_index(drop=True)
+    logger.info(f"Total valid FIMO matches: {len(fimo_valid)}")
+    logger.info(f"Total transcriptional regulators being considered: {len(processed_trs)}")
+    # Load DNA
+    cache_debug = load_chrom_dna(chrom, {}, json_dir=json_dir)
+    # Randomly select a positive and negative row to test
+    pos_row = fimo_valid.loc[fimo_valid["strand"]=="+"].sample(n=1, random_state=44)
+    neg_row = fimo_valid.loc[fimo_valid["strand"]=="-"].sample(n=1, random_state=44)
+    # Iterate through the rows
+    for row in [pos_row, neg_row]:
+        indices = [int(x) for x in row["sequence_name"].item().split("_")[-2::]]
+        chipseq_start, chipseq_end = indices
+        strand = row['strand'].item()
+        logger.info(f"ChIPseq start: {chipseq_start}, ChIPseq end: {chipseq_end}")
+        logger.info(f"Strand: {strand}")
+        motif_start = chipseq_start + int(row["start"].item()) -1
+        motif_end = chipseq_start + int(row["stop"].item())
+        motif = row['matched_sequence'].item()
+        full_seq = cache_debug[chipseq_start:chipseq_end].upper()
+        logger.info(f"Full sequence: {full_seq}")
+        logger.info(f"Full sequence reverse complement: {get_reverse_complement(full_seq)}")
+        our_motif = cache_debug[motif_start:motif_end].upper()
+        if strand=="+":
+            logger.info(f"True motif found by FIMO: {motif}")
+            logger.info(f"Extracted motif on our end: {our_motif}")
+            logger.info(f"Correct extraction: {motif==our_motif}")
+            matching_rows = processed.loc[
+                (processed["dna_sequence"].str.contains(full_seq)) &
+                (processed["pred_motif_string"].str.contains(our_motif))
+            ]
+        if strand=="-":
+            our_motif_rc = get_reverse_complement(our_motif)
+            logger.info(f"True motif found by FIMO: {motif}")
+            logger.info(f"Extracted motif on our end: {our_motif_rc}")
+            logger.info(f"Correct extraction: {motif==our_motif_rc}")
+            logger.info(f"Motif that will appear in the forward sequence: {our_motif}")
+            matching_rows = processed.loc[
+                (processed["dna_sequence"].str.contains(full_seq)) &
+                (processed["pred_motif_string"].str.contains(our_motif))
+            ]
+        # Now find if there are rows with the same TR, and the same DNA sequence and motif
+        matching_row_trs = sorted(matching_rows["tr_name"].unique().tolist())
+        expected_tr = row['motif_alt_id'].item()
+        logger.info(f"TR from selected row: {expected_tr}")
+        logger.info(f"TRs with same motif: {','.join(matching_row_trs)}")
+        logger.info(f"Expected TR in list: {expected_tr in matching_row_trs}")
+def debug_remap_check(remap_path, path_to_processed_chrom, chrom="Y", json_dir=""):
+    """
+    For debugging mode: pick a random row from processed remap. make sure the sequence matches the one we're getting here.
+    """
+    remap = pd.read_csv(remap_path)
+    remap["ChIPStart"] = remap["ChIPStart"].astype(int)
+    remap["ChIPEnd"] = remap["ChIPEnd"].astype(int)
+    row = remap.loc[remap["#chrom"]=="Y"].sample(n=1, random_state=42)
+    start, end = row["ChIPStart"].item(), row["ChIPEnd"].item()
+    cache_debug = load_chrom_dna(chrom, {}, json_dir=json_dir)
+    test_seq = cache_debug[start:end].upper()
+    logger.info(f"Randomly sampled sequence ({len(test_seq)} nucleotides), chrY {start}:{end}\n\tsequence: {test_seq}")
+    should_find = remap.loc[
+        (remap["ChIPStart"]==start) &
+        (remap["ChIPEnd"]==end)
+    ]["TR"].unique().tolist()
+    logger.info(f"Expect to find {len(should_find)} TRs: {', '.join(sorted(should_find))}")
+    processed = pd.read_csv(path_to_processed_chrom)
+    did_find = processed.loc[
+        processed["peak_sequence"]==test_seq
+    ]["tr_name"].unique().tolist()
+    logger.info(f"Looked up same sequence in processed chrY file.\nFound TRs: {', '.join(sorted(did_find))}")
+    logger.info(f"found==expected: {did_find==should_find}")
 def main(cfg: DictConfig):
     debug = bool(cfg.data_task.debug)
     json_dir = cfg.data_task.json_dir
     logger.info(f"Debug: {debug}")
     logger.info(f"Reading per-chrom final.csv under: {fimo_out_dir}")
+    # process the idmap
+    idmap_path=Path(root) / cfg.data_task.idmap_path
+    clean_idmap_path = clean_idmap(idmap_path)
+    # If we don't have temp files to process
+    if not(os.path.exists(output_parts_folder)) or (os.path.exists(output_parts_folder) and len(os.listdir(output_parts_folder))<24):
         paths_to_processed_dfs = build_dataset_fast_mp(
             fimo_out_dir=fimo_out_dir,
             json_dir=json_dir,
             debug=debug,
             max_workers=max_workers,
             jaspar_boost=cfg.data_task.jaspar_boost,
+            output_parts_folder=output_parts_folder,
+            keep_fimo_only=cfg.data_task.keep_fimo_only
         )
     else:
         paths_to_processed_dfs = [output_parts_folder/x for x in os.listdir(output_parts_folder)] if output_parts_folder.exists() else []
+    # Debug methods: (1) make sure our peak sequences correspond to remap, (2) make sure our FIMO sequences correspond to FIMO results
+    out_path = str(processed_output_csv).replace(".csv", ".parquet")
+    if debug:
+        debug_remap_check(remap_path=Path(root) / cfg.data_task.remap_path,
+                          path_to_processed_chrom=Path(output_parts_folder)/"chromY_processed.csv",
+                          chrom="Y", json_dir=json_dir)
+        debug_fimo_check(path_to_chrom_fimo=Path(root) / cfg.data_task.fimo_out_dir / "chromY" / "fimo_annotations.csv",
+                          path_to_processed_chrom=Path(output_parts_folder)/"chromY_processed.csv",
+                          chrom="Y", json_dir=json_dir)
+        out_path = out_path.replace(".parquet", "_debug.parquet")
     logger.info(f"Combining {len(paths_to_processed_dfs)} processed parts with Polars")
     combine_processed_with_polars(
         paths_to_processed_dfs=paths_to_processed_dfs,
+        idmap_path=clean_idmap_path,
+        out_path=out_path,
+        max_protein_len=cfg.data_task.max_protein_len,
+        seeds=cfg.data_task.seeds
     )
     # Delete the folder that had the temporary DFs, don't need these

dpacman/data_tasks/fimo/run_fimo.py CHANGED Viewed

@@ -152,17 +152,26 @@ def run_fimo_chunk(cfg):
 def annotate_with_fimo(df, fdf):
     df = df.reset_index().rename(columns={"index":"idx"})
     df["sequence_name"] = df["idx"].astype(str) + "_chr" + df["#chrom"] + "_" + df["TR"] + "_" + df["contextStart"].astype(str) + "_" + df["contextEnd"].astype(str) #construt it the same way as headers
-    fdf = fdf.merge(df[["sequence_name", "contextStart"]], on="sequence_name", how="left")
-    fdf["genomic_start"] = fdf["contextStart"] + fdf["start"] - 1
-    fdf["genomic_end"] = fdf["contextStart"] + fdf["stop"]
-    fdf["coord"] = (
-        fdf["genomic_start"].astype(str) + "-" + fdf["genomic_end"].astype(str)
     )
-    agg = fdf.groupby("sequence_name")["coord"].agg(lambda hits: ",".join(hits))
     df["jaspar"] = df["sequence_name"].map(agg).fillna("")
     return df
 def main(cfg: DictConfig):
     """
     Main method for running FIMO analysis, searching JASPAR motifs against ChIP-seq peaks

 def annotate_with_fimo(df, fdf):
     df = df.reset_index().rename(columns={"index":"idx"})
     df["sequence_name"] = df["idx"].astype(str) + "_chr" + df["#chrom"] + "_" + df["TR"] + "_" + df["contextStart"].astype(str) + "_" + df["contextEnd"].astype(str) #construt it the same way as headers
+    # Crucial: filter FDF results to only rows where the TF whose motif was found actually matches the TF that was detected there.
+    fdf["input_tr"] = fdf["sequence_name"].str.split("_",expand=True)[2]
+    true_matches = fdf.loc[
+        fdf["motif_alt_id"]==fdf["input_tr"]
+    ].reset_index(drop=True)
+    logger.info(f"Length of full returned FIMO results: {len(fdf)}")
+    logger.info(f"Length of true matches, where the FIMO tr and the input tr match: {len(true_matches)}")
+    true_matches = true_matches.merge(df[["sequence_name", "contextStart"]], on="sequence_name", how="left")
+    true_matches["genomic_start"] = true_matches["contextStart"] + true_matches["start"] - 1
+    true_matches["genomic_end"] = true_matches["contextStart"] + true_matches["stop"]
+    true_matches["coord"] = (
+        true_matches["genomic_start"].astype(str) + "-" + true_matches["genomic_end"].astype(str)
     )
+    agg = true_matches.groupby("sequence_name")["coord"].agg(lambda hits: ",".join(hits))
     df["jaspar"] = df["sequence_name"].map(agg).fillna("")
     return df
 def main(cfg: DictConfig):
     """
     Main method for running FIMO analysis, searching JASPAR motifs against ChIP-seq peaks

dpacman/data_tasks/split/__init__.py ADDED Viewed

File without changes

dpacman/data_tasks/split/remap.py ADDED Viewed

	@@ -0,0 +1,512 @@

+from collections import Counter, defaultdict
+from ortools.linear_solver import pywraplp
+import random
+import logging
+from omegaconf import DictConfig
+import rootutils
+import pandas as pd
+from pathlib import Path
+import os
+import numpy as np
+from sklearn.model_selection import train_test_split
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+logger = logging.getLogger(__name__)
+def split_bipartite_fast(
+    dna_clusters,
+    split_names=("train","val","test"),
+    ratios=(0.8,0.1,0.1),
+):
+    # use sklearn
+    test_size_1 = 0.2
+    test_size_2 = 0.5
+    logger.info(f"\tPerforming first split: all clusters -> train clusters ({round(1-test_size_1,3)}) and other ({test_size_1})")
+    X = dna_clusters
+    y = [0]*len(dna_clusters)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_1, random_state=0)
+    logger.info(f"\tPerforming second split: other -> val clusters ({round(1-test_size_2,3)}) and test clusters ({test_size_2})")
+    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_size_2, random_state=0)
+    dna_assign = {}
+    for x in X_train:
+        dna_assign[x] = "train"
+    for x in X_val:
+        dna_assign[x] = "val"
+    for x in X_test:
+        dna_assign[x] = "test"
+    kept_by_split = {'train': len(X_train), 'val': len(X_val), 'test': len(X_test)}
+    return dna_assign, kept_by_split
+def split_bipartite_with_ratios_and_leaky(
+    edges,
+    split_names=("train","val","test"),
+    ratios=(0.8, 0.1, 0.1),
+    require_nonempty=False,
+    ratio_tolerance=None,   # None = soft ratios only; 0.0 = exact band (use with care)
+    bigM=None,
+    shuffle_within_pair=False,
+    seed=0,
+    test_edges_must=None,   # NEW: list of (tf,dna) with duplicates OR dict {(tf,dna): count}
+):
+    """
+    edges: list of (tf_cluster_id, dna_cluster_id). Duplicates allowed (-> weights).
+    test_edges_must: None, list of pairs, or dict {(tf,dna): required_count}.
+        - If a pair appears with required_count > 0, at least that many examples MUST be kept in TEST.
+        - This implicitly pins both clusters of that pair to TEST (cluster exclusivity).
+    Returns:
+        tf_assign: {tf_cluster -> split}
+        dna_assign: {dna_cluster -> split}
+        kept_by_split: {split -> kept_count} (train/val/test only)
+        total_kept: int
+        split_to_indices: {split -> [input indices]} including 'leaky_test'
+        split_to_edges:   {split -> [(tf,dna), ...]} including 'leaky_test'
+    """
+    # Aggregate counts per pair
+    w = Counter(edges)
+    tfs  = {t for (t, _) in w}
+    dnas = {d for (_, d) in w}
+    S    = list(split_names)
+    rs   = dict(zip(S, ratios))
+    N    = sum(w.values())
+    if bigM is None:
+        bigM = 1000 * max(1, N)
+    # Index original edges so we can return a per-example split
+    pair_to_indices = defaultdict(list)
+    for idx, (c, d) in enumerate(edges):
+        pair_to_indices[(c, d)].append(idx)
+    if shuffle_within_pair:
+        rng = random.Random(seed)
+        for key in pair_to_indices:
+            rng.shuffle(pair_to_indices[key])
+    # Parse required test edges
+    req_test = Counter()
+    if test_edges_must:
+        if isinstance(test_edges_must, dict):
+            for k, v in test_edges_must.items():
+                if not isinstance(k, tuple) or len(k) != 2:
+                    raise ValueError("test_edges_must dict keys must be (tf_cluster, dna_cluster)")
+                if v < 0:
+                    raise ValueError("required_count must be non-negative")
+                if v:
+                    req_test[k] += int(v)
+        else:
+            # assume iterable of pairs
+            req_test = Counter(test_edges_must)
+        # Validate against available counts
+        for pair, req in req_test.items():
+            if pair not in w:
+                raise ValueError(f"Required test pair {pair} not present in edges.")
+            if req > w[pair]:
+                raise ValueError(
+                    f"Required count {req} for {pair} exceeds available {w[pair]}."
+                )
+    # Build solver
+    solver = pywraplp.Solver.CreateSolver("CBC")
+    if solver is None:
+        raise RuntimeError("Could not create CBC solver.")
+    # Binary cluster assignments
+    x = {(c,s): solver.BoolVar(f"x[{c},{s}]") for c in tfs  for s in S}
+    y = {(d,s): solver.BoolVar(f"y[{d},{s}]") for d in dnas for s in S}
+    # Each cluster in exactly one split
+    for c in tfs:
+        solver.Add(sum(x[c,s] for s in S) == 1)
+    for d in dnas:
+        solver.Add(sum(y[d,s] for s in S) == 1)
+    # Integer kept counts per pair and split (allow partial within-pair)
+    k = {((c,d),s): solver.IntVar(0, w[(c,d)], f"k[{c},{d},{s}]") for (c,d) in w for s in S}
+    # Only keep in split s if both endpoint clusters are assigned to s
+    for (c,d), wt in w.items():
+        for s in S:
+            solver.Add(k[((c,d),s)] <= wt * x[c,s])
+            solver.Add(k[((c,d),s)] <= wt * y[d,s])
+    # Enforce minimum kept counts in TEST for required pairs
+    for (c,d), req in req_test.items():
+        solver.Add(k[((c,d), "test")] >= req)
+    # Optional: ensure each split has at least one cluster (feasibility depends on counts)
+    if require_nonempty:
+        for s in S:
+            solver.Add(
+                sum(x[c,s] for c in tfs) + sum(y[d,s] for d in dnas) >= 1
+            )
+    # Kept counts per split and total
+    K = {s: solver.IntVar(0, N, f"K[{s}]") for s in S}
+    for s in S:
+        solver.Add(K[s] == sum(k[((c,d),s)] for (c,d) in w))
+    T = solver.IntVar(0, N, "T")
+    solver.Add(T == sum(K[s] for s in S))
+    # Ratio deviation: K_s - r_s * T = d+ - d-
+    dpos = {s: solver.NumVar(0, solver.infinity(), f"dpos[{s}]") for s in S}
+    dneg = {s: solver.NumVar(0, solver.infinity(), f"dneg[{s}]") for s in S}
+    for s in S:
+        solver.Add(K[s] - rs[s]*T == dpos[s] - dneg[s])
+    # Optional hard band around target ratios
+    if ratio_tolerance is not None:
+        eps = float(ratio_tolerance)
+        for s in S:
+            solver.Add(K[s] >= (rs[s] - eps) * T)
+            solver.Add(K[s] <= (rs[s] + eps) * T)
+    # Objective: maximize T then minimize total deviation
+    obj = solver.Objective()
+    obj.SetMaximization()
+    obj.SetCoefficient(T, float(bigM))
+    for s in S:
+        obj.SetCoefficient(dpos[s], -1.0)
+        obj.SetCoefficient(dneg[s], -1.0)
+    status = solver.Solve()
+    if status not in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
+        raise RuntimeError("No feasible solution (check ratio_tolerance vs. required test edges).")
+    # Read cluster assignments
+    tf_assign  = {c: next(s for s in S if x[c,s].solution_value() > 0.5) for c in tfs}
+    dna_assign = {d: next(s for s in S if y[d,s].solution_value() > 0.5) for d in dnas}
+    # Kept counts per split
+    kept_by_split = {s: int(round(K[s].solution_value())) for s in S}
+    total_kept = int(round(T.solution_value()))
+    # ---- Build per-example split assignment (including 'leaky_test') ----
+    split_to_indices = {s: [] for s in S}
+    remaining_indices = {pair: list(pair_to_indices[pair]) for pair in pair_to_indices}
+    # Allocate the kept examples per split (train/val/test)
+    for (c,d), wt in w.items():
+        for s in S:
+            cnt = int(round(k[((c,d),s)].solution_value()))
+            if cnt > 0:
+                take = remaining_indices[(c,d)][:cnt]
+                split_to_indices[s].extend(take)
+                remaining_indices[(c,d)] = remaining_indices[(c,d)][cnt:]
+    # Everything left becomes leaky_test
+    leaky_indices = []
+    for pair, idxs in remaining_indices.items():
+        if idxs:
+            leaky_indices.extend(idxs)
+    split_to_indices["leaky_test"] = leaky_indices
+    split_to_edges = {s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices}
+    return tf_assign, dna_assign, kept_by_split, total_kept, split_to_indices, split_to_edges
+from collections import Counter, defaultdict
+import random
+class DSU:
+    def __init__(self): self.p = {}
+    def find(self, x):
+        if x not in self.p: self.p[x] = x
+        while self.p[x] != x:
+            self.p[x] = self.p[self.p[x]]
+            x = self.p[x]
+        return x
+    def union(self, a,b):
+        ra, rb = self.find(a), self.find(b)
+        if ra != rb: self.p[rb] = ra
+def split_bipartite_by_components(
+    edges,
+    split_names=("train","val","test"),
+    ratios=(0.8,0.1,0.1),
+    seed=0,
+    require_nonempty=False,
+    test_edges_must=None,   # None, list[(tf,dna)], or dict{(tf,dna): count}
+):
+    """
+    Guarantees exclusivity: each TF cluster and DNA cluster appears in at most one split.
+    Strategy: find connected components in the TF–DNA bipartite graph and assign components wholesale.
+    """
+    rng = random.Random(seed)
+    w = Counter(edges)                         # multiplicities per pair
+    if not w: raise ValueError("No edges.")
+    # 1) Build components with Union-Find (prefix to keep TF/DNA namespaces disjoint)
+    dsu = DSU()
+    for (tf, dna) in w:
+        dsu.union(("T", tf), ("D", dna))
+    comp_pairs = defaultdict(list)
+    comp_weight = defaultdict(int)
+    for (tf, dna), cnt in w.items():
+        root = dsu.find(("T", tf))            # component id = root of TF endpoint
+        comp_pairs[root].append((tf, dna))
+        comp_weight[root] += cnt
+    comps = list(comp_pairs.keys())
+    C = len(comps)
+    S = list(split_names)
+    rs = dict(zip(S, ratios))
+    N = sum(comp_weight[c] for c in comps)
+    target = {s: int(round(rs[s] * N)) for s in S}
+    # 2) Pin components that contain required TEST pairs
+    pinned = {}   # comp_root -> pinned_split ("test")
+    if test_edges_must:
+        req = Counter(test_edges_must) if not isinstance(test_edges_must, dict) else Counter(test_edges_must)
+        # Map each required pair to its component, ensure feasibility
+        for (tf, dna), r in req.items():
+            if (tf, dna) not in w:
+                raise ValueError(f"Required pair {(tf,dna)} not present.")
+            if r > w[(tf, dna)]:
+                raise ValueError(f"Required count {r} for {(tf,dna)} exceeds available {w[(tf,dna)]}.")
+            comp = dsu.find(("T", tf))
+            if comp in pinned and pinned[comp] != "test":
+                raise ValueError(f"Component conflict: already pinned to {pinned[comp]}, but {(tf,dna)} demands test.")
+            pinned[comp] = "test"
+        # NOTE: pinning a pair pins the WHOLE component to test (to keep exclusivity).
+        # If you only want some edges kept in test and discard the rest, handle below when materializing.
+    # 3) Assign components greedily by deficit
+    kept_by_split = {s: 0 for s in S}
+    comp_assign = {}  # comp_root -> split
+    # First assign pinned comps
+    for comp, split in pinned.items():
+        comp_assign[comp] = split
+        kept_by_split[split] += comp_weight[comp]
+    # Sort remaining components by descending weight
+    remaining = [c for c in comps if c not in comp_assign]
+    remaining.sort(key=lambda c: comp_weight[c], reverse=True)
+    # Ensure nonempty splits if requested (seed with largest remaining comps)
+    if require_nonempty:
+        seeds = remaining[:min(len(S), len(remaining))]
+        for comp, s in zip(seeds, S):
+            comp_assign[comp] = s
+            kept_by_split[s] += comp_weight[comp]
+        remaining = [c for c in remaining if c not in comp_assign]
+    for comp in remaining:
+        # choose split with largest deficit (target - current)
+        deficits = {s: target[s] - kept_by_split[s] for s in S}
+        best = max(deficits, key=lambda s: deficits[s])
+        comp_assign[comp] = best
+        kept_by_split[best] += comp_weight[comp]
+    total_kept = sum(kept_by_split.values())
+    # 4) Materialize per-example indices (and verify exclusivity)
+    pair_to_indices = defaultdict(list)
+    for idx, pair in enumerate(edges):
+        pair_to_indices[pair].append(idx)
+    split_to_indices = {s: [] for s in S}
+    for comp, s in comp_assign.items():
+        for pair in comp_pairs[comp]:
+            split_to_indices[s].extend(pair_to_indices[pair])
+    # Optional: if you pinned a comp due to a small 'must-test' count but
+    # want to *discard* the rest instead of keeping them in test, uncomment:
+    # for comp, s in comp_assign.items():
+    #     if s == "test" and test_edges_must:
+    #         # Keep only the required counts; dump extras to 'leaky_test'
+    #         ...
+    # (Left out for clarity; default is: keep the whole component in its split.)
+    # 5) Build edge lists and simple cluster assignments
+    split_to_edges = {s: [edges[i] for i in split_to_indices[s]] for s in split_to_indices}
+    tf_assign, dna_assign = {}, {}
+    for comp, s in comp_assign.items():
+        for (tf, dna) in comp_pairs[comp]:
+            tf_assign[tf] = s
+            dna_assign[dna] = s
+    # 6) Safety check: no DNA/TF appears in multiple splits
+    tf_in_split = defaultdict(set)
+    dna_in_split = defaultdict(set)
+    for s, elist in split_to_edges.items():
+        for tf, dna in elist:
+            tf_in_split[tf].add(s)
+            dna_in_split[dna].add(s)
+    dup_tf = {tf: ss for tf, ss in tf_in_split.items() if len(ss) > 1}
+    dup_dna = {dn: ss for dn, ss in dna_in_split.items() if len(ss) > 1}
+    assert not dup_tf and not dup_dna, f"Exclusivity violated: {dup_tf} {dup_dna}"
+    return tf_assign, dna_assign, kept_by_split, total_kept, split_to_indices, split_to_edges
+def print_split_ratios(kept_by_split):
+    total = sum(kept_by_split.values())
+    train_pcnt = 100 * kept_by_split["train"] / total
+    val_pcnt   = 100 * kept_by_split["val"]   / total
+    test_pcnt  = 100 * kept_by_split["test"]  / total
+    logger.info(f"Cluster distribution - Train: {train_pcnt:.2f}%, Val: {val_pcnt:.2f}%, Test: {test_pcnt:.2f}%")
+def make_edges(processed_fimo_path: str, protein_cluster_path: str, dna_cluster_path: str):
+    """
+    Make edges for input to the splitting algorithm. Edges consist of: (tr_cluster_rep)_(dna_cluster_rep) where the cluster rep is the sequence ID
+    """
+    # Read cluser data
+    protein_clusters = pd.read_csv(protein_cluster_path, header=None,sep="\t")
+    protein_clusters.columns=["tr_cluster_rep","tr_seqid"]
+    dna_clusters = pd.read_csv(dna_cluster_path, header=None,sep="\t")
+    dna_clusters.columns=["dna_cluster_rep","dna_seqid"]
+    # Read datapoints
+    edges = pd.read_parquet(processed_fimo_path)
+    edges = pd.merge(edges, dna_clusters, on="dna_seqid",how="left")
+    edges = pd.merge(edges, protein_clusters, on="tr_seqid",how="left")
+    edges["edge"] = edges.apply(lambda row: (row["tr_cluster_rep"], row["dna_cluster_rep"]), axis=1)
+    logger.info(f"Total unique edges: {len(edges['edge'].unique().tolist())}")
+    dup_edges = edges.loc[edges.duplicated("edge")]["edge"].unique().tolist()
+    logger.info(f"Total edges with >1 datapoint: {len(dup_edges)}")
+    logger.info(f"Total datapoints belonging to a duplicate edge: {len(edges.loc[edges['edge'].isin(dup_edges)])}")
+    return edges
+def check_validity(train, val, test, split_by="both"):
+    """
+    Rigorous check for no overlap
+    Columns = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
+    """
+    train_ids = set(train["ID"].unique().tolist())
+    val_ids = set(val["ID"].unique().tolist())
+    test_ids = set(test["ID"].unique().tolist())
+    assert len(train_ids.intersection(val_ids))==0
+    assert len(train_ids.intersection(test_ids))==0
+    assert len(val_ids.intersection(test_ids))==0
+    logger.info(f"Pass! No overlap in IDs")
+    if split_by!="dna":
+        train_tr_seqs = set(train["tr_sequence"].unique().tolist())
+        val_tr_seqs = set(val["tr_sequence"].unique().tolist())
+        test_tr_seqs = set(test["tr_sequence"].unique().tolist())
+        assert len(train_tr_seqs.intersection(val_tr_seqs))==0
+        assert len(train_tr_seqs.intersection(test_tr_seqs))==0
+        assert len(val_tr_seqs.intersection(test_tr_seqs))==0
+        logger.info(f"Pass! No overlap in TR sequences")
+        train_tr_reps = set(train["tr_cluster_rep"].unique().tolist())
+        val_tr_reps = set(val["tr_cluster_rep"].unique().tolist())
+        test_tr_reps = set(test["tr_cluster_rep"].unique().tolist())
+        assert len(train_tr_reps.intersection(val_tr_reps))==0
+        assert len(train_tr_reps.intersection(test_tr_reps))==0
+        assert len(val_tr_reps.intersection(test_tr_reps))==0
+        logger.info(f"Pass! No overlap in TR cluster reps")
+    if split_by!="protein":
+        train_dna_seqs = set(train["dna_sequence"].unique().tolist())
+        val_dna_seqs = set(val["dna_sequence"].unique().tolist())
+        test_dna_seqs = set(test["dna_sequence"].unique().tolist())
+        assert len(train_dna_seqs.intersection(val_dna_seqs))==0
+        assert len(train_dna_seqs.intersection(test_dna_seqs))==0
+        assert len(val_dna_seqs.intersection(test_dna_seqs))==0
+        logger.info(f"Pass! No overlap in DNA sequences")
+        train_dna_reps = set(train["dna_cluster_rep"].unique().tolist())
+        val_dna_reps = set(val["dna_cluster_rep"].unique().tolist())
+        test_dna_reps = set(test["dna_cluster_rep"].unique().tolist())
+        assert len(train_dna_reps.intersection(val_dna_reps))==0
+        assert len(train_dna_reps.intersection(test_dna_reps))==0
+        assert len(val_dna_reps.intersection(test_dna_reps))==0
+        logger.info(f"Pass! No overlap in DNA cluster reps")
+def main(cfg: DictConfig):
+    """
+    Take a set of DNA clusters + protein clusters, and create the best possible splits into train/val/test.
+    """
+    # construct edges from training data
+    edge_df = make_edges(processed_fimo_path=Path(root) / cfg.data_task.input_data_path,
+               protein_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.protein,
+               dna_cluster_path=Path(root) / cfg.data_task.cluster_output_paths.dna)
+    edges = edge_df["edge"].unique().tolist()
+    # figure out if we actually even have a conflict
+    total_proteins = len(edge_df["tr_seqid"].unique().tolist())
+    total_protein_clusters = len(edge_df["tr_cluster_rep"].unique().tolist())
+    no_protein_overlap = (total_proteins)==(total_protein_clusters)
+    logger.info(f"All proteins are in their own clusters: {no_protein_overlap}")
+    if cfg.data_task.split_by=="dna":
+        logger.info(f"Easy split: all proteins are in their own clusters.")
+        dna_clusters = edge_df["dna_cluster_rep"].unique().tolist()
+        results = split_bipartite_fast(
+            dna_clusters,
+            split_names=("train","val","test"),
+            ratios=(cfg.data_task.train_ratio, cfg.data_task.val_ratio, cfg.data_task.test_ratio),
+        )
+        dna_assign, kept_by_split = results
+        edge_df["split"] = edge_df["dna_seqid"].map(dna_assign)
+    else:
+        results = split_bipartite_by_components(
+            edges,
+            split_names=("train","val","test"),
+            ratios=(cfg.data_task.train_ratio, cfg.data_task.val_ratio, cfg.data_task.test_ratio),
+            require_nonempty=cfg.data_task.require_nonempty,
+            seed=cfg.data_task.seed,
+            test_edges_must=None,
+        )
+        tf_assign, dna_assign, kept_by_split, total_kept, split_to_indices, split_to_edges = results
+        # Map each sample to its split
+        print(tf_assign)
+        print(dna_assign)
+        edge_df["tr_split"] = edge_df["tr_cluster_rep"].map(tf_assign)
+        edge_df["dna_split"] = edge_df["dna_cluster_rep"].map(dna_assign)
+        edge_df["same_split"] = edge_df["tr_split"]==edge_df["dna_split"] # should always be true if easy cluster
+        edge_df["split"] = edge_df["tr_split"]
+        print(edge_df)
+        edge_df["split"] = np.where(
+            edge_df["same_split"],
+            edge_df["split"],       # keep existing split if same_split == True
+            "leak"                  # otherwise leak
+        )
+        print(edge_df)
+    # Print ratios: hopefully close to desired (e.g. 80/10/10)
+    print_split_ratios(kept_by_split)
+    # Make train, val, test sets
+    # make sure no ID is duplicate
+    assert len(edge_df["ID"].unique())==len(edge_df)
+    split_cols = ["ID","dna_sequence","tr_sequence","tr_cluster_rep","dna_cluster_rep", "scores","split"]
+    train = edge_df.loc[
+        edge_df["split"]=="train"
+    ].reset_index(drop=True)[split_cols]
+    val = edge_df.loc[
+        edge_df["split"]=="val"
+    ].reset_index(drop=True)[split_cols]
+    test = edge_df.loc[
+        edge_df["split"]=="test"
+    ].reset_index(drop=True)[split_cols]
+    # ensure there is no overlap
+    check_validity(train, val, test, split_by=cfg.data_task.split_by)
+    logger.info(f"Length of train dataset: {len(train)} ({100*len(train)/sum([len(train),len(val),len(test)]):.2f}%)")
+    logger.info(f"Length of val dataset: {len(val)} ({100*len(val)/sum([len(train),len(val),len(test)]):.2f}%)")
+    logger.info(f"Length of test dataset: {len(test)} ({100*len(test)/sum([len(train),len(val),len(test)]):.2f}%)")
+    # create the output dir
+    split_out_dir = Path(root)/cfg.data_task.split_out_dir
+    os.makedirs(split_out_dir, exist_ok=True)
+    split_final_cols = ["ID","dna_sequence","tr_sequence","scores","split"]
+    train[split_final_cols].to_csv(split_out_dir/"train.csv", index=False)
+    val[split_final_cols].to_csv(split_out_dir/"val.csv", index=False)
+    test[split_final_cols].to_csv(split_out_dir/"test.csv", index=False)
+    logger.info(f"Saved all splits to {split_out_dir}")

dpacman/scripts/preprocess.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import rootutils
-root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 import hydra
 from omegaconf import DictConfig
 import logging
 logger = logging.getLogger(__name__)
 # import your processing entry points here
@@ -15,7 +13,8 @@ from dpacman.data_tasks.clean.remap import main as clean_remap_main
 from dpacman.data_tasks.fimo.pre_fimo import main as pre_fimo_main
 from dpacman.data_tasks.fimo.run_fimo import main as run_fimo_main
 from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
 @hydra.main(
     config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
@@ -26,6 +25,7 @@ def main(cfg: DictConfig):
     logger.info(f"Running {task_type} task: {task_name}")
     if task_type == "download":
         if task_name == "genome":
             download_genome_main(cfg)
@@ -34,12 +34,14 @@ def main(cfg: DictConfig):
         else:
             raise ValueError(f"No download pipeline defined for: {task_name}")
     elif task_type == "clean":
         if task_name == "remap":
             clean_remap_main(cfg)
         else:
             raise ValueError(f"No clean pipeline defined for: {task_name}")
     elif task_type == "fimo":
         if task_name == "pre_fimo":
             pre_fimo_main(cfg)
@@ -50,6 +52,20 @@ def main(cfg: DictConfig):
         else:
             raise ValueError(f"No clean pipeline defined for: {task_name}")
     else:
         raise ValueError(f"Unknown task type: {task_type}")

 import rootutils
 import hydra
 from omegaconf import DictConfig
 import logging
+root = rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 logger = logging.getLogger(__name__)
 # import your processing entry points here
 from dpacman.data_tasks.fimo.pre_fimo import main as pre_fimo_main
 from dpacman.data_tasks.fimo.run_fimo import main as run_fimo_main
 from dpacman.data_tasks.fimo.post_fimo import main as post_fimo_main
+from dpacman.data_tasks.cluster.remap import main as cluster_remap_main
+from dpacman.data_tasks.split.remap import main as split_remap_main
 @hydra.main(
     config_path=str(root / "configs"), config_name="preprocess", version_base="1.3"
     logger.info(f"Running {task_type} task: {task_name}")
+    # Download
     if task_type == "download":
         if task_name == "genome":
             download_genome_main(cfg)
         else:
             raise ValueError(f"No download pipeline defined for: {task_name}")
+    # Clean
     elif task_type == "clean":
         if task_name == "remap":
             clean_remap_main(cfg)
         else:
             raise ValueError(f"No clean pipeline defined for: {task_name}")
+    # Fimo
     elif task_type == "fimo":
         if task_name == "pre_fimo":
             pre_fimo_main(cfg)
         else:
             raise ValueError(f"No clean pipeline defined for: {task_name}")
+    # Cluster
+    elif task_type == "cluster":
+        if task_name == "remap":
+            cluster_remap_main(cfg)
+        else:
+            raise ValueError(f"No clean pipeline defined for: {task_name}")
+    elif task_type == "split":
+        if task_name == "remap":
+            split_remap_main(cfg)
+        else:
+            raise ValueError(f"No clean pipeline defined for: {task_name}")
+    # Unknown - error
     else:
         raise ValueError(f"Unknown task type: {task_type}")

dpacman/scripts/run_cluster.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+# Manually specify values used in the config
+main_task="preprocess"
+data_task_type="cluster"
+timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+run_dir="/vast/projects/pranam/lab/sophie/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
+mkdir -p "$run_dir"
+nohup python -u -m scripts.preprocess \
+    hydra.run.dir="${run_dir}" \
+    data_task=${data_task_type}/remap \
+    data_task.cluster_dna_full="false" \
+    data_task.cluster_dna_peaks="false" \
+    data_task.cluster_protein="true" \
+    > "${run_dir}/run.log" 2>&1 &
+echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_fimo.sh CHANGED Viewed

@@ -5,13 +5,15 @@ main_task="preprocess"
 data_task_type="fimo"
 timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
-run_dir="$HOME/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
 mkdir -p "$run_dir"
 nohup python -u -m scripts.preprocess \
     hydra.run.dir="${run_dir}" \
     data_task=${data_task_type}/post_fimo \
     data_task.debug="false" \
     > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

 data_task_type="fimo"
 timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+run_dir="/vast/projects/pranam/lab/sophie/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
 mkdir -p "$run_dir"
 nohup python -u -m scripts.preprocess \
     hydra.run.dir="${run_dir}" \
     data_task=${data_task_type}/post_fimo \
     data_task.debug="false" \
+    data_task.keep_fimo_only="false" \
+    data_task.processed_output_csv="dpacman/data_files/processed/fimo/post_fimo/all_peaks/remap2022_crm_fimo_output_q_processed.csv" \
     > "${run_dir}/run.log" 2>&1 &
 echo $! > "${run_dir}/pid.txt"

dpacman/scripts/run_split.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+# Manually specify values used in the config
+main_task="preprocess"
+data_task_type="split"
+timestamp=$(date "+%Y-%m-%d_%H-%M-%S")
+run_dir="/vast/projects/pranam/lab/sophie/DPACMAN/logs/${main_task}/${data_task_type}/runs/${timestamp}"
+mkdir -p "$run_dir"
+nohup python -u -m scripts.preprocess \
+  hydra.run.dir="${run_dir}" \
+  data_task="${data_task_type}/remap" \
+  data_task.split_by=dna \
+  data_task.train_ratio=0.8 \
+  data_task.val_ratio=0.1 \
+  data_task.test_ratio=0.1 \
+  data_task.split_out_dir=dpacman/data_files/processed/splits/by_dna \
+  > "${run_dir}/run.log" 2>&1 &
+echo $! > "${run_dir}/pid.txt"

dpacman/utils/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Code in this folder was taken from olsdavis/gdd repository, which was based on DPLM (Diffusion Protein Language Model)

dpacman/utils/__init__.py ADDED Viewed

File without changes

dpacman/utils/clustering.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import pandas as pd
+import os
+import subprocess
+import sys
+from Bio import SeqIO
+import shutil
+import rootutils
+import logging
+logger = logging.getLogger(__name__)
+def ensure_mmseqs_in_path(mmseqs_dir):
+    """
+    Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly.
+    Args:
+        mmseqs_dir (str): Directory containing MMseqs2 binaries
+    """
+    mmseqs_bin = os.path.join(mmseqs_dir, 'mmseqs')
+    # Check if mmseqs is already in PATH
+    if shutil.which('mmseqs') is None:
+        # Export the MMseqs2 directory to PATH
+        os.environ['PATH'] = f"{mmseqs_dir}:{os.environ['PATH']}"
+        logger.info(f"\tAdded {mmseqs_dir} to PATH")
+def process_fasta(fasta_path):
+    fasta_sequences = SeqIO.parse(open(fasta_path),'fasta')
+    d = {}
+    for fasta in fasta_sequences:
+        id, sequence = fasta.id, str(fasta.seq)
+        d[id] = sequence
+    return d
+def analyze_clustering_result(input_fasta: str, tsv_path: str):
+    """
+    Args:
+        input_fasta (str): path to input fasta file
+    """
+    # Process input fasta
+    input_d = process_fasta(input_fasta)
+    # Process clusters.tsv
+    clusters = pd.read_csv(f'{tsv_path}',sep='\t',header=None)
+    clusters = clusters.rename(columns={
+        0: 'representative seq_id',
+        1: 'member seq_id'
+    })
+    clusters['representative seq'] = clusters['representative seq_id'].apply(lambda seq_id: input_d[seq_id])
+    clusters['member seq'] = clusters['member seq_id'].apply(lambda seq_id: input_d[seq_id])
+    # Sort them so that splitting results are reproducible
+    clusters = clusters.sort_values(by=['representative seq_id','member seq_id'],ascending=True).reset_index(drop=True)
+    return clusters
+def make_fasta(sequences: dict, fasta_path: str):
+    """
+    Makes a fasta file from sequences, where the key is the header and the value is the sequence.
+    Args:
+        sequences (dict): A dictionary where the key is the header and the value is the sequence.
+    Returns:
+        str: The path to the fasta file.
+    """
+    with open(fasta_path, 'w') as f:
+      for header, sequence in sequences.items():
+        f.write(f'>{header}\n{sequence}\n')
+    return fasta_path
+def run_mmseqs_clustering(input_fasta, output_dir, min_seq_id=0.3, c=0.8, cov_mode=0, cluster_mode=0, path_to_mmseqs='fuson_plm/mmseqs', dbtype=1):
+    """
+    Runs MMSeqs2 clustering using easycluster module
+    Args:
+        input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence....
+        output_dir (str): path to output dir for clustering results
+        min_seq_id (float): number [0,1] representing --min-seq-id in cluster command
+        c (float): nunber [0,1] representing -c in cluster command
+        cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command
+        cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command
+    """
+    # Get mmseqs dir
+    logger.info("\nRunning MMSeqs clustering...")
+    mmseqs_dir = os.path.join(path_to_mmseqs[0:path_to_mmseqs.index('/mmseqs')], 'mmseqs/bin')
+    logger.info(f"Running mmseqs clustering from {mmseqs_dir}")
+    # Ensure MMseqs2 is in the PATH
+    ensure_mmseqs_in_path(mmseqs_dir)
+    # Define paths for MMseqs2
+    mmseqs_bin = "mmseqs"  # Ensure this is in your PATH or provide the full path to mmseqs binary
+    # Create the output directory
+    os.makedirs(output_dir, exist_ok=True)
+    # Run MMseqs2 easy-cluster
+    cmd_easy_cluster = [
+        mmseqs_bin, "easy-cluster", input_fasta, os.path.join(output_dir, "mmseqs"), output_dir,
+        "--min-seq-id", str(min_seq_id),
+        "-c", str(c),
+        "--cov-mode", str(cov_mode),
+        "--cluster-mode", str(cluster_mode),
+        "--dbtype", str(dbtype)
+    ]
+    # Write the command to a log file
+    logger.info("\n\tCommand entered to MMSeqs2:")
+    logger.info("\t" + " ".join(cmd_easy_cluster) + "\n")
+    subprocess.run(cmd_easy_cluster, check=True)
+    logger.info(f"Clustering completed. Results are in {output_dir}")
+def cluster_summary(clusters: pd.DataFrame):
+    """
+    Summarizes how many clusters were formed, how big they are, etc ...
+    """
+    grouped_clusters = clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
+    assert len(grouped_clusters) == len(clusters['representative seq_id'].unique()) # make sure number of cluster reps = # grouped clusters
+    total_seqs = sum(grouped_clusters['member count'])
+    logger.info(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences")
+    logger.info(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1")
+    csize1_seqs = sum(grouped_clusters[grouped_clusters['member count']==1]['member count'])
+    logger.info(f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)")
+    logger.info(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1")
+    csizeg1_seqs = sum(grouped_clusters[grouped_clusters['member count']>1]['member count'])
+    logger.info(f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)")
+    logger.info(f"\tlargest cluster: {max(grouped_clusters['member count'])}")
+    logger.info("\nCluster size breakdown below...")
+    value_counts = grouped_clusters['member count'].value_counts().reset_index().rename(columns={'member count':'cluster size (n_members)','count': 'n_clusters'})
+    logger.info(value_counts.sort_values(by='cluster size (n_members)',ascending=True).to_string(index=False))

dpacman/utils/models.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Model-related utilities, such as setting seed
+"""
+import torch
+import numpy as np
+import random
+import os
+def set_seed(seed: int = 42) -> None:
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    # When running on the CuDNN backend, two further options must be set
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    # Set a fixed value for the hash seed
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    print(f"Random seed set as {seed}")

dpacman/utils/splitting.py ADDED Viewed

File without changes

environment.yaml CHANGED Viewed

@@ -1,42 +1,45 @@
-# reasons you might want to use `environment.yaml` instead of `requirements.txt`:
-# - pip installs packages in a loop, without ensuring dependencies across all packages
-#   are fulfilled simultaneously, but conda achieves proper dependency control across
-#   all packages
-# - conda allows for installing packages without requiring certain compilers or
-#   libraries to be available in the system, since it installs precompiled binaries
-# in case of errors look here: https://pytorch.org/get-started/previous-versions/
-name: dpacman
 channels:
-  - bioconda
   - conda-forge
-  - defaults
-# it is strongly recommended to specify versions of packages installed through conda
-# to avoid situation when version-unspecified packages install their latest major
-# versions which can sometimes break things
-# current approach below keeps the dependencies in the same major versions across all
-# users, but allows for different minor and patch versions of packages where backwards
-# compatibility is usually guaranteed
 dependencies:
   - python=3.10
-  - dask[complete]
   - pip>=23
-  - ghostscript=9.18
   - pip:
-    - rootutils==1.0.7
-    - polars==1.32.2
-    - hydra-core==1.3.2         # Hydra for config management
-    - hydra-colorlog==1.2.0     # Allow colorful logging in Hydra
-    - omegaconf==2.3.0          # Required by hydra-core
-    - pandas==2.2.3
-    - lxml==5.3.0
-    - pymex==0.9.31
-    - gitpython==3.1.44
-    - black==25.1.0   # code formatter
-    - tqdm==4.67.1
-    - matplotlib==3.10.3
-    - -e .

+name: dnabind
 channels:
   - conda-forge
+  - bioconda
 dependencies:
   - python=3.10
   - pip>=23
+  # dask "complete" equivalent via conda packages
+  - dask
+  - distributed
+  - rich=13.*
+  # ghostscript 9.18 is very old; keep only if you truly need that exact version.
+  # If not, drop the pin or use the conda-forge version (>=10).
+  - ghostscript
+  - lxml=5.3.0
+  - pandas=2.2.3
+  - gitpython=3.1.*
+  - tqdm=4.67.*
+  - matplotlib=3.10.*
   - pip:
+      # Pull GPU wheels (CUDA 12.8) from PyTorch's cu128 index; fall back to PyPI for others
+      - --index-url https://download.pytorch.org/whl/cu128
+      - --extra-index-url https://pypi.org/simple
+      # PyTorch + CUDA 12.8
+      - torch==2.7.1
+      - torchvision==0.22.1
+      # - torchaudio==2.7.1   # optional, if you need it
+      # Lightning (classic)
+      - pytorch-lightning
+      # Your pinned Python libs
+      - rootutils==1.0.7
+      - polars==1.32.2
+      - hydra-core==1.3.2
+      - hydra-colorlog==1.2.0
+      - omegaconf==2.3.0
+      - pymex==0.9.31
+      - transformers==4.51.2
+      - scikit-learn==1.7.1
+      - biopython==1.85
+      - ortools==9.14.6206
+      # Your package in editable mode
+      - -e .