added model files

Browse files

Files changed (8) hide show

dpacman/classifier/model/__init__.py +0 -0
dpacman/classifier/model/compress_embeddings.py +54 -0
dpacman/classifier/model/compute_embeddings.py +546 -0
dpacman/classifier/model/extract_tf_symbols.py +27 -0
dpacman/classifier/model/make_pair_list.py +220 -0
dpacman/classifier/model/make_peak_fasta.py +13 -0
dpacman/classifier/model/model.py +103 -0
dpacman/classifier/model/train.py +262 -0

dpacman/classifier/model/__init__.py ADDED Viewed

File without changes

dpacman/classifier/model/compress_embeddings.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# compress_embeddings.py
+# USAGE: python compress_embeddings.py --input_glob "/path/to/esm_embeddings/*.npy" --output_dir "/path/to/compressed_embeddings" --esm_dim 1280 --out_dim 256
+# --------------
+import os
+import glob
+import numpy as np
+import torch
+from torch import nn
+class EmbeddingCompressor(nn.Module):
+    def __init__(self, input_dim: int = 1280, output_dim: int = 256):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        x: (batch, L, input_dim)  or (L, input_dim)
+        returns: (batch, output_dim) or (output_dim,)
+        """
+        if x.dim() == 2:
+            # single example: mean over tokens
+            x = x.mean(dim=0, keepdim=True)      # → (1, input_dim)
+        else:
+            # batch: mean over tokens
+            x = x.mean(dim=1)                     # → (batch, input_dim)
+        return self.fc(x)                         # → (batch, output_dim)
+def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
+    arr = np.load(in_path)                      # shape (L, D) or (batch, L, D)
+    tensor = torch.from_numpy(arr).float()
+    with torch.no_grad():
+        compressed = model(tensor)              # → (batch, 256)
+    out = compressed.cpu().numpy()
+    np.save(out_path, out)
+    print(f"Saved {out_path}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
+    parser.add_argument("--input_glob", type=str, required=True,
+                        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)")
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--esm_dim", type=int, default=1280)
+    parser.add_argument("--out_dim", type=int, default=256)
+    args = parser.parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    compressor = EmbeddingCompressor(args.esm_dim, args.out_dim)
+    compressor.eval()
+    for fn in glob.glob(args.input_glob):
+        base = os.path.basename(fn).replace(".npy", "_256.npy")
+        out_path = os.path.join(args.output_dir, base)
+        compress_file(fn, out_path, compressor)

dpacman/classifier/model/compute_embeddings.py ADDED Viewed

	@@ -0,0 +1,546 @@

+"""
+Plug-and-play embedding extraction for:
+  • Chromosome sequences (from raw UCSC JSON)
+  • TF sequences (transcription_factors.fasta)
+Usage example (DNA + protein in one go):
+  module load miniconda/24.7.1
+  conda activate dpacman
+  python dpacman/data/compute_embeddings.py \
+    --genome-json-dir ../data_files/raw/genomes/hg38 \
+    --tf-fasta         ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
+    --chrom-model      caduceus \
+    --tf-model         esm-dbp \
+    --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
+    --device           cuda
+"""
+import os
+import re
+import argparse
+import json
+import numpy as np
+from pathlib import Path
+import torch
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
+import esm
+from Bio import SeqIO
+import time
+# ---- model wrappers ----
+class CaduceusEmbedder:
+    def __init__(self, device, chunk_size=131_072, overlap=0):
+        """
+        device: 'cpu' or 'cuda'
+        chunk_size: max bases (and thus tokens) to send in one forward pass
+        overlap: how many bases each window overlaps the previous; 0 = no overlap
+        """
+        model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(
+            model_name, trust_remote_code=True
+        ).to(device).eval()
+        self.device     = device
+        self.chunk_size = chunk_size
+        self.step       = chunk_size - overlap
+    def embed(self, seqs):
+        """
+        seqs: List[str] of DNA sequences (each <= chunk_size for this test)
+        returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
+        """
+        # outputs = []
+        # for seq in seqs:
+        #     # --- new: raw per‐token embeddings in one shot ---
+        #     toks = self.tokenizer(
+        #         seq,
+        #         return_tensors="pt",
+        #         padding=False,
+        #         truncation=True,
+        #         max_length=self.chunk_size
+        #     ).to(self.device)
+        #     with torch.no_grad():
+        #         out = self.model(**toks).last_hidden_state  # (1, L, D)
+        #     outputs.append(out.cpu().numpy()[0])             # (L, D)
+        # return np.stack(outputs, axis=0)  # (N, L, D)
+        outputs = []
+        for seq in seqs:
+            toks = self.tokenizer(
+                seq,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=self.chunk_size
+            ).to(self.device)
+            with torch.no_grad():
+                out = self.model(**toks).last_hidden_state  # (1, L, D)
+            outputs.append(out.cpu().numpy()[0])             # (L, D)
+        return outputs  # list of variable-length (L_i, D) arrays
+    def benchmark(self, lengths=None):
+        """
+        Time embedding on single-sequence of various lengths.
+        By default tests [5K,10K,50K,100K,chunk_size].
+        """
+        tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
+        print(f"→ Benchmarking Caduceus on device={self.device}")
+        for sz in tests:
+            seq = "A" * sz
+            # Warm-up
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t0 = time.perf_counter()
+            _ = self.embed([seq])
+            if self.device != "cpu":
+                torch.cuda.synchronize()
+            t1 = time.perf_counter()
+            print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
+class SegmentNTEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
+        self.model = AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True).to(device).eval()
+        self.device = device
+    def _adjust_length(self, input_ids):
+        bs, L = input_ids.shape
+        excl = L - 1
+        remainder = (excl) % 4
+        if remainder != 0:
+            pad_needed = 4 - remainder
+            pad_tensor = torch.full((bs, pad_needed), self.tokenizer.pad_token_id, dtype=input_ids.dtype, device=input_ids.device)
+            input_ids = torch.cat([input_ids, pad_tensor], dim=1)
+        return input_ids
+    def embed(self, seqs, batch_size=16):
+        """
+        seqs: List[str]
+        Returns: np.ndarray of shape (N, D)
+        """
+        all_embeddings = []
+        for i in range(0, len(seqs), batch_size):
+            batch_seqs = seqs[i : i + batch_size]
+            encoded = self.tokenizer.batch_encode_plus(
+                batch_seqs,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+            )
+            input_ids = encoded["input_ids"].to(self.device)  # (B, L)
+            attention_mask = input_ids != self.tokenizer.pad_token_id
+            input_ids = self._adjust_length(input_ids)
+            attention_mask = (input_ids != self.tokenizer.pad_token_id)
+            with torch.no_grad():
+                outs = self.model(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+            if hasattr(outs, "hidden_states") and outs.hidden_states is not None:
+                last_hidden = outs.hidden_states[-1]  # (B, L, D)
+            else:
+                last_hidden = outs.last_hidden_state  # fallback
+            # Exclude CLS token if present (assume first token) and pool
+            pooled = last_hidden[:, 1:, :].mean(dim=1)  # (B, D)
+            all_embeddings.append(pooled.cpu().numpy())
+            # release fragmentation
+            torch.cuda.empty_cache()
+        return np.vstack(all_embeddings)  # (N, D)
+class DNABertEmbedder:
+    def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
+        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
+        self.device    = device
+    def embed(self, seqs):
+        embs = []
+        for s in seqs:
+            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
+            with torch.no_grad():
+                out = self.model(tokens).last_hidden_state.mean(1)
+            embs.append(out.cpu().numpy())
+        return np.vstack(embs)
+class NucleotideTransformerEmbedder:
+    def __init__(self, device):
+        # HF “feature-extraction” returns a list of (L, D) arrays for each input
+        # device: “cpu” or “cuda”
+        self.pipe = pipeline(
+            "feature-extraction",
+            model="InstaDeepAI/nucleotide-transformer-500m-1000g",
+            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
+        )
+    def embed(self, seqs):
+        """
+        seqs: List[str] of raw DNA sequences
+        returns: (N, D) array, one D-dim vector per sequence
+        """
+        all_embeddings = self.pipe(seqs, truncation=True, padding=True)
+        # all_embeddings is a List of shape (L, D) arrays
+        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
+        return np.vstack(pooled)
+# class ESMEmbedder:
+#     def __init__(self, device):
+#         self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         self.batch_converter = self.alphabet.get_batch_converter()
+#         self.model.to(device).eval()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             results = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = results["representations"][33]
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
+class ESMEmbedder:
+    def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
+        # Try to load the specified ESM-2 model; fallback to esm1b if missing
+        self.device = device
+        try:
+            self.model, self.alphabet = getattr(esm.pretrained, model_name)()
+            self.is_esm2 = model_name.lower().startswith("esm2")
+        except AttributeError:
+            # fallback to ESM-1b
+            self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+            self.is_esm2 = False
+        self.batch_converter = self.alphabet.get_batch_converter()
+        self.model.to(device).eval()
+        # determine max length: esm2 models vary; use default 1024 for esm1b
+        self.max_len = 4096 if self.is_esm2 else 1024  # adjust if your esm2 variant has explicit limit
+        # for chunking: reserve 2 tokens if model uses BOS/EOS
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
+    def _chunk_sequence(self, seq):
+        """
+        Return list of possibly overlapping chunks of seq, each <= chunk_size.
+        """
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
+    def embed(self, seqs):
+        """
+        seqs: List[str] of protein sequences.
+        Returns: np.ndarray of shape (N, D) pooled per-sequence embeddings.
+        """
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            # process chunks in batch if small number, else sequentially
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    results = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = results["representations"][33]  # (1, L, D)
+                # remove BOS/EOS if present: take 1:-1 if length permits
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)  # (1, D)
+                else:
+                    rep = reps.mean(1)  # fallback
+                chunk_vecs.append(rep.squeeze(0))  # (D,)
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                # average chunk vectors
+                stacked = torch.stack(chunk_vecs, dim=0)  # (num_chunks, D)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)  # (N, D)
+# class ESMDBPEmbedder:
+#     def __init__(self, device):
+#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+#         model_path = (
+#             Path(__file__).resolve().parent.parent
+#             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+#         )
+#         checkpoint = torch.load(model_path, map_location="cpu")
+#         clean_sd = {}
+#         for k, v in checkpoint.items():
+#             clean_sd[k.replace("module.", "")] = v
+#         result = base_model.load_state_dict(clean_sd, strict=False)
+#         if result.missing_keys:
+#             print(f"[ESMDBP] missing keys: {result.missing_keys}")
+#         if result.unexpected_keys:
+#             print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+#         self.model = base_model.to(device).eval()
+#         self.alphabet = alphabet
+#         self.batch_converter = alphabet.get_batch_converter()
+#         self.device = device
+#     def embed(self, seqs):
+#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
+#         _, _, toks = self.batch_converter(batch)
+#         toks = toks.to(self.device)
+#         with torch.no_grad():
+#             out = self.model(toks, repr_layers=[33], return_contacts=False)
+#         reps = out["representations"][33]
+#         # skip start/end tokens
+#         return reps[:, 1:-1].mean(1).cpu().numpy()
+class ESMDBPEmbedder:
+    def __init__(self, device):
+        base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
+        model_path = (
+            Path(__file__).resolve().parent.parent
+            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
+        )
+        checkpoint = torch.load(model_path, map_location="cpu")
+        clean_sd = {}
+        for k, v in checkpoint.items():
+            clean_sd[k.replace("module.", "")] = v
+        result = base_model.load_state_dict(clean_sd, strict=False)
+        if result.missing_keys:
+            print(f"[ESMDBP] missing keys: {result.missing_keys}")
+        if result.unexpected_keys:
+            print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
+        self.model = base_model.to(device).eval()
+        self.alphabet = alphabet
+        self.batch_converter = alphabet.get_batch_converter()
+        self.device = device
+        self.max_len = 1024  # same limit as esm1b
+        self.chunk_size = self.max_len - 2
+        self.overlap = self.chunk_size // 4
+    def _chunk_sequence(self, seq):
+        if len(seq) <= self.chunk_size:
+            return [seq]
+        step = self.chunk_size - self.overlap
+        chunks = []
+        for i in range(0, len(seq), step):
+            chunk = seq[i : i + self.chunk_size]
+            if not chunk:
+                break
+            chunks.append(chunk)
+        return chunks
+    def embed(self, seqs):
+        all_embeddings = []
+        for i, seq in enumerate(seqs):
+            chunks = self._chunk_sequence(seq)
+            chunk_vecs = []
+            for chunk in chunks:
+                batch = [(str(i), chunk)]
+                _, _, toks = self.batch_converter(batch)
+                toks = toks.to(self.device)
+                with torch.no_grad():
+                    out = self.model(toks, repr_layers=[33], return_contacts=False)
+                reps = out["representations"][33]
+                if reps.size(1) > 2:
+                    rep = reps[:, 1:-1].mean(1)
+                else:
+                    rep = reps.mean(1)
+                chunk_vecs.append(rep.squeeze(0))
+            if len(chunk_vecs) == 1:
+                seq_vec = chunk_vecs[0]
+            else:
+                stacked = torch.stack(chunk_vecs, dim=0)
+                seq_vec = stacked.mean(0)
+            all_embeddings.append(seq_vec.cpu().numpy())
+        return np.vstack(all_embeddings)
+class GPNEmbedder:
+    def __init__(self, device):
+        model_name = "songlab/gpn-msa-sapiens"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.model.to(device)
+        self.model.eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+class ProGenEmbedder:
+    def __init__(self, device):
+        model_name = "jinyuan22/ProGen2-base"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
+        self.device = device
+    def embed(self, seqs):
+        inputs = self.tokenizer(
+            seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            last_hidden = self.model(**inputs).last_hidden_state
+        return last_hidden.mean(dim=1).cpu().numpy()
+# ---- main pipeline ----
+def get_embedder(name, device, for_dna=True):
+    name = name.lower()
+    if for_dna:
+        if name=="caduceus":   return CaduceusEmbedder(device)
+        if name=="dnabert":    return DNABertEmbedder(device)
+        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
+        if name=="gpn":        return GPNEmbedder(device)
+        if name=="segmentnt":    return SegmentNTEmbedder(device)
+    else:
+        if name in ("esm",):    return ESMEmbedder(device)
+        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
+        if name=="progen":      return ProGenEmbedder(device)
+    raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
+def pad_token_embeddings(list_of_arrays, pad_value=0.0):
+    """
+    list_of_arrays: list of (L_i, D) numpy arrays
+    Returns:
+      padded: (N, L_max, D) array
+      mask:   (N, L_max) boolean array where True = real token, False = padding
+    """
+    N = len(list_of_arrays)
+    D = list_of_arrays[0].shape[1]
+    L_max = max(arr.shape[0] for arr in list_of_arrays)
+    padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
+    mask = np.zeros((N, L_max), dtype=bool)
+    for i, arr in enumerate(list_of_arrays):
+        L = arr.shape[0]
+        padded[i, :L] = arr
+        mask[i, :L] = True
+    return padded, mask
+def embed_and_save(seqs, ids, embedder, out_path):
+    embs = embedder.embed(seqs)
+    # Decide whether we got variable-length per-token outputs (list of (L, D))
+    is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
+    if is_variable_token:
+        # pad to (N, L_max, D) + mask
+        padded, mask = pad_token_embeddings(embs)
+        # Save both embeddings and mask together in an .npz for convenience
+        np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
+                            embeddings=padded,
+                            mask=mask,
+                            ids=np.array(ids, dtype=object))
+    else:
+        # fixed shape output, e.g., pooled (N, D)
+        array = np.vstack(embs) if isinstance(embs, list) else embs
+        np.save(out_path, array)
+        with open(out_path.with_suffix(".ids"), "w") as f:
+            f.write("\n".join(ids))
+if __name__=="__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument("--peak-fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
+    p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
+    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
+    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
+    p.add_argument("--chrom-model",   default="caduceus")
+    p.add_argument("--tf-model",      default="esm-dbp")
+    p.add_argument("--out-dir",       default="data_files/processed/tfclust/hg38_tf/embeddings")
+    p.add_argument("--device",        default="cpu")
+    args = p.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    device = args.device
+    if not args.skip_dna:
+        peak_fasta = Path(args.peak_fasta)
+        if peak_fasta.exists():
+            # Load peak sequences from FASTA
+            from Bio import SeqIO
+            peak_seqs = []
+            peak_ids = []
+            for rec in SeqIO.parse(peak_fasta, "fasta"):
+                peak_ids.append(rec.id)
+                peak_seqs.append(str(rec.seq))
+            print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
+            dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
+            embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
+        elif args.genome_json_dir:
+            # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
+            genome_dir = Path(args.genome_json_dir)
+            chrom_seqs, chrom_ids = [], []
+            primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
+            for j in sorted(genome_dir.iterdir()):
+                if not primary_pattern.match(j.name):
+                    continue
+                data = json.loads(j.read_text())
+                seq = data.get("dna") or data.get("sequence")
+                chrom = data.get("chrom") or j.stem.split("_")[-1]
+                chrom_seqs.append(seq)
+                chrom_ids.append(chrom)
+            cutoff = CaduceusEmbedder(device).chunk_size
+            long_chroms = [
+                (chrom, len(seq))
+                for chrom, seq in zip(chrom_ids, chrom_seqs)
+                if len(seq) > cutoff
+            ]
+            if long_chroms:
+                print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
+                for chrom, L in long_chroms:
+                    print(f"  {chrom}: {L} bases")
+            else:
+                print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
+            chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
+            out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
+            embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
+        else:
+            raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
+    #Load TF sequences
+    tf_seqs, tf_ids = [], []
+    for record in SeqIO.parse(args.tf_fasta, "fasta"):
+        tf_ids.append(record.id)
+        tf_seqs.append(str(record.seq))
+    # embed and save
+    tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
+    out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
+    embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
+    print("Done.")

dpacman/classifier/model/extract_tf_symbols.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+import pandas as pd
+from pathlib import Path
+FINAL_CSV = Path("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv")
+OUT_SYMBOLS = Path("tf_symbols.txt")
+def normalize_tf(tf_id: str) -> str:
+    return tf_id.split("_seq")[0].upper()
+def main():
+    df = pd.read_csv(FINAL_CSV, dtype=str)
+    if "TF_id" not in df.columns:
+        raise RuntimeError("final.csv missing TF_id column")
+    tf_raw = df["TF_id"].dropna().unique().tolist()
+    normalized = sorted({normalize_tf(t) for t in tf_raw})
+    print(f"Unique raw TF_id count: {len(tf_raw)}")
+    print(f"Unique normalized TF symbols: {len(normalized)}")
+    with open(OUT_SYMBOLS, "w") as f:
+        for s in normalized:
+            f.write(s + "\n")
+    print(f"Wrote normalized TF symbols to {OUT_SYMBOLS}")
+    # Optional: show sample
+    print("Sample symbols:", normalized[:50])
+if __name__ == "__main__":
+    main()

dpacman/classifier/model/make_pair_list.py ADDED Viewed

	@@ -0,0 +1,220 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import random
+import sys
+def read_ids_file(p):
+    p = Path(p)
+    if not p.exists():
+        raise FileNotFoundError(f"IDs file not found: {p}")
+    return [line.strip() for line in p.open() if line.strip()]
+def split_embeddings(emb_path, ids_path, out_dir, prefix):
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    if not Path(emb_path).exists():
+        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
+    if not Path(ids_path).exists():
+        raise FileNotFoundError(f"IDs file not found: {ids_path}")
+    if emb_path.endswith(".npz"):
+        data = np.load(emb_path, allow_pickle=True)
+        if "embeddings" in data:
+            emb = data["embeddings"]
+        else:
+            raise ValueError(f"{emb_path} missing 'embeddings' key")
+    else:
+        emb = np.load(emb_path)
+    ids = read_ids_file(ids_path)
+    if len(ids) != emb.shape[0]:
+        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
+    mapping = {}
+    for i, ident in enumerate(ids):
+        if i >= emb.shape[0]:
+            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
+            continue
+        arr = emb[i]
+        out_file = out_dir / f"{prefix}_{ident}.npy"
+        np.save(out_file, arr)
+        mapping[ident] = str(out_file)
+    return mapping
+def extract_symbol_from_tf_id(full_id: str) -> str:
+    """
+    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
+    return the gene symbol uppercase (e.g., 'ZBTB5').
+    """
+    if "|" in full_id:
+        try:
+            # format sp|Accession|SYMBOL_HUMAN
+            genepart = full_id.split("|")[2]
+        except IndexError:
+            genepart = full_id
+    else:
+        genepart = full_id
+    symbol = genepart.split("_")[0]
+    return symbol.upper()
+def build_tf_symbol_map(tf_map):
+    """
+    Build mapping gene_symbol -> list of embedding paths.
+    """
+    symbol_map = {}
+    for full_id, path in tf_map.items():
+        symbol = extract_symbol_from_tf_id(full_id)
+        symbol_map.setdefault(symbol, []).append(path)
+    return symbol_map
+def tf_key_from_path(path: str) -> str:
+    """
+    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
+    """
+    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
+    # remove leading prefix if present (tf_)
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return extract_symbol_from_tf_id(rest)
+def dna_key_from_path(path: str) -> str:
+    """
+    Given .../dna_peak42.npy -> 'peak42'
+    """
+    stem = Path(path).stem
+    if "_" in stem:
+        _, rest = stem.split("_", 1)
+    else:
+        rest = stem
+    return rest
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build TF-DNA pair list from final.csv with gene-symbol normalization for TFs."
+    )
+    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
+    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
+    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (e.g., peak*.ids)")
+    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
+    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (e.g., sp|...|... ids)")
+    parser.add_argument("--out_dir", required=True, help="Output directory")
+    parser.add_argument("--neg_per_positive", type=int, default=2, help="Negatives per positive (half same-TF, half same-DNA)")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    random.seed(args.seed)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Load final.csv
+    df = pd.read_csv(args.final_csv, dtype=str)
+    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
+        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
+    # Assign dna_id (unique per dna_sequence)
+    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
+    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
+    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
+    enriched_csv = out_dir / "final_with_dna_id.csv"
+    df.to_csv(enriched_csv, index=False)
+    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
+    # Split embeddings into per-item files
+    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
+    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
+    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
+    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
+    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
+    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
+    # Build gene-symbol normalized map
+    tf_symbol_map = build_tf_symbol_map(tf_map)
+    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
+    # Diagnostic overlaps
+    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
+    available_tf_symbols = set(tf_symbol_map.keys())
+    intersect_tf = norm_tf_in_final & available_tf_symbols
+    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
+    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
+    print(f"[i] Intersection count: {len(intersect_tf)}")
+    if len(intersect_tf) == 0:
+        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
+        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
+        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
+        sys.exit(1)
+    dna_ids_final = set(df["dna_id"].unique())
+    available_dna_ids = set(dna_map.keys())
+    intersect_dna = dna_ids_final & available_dna_ids
+    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
+    if len(intersect_dna) == 0:
+        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
+        sys.exit(1)
+    # Build positive pairs
+    positives = []
+    for _, row in df.iterrows():
+        tf_raw = row["TF_id"]
+        tf_symbol = tf_raw.split("_seq")[0].upper()
+        dnaid = row["dna_id"]
+        if tf_symbol not in tf_symbol_map:
+            continue
+        if dnaid not in dna_map:
+            continue
+        # pick the first embedding for that symbol
+        tf_embedding_path = tf_symbol_map[tf_symbol][0]
+        positives.append((tf_embedding_path, dna_map[dnaid], 1))
+    print(f"[i] Constructed {len(positives)} positive pairs after TF symbol resolution")
+    if len(positives) == 0:
+        print("[ERROR] No positive pairs could be constructed; aborting.", file=sys.stderr)
+        sys.exit(1)
+    # Build negative samples
+    all_tf_symbols = sorted(tf_symbol_map.keys())
+    all_dnaids = sorted(dna_map.keys())
+    positive_set = set()
+    for tf_path, dna_path, _ in positives:
+        tf_key = tf_key_from_path(tf_path)
+        dna_key = dna_key_from_path(dna_path)
+        positive_set.add((tf_key, dna_key))
+    negatives = []
+    half = args.neg_per_positive // 2
+    for tf_path, dna_path, _ in positives:
+        tf_key = tf_key_from_path(tf_path)
+        dna_key = dna_key_from_path(dna_path)
+        # same TF, different DNA
+        for _ in range(half):
+            candidate_dna = random.choice(all_dnaids)
+            if candidate_dna == dna_key or (tf_key, candidate_dna) in positive_set:
+                continue
+            negatives.append((tf_path, dna_map[candidate_dna], 0))
+        # same DNA, different TF
+        for _ in range(half):
+            candidate_tf_symbol = random.choice(all_tf_symbols)
+            if candidate_tf_symbol == tf_key or (candidate_tf_symbol, dna_key) in positive_set:
+                continue
+            # pick its first embedding
+            candidate_tf_path = tf_symbol_map[candidate_tf_symbol][0]
+            negatives.append((candidate_tf_path, dna_map[dnaid], 0))
+    print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive={args.neg_per_positive})")
+    # Write pair list
+    pair_list_path = out_dir / "pair_list.tsv"
+    with open(pair_list_path, "w") as f:
+        for binder_path, glm_path, label in positives + negatives:
+            # binder=TF, glm=DNA
+            f.write(f"{binder_path}\t{glm_path}\t{label}\n")
+    print(f"[i] Wrote {len(positives)+len(negatives)} examples to {pair_list_path}")
+if __name__ == "__main__":
+    main()

dpacman/classifier/model/make_peak_fasta.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import pandas as pd
+from pathlib import Path
+df = pd.read_csv("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv", dtype=str)  # adjust path if needed
+# get unique sequences
+uniq = df[["dna_sequence"]].drop_duplicates().reset_index(drop=True)
+# make headers: e.g., peak0, peak1, ...
+out_fa = Path("binding_peaks_unique.fa")
+with open(out_fa, "w") as f:
+    for i, seq in enumerate(uniq["dna_sequence"]):
+        header = f">peak{i}"
+        f.write(f"{header}\n{seq}\n")
+print(f"Wrote {len(uniq)} unique binding sequences to {out_fa}")

dpacman/classifier/model/model.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+from torch import nn
+class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+        self.act = nn.GELU()
+        self.ln = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor):
+        # x: (batch, L, dim)
+        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
+        out = self.act(out)
+        out = out.transpose(1, 2)           # → (batch, L, dim)
+        return self.ln(out + x)             # residual
+class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8):
+        super().__init__()
+        # self-attention for both sides
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_b1 = nn.LayerNorm(dim)
+        self.ln_g1 = nn.LayerNorm(dim)
+        self.ffn_b = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ffn_g = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ln_b2 = nn.LayerNorm(dim)
+        self.ln_g2 = nn.LayerNorm(dim)
+        # cross attention (binder queries, glm keys/values)
+        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_c1 = nn.LayerNorm(dim)
+        self.ffn_c = nn.Sequential(nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim))
+        self.ln_c2 = nn.LayerNorm(dim)
+    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
+        """
+        binder: (batch, Lb, dim)
+        glm: (batch, Lg, dim) -- has passed through its local CNN beforehand
+        returns: updated binder representation (batch, Lb, dim)
+        """
+        # binder self-attn + ffn
+        b = binder
+        b_sa, _ = self.sa_binder(b, b, b)
+        b = self.ln_b1(b + b_sa)
+        b_ff = self.ffn_b(b)
+        b = self.ln_b2(b + b_ff)
+        # glm self-attn + ffn
+        g = glm
+        g_sa, _ = self.sa_glm(g, g, g)
+        g = self.ln_g1(g + g_sa)
+        g_ff = self.ffn_g(g)
+        g = self.ln_g2(g + g_ff)
+        # cross-attention: binder queries glm
+        c_sa, _ = self.cross_attn(b, g, g)
+        c = self.ln_c1(b + c_sa)
+        c_ff = self.ffn_c(c)
+        c = self.ln_c2(c + c_ff)
+        return c  # (batch, Lb, dim)
+class BindPredictor(nn.Module):
+    def __init__(self,
+                 input_dim: int = 256,
+                 hidden_dim: int = 256,
+                 heads: int = 8,
+                 num_layers: int = 4,
+                 use_local_cnn_on_glm: bool = True):
+        super().__init__()
+        self.proj_binder = nn.Linear(input_dim, hidden_dim)
+        self.proj_glm = nn.Linear(input_dim, hidden_dim)
+        self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
+        self.layers = nn.ModuleList([
+            CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)
+        ])
+        self.ln_out = nn.LayerNorm(hidden_dim)
+        self.head = nn.Sequential(
+            nn.Linear(hidden_dim, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, binder_emb, glm_emb):
+        """
+        binder_emb, glm_emb: (batch, L, input_dim)
+        """
+        b = self.proj_binder(binder_emb)   # (B, Lb, hidden_dim)
+        g = self.proj_glm(glm_emb)         # (B, Lg, hidden_dim)
+        if self.use_local_cnn:
+            g = self.local_cnn(g)          # local context injected
+        for layer in self.layers:
+            b = layer(b, g)                # update binder with cross-modal info
+        pooled = b.mean(dim=1)             # (B, hidden_dim)
+        out = self.ln_out(pooled)
+        return self.head(out).squeeze(-1)   # (B,)

dpacman/classifier/model/train.py ADDED Viewed

	@@ -0,0 +1,262 @@

+#!/usr/bin/env python3
+import argparse
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+from model import BindPredictor
+from pathlib import Path
+from collections import Counter
+from sklearn.metrics import roc_auc_score, average_precision_score
+from sklearn.decomposition import TruncatedSVD
+import random
+import sys
+# ---- dataset ---------------------------------------------------------
+class PairDataset(Dataset):
+    def __init__(self, binder_paths, glm_paths, labels, tf_compressed_cache):
+        """
+        tf_compressed_cache: dict mapping binder_path -> compressed (256-d) tensor/array
+        """
+        assert len(binder_paths) == len(glm_paths) == len(labels)
+        self.binder_paths = binder_paths
+        self.glm_paths = glm_paths
+        self.labels = labels
+        self.tf_cache = tf_compressed_cache  # already reduced to 256
+    def __len__(self):
+        return len(self.labels)
+    def __getitem__(self, idx):
+        # binder = TF embedding (possibly reduced)
+        b = self.tf_cache[self.binder_paths[idx]]  # numpy array shape (L, 256) or (256,)
+        g = np.load(self.glm_paths[idx])           # glm (DNA) embedding
+        if b.ndim == 1:
+            b = b[None, :]
+        if g.ndim == 1:
+            g = g[None, :]
+        b_tensor = torch.from_numpy(b).float()
+        g_tensor = torch.from_numpy(g).float()
+        y = torch.tensor(self.labels[idx]).float()
+        return b_tensor, g_tensor, y
+def collate_fn(batch):
+    binders, glms, labels = zip(*batch)
+    binder_lens = [b.shape[0] for b in binders]
+    glm_lens = [g.shape[0] for g in glms]
+    max_b = max(binder_lens)
+    max_g = max(glm_lens)
+    def pad_seq(seq, target_len):
+        L, D = seq.shape
+        if L < target_len:
+            pad = torch.zeros((target_len - L, D), dtype=seq.dtype, device=seq.device)
+            return torch.cat([seq, pad], dim=0)
+        return seq
+    b_padded = torch.stack([pad_seq(b, max_b) for b in binders])  # (B, Lb, D)
+    g_padded = torch.stack([pad_seq(g, max_g) for g in glms])     # (B, Lg, D)
+    y = torch.stack(labels)
+    return b_padded, g_padded, y
+# ---- utilities -------------------------------------------------------
+def parse_pair_list(pair_list_path):
+    binder_paths, glm_paths, labels = [], [], []
+    with open(pair_list_path) as f:
+        for lineno, line in enumerate(f, start=1):
+            if not line.strip():
+                continue
+            parts = line.strip().split()
+            if len(parts) != 3:
+                print(f"[WARN] skipping malformed line {lineno}: {line.strip()}", file=sys.stderr)
+                continue
+            b, g, l = parts
+            try:
+                lab = int(l)
+            except ValueError:
+                print(f"[WARN] invalid label on line {lineno}: {l}", file=sys.stderr)
+                continue
+            binder_paths.append(b)
+            glm_paths.append(g)
+            labels.append(lab)
+    return binder_paths, glm_paths, labels
+def build_tf_compressed_cache(binder_paths, target_dim=256):
+    """
+    Load all unique TF (binder) embeddings, fit reduction if needed, and return dict mapping path->(L, target_dim) array.
+    """
+    unique_paths = sorted(set(binder_paths))
+    print(f"[i] Found {len(unique_paths)} unique TF embedding files to compress.", flush=True)
+    # Load all embeddings to determine dimensionality
+    samples = []
+    for p in unique_paths:
+        arr = np.load(p)
+        samples.append(arr)
+    # Determine if reduction needed: assume all have same embedding width
+    first = samples[0]
+    orig_dim = first.shape[1] if first.ndim == 2 else 1
+    reduction_needed = (orig_dim != target_dim)
+    tf_cache = {}
+    if reduction_needed:
+        # Build matrix to fit SVD: we need a 2D matrix per embedding; if lengths vary we can't directly stack.
+        # We'll do reduction per sequence individually using TruncatedSVD on concatenated flattened features:
+        # Simplest: for variable lengths, reduce each embedding separately with a learned linear projection.
+        # Here we fit a single TruncatedSVD on the concatenation of all sequence tokens (flattened) by padding/truncating to a fixed length.
+        # To avoid complexity, use PCA-like linear projection learned via SVD on mean-pooled vectors:
+        pooled = []
+        for arr in samples:
+            if arr.ndim == 2:
+                pooled.append(arr.mean(axis=0))  # (orig_dim,)
+            else:
+                pooled.append(arr)  # degenerate
+        pooled_mat = np.stack(pooled, axis=0)  # (N, orig_dim)
+        print(f"[i] Fitting TruncatedSVD on TF pooled embeddings: {pooled_mat.shape} -> {target_dim}", flush=True)
+        svd = TruncatedSVD(n_components=target_dim, random_state=42)
+        reduced_pooled = svd.fit_transform(pooled_mat)  # (N, target_dim)
+        # For each original embedding, project token-level vectors by multiplying token vector with svd.components_.T
+        # svd.components_: (target_dim, orig_dim)  so projection matrix is (orig_dim, target_dim)
+        proj_mat = svd.components_.T  # (orig_dim, target_dim)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]  # shape (L, orig_dim)
+            if arr.ndim == 1:
+                arr2 = arr @ proj_mat  # (target_dim,)
+            else:
+                # project each token: (L, orig_dim) @ (orig_dim, target_dim) -> (L, target_dim)
+                arr2 = arr @ proj_mat
+            tf_cache[p] = arr2  # reduced per-token representation
+        print("[i] Completed compression of TF embeddings.", flush=True)
+    else:
+        # already correct dim: just cache originals
+        print(f"[i] TF embeddings already {target_dim}-dimensional; skipping reduction.", flush=True)
+        for i, p in enumerate(unique_paths):
+            arr = samples[i]
+            tf_cache[p] = arr
+    return tf_cache
+def evaluate(model, dl, device):
+    model.eval()
+    all_labels = []
+    all_preds = []
+    with torch.no_grad():
+        for b, g, y in dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            all_labels.append(y.cpu())
+            all_preds.append(pred.cpu())
+    if not all_labels:
+        return 0.0, 0.0
+    y_true = torch.cat(all_labels).numpy()
+    y_score = torch.cat(all_preds).numpy()
+    try:
+        auc = roc_auc_score(y_true, y_score)
+    except Exception:
+        auc = 0.0
+    try:
+        ap = average_precision_score(y_true, y_score)
+    except Exception:
+        ap = 0.0
+    return auc, ap
+# ---- main ------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--pair_list", type=str, required=True,
+                        help="TSV: binder_path glm_path label")
+    parser.add_argument("--out_dir", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--batch_size", type=int, default=32)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    # reproducibility
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    print("DEBUG: starting training script with in-line TF compression", flush=True)
+    print(f"[i] pair_list: {args.pair_list}", flush=True)
+    print(f"[i] output dir: {args.out_dir}", flush=True)
+    device = torch.device(args.device if torch.cuda.is_available() else "cpu")
+    binder_paths, glm_paths, labels = parse_pair_list(args.pair_list)
+    if len(labels) == 0:
+        print("[ERROR] No valid pairs parsed. Exiting.", file=sys.stderr)
+        sys.exit(1)
+    label_counts = Counter(labels)
+    print(f"[i] Total examples parsed: {len(labels)}. Label distribution: {label_counts}", flush=True)
+    # build compressed TF cache (reduces to 256 if needed)
+    tf_compressed_cache = build_tf_compressed_cache(binder_paths, target_dim=256)
+    # simple split: 80/10/10
+    n = len(labels)
+    idxs = np.arange(n)
+    np.random.shuffle(idxs)
+    train_i = idxs[: int(0.8 * n)]
+    val_i = idxs[int(0.8 * n): int(0.9 * n)]
+    test_i = idxs[int(0.9 * n):]
+    def subset(idxs):
+        return [binder_paths[i] for i in idxs], [glm_paths[i] for i in idxs], [labels[i] for i in idxs]
+    train_ds = PairDataset(*subset(train_i), tf_compressed_cache=tf_compressed_cache)
+    val_ds = PairDataset(*subset(val_i), tf_compressed_cache=tf_compressed_cache)
+    test_ds = PairDataset(*subset(test_i), tf_compressed_cache=tf_compressed_cache)
+    print(f"[i] Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}", flush=True)
+    if len(train_ds) == 0 or len(val_ds) == 0:
+        print("[ERROR] Train or validation split is empty; cannot proceed.", file=sys.stderr)
+        sys.exit(1)
+    train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn)
+    val_dl = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    test_dl = DataLoader(test_ds, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
+    model = BindPredictor(input_dim=256, hidden_dim=256, heads=8, num_layers=3, use_local_cnn_on_glm=True)
+    model = model.to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-3)
+    loss_fn = nn.BCELoss()
+    best_val = -float("inf")
+    os_out = Path(args.out_dir)
+    os_out.mkdir(exist_ok=True, parents=True)
+    for epoch in range(1, args.epochs + 1):
+        print(f"[Epoch {epoch}] starting...", flush=True)
+        model.train()
+        running_loss = 0.0
+        for b, g, y in train_dl:
+            b = b.to(device)
+            g = g.to(device)
+            y = y.to(device)
+            pred = model(b, g)
+            loss = loss_fn(pred, y)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item() * b.size(0)
+        train_loss = running_loss / len(train_ds)
+        val_auc, val_ap = evaluate(model, val_dl, device)
+        print(f"[Epoch {epoch}] train_loss={train_loss:.4f} val_auc={val_auc:.4f} val_ap={val_ap:.4f}", flush=True)
+        if val_auc > best_val:
+            best_val = val_auc
+            torch.save(model.state_dict(), os_out / "best_model.pt")
+            print(f"[Epoch {epoch}] Saved new best model with val_auc={val_auc:.4f}", flush=True)
+    torch.save(model.state_dict(), os_out / "last_model.pt")
+    test_auc, test_ap = evaluate(model, test_dl, device)
+    print(f"FINAL TEST: AUC={test_auc:.4f} AP={test_ap:.4f}", flush=True)
+    print(f"[i] Models written to {os_out}/best_model.pt and last_model.pt", flush=True)
+if __name__ == "__main__":
+    main()