svincoff commited on Aug 21, 2025

Commit

29899b4

1 Parent(s): bc0d37c

training works

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +6 -1
configs/callbacks/default.yaml +21 -0
configs/callbacks/early_stopping.yaml +15 -0
configs/callbacks/model_checkpoint.yaml +17 -0
configs/callbacks/model_summary.yaml +5 -0
dpacman/classifier/model/__init__.py → configs/callbacks/none.yaml +0 -0
configs/callbacks/rich_progress_bar.yaml +4 -0
configs/data_module/pair.yaml +13 -0
configs/data_modules/pair.yaml +0 -9
configs/data_task/cluster/remap.yaml +1 -1
configs/data_task/download/genome.yaml +1 -1
configs/data_task/download/remap.yaml +1 -1
configs/data_task/embeddings/dna.yaml +8 -4
configs/data_task/embeddings/protein.yaml +14 -0
configs/data_task/fimo/post_fimo.yaml +1 -1
configs/data_task/fimo/pre_fimo.yaml +1 -1
configs/data_task/fimo/run_fimo.yaml +1 -1
configs/data_task/split/remap.yaml +3 -0
configs/extras/default.yaml +8 -0
configs/logger/aim.yaml +28 -0
configs/logger/comet.yaml +12 -0
configs/logger/csv.yaml +7 -0
configs/logger/many_loggers.yaml +9 -0
configs/logger/mlflow.yaml +12 -0
configs/logger/neptune.yaml +9 -0
configs/logger/tensorboard.yaml +10 -0
configs/logger/wandb.yaml +16 -0
configs/model/classifier.yaml +9 -0
configs/{models → model}/pooling/truncatedsvd.yaml +0 -0
configs/models/classifier.yaml +0 -11
configs/preprocess.yaml +1 -1
configs/train.yaml +37 -2
configs/trainer/cpu.yaml +5 -0
configs/trainer/ddp.yaml +9 -0
configs/trainer/ddp_sim.yaml +7 -0
configs/trainer/default.yaml +19 -0
configs/trainer/gpu.yaml +5 -0
configs/trainer/mps.yaml +5 -0
dpacman/classifier/loss.py +58 -0
dpacman/classifier/model.py +258 -0
dpacman/classifier/model/clustering_data.py +0 -383
dpacman/classifier/model/compress_embeddings.py +0 -54
dpacman/classifier/model/compute_embeddings.py +0 -560
dpacman/classifier/model/extract_tf_symbols.py +0 -27
dpacman/classifier/model/loss.py +0 -34
dpacman/classifier/model/make_pair_list.py +0 -220
dpacman/classifier/model/make_peak_fasta.py +0 -13
dpacman/classifier/model_tmp/clustering_data.py +139 -47
dpacman/classifier/model_tmp/compress_embeddings.py +15 -7
dpacman/classifier/model_tmp/compute_embeddings.py +125 -59

.gitignore CHANGED Viewed

@@ -29,4 +29,9 @@ dpacman/nohup.out
 dpacman/*/__pycache__/
 dpacman/data_tasks/split/__pycache__/
 dpacman/data_tasks/cluster/__pycache__/
-dpacman/data_tasks/embeddings/__pycache__/

 dpacman/*/__pycache__/
 dpacman/data_tasks/split/__pycache__/
 dpacman/data_tasks/cluster/__pycache__/
+dpacman/data_tasks/embeddings/__pycache__/
+dpacman/combine_shards.py
+dpacman/combine.log
+dpacman/loss_sim.py
+dpacman/loss_temp.py
+dpacman/peak_examples/

configs/callbacks/default.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+defaults:
+  - model_checkpoint
+  - early_stopping
+  - model_summary
+  - _self_
+model_checkpoint:
+  dirpath: ${paths.output_dir}/checkpoints
+  filename: "epoch_{epoch:03d}"
+  monitor: "val/loss"
+  mode: "min"
+  save_last: True
+  auto_insert_metric_name: False
+early_stopping:
+  monitor: "val/loss"
+  patience: 100
+  mode: "min"
+model_summary:
+  max_depth: -1

configs/callbacks/early_stopping.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.EarlyStopping.html
+early_stopping:
+  _target_: lightning.pytorch.callbacks.EarlyStopping
+  monitor: ??? # quantity to be monitored, must be specified !!!
+  min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
+  patience: 3 # number of checks with no improvement after which training will be stopped
+  verbose: False # verbosity mode
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  strict: True # whether to crash the training if monitor is not found in the validation metrics
+  check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
+  stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
+  divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
+  check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
+  # log_rank_zero_only: False  # this keyword argument isn't available in stable version

configs/callbacks/model_checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
+model_checkpoint:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  dirpath: null # directory to save the model file
+  filename: null # checkpoint filename
+  monitor: null # name of the logged metric which determines when model is improving
+  verbose: False # verbosity mode
+  save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+  save_top_k: 1 # save k best models (determined by above metric)
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  auto_insert_metric_name: True # when True, the checkpoints filenames will contain the metric name
+  save_weights_only: False # if True, then only the model’s weights will be saved
+  every_n_train_steps: null # number of training steps between checkpoints
+  train_time_interval: null # checkpoints are monitored at the specified time interval
+  every_n_epochs: null # number of epochs between checkpoints
+  save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation

configs/callbacks/model_summary.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
+model_summary:
+  _target_: lightning.pytorch.callbacks.RichModelSummary
+  max_depth: 1 # the maximum depth of layer nesting that the summary will include

dpacman/classifier/model/__init__.py → configs/callbacks/none.yaml RENAMED Viewed

File without changes

configs/callbacks/rich_progress_bar.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
+rich_progress_bar:
+  _target_: lightning.pytorch.callbacks.RichProgressBar

configs/data_module/pair.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+_target_: dpacman.data_modules.pair.PairDataModule
+train_file: data_files/processed/splits/by_dna/babytrain.csv
+val_file: data_files/processed/splits/by_dna/babyval.csv
+test_file: data_files/processed/splits/by_dna/babytest.csv
+tr_shelf_path: data_files/processed/embeddings/fimo_hits_only/trs_esm.shelf
+dna_shelf_path: data_files/processed/embeddings/fimo_hits_only/baby_peaks_segmentnt_pernuc_with_onehot.shelf
+batch_size: 32
+num_workers: 8
+maximize_num_workers: False

configs/data_modules/pair.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-train_file: data_files/splits/train.csv
-val_file: data_files/splits/val.csv
-test_file: data_files/splits/test.csv
-batch_size: 32
-num_workers: 8
-maximize_num_workers: False

configs/data_task/cluster/remap.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: remap
-type: cluster
 max_protein_length: 1998

 name: remap
+task_type: cluster
 max_protein_length: 1998

configs/data_task/download/genome.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: genome
-type: download
 output_dir: dpacman/data_files/raw/genomes
 genomes:
   - hg38

 name: genome
+task_type: download
 output_dir: dpacman/data_files/raw/genomes
 genomes:
   - hg38

configs/data_task/download/remap.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: remap
-type: download
 nr_url: https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_nr_macs2_hg38_v1_0.bed.gz
 nr_output_dir: dpacman/data_files/raw/remap

 name: remap
+task_type: download
 nr_url: https://remap.univ-amu.fr/storage/remap2022/hg38/MACS2/remap2022_nr_macs2_hg38_v1_0.bed.gz
 nr_output_dir: dpacman/data_files/raw/remap

configs/data_task/embeddings/dna.yaml CHANGED Viewed

@@ -1,9 +1,13 @@
 name: dna
-type: embeddings
 genome_json_dir: null
-chrom_model: caduceus
-input_file: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
 out_dir: dpacman/data_files/processed/embeddings/fimo_hits_only
-device: gpu

 name: dna
+task_type: embeddings
 genome_json_dir: null
+chrom_model: segmentnt
+input_file: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence_with_rc.json
 out_dir: dpacman/data_files/processed/embeddings/fimo_hits_only
+device: gpu
+batch_size: 1
+debug: false

configs/data_task/embeddings/protein.yaml CHANGED Viewed

	@@ -0,0 +1,14 @@

+name: protein
+task_type: embeddings
+prot_model: esm
+input_file: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/tr_seqid_to_tr_sequence.json
+out_dir: dpacman/data_files/processed/embeddings/fimo_hits_only
+device: gpu
+save_as_shelf: true
+batch_size: 1
+debug: false

configs/data_task/fimo/post_fimo.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: post_fimo
-type: fimo
 fimo_out_dir: dpacman/data_files/processed/fimo/fimo_out_q
 processed_output_csv: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed.csv

 name: post_fimo
+task_type: fimo
 fimo_out_dir: dpacman/data_files/processed/fimo/fimo_out_q
 processed_output_csv: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed.csv

configs/data_task/fimo/pre_fimo.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: pre_fimo
-type: fimo
 paths:
   input_csv: dpacman/data_files/processed/remap/remap2022_crm_macs2_hg38_v1_0_clean.tsv

 name: pre_fimo
+task_type: fimo
 paths:
   input_csv: dpacman/data_files/processed/remap/remap2022_crm_macs2_hg38_v1_0_clean.tsv

configs/data_task/fimo/run_fimo.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: run_fimo
-type: fimo
 debug: true

 name: run_fimo
+task_type: fimo
 debug: true

configs/data_task/split/remap.yaml CHANGED Viewed

@@ -10,7 +10,10 @@ cluster_output_paths:
 input_data_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet
 split_out_dir: dpacman/data_files/processed/splits
 split_by: both # protein, dna, or both
 test_ratio: 0.10
 val_ratio: 0.10

 input_data_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/remap2022_crm_fimo_output_q_processed_seed0.parquet
 split_out_dir: dpacman/data_files/processed/splits
+dna_map_path: dpacman/data_files/processed/fimo/post_fimo/fimo_hits_only/maps/dna_seqid_to_dna_sequence.json
 split_by: both # protein, dna, or both
+augment_rc: true
 test_ratio: 0.10
 val_ratio: 0.10

configs/extras/default.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# disable python warnings if they annoy you
+ignore_warnings: False
+# ask user for tags if none are provided in the config
+enforce_tags: True
+# pretty print config tree at the start of the run using Rich library
+print_config: True

configs/logger/aim.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# https://aimstack.io/
+# example usage in lightning module:
+# https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
+# open the Aim UI with the following command (run in the folder containing the `.aim` folder):
+# `aim up`
+aim:
+  _target_: aim.pytorch_lightning.AimLogger
+  repo: ${paths.root_dir} # .aim folder will be created here
+  # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
+  # aim allows to group runs under experiment name
+  experiment: null # any string, set to "default" if not specified
+  train_metric_prefix: "train/"
+  val_metric_prefix: "val/"
+  test_metric_prefix: "test/"
+  # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
+  system_tracking_interval: 10 # set to null to disable system metrics tracking
+  # enable/disable logging of system params such as installed packages, git info, env vars, etc.
+  log_system_params: true
+  # enable/disable tracking console logs (default value is true)
+  capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550

configs/logger/comet.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# https://www.comet.ml
+comet:
+  _target_: lightning.pytorch.loggers.comet.CometLogger
+  api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
+  save_dir: "${paths.output_dir}"
+  project_name: "lightning-hydra-template"
+  rest_api_key: null
+  # experiment_name: ""
+  experiment_key: null # set to resume experiment
+  offline: False
+  prefix: ""

configs/logger/csv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# csv logger built in lightning
+csv:
+  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
+  save_dir: "${paths.output_dir}"
+  name: "csv/"
+  prefix: ""

configs/logger/many_loggers.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# train with many loggers at once
+defaults:
+  # - comet
+  - csv
+  # - mlflow
+  # - neptune
+  - tensorboard
+  - wandb

configs/logger/mlflow.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# https://mlflow.org
+mlflow:
+  _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
+  # experiment_name: ""
+  # run_name: ""
+  tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
+  tags: null
+  # save_dir: "./mlruns"
+  prefix: ""
+  artifact_location: null
+  # run_id: ""

configs/logger/neptune.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# https://neptune.ai
+neptune:
+  _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
+  api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
+  project: username/lightning-hydra-template
+  # name: ""
+  log_model_checkpoints: True
+  prefix: ""

configs/logger/tensorboard.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# https://www.tensorflow.org/tensorboard/
+tensorboard:
+  _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+  save_dir: "${paths.output_dir}/tensorboard/"
+  name: null
+  log_graph: False
+  default_hp_metric: True
+  prefix: ""
+  # version: ""

configs/logger/wandb.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# https://wandb.ai
+wandb:
+  _target_: lightning.pytorch.loggers.wandb.WandbLogger
+  # name: "" # name of the run (normally generated by wandb)
+  save_dir: "${paths.output_dir}"
+  offline: False
+  id: null # pass correct id to resume experiment!
+  anonymous: null # enable anonymous logging
+  project: "dnabind"
+  log_model: False # upload lightning ckpts
+  prefix: "" # a string to put at the beginning of metric keys
+  # entity: "" # set to name of your wandb team
+  group: ""
+  tags: []
+  job_type: ""

configs/model/classifier.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+_target_: dpacman.classifier.model.BindPredictor
+lr: 1e-4
+alpha: 20
+gamma: 20
+weight_decay: 0.01
+glm_input_dim: 1029
+compressed_dim: 1029

configs/{models → model}/pooling/truncatedsvd.yaml RENAMED Viewed

File without changes

configs/models/classifier.yaml DELETED Viewed

@@ -1,11 +0,0 @@
-name: classifier
-type: train
-params:
-  epochs: 10
-  batch_size: 32
-  lr: 1e-4
-  seed: 42
-out_dir: null
-pair_list: null

configs/preprocess.yaml CHANGED Viewed

@@ -6,4 +6,4 @@ defaults:
   - hydra: default  # ← tells Hydra to use the logging/output config
   - data_task: download/genome
-task_name: preprocess/${data_task.type}

   - hydra: default  # ← tells Hydra to use the logging/output config
   - data_task: download/genome
+task_name: preprocess/${data_task.task_type}

configs/train.yaml CHANGED Viewed

@@ -2,7 +2,42 @@ defaults:
   - _self_
   - paths: default
   - hydra: default  # ← tells Hydra to use the logging/output config
   - trainer: gpu
-  - data_task: model/classifier
-task_name: train/${data_task.type}

   - _self_
   - paths: default
   - hydra: default  # ← tells Hydra to use the logging/output config
+  - data_module: pair
+  - model: classifier
   - trainer: gpu
+  - extras: default
+  - logger: wandb
+  - callbacks: default
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  # config for hyperparameter optimization
+  - hparams_search: null
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+task_name: train/${model}
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# set False to skip model training
+train: True
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: True
+# simply provide checkpoint path to resume training
+ckpt_path: null
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42
+trainer:
+  max_epochs: 20

configs/trainer/cpu.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: cpu
+devices: 1

configs/trainer/ddp.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+defaults:
+  - default
+strategy: ddp
+accelerator: gpu
+devices: 4
+num_nodes: 1
+sync_batchnorm: True

configs/trainer/ddp_sim.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+defaults:
+  - default
+# simulate DDP on CPU, useful for debugging
+accelerator: cpu
+devices: 2
+strategy: ddp_spawn

configs/trainer/default.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+_target_: lightning.pytorch.trainer.Trainer
+default_root_dir: ${paths.output_dir}
+min_epochs: 1 # prevents early stopping
+max_epochs: 10
+accelerator: cpu
+devices: 1
+# mixed precision for extra speed-up
+# precision: 16
+# perform a validation loop every N training epochs
+check_val_every_n_epoch: 1
+# set True to to ensure deterministic results
+# makes training slower but gives more reproducibility than just setting seeds
+deterministic: False

configs/trainer/gpu.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: gpu
+devices: 1

configs/trainer/mps.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: mps
+devices: 1

dpacman/classifier/loss.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""
+Define loss functions needed for training the model
+"""
+import torch
+from torch.nn import functional as F
+def bce_loss_masked(logits, targets, nonpeak_mask, pos_weight=None):
+    """
+    Compute the masked Binary Cross Entropy, only on certain positions.
+    We will only compute BCE on positions whre nonpeak_mask == 1.0; the mask represents non-peak positions
+    """
+    loss = F.binary_cross_entropy_with_logits(
+        logits, targets, reduction="none", pos_weight=pos_weight
+    )
+    denom = nonpeak_mask.sum().clamp_min(1.0)
+    return (loss * nonpeak_mask).sum() / denom
+def mse_peaks_only(logits, targets, peak_mask, eps=1e-8):
+    """
+    Calculate MSE on peaks only.
+    """
+    probs = torch.sigmoid(logits)
+    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction="sum") / (
+        peak_mask.sum() + eps
+    )
+    return mse_peaks
+def calculate_loss(logits, targets, eps=1e-8, alpha=1.0, gamma=1.0):
+    """
+    Combine masked-BCE + global-MSE to get a loss vlaue
+    """
+    # Calculate peak and non-peak masks.
+    # Anything outside a peak will have a label equal to 0.
+    nonpeak_mask = (targets == 0).float()
+    peak_mask = (targets > 0).float()
+    bce_nonpeak = bce_loss_masked(logits, targets, nonpeak_mask)
+    mse_peak = mse_peaks_only(logits, targets, peak_mask, eps=eps)
+    loss = alpha * bce_nonpeak + gamma * mse_peak
+    return loss
+def accuracy_percentage(logits, targets, peak_thresh=0.5):
+    """
+    Compute accuracy in predicting high-confidence peaks (probability > 0.5)
+    """
+    probs = torch.sigmoid(logits)
+    preds_bin = (probs >= 0.5).float()
+    labels = (targets >= peak_thresh).float()
+    correct = (preds_bin == labels).float().sum()
+    total = torch.numel(labels)
+    return (correct / max(1, total)).item() * 100.0

dpacman/classifier/model.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+Lightning Module for the binding model.
+"""
+import torch
+from torch import nn
+from lightning import LightningModule
+from dpacman.utils.models import set_seed
+from .loss import calculate_loss
+set_seed()
+class LocalCNN(nn.Module):
+    def __init__(self, dim: int = 256, kernel_size: int = 3):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=padding)
+        self.act = nn.GELU()
+        self.ln = nn.LayerNorm(dim)
+    def forward(self, x: torch.Tensor):
+        # x: (batch, L, dim)
+        out = self.conv(x.transpose(1, 2))  # → (batch, dim, L)
+        out = self.act(out)
+        out = out.transpose(1, 2)  # → (batch, L, dim)
+        return self.ln(out + x)  # residual
+class CrossModalBlock(nn.Module):
+    def __init__(self, dim: int = 256, heads: int = 8):
+        super().__init__()
+        # self-attention for both sides
+        self.sa_binder = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.sa_glm = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_b1 = nn.LayerNorm(dim)
+        self.ln_g1 = nn.LayerNorm(dim)
+        self.ffn_b = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ffn_g = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ln_b2 = nn.LayerNorm(dim)
+        self.ln_g2 = nn.LayerNorm(dim)
+        # cross attention (binder queries, glm keys/values)
+        # so the NDA path is updated by the transcriptoin factors
+        self.cross_attn = nn.MultiheadAttention(dim, heads, batch_first=True)
+        self.ln_c1 = nn.LayerNorm(dim)
+        self.ffn_c = nn.Sequential(
+            nn.Linear(dim, dim * 4), nn.GELU(), nn.Linear(dim * 4, dim)
+        )
+        self.ln_c2 = nn.LayerNorm(dim)
+    def forward(self, binder: torch.Tensor, glm: torch.Tensor):
+        """
+        binder: (batch, Lb, dim)
+        glm:    (batch, Lg, dim) -- has passed through its local CNN beforehand
+        returns: updated binder representation (batch, Lb, dim)
+        """
+        # binder: self-attn + ffn
+        b = binder
+        b_sa, _ = self.sa_binder(b, b, b)
+        b = self.ln_b1(b + b_sa)
+        b_ff = self.ffn_b(b)
+        b = self.ln_b2(b + b_ff)
+        # glm: self-attn + ffn
+        g = glm
+        g_sa, _ = self.sa_glm(g, g, g)
+        g = self.ln_g1(g + g_sa)
+        g_ff = self.ffn_g(g)
+        g = self.ln_g2(g + g_ff)
+        # cross-attention: glm queries binder and glm embeddings are updated
+        g_to_b_ca, _ = self.cross_attn(g, b, b)
+        g = self.ln_c1(g + g_to_b_ca)
+        g_ff = self.ffn_c(g)
+        g = self.ln_c2(g + g_ff)
+        return g  # (batch, Lb, dim)
+class DimCompressor(nn.Module):
+    """
+    Learnable per-token compressor: maps any in_dim >= out_dim to out_dim (default 256).
+    If in_dim == out_dim, behaves as identity.
+    """
+    def __init__(self, in_dim: int, out_dim: int = 256):
+        super().__init__()
+        if in_dim == out_dim:
+            self.net = nn.Identity()
+        else:
+            hidden = max(out_dim * 2, (in_dim + out_dim) // 2)
+            self.net = nn.Sequential(
+                nn.LayerNorm(in_dim),
+                nn.Linear(in_dim, hidden),
+                nn.GELU(),
+                nn.Linear(hidden, out_dim),
+            )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: (B, L, in_dim)
+        return self.net(x)
+class BindPredictor(LightningModule):
+    def __init__(
+        self,
+        # input_dim: int = 256,                     # OLD: single input dim
+        binder_input_dim: int = 1280,  # NEW: TF (binder) original dim (e.g., 1280)
+        glm_input_dim: int = 256,  # NEW: DNA/GLM original dim (e.g., 256)
+        compressed_dim: int = 256,  # NEW: learnable compressed dim
+        hidden_dim: int = 256,
+        heads: int = 8,
+        num_layers: int = 4,
+        lr: float = 1e-4,
+        alpha: float = 20,
+        gamma: float = 20,
+        use_local_cnn_on_glm: bool = True,
+        weight_decay: float = 0.01,
+    ):
+        # Init
+        super(BindPredictor, self).__init__()
+        self.save_hyperparameters()
+        # Learnable compressor for binder -> 256, then project to hidden
+        self.binder_compress = DimCompressor(binder_input_dim, out_dim=compressed_dim)
+        self.proj_binder = nn.Linear(compressed_dim, hidden_dim)
+        # GLM side stays 256 -> hidden
+        self.proj_glm = nn.Linear(glm_input_dim, hidden_dim)
+        self.use_local_cnn = use_local_cnn_on_glm
+        self.local_cnn = LocalCNN(hidden_dim) if use_local_cnn_on_glm else nn.Identity()
+        self.layers = nn.ModuleList(
+            [CrossModalBlock(hidden_dim, heads) for _ in range(num_layers)]
+        )
+        self.ln_out = nn.LayerNorm(hidden_dim)
+        # self.head = nn.Sequential(nn.Linear(hidden_dim, 1), nn.Sigmoid())  # OLD: returned probabilities
+        self.head = nn.Linear(hidden_dim, 1)  # NEW: return logits (safe for AMP)
+    def forward(self, binder_emb, glm_emb):
+        """
+        binder_emb: (B, Lb, binder_input_dim)
+        glm_emb:    (B, Lg, glm_input_dim)
+        Returns per-nucleotide logits for the GLM sequence: (B, Lg)
+        """
+        # Binder: learnable compression → 256 → hidden
+        b = self.binder_compress(binder_emb)  # (B, Lb, 256)
+        b = self.proj_binder(b)  # (B, Lb, hidden_dim)
+        # GLM: project → hidden, add local CNN context
+        g = self.proj_glm(glm_emb)  # (B, Lg, hidden_dim)
+        if self.use_local_cnn:
+            g = self.local_cnn(g)
+        # Cross-modal blocks: update binder states using GLM
+        for layer in self.layers:
+            g = layer(b, g)  # (B, Lb, hidden_dim)
+        # Predict per-nucleotide logits on the GLM tokens:
+        # return self.head(g).squeeze(-1)         # OLD: probabilities (with Sigmoid in head)
+        return self.head(g).squeeze(
+            -1
+        )  # NEW: logits (apply sigmoid only in loss/metrics)
+    # ----- Lightning hooks -----
+    def training_step(self, batch, batch_idx):
+        """
+        Training step taken by PyTorch-Lightning trainer. Uses batch returned by data collator.
+        Colator returns a dictionary with:
+            "binder_emb"    # [B, Lb_max, Db]
+            "binder_mask"   # [B, Lb_max]
+            "glm_emb"       # [B, Lg_max, Dg]
+            "glm_mask"      # [B, Lg_max]
+            "labels"        # [B, Lg_max]
+            "ID"
+            "tr_sequence"
+            "dna_sequence"
+        }
+        """
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "train/loss",
+            loss,
+            on_step=True,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        return loss
+    def validation_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "val/loss",
+            loss,
+            on_step=False,
+            on_epoch=True,
+            prog_bar=True,
+            batch_size=logits.size(0),
+        )
+        return loss
+    def test_step(self, batch, batch_idx):
+        logits = self.forward(batch["binder_emb"], batch["glm_emb"])
+        loss = calculate_loss(
+            logits, batch["labels"], alpha=self.hparams.alpha, gamma=self.hparams.gamma
+        )
+        self.log(
+            "test/loss", loss, on_step=False, on_epoch=True, batch_size=logits.size(0)
+        )
+        return loss
+    def on_train_epoch_end(self):
+        if False:
+            if self.train_auc.compute() is not None:
+                self.log("train/auroc", self.train_auc.compute(), prog_bar=True)
+            self.train_auc.reset()
+    def on_validation_epoch_end(self):
+        if False:
+            if self.val_auc.compute() is not None:
+                self.log("val/auroc", self.val_auc.compute(), prog_bar=True)
+            self.val_auc.reset()
+    def on_test_epoch_end(self):
+        if False:
+            if self.test_auc.compute() is not None:
+                self.log("test/auroc", self.test_auc.compute(), prog_bar=True)
+            self.test_auc.reset()
+    def configure_optimizers(self):
+        # AdamW + cosine as a sensible default
+        opt = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.hparams.lr,
+            weight_decay=self.hparams.weight_decay,
+        )
+        # Scheduler optional—comment out if you prefer fixed LR
+        sch = torch.optim.lr_scheduler.CosineAnnealingLR(
+            opt, T_max=max(self.trainer.max_epochs, 1)
+        )
+        return {
+            "optimizer": opt,
+            "lr_scheduler": {"scheduler": sch, "interval": "epoch"},
+        }

dpacman/classifier/model/clustering_data.py DELETED Viewed

@@ -1,383 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import pandas as pd
-from pathlib import Path
-import random
-import sys
-import subprocess
-from collections import defaultdict
-# ─────────────────────────────────────────────────────────────────────────
-# Original helpers (kept; some lightly edited/commented where needed)
-# ─────────────────────────────────────────────────────────────────────────
-def read_ids_file(p):
-    p = Path(p)
-    if not p.exists():
-        raise FileNotFoundError(f"IDs file not found: {p}")
-    return [line.strip() for line in p.open() if line.strip()]
-def split_embeddings(emb_path, ids_path, out_dir, prefix):
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    if not Path(emb_path).exists():
-        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
-    if not Path(ids_path).exists():
-        raise FileNotFoundError(f"IDs file not found: {ids_path}")
-    if emb_path.endswith(".npz"):
-        data = np.load(emb_path, allow_pickle=True)
-        if "embeddings" in data:
-            emb = data["embeddings"]
-        else:
-            raise ValueError(f"{emb_path} missing 'embeddings' key")
-    else:
-        emb = np.load(emb_path)
-    ids = read_ids_file(ids_path)
-    if len(ids) != emb.shape[0]:
-        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
-    mapping = {}
-    for i, ident in enumerate(ids):
-        if i >= emb.shape[0]:
-            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
-            continue
-        arr = emb[i]
-        out_file = out_dir / f"{prefix}_{ident}.npy"
-        np.save(out_file, arr)
-        mapping[ident] = str(out_file)
-    return mapping
-def extract_symbol_from_tf_id(full_id: str) -> str:
-    """
-    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
-    return the gene symbol uppercase (e.g., 'ZBTB5').
-    """
-    if "|" in full_id:
-        try:
-            # format sp|Accession|SYMBOL_HUMAN
-            genepart = full_id.split("|")[2]
-        except IndexError:
-            genepart = full_id
-    else:
-        genepart = full_id
-    symbol = genepart.split("_")[0]
-    return symbol.upper()
-def build_tf_symbol_map(tf_map):
-    """
-    Build mapping gene_symbol -> list of embedding paths.
-    """
-    symbol_map = {}
-    for full_id, path in tf_map.items():
-        symbol = extract_symbol_from_tf_id(full_id)
-        symbol_map.setdefault(symbol, []).append(path)
-    return symbol_map
-def tf_key_from_path(path: str) -> str:
-    """
-    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
-    """
-    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
-    # remove leading prefix if present (tf_)
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return extract_symbol_from_tf_id(rest)
-def dna_key_from_path(path: str) -> str:
-    """
-    Given .../dna_peak42.npy -> 'peak42'
-    """
-    stem = Path(path).stem
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return rest
-# ─────────────────────────────────────────────────────────────────────────
-# New helpers for MMseqs clustering & cluster-level splitting
-# ─────────────────────────────────────────────────────────────────────────
-def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
-    """
-    Write unique DNA sequences to FASTA using dna_id as header.
-    Requires df with columns: dna_id, dna_sequence
-    """
-    uniq = df[["dna_id", "dna_sequence"]].drop_duplicates()
-    with open(out_fasta, "w") as f:
-        for _, row in uniq.iterrows():
-            did = row["dna_id"]
-            seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
-            f.write(f">{did}\n{seq}\n")
-def run_mmseqs_easy_cluster(
-    mmseqs_bin: str,
-    fasta: Path,
-    out_prefix: Path,
-    tmp_dir: Path,
-    min_seq_id: float,
-    coverage: float,
-    cov_mode: int,
-) -> Path:
-    """
-    Runs mmseqs easy-cluster on nucleotide sequences.
-    Returns the path to a clusters TSV file (creating it if the default one isn't present).
-    """
-    tmp_dir.mkdir(parents=True, exist_ok=True)
-    out_prefix.parent.mkdir(parents=True, exist_ok=True)
-    cmd = [
-        mmseqs_bin, "easy-cluster",
-        str(fasta), str(out_prefix), str(tmp_dir),
-        "--min-seq-id", str(min_seq_id),
-        "-c", str(coverage),
-        "--cov-mode", str(cov_mode),
-        # You can add performance flags here if needed, e.g.:
-        # "--threads", "8"
-    ]
-    print("[i] Running:", " ".join(cmd), flush=True)
-    subprocess.run(cmd, check=True)
-    # MMseqs easy-cluster typically writes <out_prefix>_cluster.tsv
-    default_tsv = Path(str(out_prefix) + "_cluster.tsv")
-    if default_tsv.exists():
-        print(f"[i] Found cluster TSV: {default_tsv}")
-        return default_tsv
-    # Fallback: try createtsv if default is missing
-    # This requires the internal DBs. easy-cluster creates DBs alongside out_prefix.
-    # We'll try to locate them and emit a TSV.
-    in_db = Path(str(out_prefix) + "_query")
-    cl_db = Path(str(out_prefix) + "_cluster")
-    out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
-    if in_db.exists() and cl_db.exists():
-        cmd2 = [mmseqs_bin, "createtsv", str(in_db), str(in_db), str(cl_db), str(out_tsv)]
-        print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
-        subprocess.run(cmd2, check=True)
-        if out_tsv.exists():
-            return out_tsv
-    raise FileNotFoundError("Could not locate clusters TSV from mmseqs. "
-                            "Expected {default_tsv} or createtsv fallback.")
-def parse_mmseqs_clusters(tsv_path: Path) -> dict:
-    """
-    Parse MMseqs cluster TSV (rep \t member). Returns dna_id -> cluster_rep_id
-    """
-    mapping = {}
-    with open(tsv_path) as f:
-        for line in f:
-            parts = line.rstrip("\n").split("\t")
-            if len(parts) < 2:
-                continue
-            rep, member = parts[0], parts[1]
-            mapping[member] = rep
-            # Some TSVs include rep->rep; if not, ensure rep is mapped to itself:
-            if rep not in mapping:
-                mapping[rep] = rep
-    return mapping
-def assign_clusters_to_splits(cluster_rep_to_members: dict,
-                              val_frac: float,
-                              test_frac: float,
-                              seed: int = 42):
-    """
-    cluster_rep_to_members: dict[rep] = [members...]
-    Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
-    Ensures all members of a cluster go to the same split.
-    """
-    rng = random.Random(seed)
-    reps = list(cluster_rep_to_members.keys())
-    rng.shuffle(reps)
-    # Greedy-ish fill by total member counts to match desired fractions.
-    total = sum(len(cluster_rep_to_members[r]) for r in reps)
-    target_val = int(round(total * val_frac))
-    target_test = int(round(total * test_frac))
-    cur_val = cur_test = 0
-    val_ids, test_ids, train_ids = set(), set(), set()
-    for rep in reps:
-        members = cluster_rep_to_members[rep]
-        c = len(members)
-        # Fill val first, then test, then train
-        if cur_val + c <= target_val:
-            val_ids.update(members); cur_val += c
-        elif cur_test + c <= target_test:
-            test_ids.update(members); cur_test += c
-        else:
-            train_ids.update(members)
-    return {"train": train_ids, "val": val_ids, "test": test_ids}
-# ─────────────────────────────────────────────────────────────────────────
-# Main
-# ─────────────────────────────────────────────────────────────────────────
-def main():
-    parser = argparse.ArgumentParser(
-        description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
-    )
-    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
-    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
-    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)")
-    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
-    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)")
-    parser.add_argument("--out_dir", required=True, help="Output directory")
-    parser.add_argument("--seed", type=int, default=42)
-    # NEW: MMseqs options & split fractions
-    parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
-    parser.add_argument("--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id")
-    parser.add_argument("--cov", type=float, default=0.8, help="MMseqs -c coverage fraction")
-    parser.add_argument("--cov_mode", type=int, default=1, help="MMseqs --cov-mode (1 = coverage of target)")
-    parser.add_argument("--val_frac", type=float, default=0.10)
-    parser.add_argument("--test_frac", type=float, default=0.10)
-    parser.add_argument("--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)")
-    args = parser.parse_args()
-    random.seed(args.seed)
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Load final.csv
-    df = pd.read_csv(args.final_csv, dtype=str)
-    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
-        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
-    # Assign dna_id (unique per dna_sequence)
-    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
-    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
-    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
-    enriched_csv = out_dir / "final_with_dna_id.csv"
-    df.to_csv(enriched_csv, index=False)
-    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
-    # Split embeddings into per-item files (unchanged)
-    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
-    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
-    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
-    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
-    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
-    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
-    # Build gene-symbol normalized map
-    tf_symbol_map = build_tf_symbol_map(tf_map)
-    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
-    # Diagnostic overlaps
-    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
-    available_tf_symbols = set(tf_symbol_map.keys())
-    intersect_tf = norm_tf_in_final & available_tf_symbols
-    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
-    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
-    print(f"[i] Intersection count: {len(intersect_tf)}")
-    if len(intersect_tf) == 0:
-        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
-        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
-        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
-        sys.exit(1)
-    dna_ids_final = set(df["dna_id"].unique())
-    available_dna_ids = set(dna_map.keys())
-    intersect_dna = dna_ids_final & available_dna_ids
-    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
-    if len(intersect_dna) == 0:
-        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
-        sys.exit(1)
-    # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
-    fasta_path = out_dir / "dna_unique.fasta"
-    write_dna_fasta(df, fasta_path)
-    print(f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}")
-    tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
-    cluster_prefix = out_dir / "mmseqs_dna_clusters"
-    clusters_tsv = run_mmseqs_easy_cluster(
-        mmseqs_bin=args.mmseqs_bin,
-        fasta=fasta_path,
-        out_prefix=cluster_prefix,
-        tmp_dir=tmp_dir,
-        min_seq_id=args.min_seq_id,
-        coverage=args.cov,
-        cov_mode=args.cov_mode,
-    )
-    # Parse clusters
-    member_to_rep = parse_mmseqs_clusters(clusters_tsv)   # dna_id -> rep_id
-    # Build rep -> members list
-    rep_to_members = defaultdict(list)
-    for member, rep in member_to_rep.items():
-        rep_to_members[rep].append(member)
-    print(f"[i] Parsed {len(rep_to_members)} clusters from {clusters_tsv}")
-    clusters_table = []
-    for rep, members in rep_to_members.items():
-        for m in members:
-            clusters_table.append((m, rep))
-    clusters_df = pd.DataFrame(clusters_table, columns=["dna_id", "cluster_id"])
-    clusters_df.to_csv(out_dir / "clusters.tsv", sep="\t", index=False)
-    print(f"[i] Wrote clusters mapping → {out_dir / 'clusters.tsv'}")
-    # Attach cluster_id back to final df
-    df = df.merge(clusters_df, on="dna_id", how="left")
-    df.to_csv(out_dir / "final_with_dna_id_and_cluster.csv", index=False)
-    print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
-    # Assign entire clusters to splits
-    splits = assign_clusters_to_splits(rep_to_members,
-                                       val_frac=args.val_frac,
-                                       test_frac=args.test_frac,
-                                       seed=args.seed)
-    for k in ["train", "val", "test"]:
-        print(f"[i] {k}: {len(splits[k])} dna_ids")
-    # ── Build positive pairs only, per split (NO negatives) ───────────────
-    positives_by_split = {"train": [], "val": [], "test": []}
-    # Build a quick dna_id -> embedding path map
-    dnaid_to_path = {did: path for did, path in dna_map.items()}
-    pos_count = 0
-    for _, row in df.iterrows():
-        tf_raw = row["TF_id"]
-        tf_symbol = tf_raw.split("_seq")[0].upper()
-        dnaid = row["dna_id"]
-        if (tf_symbol not in tf_symbol_map) or (dnaid not in dnaid_to_path):
-            continue
-        tf_embedding_path = tf_symbol_map[tf_symbol][0]  # first embedding per symbol
-        # decide split by dna_id cluster assignment
-        if dnaid in splits["train"]:
-            positives_by_split["train"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
-        elif dnaid in splits["val"]:
-            positives_by_split["val"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
-        elif dnaid in splits["test"]:
-            positives_by_split["test"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
-        pos_count += 1
-    print(f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})")
-    for k in ["train", "val", "test"]:
-        print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
-    # # OLD: negatives (kept commented)
-    # negatives = []
-    # print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive not used)")
-    # Emit split-specific pair lists
-    for split in ["train", "val", "test"]:
-        out_tsv = out_dir / f"pair_list_{split}.tsv"
-        with open(out_tsv, "w") as f:
-            for binder_path, glm_path, label in positives_by_split[split]:  # + negatives if you add later
-                f.write(f"{binder_path}\t{glm_path}\t{label}\n")
-        print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
-    print("✅ Done. Cluster-aware splits ready.")
-if __name__ == "__main__":
-    main()

dpacman/classifier/model/compress_embeddings.py DELETED Viewed

@@ -1,54 +0,0 @@
-# compress_embeddings.py
-# USAGE: python compress_embeddings.py --input_glob "/path/to/esm_embeddings/*.npy" --output_dir "/path/to/compressed_embeddings" --esm_dim 1280 --out_dim 256
-# --------------
-import os
-import glob
-import numpy as np
-import torch
-from torch import nn
-class EmbeddingCompressor(nn.Module):
-    def __init__(self, input_dim: int = 1280, output_dim: int = 256):
-        super().__init__()
-        self.fc = nn.Linear(input_dim, output_dim)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        x: (batch, L, input_dim)  or (L, input_dim)
-        returns: (batch, output_dim) or (output_dim,)
-        """
-        if x.dim() == 2:
-            # single example: mean over tokens
-            x = x.mean(dim=0, keepdim=True)      # → (1, input_dim)
-        else:
-            # batch: mean over tokens
-            x = x.mean(dim=1)                     # → (batch, input_dim)
-        return self.fc(x)                         # → (batch, output_dim)
-def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
-    arr = np.load(in_path)                      # shape (L, D) or (batch, L, D)
-    tensor = torch.from_numpy(arr).float()
-    with torch.no_grad():
-        compressed = model(tensor)              # → (batch, 256)
-    out = compressed.cpu().numpy()
-    np.save(out_path, out)
-    print(f"Saved {out_path}")
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
-    parser.add_argument("--input_glob", type=str, required=True,
-                        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)")
-    parser.add_argument("--output_dir", type=str, required=True)
-    parser.add_argument("--esm_dim", type=int, default=1280)
-    parser.add_argument("--out_dim", type=int, default=256)
-    args = parser.parse_args()
-    os.makedirs(args.output_dir, exist_ok=True)
-    compressor = EmbeddingCompressor(args.esm_dim, args.out_dim)
-    compressor.eval()
-    for fn in glob.glob(args.input_glob):
-        base = os.path.basename(fn).replace(".npy", "_256.npy")
-        out_path = os.path.join(args.output_dir, base)
-        compress_file(fn, out_path, compressor)

dpacman/classifier/model/compute_embeddings.py DELETED Viewed

@@ -1,560 +0,0 @@
-"""
-Plug-and-play embedding extraction for:
-  • Chromosome sequences (from raw UCSC JSON)
-  • TF sequences (transcription_factors.fasta)
-Usage example (DNA + protein in one go):
-  module load miniconda/24.7.1
-  conda activate dpacman
-  python dpacman/data/compute_embeddings.py \
-    --genome-json-dir ../data_files/raw/genomes/hg38 \
-    --tf-fasta         ../data_files/processed/tfclust/hg38_tf/transcription_factors.fasta \
-    --chrom-model      caduceus \
-    --tf-model         esm-dbp \
-    --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
-    --device           cuda
-"""
-import os
-import re
-import argparse
-import json
-import numpy as np
-from pathlib import Path
-import torch
-from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, pipeline
-import esm
-from Bio import SeqIO
-import time
-import pandas as pd
-from tqdm.auto import tqdm
-import logging, math
-# ---- model wrappers ----
-class CaduceusEmbedder:
-    def __init__(self, device, chunk_size=131_072, overlap=0):
-        """
-        device: 'cpu' or 'cuda'
-        chunk_size: max bases (and thus tokens) to send in one forward pass
-        overlap: how many bases each window overlaps the previous; 0 = no overlap
-        """
-        model_name = "kuleshov-group/caduceus-ph_seqlen-131k_d_model-256_n_layer-16"
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            model_name, trust_remote_code=True
-        )
-        self.model = AutoModel.from_pretrained(
-            model_name, trust_remote_code=True
-        ).to(device).eval()
-        self.device     = device
-        self.chunk_size = chunk_size
-        self.step       = chunk_size - overlap
-    def embed(self, seqs):
-        """
-        seqs: List[str] of DNA sequences (each <= chunk_size for this test)
-        returns: np.ndarray of shape (N, L, D), raw per‐token embeddings
-        """
-        # outputs = []
-        # for seq in seqs:
-        #     # --- new: raw per‐token embeddings in one shot ---
-        #     toks = self.tokenizer(
-        #         seq,
-        #         return_tensors="pt",
-        #         padding=False,
-        #         truncation=True,
-        #         max_length=self.chunk_size
-        #     ).to(self.device)
-        #     with torch.no_grad():
-        #         out = self.model(**toks).last_hidden_state  # (1, L, D)
-        #     outputs.append(out.cpu().numpy()[0])             # (L, D)
-        # return np.stack(outputs, axis=0)  # (N, L, D)
-        outputs = []
-        for seq in tqdm(seqs, total=len(seqs), desc="DNA: Caduceus", dynamic_ncols=True):
-            toks = self.tokenizer(
-                seq,
-                return_tensors="pt",
-                padding=False,
-                truncation=True,
-                max_length=self.chunk_size
-            ).to(self.device)
-            with torch.no_grad():
-                out = self.model(**toks).last_hidden_state  # (1, L, D)
-            outputs.append(out.cpu().numpy()[0])             # (L, D)
-        return outputs  # list of variable-length (L_i, D) arrays
-    def benchmark(self, lengths=None):
-        """
-        Time embedding on single-sequence of various lengths.
-        By default tests [5K,10K,50K,100K,chunk_size].
-        """
-        tests = lengths or [5_000, 10_000, 50_000, 100_000, self.chunk_size]
-        print(f"→ Benchmarking Caduceus on device={self.device}")
-        for sz in tests:
-            seq = "A" * sz
-            # Warm-up
-            _ = self.embed([seq])
-            if self.device != "cpu":
-                torch.cuda.synchronize()
-            t0 = time.perf_counter()
-            _ = self.embed([seq])
-            if self.device != "cpu":
-                torch.cuda.synchronize()
-            t1 = time.perf_counter()
-            print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
-class SegmentNTEmbedder:
-    def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
-        self.model = AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True).to(device).eval()
-        self.device = device
-    def _adjust_length(self, input_ids):
-        bs, L = input_ids.shape
-        excl = L - 1
-        remainder = (excl) % 4
-        if remainder != 0:
-            pad_needed = 4 - remainder
-            pad_tensor = torch.full((bs, pad_needed), self.tokenizer.pad_token_id, dtype=input_ids.dtype, device=input_ids.device)
-            input_ids = torch.cat([input_ids, pad_tensor], dim=1)
-        return input_ids
-    def embed(self, seqs, batch_size=16):
-        """
-        seqs: List[str]
-        Returns: np.ndarray of shape (N, D)
-        """
-        all_embeddings = []
-        for i in range(0, len(seqs), batch_size):
-            batch_seqs = seqs[i : i + batch_size]
-            encoded = self.tokenizer.batch_encode_plus(
-                batch_seqs,
-                return_tensors="pt",
-                padding=True,
-                truncation=True,
-            )
-            input_ids = encoded["input_ids"].to(self.device)  # (B, L)
-            attention_mask = input_ids != self.tokenizer.pad_token_id
-            input_ids = self._adjust_length(input_ids)
-            attention_mask = (input_ids != self.tokenizer.pad_token_id)
-            with torch.no_grad():
-                outs = self.model(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
-                    return_dict=True,
-                )
-            if hasattr(outs, "hidden_states") and outs.hidden_states is not None:
-                last_hidden = outs.hidden_states[-1]  # (B, L, D)
-            else:
-                last_hidden = outs.last_hidden_state  # fallback
-            # Exclude CLS token if present (assume first token) and pool
-            pooled = last_hidden[:, 1:, :].mean(dim=1)  # (B, D)
-            all_embeddings.append(pooled.cpu().numpy())
-            # release fragmentation
-            torch.cuda.empty_cache()
-        return np.vstack(all_embeddings)  # (N, D)
-class DNABertEmbedder:
-    def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
-        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
-        self.device    = device
-    def embed(self, seqs):
-        embs = []
-        for s in seqs:
-            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
-            with torch.no_grad():
-                out = self.model(tokens).last_hidden_state.mean(1)
-            embs.append(out.cpu().numpy())
-        return np.vstack(embs)
-class NucleotideTransformerEmbedder:
-    def __init__(self, device):
-        # HF “feature-extraction” returns a list of (L, D) arrays for each input
-        # device: “cpu” or “cuda”
-        self.pipe = pipeline(
-            "feature-extraction",
-            model="InstaDeepAI/nucleotide-transformer-500m-1000g",
-            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
-        )
-    def embed(self, seqs):
-        """
-        seqs: List[str] of raw DNA sequences
-        returns: (N, D) array, one D-dim vector per sequence
-        """
-        all_embeddings = self.pipe(seqs, truncation=True, padding=True)
-        # all_embeddings is a List of shape (L, D) arrays
-        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
-        return np.vstack(pooled)
-# class ESMEmbedder:
-#     def __init__(self, device):
-#         self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-#         self.batch_converter = self.alphabet.get_batch_converter()
-#         self.model.to(device).eval()
-#         self.device = device
-#     def embed(self, seqs):
-#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-#         _, _, toks = self.batch_converter(batch)
-#         toks = toks.to(self.device)
-#         with torch.no_grad():
-#             results = self.model(toks, repr_layers=[33], return_contacts=False)
-#         reps = results["representations"][33]
-#         return reps[:, 1:-1].mean(1).cpu().numpy()
-class ESMEmbedder:
-    def __init__(self, device, model_name="esm2_t33_650M_UR50D"):
-        # Try to load the specified ESM-2 model; fallback to esm1b if missing
-        self.device = device
-        try:
-            self.model, self.alphabet = getattr(esm.pretrained, model_name)()
-            self.is_esm2 = model_name.lower().startswith("esm2")
-        except AttributeError:
-            # fallback to ESM-1b
-            self.model, self.alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-            self.is_esm2 = False
-        self.batch_converter = self.alphabet.get_batch_converter()
-        self.model.to(device).eval()
-        # determine max length: esm2 models vary; use default 1024 for esm1b
-        self.max_len = 4096 if self.is_esm2 else 1024  # adjust if your esm2 variant has explicit limit
-        # for chunking: reserve 2 tokens if model uses BOS/EOS
-        self.chunk_size = self.max_len - 2
-        self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
-    def _chunk_sequence(self, seq):
-        """
-        Return list of possibly overlapping chunks of seq, each <= chunk_size.
-        """
-        if len(seq) <= self.chunk_size:
-            return [seq]
-        step = self.chunk_size - self.overlap
-        chunks = []
-        for i in range(0, len(seq), step):
-            chunk = seq[i : i + self.chunk_size]
-            if not chunk:
-                break
-            chunks.append(chunk)
-        return chunks
-    def embed(self, seqs):
-        """
-        seqs: List[str] of protein sequences.
-        Returns: np.ndarray of shape (N, D) pooled per-sequence embeddings.
-        """
-        all_embeddings = []
-        for i, seq in enumerate(seqs):
-            chunks = self._chunk_sequence(seq)
-            chunk_vecs = []
-            # process chunks in batch if small number, else sequentially
-            for chunk in chunks:
-                batch = [(str(i), chunk)]
-                _, _, toks = self.batch_converter(batch)
-                toks = toks.to(self.device)
-                with torch.no_grad():
-                    results = self.model(toks, repr_layers=[33], return_contacts=False)
-                reps = results["representations"][33]  # (1, L, D)
-                # remove BOS/EOS if present: take 1:-1 if length permits
-                if reps.size(1) > 2:
-                    rep = reps[:, 1:-1].mean(1)  # (1, D)
-                else:
-                    rep = reps.mean(1)  # fallback
-                chunk_vecs.append(rep.squeeze(0))  # (D,)
-            if len(chunk_vecs) == 1:
-                seq_vec = chunk_vecs[0]
-            else:
-                # average chunk vectors
-                stacked = torch.stack(chunk_vecs, dim=0)  # (num_chunks, D)
-                seq_vec = stacked.mean(0)
-            all_embeddings.append(seq_vec.cpu().numpy())
-        return np.vstack(all_embeddings)  # (N, D)
-# class ESMDBPEmbedder:
-#     def __init__(self, device):
-#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-#         model_path = (
-#             Path(__file__).resolve().parent.parent
-#             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
-#         )
-#         checkpoint = torch.load(model_path, map_location="cpu")
-#         clean_sd = {}
-#         for k, v in checkpoint.items():
-#             clean_sd[k.replace("module.", "")] = v
-#         result = base_model.load_state_dict(clean_sd, strict=False)
-#         if result.missing_keys:
-#             print(f"[ESMDBP] missing keys: {result.missing_keys}")
-#         if result.unexpected_keys:
-#             print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
-#         self.model = base_model.to(device).eval()
-#         self.alphabet = alphabet
-#         self.batch_converter = alphabet.get_batch_converter()
-#         self.device = device
-#     def embed(self, seqs):
-#         batch = [(str(i), seq) for i, seq in enumerate(seqs)]
-#         _, _, toks = self.batch_converter(batch)
-#         toks = toks.to(self.device)
-#         with torch.no_grad():
-#             out = self.model(toks, repr_layers=[33], return_contacts=False)
-#         reps = out["representations"][33]
-#         # skip start/end tokens
-#         return reps[:, 1:-1].mean(1).cpu().numpy()
-class ESMDBPEmbedder:
-    def __init__(self, device):
-        base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
-        model_path = (
-            Path(__file__).resolve().parent.parent
-            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
-        )
-        checkpoint = torch.load(model_path, map_location="cpu")
-        clean_sd = {}
-        for k, v in checkpoint.items():
-            clean_sd[k.replace("module.", "")] = v
-        result = base_model.load_state_dict(clean_sd, strict=False)
-        if result.missing_keys:
-            print(f"[ESMDBP] missing keys: {result.missing_keys}")
-        if result.unexpected_keys:
-            print(f"[ESMDBP] unexpected keys: {result.unexpected_keys}")
-        self.model = base_model.to(device).eval()
-        self.alphabet = alphabet
-        self.batch_converter = alphabet.get_batch_converter()
-        self.device = device
-        self.max_len = 1024  # same limit as esm1b
-        self.chunk_size = self.max_len - 2
-        self.overlap = self.chunk_size // 4
-    def _chunk_sequence(self, seq):
-        if len(seq) <= self.chunk_size:
-            return [seq]
-        step = self.chunk_size - self.overlap
-        chunks = []
-        for i in range(0, len(seq), step):
-            chunk = seq[i : i + self.chunk_size]
-            if not chunk:
-                break
-            chunks.append(chunk)
-        return chunks
-    def embed(self, seqs):
-        all_embeddings = []
-        for i, seq in enumerate(seqs):
-            chunks = self._chunk_sequence(seq)
-            chunk_vecs = []
-            for chunk in chunks:
-                batch = [(str(i), chunk)]
-                _, _, toks = self.batch_converter(batch)
-                toks = toks.to(self.device)
-                with torch.no_grad():
-                    out = self.model(toks, repr_layers=[33], return_contacts=False)
-                reps = out["representations"][33]
-                if reps.size(1) > 2:
-                    rep = reps[:, 1:-1].mean(1)
-                else:
-                    rep = reps.mean(1)
-                chunk_vecs.append(rep.squeeze(0))
-            if len(chunk_vecs) == 1:
-                seq_vec = chunk_vecs[0]
-            else:
-                stacked = torch.stack(chunk_vecs, dim=0)
-                seq_vec = stacked.mean(0)
-            all_embeddings.append(seq_vec.cpu().numpy())
-        return np.vstack(all_embeddings)
-class GPNEmbedder:
-    def __init__(self, device):
-        model_name = "songlab/gpn-msa-sapiens"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
-        self.model.to(device)
-        self.model.eval()
-        self.device = device
-    def embed(self, seqs):
-        inputs = self.tokenizer(
-            seqs,
-            return_tensors="pt",
-            padding=True,
-            truncation=True
-        ).to(self.device)
-        with torch.no_grad():
-            last_hidden = self.model(**inputs).last_hidden_state
-        return last_hidden.mean(dim=1).cpu().numpy()
-class ProGenEmbedder:
-    def __init__(self, device):
-        model_name = "jinyuan22/ProGen2-base"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
-        self.device = device
-    def embed(self, seqs):
-        inputs = self.tokenizer(
-            seqs,
-            return_tensors="pt",
-            padding=True,
-            truncation=True
-        ).to(self.device)
-        with torch.no_grad():
-            last_hidden = self.model(**inputs).last_hidden_state
-        return last_hidden.mean(dim=1).cpu().numpy()
-# ---- main pipeline ----
-def get_embedder(name, device, for_dna=True):
-    name = name.lower()
-    if for_dna:
-        if name=="caduceus":   return CaduceusEmbedder(device)
-        if name=="dnabert":    return DNABertEmbedder(device)
-        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
-        if name=="gpn":        return GPNEmbedder(device)
-        if name=="segmentnt":    return SegmentNTEmbedder(device)
-    else:
-        if name in ("esm",):    return ESMEmbedder(device)
-        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
-        if name=="progen":      return ProGenEmbedder(device)
-    raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
-def pad_token_embeddings(list_of_arrays, pad_value=0.0):
-    """
-    list_of_arrays: list of (L_i, D) numpy arrays
-    Returns:
-      padded: (N, L_max, D) array
-      mask:   (N, L_max) boolean array where True = real token, False = padding
-    """
-    N = len(list_of_arrays)
-    D = list_of_arrays[0].shape[1]
-    L_max = max(arr.shape[0] for arr in list_of_arrays)
-    padded = np.full((N, L_max, D), pad_value, dtype=list_of_arrays[0].dtype)
-    mask = np.zeros((N, L_max), dtype=bool)
-    for i, arr in enumerate(list_of_arrays):
-        L = arr.shape[0]
-        padded[i, :L] = arr
-        mask[i, :L] = True
-    return padded, mask
-def embed_and_save(seqs, ids, embedder, out_path):
-    embs = embedder.embed(seqs)
-    # Decide whether we got variable-length per-token outputs (list of (L, D))
-    is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
-    if is_variable_token:
-        # pad to (N, L_max, D) + mask
-        padded, mask = pad_token_embeddings(embs)
-        # Save both embeddings and mask together in an .npz for convenience
-        np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
-                            embeddings=padded,
-                            mask=mask,
-                            ids=np.array(ids, dtype=object))
-    else:
-        # fixed shape output, e.g., pooled (N, D)
-        array = np.vstack(embs) if isinstance(embs, list) else embs
-        np.save(out_path, array)
-        with open(out_path.with_suffix(".ids"), "w") as f:
-            f.write("\n".join(ids))
-if __name__=="__main__":
-    p = argparse.ArgumentParser()
-    #p.add_argument("--peak_fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
-    p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
-    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
-    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
-    p.add_argument("--chrom-model",   default="caduceus")
-    p.add_argument("--tf-model",      default="esm-dbp")
-    p.add_argument("--out-dir",       default="dpacman/model/embeddings")
-    p.add_argument("--device",        default="cpu")
-    args = p.parse_args()
-    os.makedirs(args.out_dir, exist_ok=True)
-    device = args.device
-    print(device)
-    if not args.skip_dna:
-        if args.genome_json_dir == None:
-            dna_df = pd.read_parquet('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.parquet', engine='pyarrow')
-            #df.to_csv('/home/a03-akrishna/DPACMAN/dpacman/model/remap2022_crm_fimo_output_q_processed.csv', index=False)
-            peak_seqs = dna_df["dna_sequence"]
-            peak_ids = dna_df["ID"]
-            print(f"Embedding {len(peak_seqs)} binding peak sequences from processed remap data", flush=True)
-            dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-            out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
-            embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
-        # peak_fasta = Path(args.peak_fasta)
-        # if peak_fasta.exists():
-        #     # Load peak sequences from FASTA
-        #     from Bio import SeqIO
-        #     peak_seqs = []
-        #     peak_ids = []
-        #     for rec in SeqIO.parse(peak_fasta, "fasta"):
-        #         peak_ids.append(rec.id)
-        #         peak_seqs.append(str(rec.seq))
-        #     print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
-        #     dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-        #     out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
-        #     embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
-        elif args.genome_json_dir:
-            # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
-            genome_dir = Path(args.genome_json_dir)
-            chrom_seqs, chrom_ids = [], []
-            primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
-            for j in sorted(genome_dir.iterdir()):
-                if not primary_pattern.match(j.name):
-                    continue
-                data = json.loads(j.read_text())
-                seq = data.get("dna") or data.get("sequence")
-                chrom = data.get("chrom") or j.stem.split("_")[-1]
-                chrom_seqs.append(seq)
-                chrom_ids.append(chrom)
-            cutoff = CaduceusEmbedder(device).chunk_size
-            long_chroms = [
-                (chrom, len(seq))
-                for chrom, seq in zip(chrom_ids, chrom_seqs)
-                if len(seq) > cutoff
-            ]
-            if long_chroms:
-                print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
-                for chrom, L in long_chroms:
-                    print(f"  {chrom}: {L} bases")
-            else:
-                print("All chromosomes ≤ Caduceus limit ({}).".format(cutoff))
-            chrom_embedder = get_embedder(args.chrom_model, device, for_dna=True)
-            out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
-            embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
-        else:
-            raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
-    #Load TF sequences
-    tf_seqs, tf_ids = [], []
-    for record in SeqIO.parse(args.tf_fasta, "fasta"):
-        tf_ids.append(record.id)
-        tf_seqs.append(str(record.seq))
-    # embed and save
-    tf_embedder = get_embedder(args.tf_model, device, for_dna=False)
-    out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
-    embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
-    print("Done.")

dpacman/classifier/model/extract_tf_symbols.py DELETED Viewed

@@ -1,27 +0,0 @@
-#!/usr/bin/env python3
-import pandas as pd
-from pathlib import Path
-FINAL_CSV = Path("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv")
-OUT_SYMBOLS = Path("tf_symbols.txt")
-def normalize_tf(tf_id: str) -> str:
-    return tf_id.split("_seq")[0].upper()
-def main():
-    df = pd.read_csv(FINAL_CSV, dtype=str)
-    if "TF_id" not in df.columns:
-        raise RuntimeError("final.csv missing TF_id column")
-    tf_raw = df["TF_id"].dropna().unique().tolist()
-    normalized = sorted({normalize_tf(t) for t in tf_raw})
-    print(f"Unique raw TF_id count: {len(tf_raw)}")
-    print(f"Unique normalized TF symbols: {len(normalized)}")
-    with open(OUT_SYMBOLS, "w") as f:
-        for s in normalized:
-            f.write(s + "\n")
-    print(f"Wrote normalized TF symbols to {OUT_SYMBOLS}")
-    # Optional: show sample
-    print("Sample symbols:", normalized[:50])
-if __name__ == "__main__":
-    main()

dpacman/classifier/model/loss.py DELETED Viewed

@@ -1,34 +0,0 @@
-"""
-Define loss functions needed for training the model
-"""
-import torch
-from torch.nn import functional as F
-def combined_loss_components(logits, targets, peak_thresh=0.5, eps=1e-8):
-    probs = torch.sigmoid(logits)
-    labels = (targets >= peak_thresh).float()
-    non_peak_mask = (labels == 0).float()
-    peak_mask     = (labels == 1).float()
-    bce_all = F.binary_cross_entropy_with_logits(logits, labels, reduction='none')
-    bce_non = (bce_all * non_peak_mask)
-    bce_non = bce_non.sum() / (non_peak_mask.sum() + eps)
-    mse_peaks = F.mse_loss(probs * peak_mask, targets * peak_mask, reduction='sum') \
-                / (peak_mask.sum() + eps)
-    t_dist = (targets + eps)
-    p_dist = (probs + eps)
-    t_dist = t_dist / t_dist.sum(dim=1, keepdim=True)
-    p_dist = p_dist / p_dist.sum(dim=1, keepdim=True)
-    kl = (t_dist * (t_dist.clamp(min=eps).log() - p_dist.clamp(min=eps).log())).sum(dim=1).mean()
-    return bce_non, kl, mse_peaks, probs
-def accuracy_percentage(logits, targets, peak_thresh=0.5):
-    probs = torch.sigmoid(logits)
-    preds_bin = (probs >= 0.5).float()
-    labels    = (targets >= peak_thresh).float()
-    correct   = (preds_bin == labels).float().sum()
-    total     = torch.numel(labels)
-    return (correct / max(1, total)).item() * 100.0

dpacman/classifier/model/make_pair_list.py DELETED Viewed

@@ -1,220 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import numpy as np
-import pandas as pd
-from pathlib import Path
-import random
-import sys
-def read_ids_file(p):
-    p = Path(p)
-    if not p.exists():
-        raise FileNotFoundError(f"IDs file not found: {p}")
-    return [line.strip() for line in p.open() if line.strip()]
-def split_embeddings(emb_path, ids_path, out_dir, prefix):
-    out_dir = Path(out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    if not Path(emb_path).exists():
-        raise FileNotFoundError(f"Embedding file not found: {emb_path}")
-    if not Path(ids_path).exists():
-        raise FileNotFoundError(f"IDs file not found: {ids_path}")
-    if emb_path.endswith(".npz"):
-        data = np.load(emb_path, allow_pickle=True)
-        if "embeddings" in data:
-            emb = data["embeddings"]
-        else:
-            raise ValueError(f"{emb_path} missing 'embeddings' key")
-    else:
-        emb = np.load(emb_path)
-    ids = read_ids_file(ids_path)
-    if len(ids) != emb.shape[0]:
-        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
-    mapping = {}
-    for i, ident in enumerate(ids):
-        if i >= emb.shape[0]:
-            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
-            continue
-        arr = emb[i]
-        out_file = out_dir / f"{prefix}_{ident}.npy"
-        np.save(out_file, arr)
-        mapping[ident] = str(out_file)
-    return mapping
-def extract_symbol_from_tf_id(full_id: str) -> str:
-    """
-    Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
-    return the gene symbol uppercase (e.g., 'ZBTB5').
-    """
-    if "|" in full_id:
-        try:
-            # format sp|Accession|SYMBOL_HUMAN
-            genepart = full_id.split("|")[2]
-        except IndexError:
-            genepart = full_id
-    else:
-        genepart = full_id
-    symbol = genepart.split("_")[0]
-    return symbol.upper()
-def build_tf_symbol_map(tf_map):
-    """
-    Build mapping gene_symbol -> list of embedding paths.
-    """
-    symbol_map = {}
-    for full_id, path in tf_map.items():
-        symbol = extract_symbol_from_tf_id(full_id)
-        symbol_map.setdefault(symbol, []).append(path)
-    return symbol_map
-def tf_key_from_path(path: str) -> str:
-    """
-    Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
-    """
-    stem = Path(path).stem  # e.g., tf_sp|O15062|ZBTB5_HUMAN
-    # remove leading prefix if present (tf_)
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return extract_symbol_from_tf_id(rest)
-def dna_key_from_path(path: str) -> str:
-    """
-    Given .../dna_peak42.npy -> 'peak42'
-    """
-    stem = Path(path).stem
-    if "_" in stem:
-        _, rest = stem.split("_", 1)
-    else:
-        rest = stem
-    return rest
-def main():
-    parser = argparse.ArgumentParser(
-        description="Build TF-DNA pair list from final.csv with gene-symbol normalization for TFs."
-    )
-    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
-    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
-    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (e.g., peak*.ids)")
-    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
-    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (e.g., sp|...|... ids)")
-    parser.add_argument("--out_dir", required=True, help="Output directory")
-    parser.add_argument("--neg_per_positive", type=int, default=2, help="Negatives per positive (half same-TF, half same-DNA)")
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-    random.seed(args.seed)
-    out_dir = Path(args.out_dir)
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Load final.csv
-    df = pd.read_csv(args.final_csv, dtype=str)
-    if "TF_id" not in df.columns or "dna_sequence" not in df.columns:
-        raise RuntimeError("final.csv must have columns TF_id and dna_sequence")
-    # Assign dna_id (unique per dna_sequence)
-    unique_seqs = df["dna_sequence"].drop_duplicates().tolist()
-    seq_to_id = {seq: f"peak{i}" for i, seq in enumerate(unique_seqs)}
-    df["dna_id"] = df["dna_sequence"].map(seq_to_id)
-    enriched_csv = out_dir / "final_with_dna_id.csv"
-    df.to_csv(enriched_csv, index=False)
-    print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
-    # Split embeddings into per-item files
-    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
-    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
-    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
-    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
-    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
-    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
-    # Build gene-symbol normalized map
-    tf_symbol_map = build_tf_symbol_map(tf_map)
-    print(f"[i] TF symbol map keys (sample): {list(tf_symbol_map.keys())[:30]}")
-    # Diagnostic overlaps
-    norm_tf_in_final = set(t.split("_seq")[0].upper() for t in df["TF_id"].unique())
-    available_tf_symbols = set(tf_symbol_map.keys())
-    intersect_tf = norm_tf_in_final & available_tf_symbols
-    print(f"[i] Unique normalized TF symbols in final.csv: {len(norm_tf_in_final)}")
-    print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
-    print(f"[i] Intersection count: {len(intersect_tf)}")
-    if len(intersect_tf) == 0:
-        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
-        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
-        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
-        sys.exit(1)
-    dna_ids_final = set(df["dna_id"].unique())
-    available_dna_ids = set(dna_map.keys())
-    intersect_dna = dna_ids_final & available_dna_ids
-    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
-    if len(intersect_dna) == 0:
-        print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
-        sys.exit(1)
-    # Build positive pairs
-    positives = []
-    for _, row in df.iterrows():
-        tf_raw = row["TF_id"]
-        tf_symbol = tf_raw.split("_seq")[0].upper()
-        dnaid = row["dna_id"]
-        if tf_symbol not in tf_symbol_map:
-            continue
-        if dnaid not in dna_map:
-            continue
-        # pick the first embedding for that symbol
-        tf_embedding_path = tf_symbol_map[tf_symbol][0]
-        positives.append((tf_embedding_path, dna_map[dnaid], 1))
-    print(f"[i] Constructed {len(positives)} positive pairs after TF symbol resolution")
-    if len(positives) == 0:
-        print("[ERROR] No positive pairs could be constructed; aborting.", file=sys.stderr)
-        sys.exit(1)
-    # Build negative samples
-    all_tf_symbols = sorted(tf_symbol_map.keys())
-    all_dnaids = sorted(dna_map.keys())
-    positive_set = set()
-    for tf_path, dna_path, _ in positives:
-        tf_key = tf_key_from_path(tf_path)
-        dna_key = dna_key_from_path(dna_path)
-        positive_set.add((tf_key, dna_key))
-    negatives = []
-    half = args.neg_per_positive // 2
-    for tf_path, dna_path, _ in positives:
-        tf_key = tf_key_from_path(tf_path)
-        dna_key = dna_key_from_path(dna_path)
-        # same TF, different DNA
-        for _ in range(half):
-            candidate_dna = random.choice(all_dnaids)
-            if candidate_dna == dna_key or (tf_key, candidate_dna) in positive_set:
-                continue
-            negatives.append((tf_path, dna_map[candidate_dna], 0))
-        # same DNA, different TF
-        for _ in range(half):
-            candidate_tf_symbol = random.choice(all_tf_symbols)
-            if candidate_tf_symbol == tf_key or (candidate_tf_symbol, dna_key) in positive_set:
-                continue
-            # pick its first embedding
-            candidate_tf_path = tf_symbol_map[candidate_tf_symbol][0]
-            negatives.append((candidate_tf_path, dna_map[dnaid], 0))
-    print(f"[i] Sampled {len(negatives)} negatives (neg_per_positive={args.neg_per_positive})")
-    # Write pair list
-    pair_list_path = out_dir / "pair_list.tsv"
-    with open(pair_list_path, "w") as f:
-        for binder_path, glm_path, label in positives + negatives:
-            # binder=TF, glm=DNA
-            f.write(f"{binder_path}\t{glm_path}\t{label}\n")
-    print(f"[i] Wrote {len(positives)+len(negatives)} examples to {pair_list_path}")
-if __name__ == "__main__":
-    main()

dpacman/classifier/model/make_peak_fasta.py DELETED Viewed

@@ -1,13 +0,0 @@
-import pandas as pd
-from pathlib import Path
-df = pd.read_csv("/home/a03-akrishna/DPACMAN/data_files/processed/final.csv", dtype=str)  # adjust path if needed
-# get unique sequences
-uniq = df[["dna_sequence"]].drop_duplicates().reset_index(drop=True)
-# make headers: e.g., peak0, peak1, ...
-out_fa = Path("binding_peaks_unique.fa")
-with open(out_fa, "w") as f:
-    for i, seq in enumerate(uniq["dna_sequence"]):
-        header = f">peak{i}"
-        f.write(f"{header}\n{seq}\n")
-print(f"Wrote {len(uniq)} unique binding sequences to {out_fa}")

dpacman/classifier/model_tmp/clustering_data.py CHANGED Viewed

@@ -12,12 +12,14 @@ from collections import defaultdict
 # Original helpers (kept; some lightly edited/commented where needed)
 # ─────────────────────────────────────────────────────────────────────────
 def read_ids_file(p):
     p = Path(p)
     if not p.exists():
         raise FileNotFoundError(f"IDs file not found: {p}")
     return [line.strip() for line in p.open() if line.strip()]
 def split_embeddings(emb_path, ids_path, out_dir, prefix):
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
@@ -38,12 +40,17 @@ def split_embeddings(emb_path, ids_path, out_dir, prefix):
     ids = read_ids_file(ids_path)
     if len(ids) != emb.shape[0]:
-        print(f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}", file=sys.stderr)
     mapping = {}
     for i, ident in enumerate(ids):
         if i >= emb.shape[0]:
-            print(f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr)
             continue
         arr = emb[i]
         out_file = out_dir / f"{prefix}_{ident}.npy"
@@ -51,6 +58,7 @@ def split_embeddings(emb_path, ids_path, out_dir, prefix):
         mapping[ident] = str(out_file)
     return mapping
 def extract_symbol_from_tf_id(full_id: str) -> str:
     """
     Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
@@ -67,6 +75,7 @@ def extract_symbol_from_tf_id(full_id: str) -> str:
     symbol = genepart.split("_")[0]
     return symbol.upper()
 def build_tf_symbol_map(tf_map):
     """
     Build mapping gene_symbol -> list of embedding paths.
@@ -77,6 +86,7 @@ def build_tf_symbol_map(tf_map):
         symbol_map.setdefault(symbol, []).append(path)
     return symbol_map
 def tf_key_from_path(path: str) -> str:
     """
     Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
@@ -89,6 +99,7 @@ def tf_key_from_path(path: str) -> str:
         rest = stem
     return extract_symbol_from_tf_id(rest)
 def dna_key_from_path(path: str) -> str:
     """
     Given .../dna_peak42.npy -> 'peak42'
@@ -100,10 +111,12 @@ def dna_key_from_path(path: str) -> str:
         rest = stem
     return rest
 # ─────────────────────────────────────────────────────────────────────────
 # New helpers for MMseqs clustering & cluster-level splitting
 # ─────────────────────────────────────────────────────────────────────────
 def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
     """
     Write unique DNA sequences to FASTA using dna_id as header.
@@ -116,6 +129,7 @@ def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
             seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
             f.write(f">{did}\n{seq}\n")
 def run_mmseqs_easy_cluster(
     mmseqs_bin: str,
     fasta: Path,
@@ -133,11 +147,17 @@ def run_mmseqs_easy_cluster(
     out_prefix.parent.mkdir(parents=True, exist_ok=True)
     cmd = [
-        mmseqs_bin, "easy-cluster",
-        str(fasta), str(out_prefix), str(tmp_dir),
-        "--min-seq-id", str(min_seq_id),
-        "-c", str(coverage),
-        "--cov-mode", str(cov_mode),
         # You can add performance flags here if needed, e.g.:
         # "--threads", "8"
     ]
@@ -157,14 +177,24 @@ def run_mmseqs_easy_cluster(
     cl_db = Path(str(out_prefix) + "_cluster")
     out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
     if in_db.exists() and cl_db.exists():
-        cmd2 = [mmseqs_bin, "createtsv", str(in_db), str(in_db), str(cl_db), str(out_tsv)]
         print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
         subprocess.run(cmd2, check=True)
         if out_tsv.exists():
             return out_tsv
-    raise FileNotFoundError("Could not locate clusters TSV from mmseqs. "
-                            "Expected {default_tsv} or createtsv fallback.")
 def parse_mmseqs_clusters(tsv_path: Path) -> dict:
     """
@@ -174,7 +204,7 @@ def parse_mmseqs_clusters(tsv_path: Path) -> dict:
     with open(tsv_path) as f:
         for line in f:
             parts = line.rstrip("\n").split("\t")
-            if len(parts) < 2:
                 continue
             rep, member = parts[0], parts[1]
             mapping[member] = rep
@@ -183,10 +213,10 @@ def parse_mmseqs_clusters(tsv_path: Path) -> dict:
                 mapping[rep] = rep
     return mapping
-def assign_clusters_to_splits(cluster_rep_to_members: dict,
-                              val_frac: float,
-                              test_frac: float,
-                              seed: int = 42):
     """
     cluster_rep_to_members: dict[rep] = [members...]
     Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
@@ -208,38 +238,63 @@ def assign_clusters_to_splits(cluster_rep_to_members: dict,
         c = len(members)
         # Fill val first, then test, then train
         if cur_val + c <= target_val:
-            val_ids.update(members); cur_val += c
         elif cur_test + c <= target_test:
-            test_ids.update(members); cur_test += c
         else:
             train_ids.update(members)
     return {"train": train_ids, "val": val_ids, "test": test_ids}
 # ─────────────────────────────────────────────────────────────────────────
 # Main
 # ─────────────────────────────────────────────────────────────────────────
 def main():
     parser = argparse.ArgumentParser(
         description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
     )
-    parser.add_argument("--final_csv", required=True, help="final.csv with TF_id and dna_sequence")
-    parser.add_argument("--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)")
-    parser.add_argument("--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)")
-    parser.add_argument("--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)")
-    parser.add_argument("--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)")
     parser.add_argument("--out_dir", required=True, help="Output directory")
     parser.add_argument("--seed", type=int, default=42)
     # NEW: MMseqs options & split fractions
     parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
-    parser.add_argument("--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id")
-    parser.add_argument("--cov", type=float, default=0.8, help="MMseqs -c coverage fraction")
-    parser.add_argument("--cov_mode", type=int, default=1, help="MMseqs --cov-mode (1 = coverage of target)")
     parser.add_argument("--val_frac", type=float, default=0.10)
     parser.add_argument("--test_frac", type=float, default=0.10)
-    parser.add_argument("--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)")
     args = parser.parse_args()
     random.seed(args.seed)
@@ -260,12 +315,24 @@ def main():
     print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
     # Split embeddings into per-item files (unchanged)
-    print(f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}")
-    dna_map = split_embeddings(args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna")
-    print(f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})")
-    print(f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}")
-    tf_map = split_embeddings(args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf")
-    print(f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})")
     # Build gene-symbol normalized map
     tf_symbol_map = build_tf_symbol_map(tf_map)
@@ -279,15 +346,28 @@ def main():
     print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
     print(f"[i] Intersection count: {len(intersect_tf)}")
     if len(intersect_tf) == 0:
-        print("[ERROR] No overlap between normalized TF_id and TF embedding symbols.", file=sys.stderr)
-        print("Sample normalized TFs from final.csv:", sorted(list(norm_tf_in_final))[:30], file=sys.stderr)
-        print("Sample available TF symbols:", sorted(list(available_tf_symbols))[:30], file=sys.stderr)
         sys.exit(1)
     dna_ids_final = set(df["dna_id"].unique())
     available_dna_ids = set(dna_map.keys())
     intersect_dna = dna_ids_final & available_dna_ids
-    print(f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}")
     if len(intersect_dna) == 0:
         print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
         sys.exit(1)
@@ -295,7 +375,9 @@ def main():
     # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
     fasta_path = out_dir / "dna_unique.fasta"
     write_dna_fasta(df, fasta_path)
-    print(f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}")
     tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
     cluster_prefix = out_dir / "mmseqs_dna_clusters"
@@ -310,7 +392,7 @@ def main():
     )
     # Parse clusters
-    member_to_rep = parse_mmseqs_clusters(clusters_tsv)   # dna_id -> rep_id
     # Build rep -> members list
     rep_to_members = defaultdict(list)
     for member, rep in member_to_rep.items():
@@ -331,10 +413,9 @@ def main():
     print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
     # Assign entire clusters to splits
-    splits = assign_clusters_to_splits(rep_to_members,
-                                       val_frac=args.val_frac,
-                                       test_frac=args.test_frac,
-                                       seed=args.seed)
     for k in ["train", "val", "test"]:
         print(f"[i] {k}: {len(splits[k])} dna_ids")
@@ -354,14 +435,22 @@ def main():
         # decide split by dna_id cluster assignment
         if dnaid in splits["train"]:
-            positives_by_split["train"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
         elif dnaid in splits["val"]:
-            positives_by_split["val"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
         elif dnaid in splits["test"]:
-            positives_by_split["test"].append((tf_embedding_path, dnaid_to_path[dnaid], 1))
         pos_count += 1
-    print(f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})")
     for k in ["train", "val", "test"]:
         print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
@@ -373,11 +462,14 @@ def main():
     for split in ["train", "val", "test"]:
         out_tsv = out_dir / f"pair_list_{split}.tsv"
         with open(out_tsv, "w") as f:
-            for binder_path, glm_path, label in positives_by_split[split]:  # + negatives if you add later
                 f.write(f"{binder_path}\t{glm_path}\t{label}\n")
         print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
     print("✅ Done. Cluster-aware splits ready.")
 if __name__ == "__main__":
     main()

 # Original helpers (kept; some lightly edited/commented where needed)
 # ─────────────────────────────────────────────────────────────────────────
 def read_ids_file(p):
     p = Path(p)
     if not p.exists():
         raise FileNotFoundError(f"IDs file not found: {p}")
     return [line.strip() for line in p.open() if line.strip()]
 def split_embeddings(emb_path, ids_path, out_dir, prefix):
     out_dir = Path(out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)
     ids = read_ids_file(ids_path)
     if len(ids) != emb.shape[0]:
+        print(
+            f"[WARN] length mismatch: {len(ids)} ids vs {emb.shape[0]} embeddings in {emb_path}",
+            file=sys.stderr,
+        )
     mapping = {}
     for i, ident in enumerate(ids):
         if i >= emb.shape[0]:
+            print(
+                f"[WARN] skipping {ident}: no embedding at index {i}", file=sys.stderr
+            )
             continue
         arr = emb[i]
         out_file = out_dir / f"{prefix}_{ident}.npy"
         mapping[ident] = str(out_file)
     return mapping
 def extract_symbol_from_tf_id(full_id: str) -> str:
     """
     Given a TF embedding ID like 'sp|O15062|ZBTB5_HUMAN' or 'ZBTB5_HUMAN',
     symbol = genepart.split("_")[0]
     return symbol.upper()
 def build_tf_symbol_map(tf_map):
     """
     Build mapping gene_symbol -> list of embedding paths.
         symbol_map.setdefault(symbol, []).append(path)
     return symbol_map
 def tf_key_from_path(path: str) -> str:
     """
     Given a path like .../tf_sp|O15062|ZBTB5_HUMAN.npy, extract normalized symbol 'ZBTB5'.
         rest = stem
     return extract_symbol_from_tf_id(rest)
 def dna_key_from_path(path: str) -> str:
     """
     Given .../dna_peak42.npy -> 'peak42'
         rest = stem
     return rest
 # ─────────────────────────────────────────────────────────────────────────
 # New helpers for MMseqs clustering & cluster-level splitting
 # ─────────────────────────────────────────────────────────────────────────
 def write_dna_fasta(df: pd.DataFrame, out_fasta: Path) -> None:
     """
     Write unique DNA sequences to FASTA using dna_id as header.
             seq = str(row["dna_sequence"]).upper().replace(" ", "").replace("\n", "")
             f.write(f">{did}\n{seq}\n")
 def run_mmseqs_easy_cluster(
     mmseqs_bin: str,
     fasta: Path,
     out_prefix.parent.mkdir(parents=True, exist_ok=True)
     cmd = [
+        mmseqs_bin,
+        "easy-cluster",
+        str(fasta),
+        str(out_prefix),
+        str(tmp_dir),
+        "--min-seq-id",
+        str(min_seq_id),
+        "-c",
+        str(coverage),
+        "--cov-mode",
+        str(cov_mode),
         # You can add performance flags here if needed, e.g.:
         # "--threads", "8"
     ]
     cl_db = Path(str(out_prefix) + "_cluster")
     out_tsv = Path(str(out_prefix) + "_fallback_cluster.tsv")
     if in_db.exists() and cl_db.exists():
+        cmd2 = [
+            mmseqs_bin,
+            "createtsv",
+            str(in_db),
+            str(in_db),
+            str(cl_db),
+            str(out_tsv),
+        ]
         print("[i] Creating TSV via createtsv:", " ".join(cmd2), flush=True)
         subprocess.run(cmd2, check=True)
         if out_tsv.exists():
             return out_tsv
+    raise FileNotFoundError(
+        "Could not locate clusters TSV from mmseqs. "
+        "Expected {default_tsv} or createtsv fallback."
+    )
 def parse_mmseqs_clusters(tsv_path: Path) -> dict:
     """
     with open(tsv_path) as f:
         for line in f:
             parts = line.rstrip("\n").split("\t")
+            if len(parts) < 2:
                 continue
             rep, member = parts[0], parts[1]
             mapping[member] = rep
                 mapping[rep] = rep
     return mapping
+def assign_clusters_to_splits(
+    cluster_rep_to_members: dict, val_frac: float, test_frac: float, seed: int = 42
+):
     """
     cluster_rep_to_members: dict[rep] = [members...]
     Returns: dict with keys 'train','val','test' mapping to sets of dna_id.
         c = len(members)
         # Fill val first, then test, then train
         if cur_val + c <= target_val:
+            val_ids.update(members)
+            cur_val += c
         elif cur_test + c <= target_test:
+            test_ids.update(members)
+            cur_test += c
         else:
             train_ids.update(members)
     return {"train": train_ids, "val": val_ids, "test": test_ids}
 # ─────────────────────────────────────────────────────────────────────────
 # Main
 # ─────────────────────────────────────────────────────────────────────────
 def main():
     parser = argparse.ArgumentParser(
         description="Build TF-DNA pair lists with MMseqs clustering on DNA to prevent split leakage."
     )
+    parser.add_argument(
+        "--final_csv", required=True, help="final.csv with TF_id and dna_sequence"
+    )
+    parser.add_argument(
+        "--dna_embed_npz", required=True, help="DNA embedding file (.npy or .npz)"
+    )
+    parser.add_argument(
+        "--dna_ids", required=True, help="IDs file for DNA embeddings (peak*.ids)"
+    )
+    parser.add_argument(
+        "--tf_embed_npy", required=True, help="TF embedding file (.npy or .npz)"
+    )
+    parser.add_argument(
+        "--tf_ids", required=True, help="IDs file for TF embeddings (sp|... ids)"
+    )
     parser.add_argument("--out_dir", required=True, help="Output directory")
     parser.add_argument("--seed", type=int, default=42)
     # NEW: MMseqs options & split fractions
     parser.add_argument("--mmseqs_bin", default="mmseqs", help="Path to mmseqs binary")
+    parser.add_argument(
+        "--min_seq_id", type=float, default=0.9, help="MMseqs --min-seq-id"
+    )
+    parser.add_argument(
+        "--cov", type=float, default=0.8, help="MMseqs -c coverage fraction"
+    )
+    parser.add_argument(
+        "--cov_mode",
+        type=int,
+        default=1,
+        help="MMseqs --cov-mode (1 = coverage of target)",
+    )
     parser.add_argument("--val_frac", type=float, default=0.10)
     parser.add_argument("--test_frac", type=float, default=0.10)
+    parser.add_argument(
+        "--tmp_dir", default=None, help="MMseqs tmp dir (defaults to out_dir/tmp)"
+    )
     args = parser.parse_args()
     random.seed(args.seed)
     print(f"[i] Wrote augmented final.csv with dna_id to {enriched_csv}")
     # Split embeddings into per-item files (unchanged)
+    print(
+        f"[i] Splitting DNA embeddings from {args.dna_embed_npz} with ids {args.dna_ids}"
+    )
+    dna_map = split_embeddings(
+        args.dna_embed_npz, args.dna_ids, out_dir / "dna_single", "dna"
+    )
+    print(
+        f"[i] DNA embeddings available: {len(dna_map)} (sample: {list(dna_map.keys())[:10]})"
+    )
+    print(
+        f"[i] Splitting TF embeddings from {args.tf_embed_npy} with ids {args.tf_ids}"
+    )
+    tf_map = split_embeddings(
+        args.tf_embed_npy, args.tf_ids, out_dir / "tf_single", "tf"
+    )
+    print(
+        f"[i] TF embeddings available: {len(tf_map)} (sample: {list(tf_map.keys())[:10]})"
+    )
     # Build gene-symbol normalized map
     tf_symbol_map = build_tf_symbol_map(tf_map)
     print(f"[i] Available TF embedding symbols: {len(available_tf_symbols)}")
     print(f"[i] Intersection count: {len(intersect_tf)}")
     if len(intersect_tf) == 0:
+        print(
+            "[ERROR] No overlap between normalized TF_id and TF embedding symbols.",
+            file=sys.stderr,
+        )
+        print(
+            "Sample normalized TFs from final.csv:",
+            sorted(list(norm_tf_in_final))[:30],
+            file=sys.stderr,
+        )
+        print(
+            "Sample available TF symbols:",
+            sorted(list(available_tf_symbols))[:30],
+            file=sys.stderr,
+        )
         sys.exit(1)
     dna_ids_final = set(df["dna_id"].unique())
     available_dna_ids = set(dna_map.keys())
     intersect_dna = dna_ids_final & available_dna_ids
+    print(
+        f"[i] Unique dna_id in final.csv: {len(dna_ids_final)}. Available DNA ids: {len(available_dna_ids)}. Intersection: {len(intersect_dna)}"
+    )
     if len(intersect_dna) == 0:
         print("[ERROR] No overlap on DNA ids.", file=sys.stderr)
         sys.exit(1)
     # ── NEW: MMseqs clustering on DNA sequences ───────────────────────────
     fasta_path = out_dir / "dna_unique.fasta"
     write_dna_fasta(df, fasta_path)
+    print(
+        f"[i] Wrote FASTA with {df['dna_id'].nunique()} unique sequences → {fasta_path}"
+    )
     tmp_dir = Path(args.tmp_dir) if args.tmp_dir else (out_dir / "mmseqs_tmp")
     cluster_prefix = out_dir / "mmseqs_dna_clusters"
     )
     # Parse clusters
+    member_to_rep = parse_mmseqs_clusters(clusters_tsv)  # dna_id -> rep_id
     # Build rep -> members list
     rep_to_members = defaultdict(list)
     for member, rep in member_to_rep.items():
     print(f"[i] Wrote {out_dir / 'final_with_dna_id_and_cluster.csv'}")
     # Assign entire clusters to splits
+    splits = assign_clusters_to_splits(
+        rep_to_members, val_frac=args.val_frac, test_frac=args.test_frac, seed=args.seed
+    )
     for k in ["train", "val", "test"]:
         print(f"[i] {k}: {len(splits[k])} dna_ids")
         # decide split by dna_id cluster assignment
         if dnaid in splits["train"]:
+            positives_by_split["train"].append(
+                (tf_embedding_path, dnaid_to_path[dnaid], 1)
+            )
         elif dnaid in splits["val"]:
+            positives_by_split["val"].append(
+                (tf_embedding_path, dnaid_to_path[dnaid], 1)
+            )
         elif dnaid in splits["test"]:
+            positives_by_split["test"].append(
+                (tf_embedding_path, dnaid_to_path[dnaid], 1)
+            )
         pos_count += 1
+    print(
+        f"[i] Constructed positives across splits (rows in final.csv iterated: {len(df)})"
+    )
     for k in ["train", "val", "test"]:
         print(f"[i] positives[{k}] = {len(positives_by_split[k])}")
     for split in ["train", "val", "test"]:
         out_tsv = out_dir / f"pair_list_{split}.tsv"
         with open(out_tsv, "w") as f:
+            for binder_path, glm_path, label in positives_by_split[
+                split
+            ]:  # + negatives if you add later
                 f.write(f"{binder_path}\t{glm_path}\t{label}\n")
         print(f"[i] Wrote {len(positives_by_split[split])} examples to {out_tsv}")
     print("✅ Done. Cluster-aware splits ready.")
 if __name__ == "__main__":
     main()

dpacman/classifier/model_tmp/compress_embeddings.py CHANGED Viewed

@@ -7,6 +7,7 @@ import numpy as np
 import torch
 from torch import nn
 class EmbeddingCompressor(nn.Module):
     def __init__(self, input_dim: int = 1280, output_dim: int = 256):
         super().__init__()
@@ -19,26 +20,33 @@ class EmbeddingCompressor(nn.Module):
         """
         if x.dim() == 2:
             # single example: mean over tokens
-            x = x.mean(dim=0, keepdim=True)      # → (1, input_dim)
         else:
             # batch: mean over tokens
-            x = x.mean(dim=1)                     # → (batch, input_dim)
-        return self.fc(x)                         # → (batch, output_dim)
 def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
-    arr = np.load(in_path)                      # shape (L, D) or (batch, L, D)
     tensor = torch.from_numpy(arr).float()
     with torch.no_grad():
-        compressed = model(tensor)              # → (batch, 256)
     out = compressed.cpu().numpy()
     np.save(out_path, out)
     print(f"Saved {out_path}")
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
-    parser.add_argument("--input_glob", type=str, required=True,
-                        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)")
     parser.add_argument("--output_dir", type=str, required=True)
     parser.add_argument("--esm_dim", type=int, default=1280)
     parser.add_argument("--out_dim", type=int, default=256)

 import torch
 from torch import nn
 class EmbeddingCompressor(nn.Module):
     def __init__(self, input_dim: int = 1280, output_dim: int = 256):
         super().__init__()
         """
         if x.dim() == 2:
             # single example: mean over tokens
+            x = x.mean(dim=0, keepdim=True)  # → (1, input_dim)
         else:
             # batch: mean over tokens
+            x = x.mean(dim=1)  # → (batch, input_dim)
+        return self.fc(x)  # → (batch, output_dim)
 def compress_file(in_path: str, out_path: str, model: EmbeddingCompressor):
+    arr = np.load(in_path)  # shape (L, D) or (batch, L, D)
     tensor = torch.from_numpy(arr).float()
     with torch.no_grad():
+        compressed = model(tensor)  # → (batch, 256)
     out = compressed.cpu().numpy()
     np.save(out_path, out)
     print(f"Saved {out_path}")
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="Compress ESM embeddings to 256d")
+    parser.add_argument(
+        "--input_glob",
+        type=str,
+        required=True,
+        help="Glob for your .npy ESM embeddings (e.g. data/esm_*.npy)",
+    )
     parser.add_argument("--output_dir", type=str, required=True)
     parser.add_argument("--esm_dim", type=int, default=1280)
     parser.add_argument("--out_dim", type=int, default=256)

dpacman/classifier/model_tmp/compute_embeddings.py CHANGED Viewed

@@ -14,6 +14,7 @@ Usage example (DNA + protein in one go):
     --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
     --device           cuda
 """
 import os
 import re
 import argparse
@@ -28,6 +29,7 @@ import time
 # ---- model wrappers ----
 class CaduceusEmbedder:
     def __init__(self, device, chunk_size=131_072, overlap=0):
         """
@@ -39,12 +41,14 @@ class CaduceusEmbedder:
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, trust_remote_code=True
         )
-        self.model = AutoModel.from_pretrained(
-            model_name, trust_remote_code=True
-        ).to(device).eval()
-        self.device     = device
         self.chunk_size = chunk_size
-        self.step       = chunk_size - overlap
     def embed(self, seqs):
         """
@@ -73,14 +77,13 @@ class CaduceusEmbedder:
                 return_tensors="pt",
                 padding=False,
                 truncation=True,
-                max_length=self.chunk_size
             ).to(self.device)
             with torch.no_grad():
                 out = self.model(**toks).last_hidden_state  # (1, L, D)
-            outputs.append(out.cpu().numpy()[0])             # (L, D)
         return outputs  # list of variable-length (L_i, D) arrays
     def benchmark(self, lengths=None):
         """
         Time embedding on single-sequence of various lengths.
@@ -101,10 +104,17 @@ class CaduceusEmbedder:
             t1 = time.perf_counter()
             print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
 class SegmentNTEmbedder:
     def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
-        self.model = AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True).to(device).eval()
         self.device = device
     def _adjust_length(self, input_ids):
@@ -113,7 +123,12 @@ class SegmentNTEmbedder:
         remainder = (excl) % 4
         if remainder != 0:
             pad_needed = 4 - remainder
-            pad_tensor = torch.full((bs, pad_needed), self.tokenizer.pad_token_id, dtype=input_ids.dtype, device=input_ids.device)
             input_ids = torch.cat([input_ids, pad_tensor], dim=1)
         return input_ids
@@ -135,7 +150,7 @@ class SegmentNTEmbedder:
             attention_mask = input_ids != self.tokenizer.pad_token_id
             input_ids = self._adjust_length(input_ids)
-            attention_mask = (input_ids != self.tokenizer.pad_token_id)
             with torch.no_grad():
                 outs = self.model(
@@ -161,19 +176,26 @@ class SegmentNTEmbedder:
 class DNABertEmbedder:
     def __init__(self, device):
-        self.tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
-        self.model     = AutoModel.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True).to(device)
-        self.device    = device
     def embed(self, seqs):
         embs = []
         for s in seqs:
-            tokens = self.tokenizer(s, return_tensors="pt", padding=True)["input_ids"].to(self.device)
             with torch.no_grad():
                 out = self.model(tokens).last_hidden_state.mean(1)
             embs.append(out.cpu().numpy())
         return np.vstack(embs)
 class NucleotideTransformerEmbedder:
     def __init__(self, device):
         # HF “feature-extraction” returns a list of (L, D) arrays for each input
@@ -181,7 +203,9 @@ class NucleotideTransformerEmbedder:
         self.pipe = pipeline(
             "feature-extraction",
             model="InstaDeepAI/nucleotide-transformer-500m-1000g",
-            device= -1 if device=="cpu" else 0    # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
         )
     def embed(self, seqs):
@@ -191,8 +215,9 @@ class NucleotideTransformerEmbedder:
         """
         all_embeddings = self.pipe(seqs, truncation=True, padding=True)
         # all_embeddings is a List of shape (L, D) arrays
-        pooled = [ np.mean(x, axis=0) for x in all_embeddings ]
-        return np.vstack(pooled)
 # class ESMEmbedder:
 #     def __init__(self, device):
@@ -225,7 +250,9 @@ class ESMEmbedder:
         self.batch_converter = self.alphabet.get_batch_converter()
         self.model.to(device).eval()
         # determine max length: esm2 models vary; use default 1024 for esm1b
-        self.max_len = 4096 if self.is_esm2 else 1024  # adjust if your esm2 variant has explicit limit
         # for chunking: reserve 2 tokens if model uses BOS/EOS
         self.chunk_size = self.max_len - 2
         self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
@@ -280,7 +307,7 @@ class ESMEmbedder:
 # class ESMDBPEmbedder:
 #     def __init__(self, device):
-#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
 #         model_path = (
 #             Path(__file__).resolve().parent.parent
 #             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
@@ -310,12 +337,15 @@ class ESMEmbedder:
 #         # skip start/end tokens
 #         return reps[:, 1:-1].mean(1).cpu().numpy()
 class ESMDBPEmbedder:
     def __init__(self, device):
         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
         model_path = (
             Path(__file__).resolve().parent.parent
-            / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
         )
         checkpoint = torch.load(model_path, map_location="cpu")
         clean_sd = {}
@@ -372,6 +402,7 @@ class ESMDBPEmbedder:
             all_embeddings.append(seq_vec.cpu().numpy())
         return np.vstack(all_embeddings)
 class GPNEmbedder:
     def __init__(self, device):
         model_name = "songlab/gpn-msa-sapiens"
@@ -383,16 +414,14 @@ class GPNEmbedder:
     def embed(self, seqs):
         inputs = self.tokenizer(
-            seqs,
-            return_tensors="pt",
-            padding=True,
-            truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 class ProGenEmbedder:
     def __init__(self, device):
         model_name = "jinyuan22/ProGen2-base"
@@ -402,29 +431,36 @@ class ProGenEmbedder:
     def embed(self, seqs):
         inputs = self.tokenizer(
-            seqs,
-            return_tensors="pt",
-            padding=True,
-            truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 # ---- main pipeline ----
 def get_embedder(name, device, for_dna=True):
     name = name.lower()
     if for_dna:
-        if name=="caduceus":   return CaduceusEmbedder(device)
-        if name=="dnabert":    return DNABertEmbedder(device)
-        if name=="nucleotide": return NucleotideTransformerEmbedder(device)
-        if name=="gpn":        return GPNEmbedder(device)
-        if name=="segmentnt":    return SegmentNTEmbedder(device)
     else:
-        if name in ("esm",):    return ESMEmbedder(device)
-        if name in ("esm-dbp","esm_dbp"): return ESMDBPEmbedder(device)
-        if name=="progen":      return ProGenEmbedder(device)
     raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
@@ -446,20 +482,28 @@ def pad_token_embeddings(list_of_arrays, pad_value=0.0):
         mask[i, :L] = True
     return padded, mask
 def embed_and_save(seqs, ids, embedder, out_path):
     embs = embedder.embed(seqs)
     # Decide whether we got variable-length per-token outputs (list of (L, D))
-    is_variable_token = isinstance(embs, (list, tuple)) and len(embs) > 0 and hasattr(embs[0], "shape") and embs[0].ndim == 2
     if is_variable_token:
         # pad to (N, L_max, D) + mask
         padded, mask = pad_token_embeddings(embs)
         # Save both embeddings and mask together in an .npz for convenience
-        np.savez_compressed(out_path.with_suffix(".caduceus.npz"),
-                            embeddings=padded,
-                            mask=mask,
-                            ids=np.array(ids, dtype=object))
     else:
         # fixed shape output, e.g., pooled (N, D)
         array = np.vstack(embs) if isinstance(embs, list) else embs
@@ -468,17 +512,31 @@ def embed_and_save(seqs, ids, embedder, out_path):
             f.write("\n".join(ids))
-if __name__=="__main__":
     p = argparse.ArgumentParser()
-    p.add_argument("--peak-fasta", default="binding_peaks_unique.fa", help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs")
-    p.add_argument("--genome-json-dir", default=None, help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes")
-    p.add_argument("--skip-dna", action="store_true", help="if set, skip the chromosome embedding step") #if glm embeddings successful but not plm embeddings
-    p.add_argument("--tf-fasta",      required=True, help="input TF FASTA file")
-    p.add_argument("--chrom-model",   default="caduceus")
-    p.add_argument("--tf-model",      default="esm-dbp")
-    p.add_argument("--out-dir",       default="data_files/processed/tfclust/hg38_tf/embeddings")
-    p.add_argument("--device",        default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
@@ -495,7 +553,10 @@ if __name__=="__main__":
             for rec in SeqIO.parse(peak_fasta, "fasta"):
                 peak_ids.append(rec.id)
                 peak_seqs.append(str(rec.seq))
-            print(f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}", flush=True)
             dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
             out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
             embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
@@ -503,7 +564,9 @@ if __name__=="__main__":
             # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
             genome_dir = Path(args.genome_json_dir)
             chrom_seqs, chrom_ids = [], []
-            primary_pattern = re.compile(r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$")
             for j in sorted(genome_dir.iterdir()):
                 if not primary_pattern.match(j.name):
                     continue
@@ -519,7 +582,9 @@ if __name__=="__main__":
                 if len(seq) > cutoff
             ]
             if long_chroms:
-                print("⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff))
                 for chrom, L in long_chroms:
                     print(f"  {chrom}: {L} bases")
             else:
@@ -529,10 +594,11 @@ if __name__=="__main__":
             out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
             embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
         else:
-            raise ValueError("No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs.")
-    #Load TF sequences
     tf_seqs, tf_ids = [], []
     for record in SeqIO.parse(args.tf_fasta, "fasta"):
         tf_ids.append(record.id)
@@ -543,4 +609,4 @@ if __name__=="__main__":
     out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
     embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
-    print("Done.")

     --out-dir          ../data_files/processed/tfclust/hg38_tf/embeddings \
     --device           cuda
 """
 import os
 import re
 import argparse
 # ---- model wrappers ----
 class CaduceusEmbedder:
     def __init__(self, device, chunk_size=131_072, overlap=0):
         """
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name, trust_remote_code=True
         )
+        self.model = (
+            AutoModel.from_pretrained(model_name, trust_remote_code=True)
+            .to(device)
+            .eval()
+        )
+        self.device = device
         self.chunk_size = chunk_size
+        self.step = chunk_size - overlap
     def embed(self, seqs):
         """
                 return_tensors="pt",
                 padding=False,
                 truncation=True,
+                max_length=self.chunk_size,
             ).to(self.device)
             with torch.no_grad():
                 out = self.model(**toks).last_hidden_state  # (1, L, D)
+            outputs.append(out.cpu().numpy()[0])  # (L, D)
         return outputs  # list of variable-length (L_i, D) arrays
     def benchmark(self, lengths=None):
         """
         Time embedding on single-sequence of various lengths.
             t1 = time.perf_counter()
             print(f"  length={sz:6,d}  time={(t1-t0)*1000:7.1f} ms")
 class SegmentNTEmbedder:
     def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "InstaDeepAI/segment_nt", trust_remote_code=True
+        )
+        self.model = (
+            AutoModel.from_pretrained("InstaDeepAI/segment_nt", trust_remote_code=True)
+            .to(device)
+            .eval()
+        )
         self.device = device
     def _adjust_length(self, input_ids):
         remainder = (excl) % 4
         if remainder != 0:
             pad_needed = 4 - remainder
+            pad_tensor = torch.full(
+                (bs, pad_needed),
+                self.tokenizer.pad_token_id,
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
             input_ids = torch.cat([input_ids, pad_tensor], dim=1)
         return input_ids
             attention_mask = input_ids != self.tokenizer.pad_token_id
             input_ids = self._adjust_length(input_ids)
+            attention_mask = input_ids != self.tokenizer.pad_token_id
             with torch.no_grad():
                 outs = self.model(
 class DNABertEmbedder:
     def __init__(self, device):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "zhihan1996/DNA_bert_6", trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(
+            "zhihan1996/DNA_bert_6", trust_remote_code=True
+        ).to(device)
+        self.device = device
     def embed(self, seqs):
         embs = []
         for s in seqs:
+            tokens = self.tokenizer(s, return_tensors="pt", padding=True)[
+                "input_ids"
+            ].to(self.device)
             with torch.no_grad():
                 out = self.model(tokens).last_hidden_state.mean(1)
             embs.append(out.cpu().numpy())
         return np.vstack(embs)
 class NucleotideTransformerEmbedder:
     def __init__(self, device):
         # HF “feature-extraction” returns a list of (L, D) arrays for each input
         self.pipe = pipeline(
             "feature-extraction",
             model="InstaDeepAI/nucleotide-transformer-500m-1000g",
+            device=(
+                -1 if device == "cpu" else 0
+            ),  # HF uses -1 for CPU, 0 for GPU #:contentReference[oaicite:0]{index=0}
         )
     def embed(self, seqs):
         """
         all_embeddings = self.pipe(seqs, truncation=True, padding=True)
         # all_embeddings is a List of shape (L, D) arrays
+        pooled = [np.mean(x, axis=0) for x in all_embeddings]
+        return np.vstack(pooled)
 # class ESMEmbedder:
 #     def __init__(self, device):
         self.batch_converter = self.alphabet.get_batch_converter()
         self.model.to(device).eval()
         # determine max length: esm2 models vary; use default 1024 for esm1b
+        self.max_len = (
+            4096 if self.is_esm2 else 1024
+        )  # adjust if your esm2 variant has explicit limit
         # for chunking: reserve 2 tokens if model uses BOS/EOS
         self.chunk_size = self.max_len - 2
         self.overlap = self.chunk_size // 4  # 25% overlap to smooth boundaries
 # class ESMDBPEmbedder:
 #     def __init__(self, device):
+#         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
 #         model_path = (
 #             Path(__file__).resolve().parent.parent
 #             / "pretrained" / "ESM-DBP" / "ESM-DBP.model"
 #         # skip start/end tokens
 #         return reps[:, 1:-1].mean(1).cpu().numpy()
 class ESMDBPEmbedder:
     def __init__(self, device):
         base_model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
         model_path = (
             Path(__file__).resolve().parent.parent
+            / "pretrained"
+            / "ESM-DBP"
+            / "ESM-DBP.model"
         )
         checkpoint = torch.load(model_path, map_location="cpu")
         clean_sd = {}
             all_embeddings.append(seq_vec.cpu().numpy())
         return np.vstack(all_embeddings)
 class GPNEmbedder:
     def __init__(self, device):
         model_name = "songlab/gpn-msa-sapiens"
     def embed(self, seqs):
         inputs = self.tokenizer(
+            seqs, return_tensors="pt", padding=True, truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 class ProGenEmbedder:
     def __init__(self, device):
         model_name = "jinyuan22/ProGen2-base"
     def embed(self, seqs):
         inputs = self.tokenizer(
+            seqs, return_tensors="pt", padding=True, truncation=True
         ).to(self.device)
         with torch.no_grad():
             last_hidden = self.model(**inputs).last_hidden_state
         return last_hidden.mean(dim=1).cpu().numpy()
 # ---- main pipeline ----
 def get_embedder(name, device, for_dna=True):
     name = name.lower()
     if for_dna:
+        if name == "caduceus":
+            return CaduceusEmbedder(device)
+        if name == "dnabert":
+            return DNABertEmbedder(device)
+        if name == "nucleotide":
+            return NucleotideTransformerEmbedder(device)
+        if name == "gpn":
+            return GPNEmbedder(device)
+        if name == "segmentnt":
+            return SegmentNTEmbedder(device)
     else:
+        if name in ("esm",):
+            return ESMEmbedder(device)
+        if name in ("esm-dbp", "esm_dbp"):
+            return ESMDBPEmbedder(device)
+        if name == "progen":
+            return ProGenEmbedder(device)
     raise ValueError(f"Unknown model {name} (for_dna={for_dna})")
         mask[i, :L] = True
     return padded, mask
 def embed_and_save(seqs, ids, embedder, out_path):
     embs = embedder.embed(seqs)
     # Decide whether we got variable-length per-token outputs (list of (L, D))
+    is_variable_token = (
+        isinstance(embs, (list, tuple))
+        and len(embs) > 0
+        and hasattr(embs[0], "shape")
+        and embs[0].ndim == 2
+    )
     if is_variable_token:
         # pad to (N, L_max, D) + mask
         padded, mask = pad_token_embeddings(embs)
         # Save both embeddings and mask together in an .npz for convenience
+        np.savez_compressed(
+            out_path.with_suffix(".caduceus.npz"),
+            embeddings=padded,
+            mask=mask,
+            ids=np.array(ids, dtype=object),
+        )
     else:
         # fixed shape output, e.g., pooled (N, D)
         array = np.vstack(embs) if isinstance(embs, list) else embs
             f.write("\n".join(ids))
+if __name__ == "__main__":
     p = argparse.ArgumentParser()
+    p.add_argument(
+        "--peak-fasta",
+        default="binding_peaks_unique.fa",
+        help="FASTA of deduplicated binding peak sequences; if present this is used for DNA embedding instead of genome JSONs",
+    )
+    p.add_argument(
+        "--genome-json-dir",
+        default=None,
+        help="(fallback) directory of UCSC JSONs for full chromosome embedding if peak FASTA is missing or you explicitly want chromosomes",
+    )
+    p.add_argument(
+        "--skip-dna",
+        action="store_true",
+        help="if set, skip the chromosome embedding step",
+    )  # if glm embeddings successful but not plm embeddings
+    p.add_argument("--tf-fasta", required=True, help="input TF FASTA file")
+    p.add_argument("--chrom-model", default="caduceus")
+    p.add_argument("--tf-model", default="esm-dbp")
+    p.add_argument(
+        "--out-dir", default="data_files/processed/tfclust/hg38_tf/embeddings"
+    )
+    p.add_argument("--device", default="cpu")
     args = p.parse_args()
     os.makedirs(args.out_dir, exist_ok=True)
             for rec in SeqIO.parse(peak_fasta, "fasta"):
                 peak_ids.append(rec.id)
                 peak_seqs.append(str(rec.seq))
+            print(
+                f"Embedding {len(peak_seqs)} binding peak sequences from {peak_fasta}",
+                flush=True,
+            )
             dna_embedder = get_embedder(args.chrom_model, device, for_dna=True)
             out_peaks = Path(args.out_dir) / f"peaks_{args.chrom_model}.npy"
             embed_and_save(peak_seqs, peak_ids, dna_embedder, out_peaks)
             # Legacy: load full chromosomes from JSONs (chr1–22, X, Y, M)
             genome_dir = Path(args.genome_json_dir)
             chrom_seqs, chrom_ids = [], []
+            primary_pattern = re.compile(
+                r"^hg38_chr(?:[1-9]|1[0-9]|2[0-2]|X|Y|M)\.json$"
+            )
             for j in sorted(genome_dir.iterdir()):
                 if not primary_pattern.match(j.name):
                     continue
                 if len(seq) > cutoff
             ]
             if long_chroms:
+                print(
+                    "⚠️ Chromosomes exceeding Caduceus max tokens ({}):".format(cutoff)
+                )
                 for chrom, L in long_chroms:
                     print(f"  {chrom}: {L} bases")
             else:
             out_chrom = Path(args.out_dir) / f"chrom_{args.chrom_model}.npy"
             embed_and_save(chrom_seqs, chrom_ids, chrom_embedder, out_chrom)
         else:
+            raise ValueError(
+                "No input for DNA embedding: provide a peak FASTA (default binding_peaks_unique.fa) or set --genome-json-dir for chromosome JSONs."
+            )
+    # Load TF sequences
     tf_seqs, tf_ids = [], []
     for record in SeqIO.parse(args.tf_fasta, "fasta"):
         tf_ids.append(record.id)
     out_tf = Path(args.out_dir) / f"tf_{args.tf_model}.npy"
     embed_and_save(tf_seqs, tf_ids, tf_embedder, out_tf)
+    print("Done.")