Sophia commited on 2 days ago

Commit

8019be0

0 Parent(s):

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +16 -0
LICENSE +21 -0
README.md +62 -0
a2d2_mol/README.md +132 -0
a2d2_mol/config_mol.yaml +54 -0
a2d2_mol/evaluate_mol_table.py +308 -0
a2d2_mol/finetune_mol.py +747 -0
a2d2_mol/inference_quality_mol.py +554 -0
a2d2_mol/mol_dataset.py +379 -0
a2d2_mol/mol_scoring/oracle/fpscores.pkl +3 -0
a2d2_mol/mol_scoring/scoring_functions.py +68 -0
a2d2_mol/mol_utils/bracket_safe_converter.py +159 -0
a2d2_mol/mol_utils/utils.py +135 -0
a2d2_mol/mol_utils/utils_chem.py +187 -0
a2d2_mol/oracle/fpscores.pkl +3 -0
a2d2_mol/remasking_scheduleaware.py +177 -0
a2d2_mol/sampling.py +1401 -0
a2d2_mol/scripts/run_mol_finetune.slurm +200 -0
a2d2_mol/scripts/train_mol.sh +93 -0
a2d2_mol/train.py +216 -0
a2d2_pep/README.md +145 -0
a2d2_pep/config_pep.yaml +50 -0
a2d2_pep/data/dataloading_for_dynamic_batching.py +189 -0
a2d2_pep/data/dataset.py +207 -0
a2d2_pep/evaluate_peptide_table.py +326 -0
a2d2_pep/finetune_quality.py +892 -0
a2d2_pep/inference_quality.py +605 -0
a2d2_pep/pep_scoring/functions/binding.py +178 -0
a2d2_pep/pep_scoring/functions/binding_utils.py +290 -0
a2d2_pep/pep_scoring/functions/hemolysis.py +63 -0
a2d2_pep/pep_scoring/functions/nonfouling.py +66 -0
a2d2_pep/pep_scoring/functions/permeability.py +170 -0
a2d2_pep/pep_scoring/functions/scoring_utils.py +94 -0
a2d2_pep/pep_scoring/functions/solubility.py +63 -0
a2d2_pep/pep_scoring/scoring_functions.py +79 -0
a2d2_pep/pep_scoring/tokenizer/my_tokenizers.py +424 -0
a2d2_pep/pep_utils/analyzer.py +1274 -0
a2d2_pep/pep_utils/utils.py +135 -0
a2d2_pep/remasking_scheduleaware.py +181 -0
a2d2_pep/sampling.py +1401 -0
a2d2_pep/scripts/run_peptide_finetune.slurm +210 -0
a2d2_pep/scripts/train_pep.sh +93 -0
a2d2_pep/train.py +216 -0
assets/a2d2.gif +3 -0
demo/quality_inference_demo.ipynb +0 -0
environment.yml +57 -0
lightning_modules/__init__.py +16 -0
lightning_modules/any_length_remask.py +801 -0
lightning_modules/any_order.py +417 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.gif filter=lfs diff=lfs merge=lfs -text
2	+ *.pkl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+checkpoints/
+pretrained/
+__pycache__/
+results/
+a2d2_language/
+a2d2_language/wandb/
+a2d2_pep/wandb/
+a2d2_mol/wandb/
+logs/
+*.pt
+*.pyc
+*.out
+*.json
+*.log
+*.txt
+*.wandb

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2026 Sophia Tang
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+# [A2D2: Fine-Tuning Any-Length Discrete Diffusion for Adaptive Decoding](https://arxiv.org/abs/2606.13565) 🃏🔮
+[**Sophia Tang**](https://sophtang.github.io/), [**Yuchen Zhu**](https://yuchen-zhu-zyc.github.io/), [**Molei Tao**](https://mtao8.math.gatech.edu/), and [**Pranam Chatterjee**](https://www.chatterjeelab.com/)
+<p>
+  <a href="https://arxiv.org/abs/2606.13565"><img src="https://img.shields.io/badge/arXiv-6B67EE?style=for-the-badge&logo=arxiv&logoColor=white" alt="arXiv"></a>
+  <a href="https://sophtang.github.io/a2d2/"><img src="https://img.shields.io/badge/Project_Page-6B67EE?style=for-the-badge&logo=data:image/svg+xml;base64,PHN2ZyByb2xlPSJpbWciIHZpZXdCb3g9IjAgMCAyNCAyNCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIiBmaWxsPSJ3aGl0ZSI+PHBhdGggZD0iTTEwLjUgMS41QzExIDcuMyAxMy4yIDkuNSAxOSAxMEMxMy4yIDEwLjUgMTEgMTIuNyAxMC41IDE4LjVDMTAgMTIuNyA3LjggMTAuNSAyIDEwQzcuOCA5LjUgMTAgNy4zIDEwLjUgMS41WiIvPjxwYXRoIGQ9Ik0xOC41IDEzLjVDMTguNyAxNS44IDE5LjcgMTYuOCAyMiAxN0MxOS43IDE3LjIgMTguNyAxOC4yIDE4LjUgMjAuNUMxOC4zIDE4LjIgMTcuMyAxNy4yIDE1IDE3QzE3LjMgMTYuOCAxOC4zIDE1LjggMTguNSAxMy41WiIvPjxwYXRoIGQ9Ik01IDE1LjVDNS4xMiAxNyA1LjUgMTcuMzggNyAxNy41QzUuNSAxNy42MiA1LjEyIDE4IDUgMTkuNUM0Ljg4IDE4IDQuNSAxNy42MiAzIDE3LjVDNC41IDE3LjM4IDQuODggMTcgNSAxNS41WiIvPjwvc3ZnPg==" alt="Project Page"></a>
+</p>
+![A2D2](assets/a2d2.gif)
+This is the repository for the paper [**A2D2: Fine-Tuning Any-Length Discrete Diffusion for Adaptive Decoding**](https://arxiv.org/abs/2606.13565).
+Masked discrete diffusion models (MDMs) offer a simple, stable likelihood-based framework for sequence generation, recently extended to **any-length** settings via token insertion. **A2D2** is a unified framework for reward-guided fine-tuning of any-length MDMs that **jointly optimizes the insertion and unmasking policies together with a quality-based inference schedule**, converging to the intractable reward-tilted distribution without requiring target samples.
+🃏 We derive the **Radon–Nikodym derivative** for the joint insertion–unmasking path measures, enabling theoretically guaranteed convergence to the reward-tilted sequence distribution.
+🃏 We establish **unmasking and insertion quality** as tractable approaches for minimizing decoding error (compounding parallelization error), and train lightweight quality predictors alongside the policy.
+🃏 We introduce the **Adaptive Joint Decoding (AJD)** loss, which provably yields the optimal path measure that generates the reward-tilted distribution while remasking low-quality tokens and dropping low-quality insertions at inference.
+🃏 Empirically, A2D2 improves reward optimization while enhancing generation **flexibility** and **accuracy** over prior fixed-length fine-tuning and inference-time guidance methods.
+## Drug-Like Small Molecule Design 🧪
+We pre-train an any-length MDM on the **SAFE** dataset ([Noutahi et al. 2024](https://arxiv.org/abs/2310.10773), ~950M molecules from ZINC and Unichem in SAFE notation) and fine-tune it with **A2D2** to optimize **QED** (drug-likeness) and **synthetic accessibility (SA)**. A2D2 jointly raises QED and lowers SA over the pre-trained baseline while increasing the fraction of valid, unique, drug-like, and synthesizable molecules. Code and instructions are in [`/a2d2_mol`](a2d2_mol).
+## Multi-Objective Therapeutic Peptide Generation 💉
+We pre-train an any-length **peptide SMILES** MDM on ~11M peptides (CycPeptMPDB, SmProt, CycloPs) and fine-tune with **A2D2** on five therapeutic properties simultaneously: **target-protein binding affinity, solubility, non-hemolysis, non-fouling, and permeability**. A2D2 outperforms inference-time multi-objective guidance and fixed-length off-policy RL fine-tuning on almost all objectives, while improving the fraction of valid peptides. Code and instructions are in [`/a2d2_pep`](a2d2_pep).
+## Language Model Reasoning 🧠
+We additionally apply **A2D2** to reward fine-tuning of any-length language MDMs (LLaDA / FlexMDM), optimizing math-reasoning correctness and format rewards (GSM8K / MATH), including infilling variants. Code is in [`/a2d2_language`](a2d2_language).
+## Repository Structure
+| Directory | Experiment |
+|-----------|------------|
+| [`a2d2_mol`](a2d2_mol) | Drug-like small molecule design (QED, SA) |
+| [`a2d2_pep`](a2d2_pep) | Multi-objective therapeutic peptide generation |
+| [`a2d2_language`](a2d2_language) | Language model reasoning reward fine-tuning (code soon) |
+| [`lightning_modules`](lightning_modules) | Any-length insertion MDM Lightning modules (policy + quality predictors) |
+| [`model`](model) | Shared model architecture |
+| [`demo`](demo) | Quality-guided inference demo notebook |
+Each experiment directory contains its own `README.md` with environment setup, pretrained weight placement, fine-tuning commands, and evaluation instructions.
+## Citation
+If you find this repository helpful for your publications, please consider citing our paper:
+```python
+@article{tang2026a2d2,
+  title={A2D2: Fine-Tuning Any-Length Discrete Diffusion for Adaptive Decoding},
+  author={Sophia Tang and Yuchen Zhu and Molei Tao and Pranam Chatterjee},
+  journal={arXiv preprint arXiv:2606.13565},
+  year={2026}
+}
+```
+To use this repository, you agree to abide by the MIT License.

a2d2_mol/README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# A2D2 for Molecule Generation 🧪
+This part of the code fine-tunes an **any-length masked diffusion model (MDM)** over molecules with **A2D2** (Fine-Tuning Any-Length Discrete Diffusion for Adaptive Decoding) to optimize drug-likeness rewards (QED, and optionally synthetic accessibility / SA).
+A2D2 jointly fine-tunes the insertion and unmasking policies together with **insertion and unmasking quality predictors**, generating molecules via **Adaptive Joint Decoding (AJD)** that remasks low-quality tokens and drops low-quality insertions to sample from the reward-tilted distribution while preserving generation quality.
+Molecules are represented as [SAFE](https://github.com/datamol-io/safe) strings and tokenized with the `datamol-io/safe-gpt` tokenizer.
+The codebase is partially built upon [FlexMDM (Kim et.al, 2025)](https://github.com/brianlck/FlexMDM/tree/main) and [TR2-D2 (Tang et.al, 2025)](https://github.com/sophtang/TR2-D2/tree/main).
+## Environment Installation
+```
+# from the repository root
+conda env create -f environment.yml
+conda activate a2d2
+```
+The molecule scripts share the `a2d2` environment with the peptide and language experiments. See the root [`environment.yml`](../environment.yml) for the `flash-attn` install step.
+## Model Pretrained Weights
+A2D2 fine-tunes a pretrained any-length insertion MDM trained on drug-like SAFE molecules. Download the base checkpoint and place it at:
+```
+A2D2/pretrained/anylength_mol.ckpt
+```
+```bash
+# from the repository root
+pip install gdown
+mkdir -p pretrained
+gdown 1I5EGiV1I5XZZpB9JAKABFLKVqfCyenxq -O pretrained/anylength_mol.ckpt
+```
+(Or download manually from https://drive.google.com/file/d/1I5EGiV1I5XZZpB9JAKABFLKVqfCyenxq/view?usp=drive_link — a plain `wget`/`curl` of the link saves Google's HTML warning page, not the checkpoint.)
+This is the default `--checkpoint_path` (for fine-tuning) and `--pretrained_ckpt` (for evaluation) used throughout.
+## Pretraining the Any-Length Model
+If you only want to fine-tune with A2D2, download the released `anylength_mol.ckpt` above and skip this section. Follow these steps to reproduce the base checkpoint by pretraining the any-length insertion MDM from scratch.
+### 1. The pretraining dataset
+The model is pretrained on drug-like [SAFE](https://github.com/datamol-io/safe) molecules from the [`datamol-io/safe-gpt`](https://huggingface.co/datasets/datamol-io/safe-gpt) dataset (~1.1B molecules) on the Hugging Face Hub. **No manual download is required** — the dataset is loaded in streaming mode (`load_dataset(..., streaming=True)`) and tokenized on the fly with the `datamol-io/safe-gpt` tokenizer, both fetched automatically on first run.
+The dataset is configured in [`config_mol.yaml`](config_mol.yaml):
+```yaml
+hf_dataset:
+  name: "datamol-io/safe-gpt"
+  smiles_column: "smiles"
+```
+To pretrain on a different Hugging Face SMILES/SAFE dataset, change `hf_dataset.name` (and `smiles_column` to match its column).
+### 2. Configure
+Pretraining is driven by [`config_mol.yaml`](config_mol.yaml). Key fields:
+| Field | Default | Notes |
+|-------|---------|-------|
+| `hf_dataset.name` | `datamol-io/safe-gpt` | Streaming HF dataset (auto-downloaded). |
+| `training.devices` | `2` | GPUs per node (DDP). |
+| `training.batch_size` | `2048` | Global batch; gradient accumulation is derived automatically from `per_gpu_batch_size`. |
+| `training.max_steps` | `500000` | Total optimizer steps. |
+| `training.learning_rate` | `3e-4` | AdamW LR with `warmup_steps: 2000`. |
+| `training.save_every_n_steps` | `1000` | Step-based checkpointing (used for streaming datasets). |
+| `training.checkpoint_dir` | `checkpoints/pretrain_mol` | A timestamped subdirectory is created per run. |
+| `interpolant.max_length` | `256` | Max token length. |
+### 3. Pre-training Any-Length Molecule Model
+Log in to Weights & Biases once (`wandb login`), or set `export WANDB_MODE=disabled` to skip logging. Then submit the SLURM job:
+```bash
+# from a2d2_mol/
+sbatch train_mol.sh
+```
+`train_mol.sh` is a SLURM batch script that requests one `dgx-b200` node with 2 full B200 GPUs and launches DDP via `srun` (one task per GPU), running the equivalent of:
+```bash
+python train.py --task mol
+```
+It activates the conda env (`CONDA_ENV`, defaults to the `peptune` env) from `CONDA_ROOT` (defaults to the shared miniconda install) — the batch shell does not source `~/.bashrc`, so override these env vars if your install or env path differs. The GPU count is auto-detected from the SLURM allocation and passed to hydra as `training.devices`/`training.nodes`, so to scale just change `--gpus-per-node` and `--ntasks-per-node` together at the top of the script (they must match). `--task mol` makes `train.py` load `config_mol.yaml`.
+Checkpoints are written to `checkpoints/pretrain_mol/<timestamp>/` (use `last.ckpt` / the best `train_loss` checkpoint as the `--checkpoint_path` / `--pretrained_ckpt` for fine-tuning and evaluation); the run log goes to `logs/<date>_a2d2-mol_<jobid>.log` and SLURM's catch-file to `logs/slurm/`. To resume, add a `training.resume_path: /path/to/last.ckpt` entry to the config.
+## Fine-Tune with A2D2
+The canonical run directory is the parent `a2d2/` package (`finetune_mol.py`, `inference_quality_mol.py`, `sampling.py`, and `mol_scoring/` here are the molecule-specific modules used from there). Before running:
+1. Set `--base_path` to the location of `a2d2`. Results plots are written to `<base_path>/flexible/results/<run_name>/`.
+2. Create the output directories: `a2d2/checkpoints/finetune_mol`, `a2d2/results`, and `a2d2/logs`.
+### Single run
+[`scripts/run_mol_finetune.slurm`](scripts/run_mol_finetune.slurm) runs a single `finetune_mol.py` experiment on one MIG GPU, then evaluates the resulting checkpoint. It bundles the full hyperparameter set used in the paper (replicates `R = 16`, pool size `1000`, buffer size `100`, sampling steps `N_steps = 90`, warmup `N_warmup = 20`, alternation frequency `N_alt = 5`, reward scaling `α = 0.01`, quality threshold `μ_min = 0.3`, `--qed_only`), so you don't have to pass them by hand.
+The script resolves the repo root automatically — `$A2D2_ROOT` if set, else the `sbatch` submit directory, else the script's own location — so either submit from the repo root or export your clone path. Set `CONDA_ROOT` (your miniconda install) and, if needed, `CONDA_ENV` (defaults to `peptune`):
+```bash
+export A2D2_ROOT=/path/to/your/A2D2     # absolute path to your clone
+export CONDA_ROOT=/path/to/miniconda3   # or just have `conda` on PATH
+sbatch scripts/run_mol_finetune.slurm
+```
+Select which variant to run with `MODE_ID` (default `0`): `0` = A2D2 (full planner), `1` = `--disable_planner`, `2` = `--disable_insertion_planner`, `3` = `--disable_unmasking_planner`. Override at submit time:
+```bash
+sbatch --export=ALL,MODE_ID=2 scripts/run_mol_finetune.slurm
+```
+The pretrained base checkpoint is read from `$A2D2_ROOT/pretrained/anylength_mol.ckpt`. Outputs land in `checkpoints/finetune_mol/<job>_mol_<mode>/` and `results/mol_ablation/<mode>/`.
+### Ablation flags
+| Flag | Variant |
+|------|---------|
+| *(none)* | A2D2 w/ insertion + unmasking quality (alternation) |
+| `--disable_planner` | A2D2 w/o quality (policy only, no remasking) |
+| `--disable_insertion_planner` | A2D2 w/o insertion quality |
+| `--disable_unmasking_planner` | A2D2 w/o unmasking/remasking quality |
+| `--joint_training` | train policy + quality heads jointly (no alternation) |
+## Evaluation
+Evaluation runs automatically at the end of the SLURM job. To evaluate a checkpoint manually:
+```
+python evaluate_mol_table.py \
+    --checkpoint_path /path/to/a2d2/checkpoints/finetune_mol/my_run/last.ckpt \
+    --pretrained_ckpt /path/to/A2D2/pretrained/anylength_mol.ckpt \
+    --output_dir /path/to/results \
+    --num_samples 1000 --batch_size 50 \
+    --max_length 256 --total_num_steps 256 \
+    --num_remasking 2 --quality_threshold 0.3 --seed 42 --device cuda:0
+```
+This reports QED, SA, validity, uniqueness, diversity, and mean unmasking/insertion quality over the generated molecules and writes `eval_metrics_<mode>.csv`.

a2d2_mol/config_mol.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+trainer: "any-order-flow"
+dataset: "safe-drugs"
+# HuggingFace dataset configuration
+hf_dataset:
+  name: "datamol-io/safe-gpt"
+  smiles_column: "smiles"  # Adjust based on actual column name in the dataset
+model:
+  hidden_size: 768
+  n_heads: 12
+  cond_dim: 128
+  dropout: 0.05
+  n_blocks: 12
+  torch_dtype: 'float32'  # Options: 'float32', 'float16', 'bfloat16'
+interpolant:
+  type: "any-order"
+  tokens: null # filled in automatically
+  pad_token: null # filled in automatically
+  mask_token: null # filled in automatically
+  max_length: 256
+  insert_schedule:
+    type: "linear"
+  unmask_schedule:
+    type: "linear"
+training:
+  only_embed_insert: true
+  batch_size: 2048
+  per_gpu_batch_size: 64 # Gradient accumulation happens automatically
+  cpus: 4
+  learning_rate: 3e-4
+  nodes: 1
+  devices: 2
+  max_steps: 500000
+  weight_decay: 0.03
+  checkpoint_dir: "checkpoints/pretrain_mol"
+  save_top_k: 3
+  save_every_n_steps: 1000  # Save checkpoint every 1k steps (for streaming datasets)
+  # save_every_n_epochs: 1  # Not used with streaming datasets
+  loss_fn:
+    unmask: "elbo"
+    insert: "expectation"
+  reset_lr: false
+  warmup_steps: 2000
+  ema_decay: 0.9999
+  filter_max_length: false
+wandb:
+  entity: null  # set to your W&B entity, or leave null to use the default
+  project: "a2d2-mol"
+  name: "a2d2-mol"
+  path: "./wandb"

a2d2_mol/evaluate_mol_table.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+Evaluate a finetuned molecule model checkpoint by sampling sequences
+and computing metrics for the De Novo Small Molecule Generation table:
+  Validity (%), Uniqueness (%), QED (↑), SA (↓), Quality (%), Diversity (↑), Sampling Time (↓)
+"""
+import os
+import sys
+import argparse
+import time
+import torch
+import numpy as np
+import pandas as pd
+from tdc import Oracle, Evaluator
+# add repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, REPO_ROOT)
+from lightning_modules.any_length_remask import AnyOrderInsertionFlowModuleFT
+from lightning_modules import AnyOrderInsertionFlowModule
+from inference_quality_mol import sample_mol_eval
+from mol_scoring.scoring_functions import MolScoringFunctions
+from finetune_mol import MolFinetuner, get_tokenizer
+from mol_utils.utils import str2bool, set_seed
+def load_finetuned_model(checkpoint_path, pretrained_ckpt_path, device='cuda'):
+    """Load a finetuned MolFinetuner from a Lightning checkpoint."""
+    # We need to reconstruct the model the same way main() does, then load state
+    # Load from Lightning checkpoint directly
+    ckpt = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+    hparams = ckpt.get('hyper_parameters', {})
+    args = hparams.get('args', None)
+    # Load pretrained base checkpoint to get config
+    base_ckpt = torch.load(pretrained_ckpt_path, map_location='cpu', weights_only=False)
+    if 'hyper_parameters' in base_ckpt:
+        config = base_ckpt['hyper_parameters']['config']
+    elif 'config' in base_ckpt:
+        config = base_ckpt['config']
+    else:
+        raise ValueError("Cannot find config in base checkpoint")
+    from omegaconf import OmegaConf, DictConfig
+    if not OmegaConf.is_config(config):
+        config = DictConfig(config)
+    OmegaConf.set_struct(config, False)
+    # Set adaptive schedule config from args or defaults
+    config.training.use_adaptive_schedule = getattr(args, 'use_adaptive_schedule', True)
+    config.training.schedule_hidden_dim = getattr(args, 'schedule_hidden_dim', 256)
+    config.training.schedule_num_layers = getattr(args, 'schedule_num_layers', 2)
+    config.training.schedule_loss_weight = getattr(args, 'schedule_loss_weight', 0.1)
+    config.training.freeze_base_model = getattr(args, 'freeze_base_model', False)
+    config.training.schedule_warmup_epochs = getattr(args, 'schedule_warmup_epochs', 0)
+    config.training.use_bracket_safe = True
+    OmegaConf.set_struct(config, True)
+    # Determine if planner should be loaded based on disable_planner flag
+    disable_planner = getattr(args, 'disable_planner', False)
+    # Initialize policy model
+    policy_model = AnyOrderInsertionFlowModuleFT(
+        config=config,
+        args=args,
+        pretrained_checkpoint=pretrained_ckpt_path,
+        insertion_planner=not disable_planner,
+    )
+    # Load policy model weights from the finetuned checkpoint
+    state_dict = ckpt['state_dict']
+    # Lightning wraps the model: 'policy_model.xxx' -> remove prefix for the sub-module
+    policy_state = {}
+    for k, v in state_dict.items():
+        if k.startswith('policy_model.'):
+            policy_state[k[len('policy_model.'):]] = v
+    policy_model.load_state_dict(policy_state, strict=False)
+    policy_model = policy_model.to(device)
+    policy_model.eval()
+    return policy_model, args, config
+@torch.no_grad()
+def evaluate_checkpoint(policy_model, tokenizer, reward_model, evaluator,
+                        num_samples=1000, batch_size=50, max_length=256,
+                        total_num_steps=256, quality_mode="both", num_remasking=2,
+                        quality_threshold=0.5, unmask_quality_threshold=None, device='cuda'):
+    """
+    Sample `num_samples` molecules and compute all table metrics.
+    Returns a dict with: validity, uniqueness, qed, sa, quality, diversity, sampling_time
+    """
+    all_valid_seqs = []
+    all_smiles_generated = 0
+    total_time = 0.0
+    num_batches = (num_samples + batch_size - 1) // batch_size
+    remaining = num_samples
+    for b in range(num_batches):
+        bs = min(batch_size, remaining)
+        remaining -= bs
+        t_start = time.time()
+        result = sample_mol_eval(
+            model=policy_model,
+            reward_model=reward_model,
+            tokenizer=tokenizer,
+            steps=total_num_steps,
+            mask=policy_model.interpolant.mask_token,
+            pad=policy_model.interpolant.pad_token,
+            batch_size=bs,
+            max_length=max_length,
+            quality_mode=quality_mode,
+            num_remasking=num_remasking,
+            quality_threshold=quality_threshold,
+            unmask_quality_threshold=unmask_quality_threshold,
+            evaluator=evaluator,
+            dataframe=True,
+        )
+        t_end = time.time()
+        # Unpack: uniqueSequences, qed, sa, valid_fraction, uniqueness, diversity, quality, df
+        unique_seqs, qed_scores, sa_scores, valid_frac, uniq, div, qual, df = result
+        all_valid_seqs.extend(list(unique_seqs) if not isinstance(unique_seqs, list) else unique_seqs)
+        all_smiles_generated += bs
+        total_time += (t_end - t_start)
+        print(f"  Batch {b+1}/{num_batches}: {len(unique_seqs)} valid unique, "
+              f"time={t_end - t_start:.1f}s")
+    # --- Aggregate metrics over all samples ---
+    total_generated = num_samples
+    # Valid sequences (keeping duplicates for validity count)
+    # Re-evaluate from scratch on all collected valid sequences
+    all_unique = list(set(all_valid_seqs))
+    num_valid = len(all_valid_seqs)  # total valid across batches (before dedup)
+    num_unique = len(all_unique)
+    validity = num_valid / total_generated * 100.0
+    uniqueness = num_unique / num_valid * 100.0 if num_valid > 0 else 0.0
+    # Diversity on unique SMILES
+    diversity = evaluator(all_unique) if num_unique > 1 else 0.0
+    # QED and SA on unique sequences
+    if num_unique > 0:
+        oracle_qed = Oracle('qed')
+        oracle_sa = Oracle('sa')
+        qed_vals = oracle_qed(all_unique)
+        sa_vals = oracle_sa(all_unique)
+        mean_qed = np.mean(qed_vals)
+        mean_sa = np.mean(sa_vals)
+        # Quality: unique sequences with QED >= 0.6 AND SA <= 4
+        quality_mask = [(q >= 0.6 and s <= 4) for q, s in zip(qed_vals, sa_vals)]
+        quality = sum(quality_mask) / total_generated * 100.0
+    else:
+        mean_qed = 0.0
+        mean_sa = 0.0
+        quality = 0.0
+    sampling_time = total_time
+    metrics = {
+        'Validity (%)': validity,
+        'Uniqueness (%)': uniqueness,
+        'QED': mean_qed,
+        'Synthetic Accessibility': mean_sa,
+        'Quality (%)': quality,
+        'Diversity': diversity,
+        'Sampling Time (s)': sampling_time,
+        'Num Generated': total_generated,
+        'Num Valid': num_valid,
+        'Num Unique': num_unique,
+    }
+    return metrics, all_unique, qed_vals if num_unique > 0 else [], sa_vals if num_unique > 0 else []
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate a finetuned mol checkpoint")
+    parser.add_argument('--checkpoint_path', type=str, required=True,
+                        help='Path to the finetuned Lightning checkpoint (e.g., last.ckpt)')
+    parser.add_argument('--pretrained_ckpt', type=str,
+                        default=os.path.join(REPO_ROOT, 'pretrained', 'anylength_mol.ckpt'),
+                        help='Path to the pretrained base model checkpoint '
+                             '(defaults to <repo>/pretrained/anylength_mol.ckpt)')
+    parser.add_argument('--num_samples', type=int, default=1000,
+                        help='Number of molecules to sample')
+    parser.add_argument('--batch_size', type=int, default=50,
+                        help='Batch size for sampling')
+    parser.add_argument('--max_length', type=int, default=256)
+    parser.add_argument('--total_num_steps', type=int, default=256)
+    parser.add_argument('--num_remasking', type=int, default=2)
+    parser.add_argument('--disable_planner', action='store_true',
+                        help='If set, disable remasking during evaluation (matches training mode)')
+    parser.add_argument('--disable_insertion_planner', action='store_true',
+                        help='If set, disable insertion quality filtering during evaluation')
+    parser.add_argument('--disable_unmasking_planner', action='store_true',
+                        help='If set, disable unmasking confidence planner during evaluation')
+    parser.add_argument('--quality_threshold', type=float, default=0.5,
+                        help='Threshold for insertion quality filtering during sampling')
+    parser.add_argument('--unmask_quality_threshold', type=float, default=None,
+                        help='If set, gate unmasking remasking on confidence: remask clean '
+                             'tokens whose remasking_conf < threshold (overrides the '
+                             'schedule-driven count). Default None = schedule-driven behavior.')
+    parser.add_argument('--output_dir', type=str, default=None,
+                        help='Directory to save results CSV. Defaults to checkpoint directory.')
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--seed', type=int, default=42)
+    args = parser.parse_args()
+    set_seed(args.seed, use_cuda=True)
+    device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
+    print(f"Loading checkpoint: {args.checkpoint_path}")
+    print(f"Pretrained base: {args.pretrained_ckpt}")
+    print(f"Disable planner (no remasking): {args.disable_planner}")
+    print(f"Disable insertion planner: {args.disable_insertion_planner}")
+    print(f"Disable unmasking planner: {args.disable_unmasking_planner}")
+    policy_model, train_args, config = load_finetuned_model(
+        args.checkpoint_path, args.pretrained_ckpt, device=device
+    )
+    tokenizer = get_tokenizer()
+    score_func_names = ['qed', 'sa']
+    reward_model = MolScoringFunctions(score_func_names, device=device)
+    evaluator = Evaluator('diversity')
+    use_remasking = not args.disable_planner
+    disable_insertion_planner = args.disable_insertion_planner
+    disable_unmasking_planner = args.disable_unmasking_planner
+    # Map flags to quality_mode
+    if args.disable_planner:
+        quality_mode = "none"
+    elif args.disable_insertion_planner and args.disable_unmasking_planner:
+        quality_mode = "none"
+    elif args.disable_insertion_planner:
+        quality_mode = "unmasking_only"
+    elif args.disable_unmasking_planner:
+        quality_mode = "insertion_only"
+    else:
+        quality_mode = "both"
+    print(f"\nSampling {args.num_samples} molecules (quality_mode={quality_mode})...")
+    metrics, unique_smiles, qed_vals, sa_vals = evaluate_checkpoint(
+        policy_model=policy_model,
+        tokenizer=tokenizer,
+        reward_model=reward_model,
+        evaluator=evaluator,
+        num_samples=args.num_samples,
+        batch_size=args.batch_size,
+        max_length=args.max_length,
+        total_num_steps=args.total_num_steps,
+        quality_mode=quality_mode,
+        num_remasking=args.num_remasking,
+        quality_threshold=getattr(args, 'quality_threshold', 0.5),
+        unmask_quality_threshold=args.unmask_quality_threshold,
+        device=device,
+    )
+    # Print summary table
+    print("\n" + "=" * 60)
+    print("  De Novo Small Molecule Generation Results")
+    print("=" * 60)
+    for k, v in metrics.items():
+        if isinstance(v, float):
+            print(f"  {k:<30s}: {v:.4f}")
+        else:
+            print(f"  {k:<30s}: {v}")
+    print("=" * 60)
+    # Save results
+    output_dir = args.output_dir or os.path.dirname(args.checkpoint_path)
+    os.makedirs(output_dir, exist_ok=True)
+    if args.disable_planner:
+        tag = "no_planner"
+    elif args.disable_insertion_planner:
+        tag = "no_insertion_planner"
+    elif args.disable_unmasking_planner:
+        tag = "no_unmasking_planner"
+    else:
+        tag = "with_planner"
+    metrics_path = os.path.join(output_dir, f'eval_metrics_{tag}.csv')
+    pd.DataFrame([metrics]).to_csv(metrics_path, index=False)
+    print(f"Metrics saved to: {metrics_path}")
+    if unique_smiles:
+        smiles_path = os.path.join(output_dir, f'eval_smiles_{tag}.csv')
+        df = pd.DataFrame({
+            'SMILES': unique_smiles,
+            'QED': qed_vals,
+            'SA': sa_vals,
+        })
+        df.to_csv(smiles_path, index=False)
+        print(f"SMILES saved to: {smiles_path}")
+if __name__ == '__main__':
+    main()

a2d2_mol/finetune_mol.py ADDED Viewed

	@@ -0,0 +1,747 @@

+import argparse
+from datetime import datetime
+import numpy as np
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+import wandb
+import os
+import sys
+from tqdm import tqdm
+import pandas as pd
+# add repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# imports
+from inference_quality_mol import sample_mol_buffer, sample_mol_eval
+from mol_utils.utils import str2bool, set_seed
+from mol_scoring.scoring_functions import MolScoringFunctions
+from lightning_modules.any_length_remask import AnyOrderInsertionFlowModuleFT
+from lightning_modules import AnyOrderInsertionFlowModule
+from safe.tokenizer import SAFETokenizer
+from tdc import Evaluator
+# Repository root (two levels up from this file: A2D2/a2d2_mol/finetune_mol.py)
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+def get_tokenizer():
+    """Get SAFE tokenizer with added special tokens."""
+    tk = SAFETokenizer.from_pretrained('datamol-io/safe-gpt').get_pretrained()
+    tk.add_tokens(['<', '>'])   # for bracket_safe
+    return tk
+class MolFinetuner(pl.LightningModule):
+    """Lightning module for distributed molecule finetuning."""
+    def __init__(
+        self,
+        args,
+        policy_model,
+        reward_model,
+        tokenizer,
+        pretrained=None,
+        mcts=None,
+        filename=None,
+        eps=1e-5
+    ):
+        super().__init__()
+        self.args = args
+        self.policy_model = policy_model
+        self.reward_model = reward_model
+        self.tokenizer = tokenizer
+        self.pretrained = pretrained
+        self.mcts = mcts
+        self.filename = filename
+        self.eps = eps
+        self.evaluator = Evaluator("diversity")
+        # Save hyperparameters
+        self.save_hyperparameters(ignore=['policy_model', 'reward_model', 'tokenizer', 'pretrained', 'mcts'])
+        # Buffer for sequences
+        self.x_saved = None
+        self.log_rnd_saved = None
+        self.final_rewards_saved = None
+        # initialize logs
+        self.valid_fraction_log = []
+        self.diversity_log = []
+        self.qed_log = []
+        self.sa_log = []
+        self.quality_log = []
+        self.uniqueness_log = []
+        # Alternating training between policy and planner
+        self.train_policy = True  # Start by training policy
+        self.alternation_frequency = getattr(args, 'alternation_frequency', 1)  # Alternate every N epochs
+    def freeze_policy_model(self):
+        """Freeze policy model parameters (but not planner)."""
+        for name, param in self.policy_model.named_parameters():
+            if not name.startswith('planner.'):
+                param.requires_grad = False
+    def unfreeze_policy_model(self):
+        """Unfreeze policy model parameters (but not planner)."""
+        for name, param in self.policy_model.named_parameters():
+            if not name.startswith('planner.'):
+                param.requires_grad = True
+    def freeze_planner_model(self):
+        """Freeze planner parameters."""
+        if hasattr(self.policy_model, 'planner'):
+            for param in self.policy_model.planner.parameters():
+                param.requires_grad = False
+    def unfreeze_planner_model(self):
+        """Unfreeze planner parameters."""
+        if hasattr(self.policy_model, 'planner'):
+            for param in self.policy_model.planner.parameters():
+                param.requires_grad = True
+    def configure_optimizers(self):
+        # Separate parameter groups for policy backbone vs planner heads
+        planner_lr = getattr(self.args, 'planner_learning_rate', self.args.learning_rate)
+        planner_params = []
+        policy_params = []
+        for name, param in self.policy_model.named_parameters():
+            if name.startswith('planner.'):
+                planner_params.append(param)
+            else:
+                policy_params.append(param)
+        param_groups = [
+            {'params': policy_params, 'lr': self.args.learning_rate},
+            {'params': planner_params, 'lr': planner_lr},
+        ]
+        optimizer = torch.optim.AdamW(param_groups)
+        return optimizer
+    def _get_quality_mode(self):
+        """Map ablation flags + warmup state to quality_mode string."""
+        if self.args.disable_planner:
+            return "none"
+        if self.current_epoch < self.args.schedule_warmup_epochs:
+            return "none"
+        di = getattr(self.args, 'disable_insertion_planner', False)
+        du = getattr(self.args, 'disable_unmasking_planner', False)
+        if di and du:
+            return "none"
+        if di:
+            return "unmasking_only"
+        if du:
+            return "insertion_only"
+        return "both"
+    def on_train_epoch_start(self):
+        """Called at the start of each training epoch."""
+        # If disable_planner mode, only train policy (no alternation)
+        if self.args.disable_planner:
+            self.train_policy = True
+            self.unfreeze_policy_model()
+            self.freeze_planner_model()
+            if self.global_rank == 0 and self.current_epoch == 0:
+                print(f"[FINETUNE_QUALITY] Training ONLY policy model (planner frozen, no remasking)")
+        elif getattr(self.args, 'joint_training', False):
+            # Joint mode: train policy + planner together every step (no alternation)
+            self.train_policy = True  # marker; training_step adds planner loss when joint_training is set
+            self.unfreeze_policy_model()
+            self.unfreeze_planner_model()
+            if self.global_rank == 0 and self.current_epoch == 0:
+                print(f"[FINETUNE_QUALITY] JOINT TRAINING: policy + planner trained together (no alternation)")
+        else:
+            # Alternate between training policy and planner from epoch 0
+            # Determine which model to train this epoch
+            cycle_position = (self.current_epoch // self.alternation_frequency) % 2
+            self.train_policy = (cycle_position == 0)
+            if self.train_policy:
+                # Train policy, freeze planner
+                self.unfreeze_policy_model()
+                self.freeze_planner_model()
+                if self.global_rank == 0:
+                    print(f"[ALTERNATION] Epoch {self.current_epoch}: Training POLICY model (planner frozen)")
+            else:
+                # Train planner, freeze policy
+                self.freeze_policy_model()
+                self.unfreeze_planner_model()
+                if self.global_rank == 0:
+                    print(f"[ALTERNATION] Epoch {self.current_epoch}: Training PLANNER model (policy frozen)")
+        # Resample buffer if needed
+        if self.x_saved is None or self.current_epoch % self.args.resample_every_n_step == 0:
+            if self.global_rank == 0:
+                print(f"[BUFFER] Starting buffer generation for epoch {self.current_epoch}")
+            self._generate_buffer()
+            # Synchronize all ranks after buffer generation
+            if self.trainer and self.trainer.world_size > 1:
+                if self.global_rank == 0:
+                    print(f"[BUFFER] All ranks completed buffer generation, synchronizing...")
+                torch.distributed.barrier()
+                if self.global_rank == 0:
+                    print(f"[BUFFER] Synchronization complete!")
+    def _generate_buffer(self):
+        """Generate buffer of sequences for training.
+        When pool_size > 0, maintains a persistent pool and refreshes a fraction
+        each time instead of regenerating the entire buffer from scratch.
+        """
+        rank = self.global_rank if self.trainer else 0
+        world_size = self.trainer.world_size if self.trainer else 1
+        pool_size = getattr(self.args, 'pool_size', 0)
+        is_pool = pool_size > 0
+        is_init = self.x_saved is None
+        # Determine how many molecules to sample this call
+        if is_pool:
+            refresh_frac = getattr(self.args, 'pool_refresh_fraction', 0.2)
+            if is_init:
+                samples_per_gpu = pool_size
+            else:
+                samples_per_gpu = max(1, int(pool_size * refresh_frac))
+            if rank == 0:
+                if is_init:
+                    print(f"\n[POOL] Initializing pool with {pool_size} molecules at epoch {self.current_epoch}")
+                else:
+                    print(f"\n[POOL] Refreshing {samples_per_gpu}/{pool_size} molecules ({refresh_frac*100:.0f}%) at epoch {self.current_epoch}")
+        else:
+            samples_per_gpu = self.args.buffer_size // world_size
+            if rank == 0:
+                samples_per_gpu += self.args.buffer_size % world_size
+        if rank == 0:
+            print(f"\n[BUFFER] Starting buffer generation at epoch {self.current_epoch}")
+        accumulated_x = []
+        accumulated_log_rnd = []
+        accumulated_rewards = []
+        total_accumulated = 0
+        max_attempts = 100  # Prevent infinite loop
+        attempts = 0
+        import time
+        while total_accumulated < samples_per_gpu and attempts < max_attempts:
+            attempts += 1
+            if rank == 0:
+                print(f"[BUFFER] rank={rank} starting sampling attempt {attempts} at {time.strftime('%H:%M:%S')}")
+            start_time = time.time()
+            x_final, log_rnd, final_rewards, trace = \
+                sample_mol_buffer(
+                    self.policy_model,
+                    self.pretrained,
+                    self.reward_model,
+                    self.tokenizer,
+                    steps=self.args.total_num_steps,
+                    mask=self.policy_model.interpolant.mask_token,
+                    pad=self.policy_model.interpolant.pad_token,
+                    batch_size=self.args.batch_size,
+                    max_length=self.args.max_length,
+                    quality_mode=self._get_quality_mode(),
+                    alpha=self.args.alpha,
+                    num_remasking=self.args.num_remasking,
+                    quality_threshold=self.args.quality_threshold,
+                    use_quality_filter=self.args.use_quality_filter,
+                )
+            if self.args.elbo_rnd:
+                # Override trajectory log_rnd with forward ELBO estimate
+                if x_final.shape[0] > 0:
+                    with torch.no_grad():
+                        noised = self.policy_model.prepare_noised_sample(
+                            x_final, num_samples=self.args.elbo_rnd_num_samples)
+                        policy_loss = self.policy_model.compute_loss_from_noised(noised)
+                        pretrained_loss = self.pretrained.compute_loss_from_noised(noised)
+                        log_rnd = (pretrained_loss - policy_loss) + (final_rewards / self.args.alpha)
+            elapsed = time.time() - start_time
+            if rank == 0:
+                print(f"[BUFFER] rank={rank} sampling took {elapsed:.1f}s")
+            n_valid = x_final.shape[0]
+            if n_valid > 0:
+                accumulated_x.append(x_final)
+                accumulated_log_rnd.append(log_rnd)
+                accumulated_rewards.append(final_rewards)
+                total_accumulated += n_valid
+            if rank == 0:
+                qm = self._get_quality_mode()
+                print(f"[BUFFER] rank={rank} epoch={self.current_epoch} quality_mode={qm} accumulated={total_accumulated} / {samples_per_gpu} (batch yielded {n_valid} valid) attempt={attempts}")
+        if total_accumulated == 0:
+            raise RuntimeError(f"[BUFFER ERROR] Rank {rank}: No valid sequences generated after {attempts} attempts. Check sampling function and reward model.")
+        if total_accumulated < samples_per_gpu:
+            print(f"[BUFFER WARNING] Rank {rank}: Only generated {total_accumulated}/{samples_per_gpu} sequences after {attempts} attempts")
+        new_x = torch.cat(accumulated_x, dim=0)[:samples_per_gpu]
+        new_log_rnd = torch.cat(accumulated_log_rnd, dim=0)[:samples_per_gpu]
+        new_rewards = torch.cat(accumulated_rewards, dim=0)[:samples_per_gpu]
+        del accumulated_x, accumulated_log_rnd, accumulated_rewards
+        torch.cuda.empty_cache()
+        # add to buffer: pool mode replaces a random subset, classic mode overwrites
+        if is_pool and not is_init:
+            actual_new = min(new_x.shape[0], self.x_saved.shape[0])
+            indices = torch.randperm(self.x_saved.shape[0], device=self.x_saved.device)[:actual_new]
+            self.x_saved[indices] = new_x[:actual_new]
+            self.log_rnd_saved[indices] = new_log_rnd[:actual_new]
+            self.final_rewards_saved[indices] = new_rewards[:actual_new]
+            if rank == 0:
+                print(f"[POOL] Replaced {actual_new}/{self.x_saved.shape[0]} molecules, reward mean={self.final_rewards_saved.mean():.4f}")
+        else:
+            self.x_saved = new_x
+            self.log_rnd_saved = new_log_rnd
+            self.final_rewards_saved = new_rewards
+        if rank == 0:
+            print(f"[BUFFER] After cleanup - GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+    def training_step(self, batch, batch_idx):
+        """Training step - batch is ignored, we use saved buffer."""
+        # Process buffer in mini-batches to avoid OOM
+        mini_batch_size = getattr(self.args, 'training_mini_batch_size', 8)
+        buffer_size = self.x_saved.shape[0]
+        # Randomly sample a mini-batch from buffer
+        indices = torch.randperm(buffer_size, device=self.x_saved.device)[:mini_batch_size]
+        x_final = self.x_saved[indices]
+        # get log_rnd values
+        log_rnd = self.log_rnd_saved[indices]
+        sm_temp = getattr(self.args, 'softmax_temperature', 1.0)
+        joint = getattr(self.args, 'joint_training', False)
+        policy_loss = None
+        planner_loss = None
+        if self.train_policy:
+            # Train policy with WDCE loss
+            policy_loss = self.policy_model.loss_wdce_flexible(
+                log_rnd,
+                x_final,
+                num_replicates=self.args.wdce_num_replicates,
+                centering=self.args.centering,
+                centering_strength=self.args.centering_strength,
+                softmax_temperature=sm_temp,
+            )
+            self.log('train/policy_loss', policy_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        if (not self.train_policy) or joint:
+            # Train planner with appropriate loss based on ablation flags
+            if self.args.disable_insertion_planner:
+                # Ablation: only train unmasking planner (no insertion head)
+                planner_loss = self.policy_model.loss_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength,
+                    softmax_temperature=sm_temp,
+                )
+                self.log('train/planner_unmask_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', 0.0, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+            elif self.args.disable_unmasking_planner:
+                # only train insertion planner (no remasking head)
+                unmask_loss, insert_loss, _ = self.policy_model.loss_insert_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength,
+                    softmax_temperature=sm_temp,
+                )
+                # Zero out the unmasking component - only backprop insertion loss
+                planner_loss = insert_loss
+                self.log('train/planner_unmask_loss', 0.0, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', insert_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+            else:
+                # Full planner: train both remasking + insertion
+                unmask_loss, insert_loss, planner_loss = self.policy_model.loss_insert_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength,
+                    softmax_temperature=sm_temp,
+                )
+                self.log('train/planner_unmask_loss', unmask_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', insert_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        # Combine losses depending on mode
+        if joint:
+            loss = policy_loss + planner_loss
+            mode_value = 0.5
+        elif self.train_policy:
+            loss = policy_loss
+            mode_value = 0.0
+        else:
+            loss = planner_loss
+            mode_value = 1.0
+        # Log overall loss and mode
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log('train/mode', mode_value, prog_bar=True, sync_dist=True)
+        return loss
+    def on_train_epoch_end(self):
+        """Called at the end of each training epoch - only rank 0 evaluates."""
+        # Only evaluate every N epochs to save time
+        eval_frequency = getattr(self.args, 'eval_every_n_epochs', 5)
+        is_last_epoch = (self.trainer and self.current_epoch == self.trainer.max_epochs - 1)
+        if self.global_rank == 0 and (self.current_epoch % eval_frequency == 0 or is_last_epoch):
+            # Sample eval batch with updated policy
+            x_eval, qed, sa, uniqueness, diversity, quality, valid_fraction = \
+                sample_mol_eval(
+                    self.policy_model, self.reward_model,
+                    self.tokenizer,
+                    steps=self.args.total_num_steps,
+                    mask=self.policy_model.interpolant.mask_token,
+                    pad=self.policy_model.interpolant.pad_token,
+                    batch_size=50,
+                    max_length=self.args.max_length,
+                    quality_mode=self._get_quality_mode(),
+                    num_remasking=self.args.num_remasking,
+                    evaluator=self.evaluator,
+                )
+            # Append to logs
+            self.valid_fraction_log.append(valid_fraction)
+            self.uniqueness_log.append(uniqueness)
+            self.diversity_log.append(diversity)
+            self.qed_log.append(qed)
+            self.sa_log.append(sa)
+            self.quality_log.append(quality)
+            # Compute reward stats
+            mean_reward = self.final_rewards_saved.mean().item()
+            min_reward = self.final_rewards_saved.min().item()
+            max_reward = self.final_rewards_saved.max().item()
+            median_reward = self.final_rewards_saved.median().item()
+            # Log metrics
+            self.log_dict({
+                "eval/valid_fraction": valid_fraction,
+                "eval/uniqueness": np.mean(uniqueness),
+                "eval/diversity": np.mean(diversity),
+                "eval/qed": np.mean(qed),
+                "eval/sa": np.mean(sa),
+                "eval/quality": np.mean(quality),
+                "eval/mean_reward_search": mean_reward,
+                "eval/min_reward_search": min_reward,
+                "eval/max_reward_search": max_reward,
+                "eval/median_reward_search": median_reward
+            })
+            print(f"epoch {self.current_epoch} | validity {valid_fraction:.4f} | uniqueness {np.mean(uniqueness):.4f} | diversity {np.mean(diversity):.4f} | "
+                  f"QED {np.mean(qed):.4f} | SA {np.mean(sa):.4f} | quality {np.mean(quality):.4f} | ")
+    def on_fit_end(self):
+        """Called at the end of training - save results."""
+        if self.global_rank == 0:
+            # Save logs and plot
+            base_path = self.args.base_path
+            plot_path = f'{base_path}/results/{self.args.run_name}'
+            os.makedirs(plot_path, exist_ok=True)
+            output_log_path = f'{plot_path}/log_{self.filename}.csv'
+            save_logs_to_file(self.valid_fraction_log, self.uniqueness_log,
+                              self.diversity_log, self.qed_log, self.sa_log,
+                              self.quality_log, output_log_path)
+            # Final generation
+            x_eval, qed, sa, valid_fraction, uniqueness, diversity, quality, df = \
+                sample_mol_eval(
+                    self.policy_model, self.reward_model,
+                    self.tokenizer,
+                    steps=self.args.total_num_steps,
+                    mask=self.policy_model.interpolant.mask_token,
+                    pad=self.policy_model.interpolant.pad_token,
+                    batch_size=50,
+                    max_length=self.args.max_length,
+                    quality_mode=self._get_quality_mode(),
+                    num_remasking=self.args.num_remasking,
+                    evaluator=self.evaluator,
+                    dataframe=True,
+                )
+            df.to_csv(f'{plot_path}/mol_generation_results.csv', index=False)
+def save_logs_to_file(valid_fraction_log, uniqueness_log,
+                      diversity_log, qed_log, sa_log,
+                      quality_log, output_path):
+    """
+    Saves the logs to a CSV file.
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    log_data = {
+        "Iteration": list(range(1, len(valid_fraction_log) + 1)),
+        "Valid Fraction": valid_fraction_log,
+        "Uniqueness": uniqueness_log,
+        "Diversity": diversity_log,
+        "QED": qed_log,
+        "Synthetic Accessibility": sa_log,
+        "Quality": quality_log
+    }
+    df = pd.DataFrame(log_data)
+    df.to_csv(output_path, index=False)
+class DummyDataset(torch.utils.data.Dataset):
+    """Dummy dataset for Lightning trainer (we use buffer instead)."""
+    def __init__(self, size=100):
+        self.size = size
+    def __len__(self):
+        return self.size
+    def __getitem__(self, idx):
+        return torch.zeros(1)  # Dummy data
+def main():
+    """Main entry point for distributed training."""
+    # Disable DDP optimizer for higher-order ops like flex_attention
+    import torch._dynamo
+    torch._dynamo.config.optimize_ddp = False
+    argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    argparser.add_argument('--base_path', type=str, default=REPO_ROOT)
+    argparser.add_argument('--learning_rate', type=float, default=1e-4)
+    argparser.add_argument('--num_epochs', type=int, default=100)
+    argparser.add_argument('--num_accum_steps', type=int, default=4)
+    argparser.add_argument('--truncate_steps', type=int, default=50)
+    argparser.add_argument("--truncate_kl", type=str2bool, default=False)
+    argparser.add_argument('--gumbel_temp', type=float, default=1.0)
+    argparser.add_argument('--gradnorm_clip', type=float, default=1.0)
+    argparser.add_argument('--batch_size', type=int, default=50)
+    argparser.add_argument('--name', type=str, default='debug')
+    argparser.add_argument('--total_num_steps', type=int, default=128)
+    argparser.add_argument('--copy_flag_temp', type=float, default=None)
+    argparser.add_argument('--save_every_n_epochs', type=int, default=10)
+    argparser.add_argument('--eval_every_n_epochs', type=int, default=5, help='Evaluate only every N epochs to save time')
+    argparser.add_argument('--alpha_schedule_warmup', type=int, default=0)
+    argparser.add_argument("--seed", type=int, default=0)
+    # new
+    argparser.add_argument('--run_name', type=str, default='mol')
+    argparser.add_argument("--save_path_dir", default="", type=str)
+    # mcts
+    argparser.add_argument('--num_sequences', type=int, default=10)
+    argparser.add_argument('--max_length', type=int, default=1024)
+    argparser.add_argument('--num_children', type=int, default=50)
+    argparser.add_argument('--num_iter', type=int, default=30) # iterations of mcts
+    argparser.add_argument('--seq_length', type=int, default=1024)
+    argparser.add_argument('--time_conditioning', action='store_true', default=False)
+    argparser.add_argument('--mcts_sampling', type=int, default=0) # for batched categorical sampling: '0' means gumbel noise
+    argparser.add_argument('--buffer_size', type=int, default=100)
+    argparser.add_argument('--wdce_num_replicates', type=int, default=16)
+    argparser.add_argument('--noise_removal', action='store_true', default=False)
+    argparser.add_argument('--grad_clip', action='store_true', default=False)
+    argparser.add_argument('--resample_every_n_step', type=int, default=3)
+    argparser.add_argument('--exploration', type=float, default=0.1)
+    argparser.add_argument('--reset_every_n_step', type=int, default=100)
+    argparser.add_argument('--alpha', type=float, default=0.01)
+    argparser.add_argument('--scalarization', type=str, default='sum')
+    argparser.add_argument('--no_mcts', action='store_true', default=False)
+    argparser.add_argument("--centering", action='store_true', default=False)
+    argparser.add_argument("--centering_strength", type=float, default=1.0)
+    # adaptive schedule parameters
+    argparser.add_argument('--use_adaptive_schedule', action='store_true', default=True)
+    argparser.add_argument('--schedule_hidden_dim', type=int, default=256)
+    argparser.add_argument('--schedule_num_layers', type=int, default=2)
+    argparser.add_argument('--schedule_loss_weight', type=float, default=0.1)
+    argparser.add_argument('--adaptive_threshold', type=float, default=0.5)
+    argparser.add_argument('--freeze_base_model', action='store_true', default=False)
+    argparser.add_argument('--schedule_warmup_epochs', type=int, default=20, help='Number of initial epochs to train WITHOUT remasking in buffer generation')
+    argparser.add_argument('--alternation_frequency', type=int, default=5, help='Number of epochs to train each model before alternating (1=alternate every epoch)')
+    argparser.add_argument('--planner_learning_rate', type=float, default=None, help='Separate learning rate for planner heads (defaults to --learning_rate if not set)')
+    # objectives
+    argparser.add_argument('--num_obj', type=int, default=2)
+    argparser.add_argument('--devices', type=int, default=-1)
+    argparser.add_argument('--checkpoint_path', type=str, default=None)
+    # ELBO-based log_rnd estimation
+    argparser.add_argument('--elbo_rnd', action='store_true', default=False,
+        help='If set, compute log_rnd via forward ELBO instead of trajectory rollout')
+    argparser.add_argument('--elbo_rnd_num_samples', type=int, default=4,
+        help='Number of noisy time samples per sequence for ELBO-based log_rnd estimation')
+    # remasking
+    argparser.add_argument('--num_remasking', type=int, default=5)
+    argparser.add_argument('--quality_threshold', type=float, default=1)
+    argparser.add_argument('--use_quality_filter', action='store_true', help='If set, filter buffer to only include molecules with QED>=0.6 and SA<=4')
+    argparser.add_argument('--training_mini_batch_size', type=int, default=8, help='Mini-batch size for training step to avoid OOM')
+    argparser.add_argument('--disable_planner', action='store_true', help='If set, disable remasking completely and only train policy (not planner) for quality optimization')
+    argparser.add_argument('--disable_insertion_planner', action='store_true', help='Ablation: disable insertion quality filtering but keep unmasking/remasking planner')
+    argparser.add_argument('--disable_unmasking_planner', action='store_true', help='Ablation: disable unmasking/remasking planner but keep insertion quality filtering')
+    argparser.add_argument('--joint_training', action='store_true', help='Ablation: train policy and planner jointly each step (no alternation, both unfrozen, summed loss). Incompatible with --disable_planner.')
+    argparser.add_argument('--qed_only', action='store_true', help='If set, optimize only for QED score (no SA)')
+    argparser.add_argument('--softmax_temperature', type=float, default=1.0,
+        help='Temperature for softmax on importance weights (>1 smooths, prevents concentration)')
+    argparser.add_argument('--pool_size', type=int, default=0,
+        help='If >0, maintain a persistent pool of this size and refresh a fraction each resample step (0=disabled, classic buffer)')
+    argparser.add_argument('--pool_refresh_fraction', type=float, default=0.2,
+        help='Fraction of pool to replace each resample step (only used when pool_size>0)')
+    argparser.add_argument('--num_training_steps_per_epoch', type=int, default=10,
+        help='Number of gradient updates per epoch (1=original, 10=recommended)')
+    args = argparser.parse_args()
+    # Default planner LR to policy LR if not specified
+    if args.planner_learning_rate is None:
+        args.planner_learning_rate = args.learning_rate
+    # Set seed
+    pl.seed_everything(args.seed)
+    # Load models
+    checkpoint_path = args.checkpoint_path if args.checkpoint_path else \
+        os.path.join(REPO_ROOT, 'pretrained', 'anylength_mol.ckpt')
+    curr_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if args.no_mcts:
+        args.run_name = f'mol_al_resample{args.resample_every_n_step}_no-mcts_{curr_time}'
+    else:
+        args.run_name = f'mol_al_resample{args.resample_every_n_step}_buffer{args.buffer_size}_numiter{args.num_iter}_children{args.num_children}_{curr_time}'
+    # append ablation tags to run name for easy identification
+    if args.disable_planner:
+        args.run_name += '_no_planner'
+    if args.disable_insertion_planner:
+        args.run_name += '_no_insertion_planner'
+    if args.disable_unmasking_planner:
+        args.run_name += '_no_unmasking_planner'
+    if args.joint_training:
+        if args.disable_planner:
+            raise ValueError("--joint_training is incompatible with --disable_planner (no planner to train)")
+        args.run_name += '_joint_training'
+    args.save_path = os.path.join(args.save_path_dir, args.run_name)
+    os.makedirs(args.save_path, exist_ok=True)
+    set_seed(args.seed, use_cuda=False)  # Don't init CUDA before Lightning spawns DDP workers
+    # Initialize the model
+    print("Loading models..")
+    # Load pretrained model for reference (frozen)
+    pretrained = AnyOrderInsertionFlowModule.load_from_checkpoint(checkpoint_path,
+                                                map_location='cpu',
+                                                weights_only=False)
+    pretrained.eval()
+    for param in pretrained.parameters():
+        param.requires_grad = False
+    # Load checkpoint to extract config
+    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+    if 'hyper_parameters' in checkpoint:
+        config = checkpoint['hyper_parameters']['config']
+    elif 'config' in checkpoint:
+        config = checkpoint['config']
+    else:
+        raise ValueError("Cannot find config in checkpoint")
+    # Update config for adaptive schedule
+    from omegaconf import OmegaConf
+    if not OmegaConf.is_config(config):
+        from omegaconf import DictConfig
+        config = DictConfig(config)
+    OmegaConf.set_struct(config, False)
+    config.training.use_adaptive_schedule = args.use_adaptive_schedule
+    config.training.schedule_hidden_dim = args.schedule_hidden_dim
+    config.training.schedule_num_layers = args.schedule_num_layers
+    config.training.schedule_loss_weight = args.schedule_loss_weight
+    config.training.freeze_base_model = args.freeze_base_model
+    config.training.schedule_warmup_epochs = args.schedule_warmup_epochs
+    config.training.use_bracket_safe = True
+    OmegaConf.set_struct(config, True)
+    # initialize policy model with adaptive schedule
+    policy_model = AnyOrderInsertionFlowModuleFT(
+        config=config,
+        args=args,
+        pretrained_checkpoint=checkpoint_path,
+        insertion_planner=True,
+    )
+    # define mcts
+    if args.qed_only:
+        score_func_names = ['qed']
+    else:
+        score_func_names = ['qed', 'sa']
+    tokenizer = get_tokenizer()
+    filename = args.run_name
+    # Device will be set by Lightning automatically in DDP
+    reward_model = MolScoringFunctions(score_func_names, device='cpu')
+    model = MolFinetuner(
+        args=args,
+        policy_model=policy_model,
+        reward_model=reward_model,
+        tokenizer=tokenizer,
+        pretrained=pretrained,
+        mcts=None,
+        filename=filename,
+    )
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=args.save_path,
+        filename='model-{epoch:02d}-{train_loss:.4f}',
+        every_n_epochs=args.save_every_n_epochs,
+        save_top_k=-1,  # Save all checkpoints
+        save_last=True,  # Also save last.ckpt
+        auto_insert_metric_name=False
+    )
+    # Defaults to your default wandb entity; override with the WANDB_ENTITY env var.
+    wandb_logger = WandbLogger(entity=os.environ.get('WANDB_ENTITY'), project='a2d2-mol', name=args.run_name)
+    # create dummy dataloader
+    dataset = DummyDataset(size=args.num_training_steps_per_epoch)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
+    # setup trainer with DDP
+    trainer = pl.Trainer(
+        max_epochs=args.num_epochs,
+        accelerator='gpu',
+        devices=args.devices,
+        strategy=DDPStrategy(find_unused_parameters=True) if args.devices != 1 else 'auto',
+        gradient_clip_val=args.gradnorm_clip if args.grad_clip else None,
+        logger=wandb_logger,
+        callbacks=[checkpoint_callback],
+        enable_progress_bar=True,
+        log_every_n_steps=1
+    )
+    # Train
+    trainer.fit(model, dataloader)
+if __name__ == '__main__':
+    main()

a2d2_mol/inference_quality_mol.py ADDED Viewed

	@@ -0,0 +1,554 @@

+"""Unified molecule sampling with quality-guided planning.
+Supports 4 quality modes and optional RND (importance weight) computation.
+Quality modes:
+    "none"            - No planner, no remasking (policy-only)
+    "both"            - Both unmasking + insertion planners active
+    "unmasking_only"  - Only unmasking/remasking planner (insertion planner disabled)
+    "insertion_only"  - Only insertion planner (unmasking planner disabled)
+RND toggle:
+    compute_rnd=True  - Run pretrained model in parallel, compute step-wise log importance weights
+    compute_rnd=False - Run policy model only (use with ELBO-based RND or eval)
+"""
+import torch
+import numpy as np
+import pandas as pd
+import torch.nn.functional as F
+from sampling import SamplingResult, SamplingTraceDatapoint, _sample_tokens
+from remasking_scheduleaware import apply_schedule_aware_remasking, apply_schedule_aware_insertion
+from mol_utils.utils_chem import batch_safe_to_smiles, batch_validate_and_extract
+from tdc import Evaluator, Oracle
+QUALITY_MODES = {"none", "both", "unmasking_only", "insertion_only"}
+@torch.no_grad()
+def _diffusion_loop(
+    model, steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    compute_rnd=False,
+    pretrained=None,
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    temperature=1.0,
+    return_trace=False,
+    unmask_quality_threshold=None,
+):
+    """Core discrete diffusion sampling loop for molecule generation.
+    Args:
+        model: Finetuned policy model.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: One of "none", "both", "unmasking_only", "insertion_only".
+        compute_rnd: Whether to compute step-wise log importance weights.
+        pretrained: Frozen pretrained model (required if compute_rnd=True).
+        remasking_mode: Remasking strategy ("schedule_aware", "remdm", "remdm_conf").
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering. None if schedule-driven.
+        temperature: Sampling temperature (1.0 = no scaling).
+        return_trace: Whether to record sampling trace.
+    Returns:
+        (xt, log_rnd, sampling_trace)
+        log_rnd is None when compute_rnd=False.
+    """
+    assert quality_mode in QUALITY_MODES, f"quality_mode must be one of {QUALITY_MODES}"
+    if compute_rnd:
+        assert pretrained is not None, "pretrained model required when compute_rnd=True"
+    # Derive flags from quality_mode
+    use_remasking = quality_mode != "none"
+    disable_unmasking_planner = quality_mode in ("none", "insertion_only")
+    disable_insertion_planner = quality_mode in ("none", "unmasking_only")
+    device = next(model.parameters()).device
+    # Initialize all-pad sequence
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute index tensors
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    neg_inf = torch.tensor(-np.inf, device=device)
+    if use_remasking and remasking_mode == "remdm_conf":
+        remasking_score = torch.zeros((batch_size, max_length), device=device)
+    log_rnd = None
+    for i in range(steps):
+        # --- Policy model forward ---
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # --- Pretrained model forward (for RND) ---
+        if compute_rnd:
+            pretrained_pred = pretrained(xt, t)
+            pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+            pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+            pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+        # --- Unmask step (Euler) ---
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        if compute_rnd:
+            pretrained_unmask_rate[xt != mask] = 0
+            pretrained_unmask_rate[mask_pos + (mask,)] = 0
+            pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+            pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+        # Add "stay" probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        if compute_rnd:
+            pretrained_trans_prob.scatter_add_(
+                2,
+                _xt.unsqueeze(-1),
+                torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+            )
+        # Temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        # Final step: remove mask token from sampling
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                num_zero_prob = mask_has_zero_prob.sum().item()
+                uniform_prob = torch.zeros((num_zero_prob, trans_prob.shape[-1]), device=device, dtype=trans_prob.dtype)
+                uniform_prob[:, :mask] = 1.0 / mask
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # Update remasking_score buffer for remdm_conf mode
+        if use_remasking and remasking_mode == "remdm_conf" and i < steps - 1:
+            token_probs = F.softmax(unmask_rate, dim=-1)  # (B, L, V)
+            chosen_probs = torch.gather(token_probs, dim=-1, index=new_xt.unsqueeze(-1)).squeeze(-1)  # (B, L)
+            changed_mask_to_token = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+            remasking_score = torch.where(changed_mask_to_token, chosen_probs, remasking_score)
+        # --- Remasking step ---
+        if use_remasking and i < steps - 1:
+            if disable_unmasking_planner or not (hasattr(model, 'planner') and model.planner is not None):
+                remasking_conf = torch.zeros((batch_size, max_length), device=device)
+            else:
+                planner_out = model.planner(new_xt, t)
+                remasking_conf = planner_out["remasking_conf"].squeeze(-1)  # (B, L)
+            clean_index = (new_xt != mask) & (new_xt != pad)  # (B, L)
+            if remasking_mode == "schedule_aware":
+                new_xt = apply_schedule_aware_remasking(
+                    model, new_xt, t, dt, remasking_conf, clean_index,
+                    mask, neg_inf, batch_size,
+                    unmask_quality_threshold=unmask_quality_threshold,
+                )
+                remasking_score_temp = None
+            else:
+                raise ValueError(f"Unknown remasking_mode: {remasking_mode}")
+            if remasking_score_temp is not None:
+                remasking_score_temp = torch.where(clean_index, remasking_score_temp, neg_inf)
+                for j in range(batch_size):
+                    k = min(num_remasking, int(clean_index[j].sum().item()))
+                    if k > 0:
+                        _, select_indices = torch.topk(remasking_score_temp[j], k=k)
+                        new_xt[j, select_indices] = mask
+            if return_trace:
+                for batch_idx in range(batch_size):
+                    for pos in range(max_length):
+                        if clean_index[batch_idx, pos] and new_xt[batch_idx, pos] == mask:
+                            sampling_trace[batch_idx].append(
+                                SamplingTraceDatapoint(
+                                    t=t[batch_idx].item(),
+                                    event_type="change",
+                                    position=pos,
+                                    token=mask,
+                                )
+                            )
+        # --- Compute log probabilities for RND ---
+        if compute_rnd:
+            lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+            lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+            changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+            log_policy_step = (lp * changed_mask).sum(dim=1)
+            log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+            log_rnd = log_pretrained_step - log_policy_step  # (B,)
+        # --- Insertion step ---
+        if i != steps - 1:
+            ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+            # Schedule-aware insertion quality filtering
+            if use_remasking and not disable_insertion_planner:
+                if compute_rnd:
+                    xt_tmp_before = xt_tmp.clone()
+                xt_tmp = apply_schedule_aware_insertion(
+                    model, xt_tmp, new_xt, t, dt, ext, mask, pad, max_length,
+                    orig_mask, new_pos_orig, quality_threshold
+                )
+                if compute_rnd:
+                    # Compute corrected ext based on what actually stayed
+                    ext_corrected = torch.zeros_like(ext)
+                    for b in range(batch_size):
+                        after_len = xt_tmp[b].ne(pad).sum().item()
+                        orig_len = xt_len[b].item()
+                        surviving_insertions = after_len - orig_len
+                        if total_ext[b] > 0:
+                            ratio = surviving_insertions / total_ext[b].item()
+                            ext_corrected[b] = (ext[b].float() * ratio).long()
+                else:
+                    ext_corrected = ext
+            else:
+                ext_corrected = ext
+            # Compute insertion log_rnd
+            if compute_rnd:
+                insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+                pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+                log_policy_insert = (ext_corrected * torch.log(insertion_rate) - insertion_rate).sum(dim=1)
+                log_pretrained_insert = (ext_corrected * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)
+                log_insert_diff = log_pretrained_insert - log_policy_insert
+                log_rnd += log_insert_diff
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                if i != steps - 1:
+                    for j in range(max_length):
+                        id = max_length - j - 1
+                        if ext[batch_idx, id]:
+                            sampling_trace[batch_idx].append(
+                                SamplingTraceDatapoint(
+                                    t=t[batch_idx].item(),
+                                    event_type="insertion",
+                                    position=id,
+                                    token=mask,
+                                )
+                            )
+        xt = xt_tmp
+        t = t + dt
+    return xt, log_rnd, sampling_trace
+def _decode_and_validate(model, tokenizer, samples):
+    """Decode token IDs to SMILES and validate.
+    Returns:
+        (validSequences, valid_indices): list of valid SMILES, list of batch indices.
+    """
+    decoded_samples = tokenizer.batch_decode(samples, skip_special_tokens=True)
+    use_bracket_safe = model.config.training.get('use_bracket_safe', False)
+    smiles_samples = batch_safe_to_smiles(decoded_samples, use_bracket_safe=use_bracket_safe, fix=True)
+    # Extract valid sequences (take largest fragment)
+    validSequences = []
+    valid_indices = []
+    for idx, s in enumerate(smiles_samples):
+        if s:
+            largest_frag = sorted(s.split('.'), key=len)[-1]
+            validSequences.append(largest_frag)
+            valid_indices.append(idx)
+    return validSequences, valid_indices
+@torch.no_grad()
+def sample_mol_buffer(
+    model, pretrained, reward_model, tokenizer,
+    steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    alpha=0.1,
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    temperature=1.0,
+    use_quality_filter=True,
+):
+    """Generate molecules for training buffer. Always computes step-wise RND.
+    Args:
+        model: Finetuned policy model.
+        pretrained: Frozen pretrained model.
+        reward_model: Molecule scoring function.
+        tokenizer: SAFE tokenizer for decoding.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: "none", "both", "unmasking_only", or "insertion_only".
+        alpha: RND scaling factor.
+        remasking_mode: Remasking strategy.
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering. None if schedule-driven.
+        temperature: Sampling temperature.
+        use_quality_filter: If True, filter to QED>=0.6 and SA<=4.
+    Returns:
+        (valid_x, log_rnd, scalar_rewards, sampling_trace)
+    """
+    xt, log_rnd, trace = _diffusion_loop(
+        model, steps, mask, pad, batch_size, max_length,
+        quality_mode=quality_mode,
+        compute_rnd=True,
+        pretrained=pretrained,
+        remasking_mode=remasking_mode,
+        num_remasking=num_remasking,
+        quality_threshold=quality_threshold,
+        temperature=temperature,
+    )
+    device = xt.device
+    samples = xt.to(device)
+    validSequences, valid_indices = _decode_and_validate(model, tokenizer, samples)
+    valid_x_final = [samples[idx] for idx in valid_indices]
+    valid_log_rnd = [log_rnd[idx] for idx in valid_indices]
+    print("len valid sequences:", len(validSequences))
+    if len(validSequences) == 0:
+        print("[WARNING] No valid molecules generated in this batch")
+        empty_x = torch.empty((0, max_length), dtype=torch.long, device=device)
+        empty_log_rnd = torch.empty((0,), dtype=torch.float32, device=device)
+        empty_rewards = torch.empty((0,), dtype=torch.float32, device=device)
+        return empty_x, empty_log_rnd, empty_rewards, trace
+    # Compute multi-objective rewards
+    score_vectors = reward_model(input_seqs=validSequences)
+    scalar_rewards = np.sum(score_vectors, axis=-1)
+    scalar_rewards = torch.as_tensor(scalar_rewards, dtype=torch.float32, device=device)
+    print(f"scalar reward dim{len(scalar_rewards)}")
+    valid_log_rnd = torch.stack(valid_log_rnd, dim=0)
+    log_rnd = valid_log_rnd + (scalar_rewards / alpha)
+    valid_x_final = torch.stack(valid_x_final, dim=0)
+    # Optionally filter to only keep quality sequences (QED >= 0.6 and SA <= 4)
+    if use_quality_filter:
+        qed_scores = score_vectors[:, 0]
+        if score_vectors.shape[1] > 1:
+            sa_scores = score_vectors[:, 1]
+        else:
+            _oracle_sa = Oracle('sa')
+            raw_sa = np.array(_oracle_sa(validSequences))
+            sa_scores = raw_sa
+        quality_mask = (qed_scores >= 0.6) & (sa_scores <= 4)
+        n_quality = quality_mask.sum()
+        print(f"Quality filtering: {n_quality}/{len(validSequences)} sequences pass (QED>=0.6, SA<=4)")
+        if n_quality == 0:
+            print("[WARNING] No quality molecules in this batch")
+            empty_x = torch.empty((0, max_length), dtype=torch.long, device=device)
+            empty_log_rnd = torch.empty((0,), dtype=torch.float32, device=device)
+            empty_rewards = torch.empty((0,), dtype=torch.float32, device=device)
+            return empty_x, empty_log_rnd, empty_rewards, trace
+        quality_mask_torch = torch.as_tensor(quality_mask, dtype=torch.bool, device=device)
+        quality_x_final = valid_x_final[quality_mask_torch]
+        quality_log_rnd = log_rnd[quality_mask_torch]
+        quality_rewards = scalar_rewards[quality_mask_torch]
+    else:
+        print(f"No quality filtering applied - using all {len(validSequences)} valid molecules")
+        quality_x_final = valid_x_final
+        quality_log_rnd = log_rnd
+        quality_rewards = scalar_rewards
+    return quality_x_final, quality_log_rnd, quality_rewards, trace
+@torch.no_grad()
+def sample_mol_eval(
+    model, reward_model, tokenizer,
+    steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    temperature=1.0,
+    evaluator=None,
+    dataframe=False,
+    unmask_quality_threshold=None,
+):
+    """Generate molecules for evaluation.
+    Args:
+        model: Finetuned policy model.
+        reward_model: Molecule scoring function.
+        tokenizer: SAFE tokenizer for decoding.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: "none", "both", "unmasking_only", or "insertion_only".
+        remasking_mode: Remasking strategy.
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering. Pass None
+            to use schedule-driven deletion with no threshold gate
+        temperature: Sampling temperature.
+        evaluator: TDC Evaluator for diversity (created if None).
+        dataframe: If True, include a pandas DataFrame in the return.
+    Returns:
+        Without dataframe:
+            (validSequences, qed, sa, uniqueness, diversity, quality, valid_fraction)
+        With dataframe:
+            (validSequences, qed, sa, valid_fraction, uniqueness, diversity, quality, df)
+        validSequences is the raw list including duplicates; qed/sa are scored
+        on the unique set. Caller can dedup with set(validSequences). The
+        dataframe (when requested) has one row per unique molecule.
+    """
+    if evaluator is None:
+        evaluator = Evaluator('diversity')
+    xt, _, trace = _diffusion_loop(
+        model, steps, mask, pad, batch_size, max_length,
+        quality_mode=quality_mode,
+        compute_rnd=False,
+        remasking_mode=remasking_mode,
+        num_remasking=num_remasking,
+        quality_threshold=quality_threshold,
+        temperature=temperature,
+        unmask_quality_threshold=unmask_quality_threshold,
+    )
+    device = xt.device
+    samples = xt.to(device)
+    decoded_samples = tokenizer.batch_decode(samples, skip_special_tokens=True)
+    use_bracket_safe = model.config.training.get('use_bracket_safe', False)
+    smiles_samples = batch_safe_to_smiles(decoded_samples, use_bracket_safe=use_bracket_safe, fix=True)
+    # Extract valid sequences (take largest fragment)
+    validSequences = [sorted(s.split('.'), key=len)[-1] for s in smiles_samples if s]
+    print("len valid sequences:", len(validSequences))
+    valid_fraction = len(validSequences) / batch_size
+    uniqueSequences = list(set(validSequences))
+    uniqueness = len(uniqueSequences) / len(validSequences) if len(validSequences) > 0 else 0
+    diversity = evaluator(uniqueSequences) if len(uniqueSequences) > 0 else 0
+    # Calculate quality (unique sequences with QED >= 0.6 and SA <= 4)
+    if len(uniqueSequences) > 0:
+        score_vectors_temp = reward_model(input_seqs=list(uniqueSequences))
+        qed_scores = score_vectors_temp[:, 0]  # Raw QED (0-1)
+        # Always use raw SA (1-10 scale) for quality filtering
+        _oracle_sa = Oracle('sa')
+        raw_sa_scores = np.array(_oracle_sa(list(uniqueSequences)))
+        quality_count = sum((qed_scores >= 0.6) & (raw_sa_scores <= 4))
+        quality = quality_count / batch_size
+        print(f'Quality:\t{quality}')
+        qed = qed_scores
+        sa = raw_sa_scores
+    else:
+        zeros = [0.0]
+        qed = zeros
+        sa = zeros
+        quality = 0.0
+    if dataframe:
+        df = pd.DataFrame({
+            "Mol Sequence": uniqueSequences,
+            "QED": qed if len(uniqueSequences) else [0.0],
+            "SA": sa if len(uniqueSequences) else [0.0],
+        })
+        return validSequences, qed, sa, valid_fraction, uniqueness, diversity, quality, df
+    return validSequences, qed, sa, uniqueness, diversity, quality, valid_fraction

a2d2_mol/mol_dataset.py ADDED Viewed

	@@ -0,0 +1,379 @@

+#!/usr/bin/env python
+"""
+Adapter to use HuggingFace datasets with the any-length discrete diffusion model.
+This module converts HuggingFace datasets (like datamol-io/safe-drugs) into the format
+expected by the training pipeline.
+"""
+import torch
+from torch.utils.data import Dataset, DataLoader
+from datasets import load_dataset
+import pytorch_lightning as pl
+from safe.tokenizer import SAFETokenizer
+from mol_utils.bracket_safe_converter import safe2bracketsafe
+from typing import Optional, List
+import re
+def get_tokenizer():
+    """Get SAFE tokenizer with added special tokens."""
+    tk = SAFETokenizer.from_pretrained('datamol-io/safe-gpt').get_pretrained()
+    tk.add_tokens(['<', '>'])   # for bracket_safe
+    return tk
+class Collator:
+    """Data collator for SAFE/bracket-SAFE format."""
+    def __init__(self, config, tokenizer=None):
+        self.tokenizer = tokenizer if tokenizer is not None else get_tokenizer()
+        self.max_length = config.interpolant.max_length
+        self.use_bracket_safe = config.training.get('use_bracket_safe', False)
+    def __call__(self, examples):
+        # Handle both dict with 'labels' and direct string format
+        inputs = []
+        for example in examples:
+            if isinstance(example, dict):
+                # Try different key names: 'input', 'labels', 'smiles'
+                input_text = example.get('input', example.get('labels', example.get('smiles', '')))
+            else:
+                input_text = example
+            if self.use_bracket_safe:
+                input_text = safe2bracketsafe(input_text)
+            inputs.append(input_text)
+        batch = self.tokenizer(
+            inputs,
+            return_tensors='pt',
+            padding=True,
+            truncation=True,
+            max_length=self.max_length
+        )
+        # Convert BatchEncoding to plain dict with tensors
+        # Remove token_type_ids if present (not needed for diffusion models)
+        result = {
+            'input_ids': batch['input_ids'],
+            'attention_mask': batch['attention_mask']
+        }
+        return result
+class HFDatasetAdapter(Dataset):
+    """Adapts HuggingFace datasets to the format expected by the diffusion model."""
+    def __init__(self, hf_dataset, tokenizer, smiles_column='smiles', max_length=1024, convert_to_safe=False, is_streaming=False):
+        """
+        Args:
+            hf_dataset: HuggingFace dataset object (streaming or regular)
+            tokenizer: SMILES tokenizer instance
+            smiles_column: Name of the column containing SMILES strings
+            max_length: Maximum sequence length
+            convert_to_safe: Whether to convert SMILES to SAFE format
+            is_streaming: Whether dataset is in streaming mode
+        """
+        self.tokenizer = tokenizer
+        self.smiles_column = smiles_column
+        self.max_length = max_length
+        self.convert_to_safe = convert_to_safe
+        self.is_streaming = is_streaming
+        if is_streaming:
+            # For streaming datasets, we don't pre-load the data
+            self.data = hf_dataset
+            self._length = None  # Unknown length for streaming
+            print(f'Initialized streaming dataset adapter')
+        else:
+            # Store raw data without pre-tokenization (tokenization will happen in collator)
+            print(f'Initializing HF dataset adapter with {len(hf_dataset)} samples...')
+            self.data = []
+            for item in hf_dataset:
+                smiles = item[smiles_column]
+                if smiles:  # Skip empty SMILES
+                    self.data.append({'input': smiles, 'labels': smiles})
+            print(f'Processed {len(self.data)} valid samples')
+    def __len__(self):
+        if self.is_streaming:
+            # Streaming datasets don't have a length
+            # Return a large number to prevent issues with samplers
+            return 10_000_000 if self._length is None else self._length
+        return len(self.data)
+    def __getitem__(self, idx):
+        if self.is_streaming:
+            # For streaming, iteration happens differently
+            raise NotImplementedError("Streaming datasets should be iterated, not indexed")
+        return self.data[idx]
+    def __iter__(self):
+        """Support iteration for streaming datasets."""
+        if self.is_streaming:
+            for item in self.data:
+                smiles = item[self.smiles_column]
+                if smiles:  # Skip empty SMILES
+                    yield {'input': smiles, 'labels': smiles}
+        else:
+            for item in self.data:
+                yield item
+class HFDataModule(pl.LightningDataModule):
+    """PyTorch Lightning DataModule for HuggingFace datasets."""
+    def __init__(
+        self,
+        config,
+        dataset_name: str,
+        tokenizer: SAFETokenizer,
+        smiles_column: str = 'smiles',
+        val_split: float = 0.1,
+        test_split: Optional[float] = None,
+        streaming: bool = True,
+        max_train_samples: Optional[int] = None,
+        max_val_samples: Optional[int] = None,
+    ):
+        """
+        Args:
+            config: Configuration object containing training parameters
+            dataset_name: HuggingFace dataset identifier (e.g., "datamol-io/safe-gpt")
+            tokenizer: SMILES tokenizer instance
+            smiles_column: Name of column containing SMILES strings
+            val_split: Fraction of data to use for validation
+            test_split: Optional fraction of data to use for testing
+            streaming: Whether to use streaming mode (recommended for large datasets)
+            max_train_samples: Maximum number of training samples to use (for non-streaming)
+            max_val_samples: Maximum number of validation samples to use (for non-streaming)
+        """
+        super().__init__()
+        self.config = config
+        self.dataset_name = dataset_name
+        self.tokenizer = tokenizer
+        self.smiles_column = smiles_column
+        self.max_length = config.interpolant.max_length
+        self.batch_size = config.training.per_gpu_batch_size
+        self.num_workers = config.training.get('cpus', 4)
+        self.val_split = val_split
+        self.test_split = test_split
+        self.streaming = streaming
+        self.max_train_samples = max_train_samples
+        self.max_val_samples = max_val_samples
+        self.train_dataset = None
+        self.val_dataset = None
+        self.test_dataset = None
+        # Initialize collator
+        self.collator = Collator(config, tokenizer)
+    def setup(self, stage: Optional[str] = None):
+        """Load and split the dataset."""
+        print(f'Loading dataset: {self.dataset_name} (streaming={self.streaming})')
+        if self.streaming:
+            # Load dataset in streaming mode
+            raw_dataset = load_dataset(self.dataset_name, streaming=True)
+            # Handle different dataset structures
+            if 'train' in raw_dataset:
+                train_stream = raw_dataset['train']
+            else:
+                # If no splits exist, use the entire dataset
+                train_stream = raw_dataset[list(raw_dataset.keys())[0]]
+            # For streaming, we need to manually split train/val
+            # Skip validation samples, then take training samples
+            val_size = int(100000 * self.val_split)  # Assume ~100k samples for val split calculation
+            train_size = 100000 - val_size
+            # Create validation stream (take first val_size samples)
+            val_stream = train_stream.take(val_size)
+            # Create training stream (skip val_size samples, then iterate)
+            train_stream_shifted = train_stream.skip(val_size)
+            # Create adapted datasets
+            self.train_dataset = HFDatasetAdapter(
+                train_stream_shifted,
+                self.tokenizer,
+                self.smiles_column,
+                self.max_length,
+                is_streaming=True
+            )
+            self.val_dataset = HFDatasetAdapter(
+                val_stream,
+                self.tokenizer,
+                self.smiles_column,
+                self.max_length,
+                is_streaming=True
+            )
+            print(f'Streaming dataset initialized - samples will be loaded on-the-fly')
+        else:
+            # Traditional non-streaming mode with full dataset loading
+            raw_dataset = load_dataset(self.dataset_name)
+            # Handle different dataset structures
+            if 'train' in raw_dataset:
+                train_data = raw_dataset['train']
+            else:
+                # If no splits exist, use the entire dataset and split it
+                train_data = raw_dataset[list(raw_dataset.keys())[0]]
+            # Limit samples if specified
+            if self.max_train_samples:
+                train_data = train_data.select(range(min(self.max_train_samples, len(train_data))))
+            # Check if dataset already has validation split
+            if 'validation' in raw_dataset or 'val' in raw_dataset:
+                val_key = 'validation' if 'validation' in raw_dataset else 'val'
+                val_data = raw_dataset[val_key]
+            else:
+                # Create train/val split
+                split_dataset = train_data.train_test_split(test_size=self.val_split, seed=42)
+                train_data = split_dataset['train']
+                val_data = split_dataset['test']
+            # Limit validation samples if specified
+            if self.max_val_samples:
+                val_data = val_data.select(range(min(self.max_val_samples, len(val_data))))
+            # Create test split if requested
+            if self.test_split and 'test' not in raw_dataset:
+                split_dataset = train_data.train_test_split(test_size=self.test_split, seed=42)
+                train_data = split_dataset['train']
+                self.test_dataset = HFDatasetAdapter(
+                    split_dataset['test'],
+                    self.tokenizer,
+                    self.smiles_column,
+                    self.max_length,
+                    is_streaming=False
+                )
+            elif 'test' in raw_dataset:
+                self.test_dataset = HFDatasetAdapter(
+                    raw_dataset['test'],
+                    self.tokenizer,
+                    self.smiles_column,
+                    self.max_length,
+                    is_streaming=False
+                )
+            # Create adapted datasets
+            self.train_dataset = HFDatasetAdapter(
+                train_data,
+                self.tokenizer,
+                self.smiles_column,
+                self.max_length,
+                is_streaming=False
+            )
+            self.val_dataset = HFDatasetAdapter(
+                val_data,
+                self.tokenizer,
+                self.smiles_column,
+                self.max_length,
+                is_streaming=False
+            )
+            print(f'Dataset splits - Train: {len(self.train_dataset)}, Val: {len(self.val_dataset)}')
+            if self.test_dataset:
+                print(f'Test: {len(self.test_dataset)}')
+    def train_dataloader(self):
+        if self.streaming:
+            # Pass streaming dataset directly to DataLoader (HF IterableDataset)
+            # Must use num_workers=0 when using .skip() or .take() operations
+            return DataLoader(
+                self.train_dataset.data,  # Use the raw HF streaming dataset
+                batch_size=self.batch_size,
+                collate_fn=self.collator,
+                num_workers=0,  # Required for streaming with skip/take operations
+                pin_memory=True,
+                shuffle=False,  # Cannot shuffle streaming datasets
+            )
+        else:
+            return DataLoader(
+                self.train_dataset,
+                batch_size=self.batch_size,
+                collate_fn=self.collator,
+                shuffle=True,
+                num_workers=self.num_workers,
+                pin_memory=True,
+                persistent_workers=True if self.num_workers > 0 else False
+            )
+    def val_dataloader(self):
+        if self.streaming:
+            # Pass streaming dataset directly to DataLoader (HF IterableDataset)
+            # Must use num_workers=0 when using .skip() or .take() operations
+            return DataLoader(
+                self.val_dataset.data,  # Use the raw HF streaming dataset
+                batch_size=self.batch_size,
+                collate_fn=self.collator,
+                num_workers=0,  # Required for streaming with skip/take operations
+                pin_memory=True,
+                shuffle=False,  # Cannot shuffle streaming datasets
+            )
+        else:
+            return DataLoader(
+                self.val_dataset,
+                batch_size=self.batch_size,
+                collate_fn=self.collator,
+                shuffle=False,
+                num_workers=self.num_workers,
+                pin_memory=True,
+                persistent_workers=True if self.num_workers > 0 else False
+            )
+    def test_dataloader(self):
+        if self.test_dataset:
+            return DataLoader(
+                self.test_dataset,
+                batch_size=self.batch_size,
+                collate_fn=self.collator,
+                shuffle=False,
+                num_workers=self.num_workers,
+                pin_memory=True,
+                persistent_workers=True if self.num_workers > 0 else False
+            )
+        return None
+def setup_hf_data_and_update_config(config, dataset_name="datamol-io/safe-gpt", smiles_column="smiles", streaming=True):
+    """
+    Setup HuggingFace dataset and update config with token information.
+    Args:
+        config: Hydra config object
+        dataset_name: HuggingFace dataset identifier
+        smiles_column: Name of column containing SMILES strings
+        streaming: Whether to use streaming mode (recommended for large datasets like safe-gpt)
+    Returns:
+        HFDataModule instance
+    """
+    # Initialize tokenizer
+    tokenizer = get_tokenizer()
+    # Update config with tokenizer info
+    config.interpolant.tokens = len(tokenizer)
+    config.interpolant.pad_token = tokenizer.pad_token_id
+    config.interpolant.mask_token = tokenizer.mask_token_id
+    # Create data module
+    data_module = HFDataModule(
+        config=config,
+        dataset_name=dataset_name,
+        tokenizer=tokenizer,
+        smiles_column=smiles_column,
+        val_split=0.1,
+        streaming=streaming,
+    )
+    return data_module

a2d2_mol/mol_scoring/oracle/fpscores.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24a4392f5c673e79c0446af3c4d8e458293b5fecaa244328e76741ead9d21dbf
+size 9048931

a2d2_mol/mol_scoring/scoring_functions.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from transformers import AutoModelForMaskedLM
+import numpy as np
+from tdc import Oracle, Evaluator
+class MolScoringFunctions:
+    def __init__(self, score_func_names=None, device=None, sa_transform='inverse'):
+        """
+        Class for generating score vectors given generated sequence
+        Args:
+            score_func_names: list of scoring function names to be evaluated
+            score_weights: weights to scale scores (default: 1)
+            sa_transform: how to transform SA scores to higher-is-better ~[0,1]:
+                'inverse' (default): 1/(1+SA)  — range ~0.09-0.5, weak gradient
+                'linear':  (10-SA)/9            — range ~0-1, stronger gradient
+        """
+        if score_func_names is None:
+            # just do unmasking based on validity of peptide bonds
+            self.score_func_names = []
+        else:
+            self.score_func_names = score_func_names
+        self.sa_transform = sa_transform
+        oracle_qed = Oracle('qed')
+        oracle_sa = Oracle('sa')
+        self.all_funcs = {'qed': oracle_qed,
+                          'sa': oracle_sa,
+                          }
+    def forward(self, input_seqs):
+        scores = []
+        for i, score_func in enumerate(self.score_func_names):
+            score = self.all_funcs[score_func](input_seqs)
+            # Transform SA to be maximized and normalized (original SA: 1-10, lower is better)
+            # Convert to: higher is better, normalized to ~0-1 range like QED
+            if score_func == 'sa':
+                if self.sa_transform == 'linear':
+                    score = (10.0 - np.array(score)) / 9.0  # range ~0-1, clipped at 0
+                    score = np.maximum(score, 0.0)
+                else:
+                    score = 1.0 / (1.0 + np.array(score))  # range ~0.09-0.5
+            scores.append(score)
+        # convert to numpy arrays with shape (num_sequences, num_functions)
+        scores = np.float32(scores).T
+        return scores
+    def __call__(self, input_seqs: list):
+        return self.forward(input_seqs)
+def unittest():
+    scoring = MolScoringFunctions(score_func_names=['qed', 'sa'])
+    smiles = ['CCOc1cc(ccc1NC(=O)N[C@@H]2CCCC[C@@H]2O)F']
+    scores = scoring(input_seqs=smiles)
+    print(scores)
+    print(len(scores))
+if __name__ == '__main__':
+    unittest()

a2d2_mol/mol_utils/bracket_safe_converter.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Patch: stub out `auto_docstring` if missing from transformers.utils
+# (needed by safe.trainer.model in newer safe versions)
+import transformers.utils as _tu
+if not hasattr(_tu, 'auto_docstring'):
+    _tu.auto_docstring = lambda *a, **kw: (lambda fn: fn)
+from safe.converter import *
+class BracketSAFEConverter(SAFEConverter):
+    def encoder(
+        self,
+        inp: Union[str, dm.Mol],
+        canonical: bool = True,
+        randomize: Optional[bool] = False,
+        seed: Optional[int] = None,
+        constraints: Optional[List[dm.Mol]] = None,
+        allow_empty: bool = False,
+        rdkit_safe: bool = True,
+    ):
+        rng = None
+        if randomize:
+            rng = np.random.default_rng(seed)
+            if not canonical:
+                inp = dm.to_mol(inp, remove_hs=False)
+                inp = self.randomize(inp, rng)
+        if isinstance(inp, dm.Mol):
+            inp = dm.to_smiles(inp, canonical=canonical, randomize=False, ordered=False)
+        branch_numbers = self._find_branch_number(inp)
+        mol = dm.to_mol(inp, remove_hs=False)
+        if self.ignore_stereo:
+            mol = dm.remove_stereochemistry(mol)
+        bond_map_id = 1
+        for atom in mol.GetAtoms():
+            if atom.GetAtomicNum() == 0:
+                atom.SetAtomMapNum(0)
+                atom.SetIsotope(bond_map_id)
+                bond_map_id += 1
+        if self.require_hs:
+            mol = dm.add_hs(mol)
+        matching_bonds = self._fragment(mol, allow_empty=allow_empty)
+        substructed_ignored = []
+        if constraints is not None:
+            substructed_ignored = list(
+                itertools.chain(
+                    *[
+                        mol.GetSubstructMatches(constraint, uniquify=True)
+                        for constraint in constraints
+                    ]
+                )
+            )
+        bonds = []
+        for i_a, i_b in matching_bonds:
+            # if both atoms of the bond are found in a disallowed substructure, we cannot consider them
+            # on the other end, a bond between two substructure to preserved independently is perfectly fine
+            if any((i_a in ignore_x and i_b in ignore_x) for ignore_x in substructed_ignored):
+                continue
+            obond = mol.GetBondBetweenAtoms(i_a, i_b)
+            bonds.append(obond.GetIdx())
+        if len(bonds) > 0:
+            mol = Chem.FragmentOnBonds(
+                mol,
+                bonds,
+                dummyLabels=[(i + bond_map_id, i + bond_map_id) for i in range(len(bonds))],
+            )
+        frags = list(Chem.GetMolFrags(mol, asMols=True))
+        if randomize:
+            frags = rng.permutation(frags).tolist()
+        elif canonical:
+            frags = sorted(
+                frags,
+                key=lambda x: x.GetNumAtoms(),
+                reverse=True,
+            )
+        frags_str = []
+        for frag in frags:
+            non_map_atom_idxs = [
+                atom.GetIdx() for atom in frag.GetAtoms() if atom.GetAtomicNum() != 0
+            ]
+            frags_str.append(
+                Chem.MolToSmiles(
+                    frag,
+                    isomericSmiles=True,
+                    canonical=True,  # needs to always be true
+                    rootedAtAtom=non_map_atom_idxs[0],
+                )
+            )
+        scaffold_str = ".".join(frags_str)
+        # don't capture atom mapping in the scaffold
+        attach_pos = set(re.findall(r"(\[\d+\*\]|!\[[^:]*:\d+\])", scaffold_str))
+        if canonical:
+            attach_pos = sorted(attach_pos)
+        starting_num = 1
+        for attach in attach_pos:
+            val = str(starting_num) if starting_num < 10 else f"%{starting_num}"
+            val = '<' + val + '>'   # bracket added
+            # we cannot have anything of the form "\([@=-#-$/\]*\d+\)"
+            attach_regexp = re.compile(r"(" + re.escape(attach) + r")")
+            scaffold_str = attach_regexp.sub(val, scaffold_str)
+            starting_num += 1
+        # now we need to remove all the parenthesis around digit only number
+        wrong_attach = re.compile(r"\((<[\%\d+]*>)\)")   # bracket added
+        scaffold_str = wrong_attach.sub(r"\g<1>", scaffold_str)
+        # furthermore, we autoapply rdkit-compatible digit standardization.
+        if rdkit_safe:
+            pattern = r"\(([=-@#\/\\]{0,2})(%?\d{1,2})\)"
+            replacement = r"\g<1>\g<2>"
+            scaffold_str = re.sub(pattern, replacement, scaffold_str)
+        return scaffold_str
+def safe2bracketsafe(safe_str):
+    try:
+        return BracketSAFEConverter().encoder(Chem.MolFromSmiles(safe_str), allow_empty=True, canonical=False, randomize=True)
+    except:
+        return safe_str
+def bracketsafe2safe(safe_str):
+    intrafrag_points = [m.group(0) for m in re.finditer(r'(?<!%)\d(?!>)', safe_str)] + \
+        [m.group(0).lstrip('%') for m in re.finditer(r'%\d+', safe_str)]
+    starting_num = max([int(i) for i in intrafrag_points]) + 1 if intrafrag_points else 0
+    interfrag_points = [(m.start(0), m.end(0)) for m in re.finditer(r'<\d+>', safe_str)]
+    safe_str = list(safe_str)
+    for start, end in interfrag_points:
+        safe_str[start] = safe_str[end-1] = ' ' # '<', '>' -> ''
+        num_to_replace = int(''.join(safe_str[start+1 : end-1])) + starting_num
+        num_to_replace = '%' + str(num_to_replace) if num_to_replace >= 10 else str(num_to_replace)
+        safe_str[start+1 : end-1] = [num_to_replace] + [' '] * (end - start - 3)
+    safe_str = re.sub(' ', '', ''.join(safe_str))
+    return safe_str

a2d2_mol/mol_utils/utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Console logger utilities.
+Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py
+Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging
+"""
+import logging
+import fsspec
+import lightning
+import torch
+from timm.scheduler import CosineLRScheduler
+import argparse
+import numpy as np
+import random
+import os
+def sample_categorical_logits(logits, dtype=torch.float64):
+  # do not require logits to be log-softmaxed
+  gumbel_noise = -(1e-10 - (torch.rand_like(logits, dtype=dtype) + 1e-10).log()).log()
+  return (logits + gumbel_noise).argmax(dim=-1)
+def fsspec_exists(filename):
+  """Check if a file exists using fsspec."""
+  fs, _ = fsspec.core.url_to_fs(filename)
+  return fs.exists(filename)
+def fsspec_listdir(dirname):
+  """Listdir in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  return fs.ls(dirname)
+def fsspec_mkdirs(dirname, exist_ok=True):
+  """Mkdirs in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  fs.makedirs(dirname, exist_ok=exist_ok)
+def print_nans(tensor, name):
+  if torch.isnan(tensor).any():
+    print(name, tensor)
+class CosineDecayWarmupLRScheduler(
+  CosineLRScheduler,
+  torch.optim.lr_scheduler._LRScheduler):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._last_epoch = -1
+    self.step(epoch=0)
+  def step(self, epoch=None):
+    if epoch is None:
+      self._last_epoch += 1
+    else:
+      self._last_epoch = epoch
+    # We call either step or step_update, depending on
+    # whether we're using the scheduler every epoch or every
+    # step.
+    # Otherwise, lightning will always call step (i.e.,
+    # meant for each epoch), and if we set scheduler
+    # interval to "step", then the learning rate update will
+    # be wrong.
+    if self.t_in_epochs:
+      super().step(epoch=self._last_epoch)
+    else:
+      super().step_update(num_updates=self._last_epoch)
+class LoggingContext:
+  """Context manager for selective logging."""
+  def __init__(self, logger, level=None, handler=None, close=True):
+    self.logger = logger
+    self.level = level
+    self.handler = handler
+    self.close = close
+  def __enter__(self):
+    if self.level is not None:
+      self.old_level = self.logger.level
+      self.logger.setLevel(self.level)
+    if self.handler:
+      self.logger.addHandler(self.handler)
+  def __exit__(self, et, ev, tb):
+    if self.level is not None:
+      self.logger.setLevel(self.old_level)
+    if self.handler:
+      self.logger.removeHandler(self.handler)
+    if self.handler and self.close:
+      self.handler.close()
+def get_logger(name=__name__, level=logging.INFO) -> logging.Logger:
+  """Initializes multi-GPU-friendly python logger."""
+  logger = logging.getLogger(name)
+  logger.setLevel(level)
+  # this ensures all logging levels get marked with the rank zero decorator
+  # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+  for level in ('debug', 'info', 'warning', 'error',
+                'exception', 'fatal', 'critical'):
+    setattr(logger,
+            level,
+            lightning.pytorch.utilities.rank_zero_only(
+              getattr(logger, level)))
+  return logger
+def str2bool(v):
+  if isinstance(v, bool):
+    return v
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+def set_seed(seed, use_cuda):
+  os.environ['PYTHONHASHSEED'] = str(seed)
+  np.random.seed(seed)
+  random.seed(seed)
+  torch.manual_seed(seed)
+  # torch.backends.cudnn.deterministic = True
+  if use_cuda:
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+  print(f'=> Seed of the run set to {seed}')

a2d2_mol/mol_utils/utils_chem.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import safe as sf
+import datamol as dm
+from contextlib import suppress
+from rdkit import Chem, RDLogger
+RDLogger.DisableLog('rdApp.*')
+# https://github.com/datamol-io/safe/blob/main/safe/sample.py
+# https://github.com/jensengroup/GB_GA/blob/master/crossover.py
+def safe_to_smiles(safe_str, fix=True):
+    if fix:
+        safe_str = '.'.join([frag for frag in safe_str.split('.')
+                             if sf.decode(frag, ignore_errors=True) is not None])
+    return sf.decode(safe_str, canonical=True, ignore_errors=True)
+def _safe_to_smiles_worker(args):
+    """Worker function for parallel SAFE to SMILES conversion."""
+    safe_str, use_bracket_safe, fix = args
+    try:
+        from mol_utils.bracket_safe_converter import bracketsafe2safe
+        if use_bracket_safe:
+            safe_str = bracketsafe2safe(safe_str)
+        return safe_to_smiles(safe_str, fix=fix)
+    except Exception:
+        return None
+def batch_safe_to_smiles(safe_strings, use_bracket_safe=False, fix=True, num_workers=None):
+    """
+    Convert a batch of SAFE strings to SMILES in parallel using multiprocessing.
+    Args:
+        safe_strings: List of SAFE format strings
+        use_bracket_safe: Whether to convert from bracket SAFE format first
+        fix: Whether to fix invalid fragments
+        num_workers: Number of parallel workers (default: min(cpu_count, len(safe_strings), 8))
+    Returns:
+        List of SMILES strings (None for invalid molecules)
+    """
+    from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
+    import os
+    n = len(safe_strings)
+    if n == 0:
+        return []
+    # For small batches, use sequential processing (overhead not worth it)
+    if n <= 4:
+        if use_bracket_safe:
+            from mol_utils.bracket_safe_converter import bracketsafe2safe
+            return [safe_to_smiles(bracketsafe2safe(s), fix=fix) for s in safe_strings]
+        else:
+            return [safe_to_smiles(s, fix=fix) for s in safe_strings]
+    # Use ThreadPoolExecutor for I/O bound tasks (RDKit releases GIL)
+    # ProcessPoolExecutor has too much overhead for this use case
+    if num_workers is None:
+        num_workers = min(os.cpu_count() or 4, n, 8)
+    args_list = [(s, use_bracket_safe, fix) for s in safe_strings]
+    # ThreadPoolExecutor is faster here because:
+    # 1. No pickle serialization overhead
+    # 2. RDKit releases the GIL during computation
+    # 3. Lower startup cost
+    with ThreadPoolExecutor(max_workers=num_workers) as executor:
+        results = list(executor.map(_safe_to_smiles_worker, args_list))
+    return results
+def batch_validate_and_extract(smiles_list, samples_tensor, log_rnd_tensor):
+    """
+    Batch validate SMILES and extract valid samples efficiently.
+    Args:
+        smiles_list: List of SMILES strings (may contain None for invalid)
+        samples_tensor: Tensor of token IDs (B, L)
+        log_rnd_tensor: Tensor of log random values (B,)
+    Returns:
+        valid_sequences: List of valid SMILES (largest fragment)
+        valid_indices: List of indices of valid samples
+    """
+    valid_sequences = []
+    valid_indices = []
+    for idx, smiles in enumerate(smiles_list):
+        if smiles:  # Valid SMILES
+            # Take largest fragment if multiple
+            largest_fragment = sorted(smiles.split('.'), key=len)[-1]
+            valid_sequences.append(largest_fragment)
+            valid_indices.append(idx)
+    return valid_sequences, valid_indices
+def filter_by_substructure(sequences, substruct):
+    substruct = sf.utils.standardize_attach(substruct)
+    substruct = Chem.DeleteSubstructs(Chem.MolFromSmarts(substruct), Chem.MolFromSmiles('*'))
+    substruct = Chem.MolFromSmarts(Chem.MolToSmiles(substruct))
+    return sf.utils.filter_by_substructure_constraints(sequences, substruct)
+def mix_sequences(prefix_sequences, suffix_sequences, prefix, suffix, num_samples=1):
+    mol_linker_slicer = sf.utils.MolSlicer(require_ring_system=False)
+    prefix_linkers = []
+    suffix_linkers = []
+    prefix_query = dm.from_smarts(prefix)
+    suffix_query = dm.from_smarts(suffix)
+    for x in prefix_sequences:
+        with suppress(Exception):
+            x = dm.to_mol(x)
+            out = mol_linker_slicer(x, prefix_query)
+            prefix_linkers.append(out[1])
+    for x in suffix_sequences:
+        with suppress(Exception):
+            x = dm.to_mol(x)
+            out = mol_linker_slicer(x, suffix_query)
+            suffix_linkers.append(out[1])
+    n_linked = 0
+    linked = []
+    linkers = prefix_linkers + suffix_linkers
+    linkers = [x for x in linkers if x is not None]
+    for n_linked, linker in enumerate(linkers):
+        linked.extend(mol_linker_slicer.link_fragments(linker, prefix, suffix))
+        if n_linked > num_samples:
+            break
+        linked = [x for x in linked if x]
+    return linked[:num_samples]
+def cut(smiles):
+    def cut_nonring(mol):
+        if not mol.HasSubstructMatch(Chem.MolFromSmarts('[*]-;!@[*]')):
+            return None
+        bis = random.choice(mol.GetSubstructMatches(Chem.MolFromSmarts('[*]-;!@[*]')))  # single bond not in ring
+        bs = [mol.GetBondBetweenAtoms(bis[0], bis[1]).GetIdx()]
+        fragments_mol = Chem.FragmentOnBonds(mol, bs, addDummies=True, dummyLabels=[(1, 1)])
+        try:
+            return Chem.GetMolFrags(fragments_mol, asMols=True, sanitizeFrags=True)
+        except ValueError:
+            return None
+    mol = Chem.MolFromSmiles(smiles)
+    frags = set()
+    # non-ring cut
+    for _ in range(3):
+        frags_nonring = cut_nonring(mol)
+        if frags_nonring is not None:
+            frags |= set([Chem.MolToSmiles(f) for f in frags_nonring])
+    return frags
+class Slicer:
+    def __call__(self, mol):
+        if isinstance(mol, str):
+            mol = Chem.MolFromSmiles(mol)
+        # non-ring single bonds
+        bonds = mol.GetSubstructMatches(Chem.MolFromSmarts('[*]-;!@[*]'))
+        for bond in bonds:
+            yield bond

a2d2_mol/oracle/fpscores.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24a4392f5c673e79c0446af3c4d8e458293b5fecaa244328e76741ead9d21dbf
+size 9048931

a2d2_mol/remasking_scheduleaware.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Schedule-aware remasking and insertion logic that ensures the number of masked tokens
+follows the interpolant schedule.
+"""
+import torch
+import numpy as np
+def apply_schedule_aware_insertion(
+    model,
+    xt_tmp,
+    new_xt,
+    t,
+    dt,
+    ext,
+    mask,
+    pad,
+    max_length,
+    orig_mask,
+    new_pos_orig,
+    quality_threshold=1,
+):
+    """
+    Remove low-quality insertions based on insertion confidence while respecting
+    the interpolant schedule for expected sequence length.
+    Args:
+        model: Model with planner and interpolant
+        xt_tmp: Sequence after insertion [B, L]
+        new_xt: Sequence before insertion [B, L]
+        t: Current time [B]
+        dt: Time step size
+        ext: Number of insertions per gap [B, L+1]
+        mask: Mask token ID
+        pad: Pad token ID
+        max_length: Maximum sequence length
+        orig_mask: Mask of original token positions [B, L]
+        new_pos_orig: New positions of original tokens [B, L]
+        quality_threshold: If a float, drop insertions with confidence below it
+    Returns:
+        xt_tmp: Modified sequence with low-quality insertions removed (respecting schedule)
+    """
+    device = xt_tmp.device
+    batch_size, L = xt_tmp.shape
+    total_ext = ext.sum(dim=1)
+    # Only proceed if there were insertions
+    if total_ext.sum() == 0:
+        return xt_tmp
+    # Get planner predictions on inserted state. The insertion head is trained
+    # with the pre-step time t (see loss_insert_planner_flexible), so condition
+    # on t here too; t_next is still used below for the length schedule.
+    t_next = t + dt
+    planner_out = model.planner(xt_tmp, t)
+    insertion_conf = planner_out.get("insertion_conf", None)
+    if insertion_conf is None:
+        return xt_tmp
+    insertion_conf = insertion_conf.squeeze(-1)  # (B, L)
+    # Expected sequence length at next timestep according to schedule
+    current_length_after = xt_tmp.ne(pad).sum(dim=1).float()  # [B]
+    expected_progress = model.interpolant.insertion_schedule.at(t_next)  # [B]
+    estimated_final_length = current_length_after / (expected_progress.clamp(min=0.1))
+    expected_length = estimated_final_length * expected_progress  # [B]
+    # Mark positions in xt_tmp that came from new_xt (originals) vs. fresh insertions.
+    # Fancy-indexing scatter avoids the per-batch python loop.
+    valid_b, valid_l = orig_mask.nonzero(as_tuple=True)
+    valid_p = new_pos_orig[valid_b, valid_l].long().clamp_(0, L - 1)
+    is_original = torch.zeros_like(xt_tmp, dtype=torch.bool)
+    is_original[valid_b, valid_p] = True
+    inserted_positions = (xt_tmp == mask) & ~is_original
+    # Two deletion modes, selected by `quality_threshold`:
+    #   * float: drop insertions whose confidence is below the threshold, capped
+    #     so the length never falls below the scheduled minimum.
+    candidates = inserted_positions & (insertion_conf < quality_threshold)
+    num_bad = candidates.sum(dim=1)  # [B], long
+    min_length = expected_length.long().clamp(min=1)  # [B]
+    max_removable = (current_length_after.long() - min_length).clamp(min=0)
+    length_after_removal = current_length_after.long() - num_bad
+    schedule_violates = length_after_removal < min_length
+    k_per_row = torch.where(schedule_violates, max_removable, num_bad)
+    k_per_row = torch.where(num_bad > 0, k_per_row, torch.zeros_like(k_per_row))
+    if not candidates.any():
+        return xt_tmp
+    # Select the lowest-confidence candidates per row via a sort.
+    neg_inf = torch.tensor(float('-inf'), device=device, dtype=insertion_conf.dtype)
+    scores = torch.where(candidates, -insertion_conf, neg_inf)  # higher = worse
+    _, sorted_indices = scores.sort(dim=1, descending=True)
+    positions = torch.arange(L, device=device).unsqueeze(0)  # [1, L]
+    keep_in_topk = positions < k_per_row.unsqueeze(1)  # [B, L]
+    final_bad = torch.zeros_like(candidates)
+    final_bad.scatter_(1, sorted_indices, keep_in_topk)
+    if not final_bad.any():
+        return xt_tmp
+    # Compact each row to the left (keep good, drop bad), then pad the tail.
+    # Stable sort by the bad flag pushes bad positions to the right.
+    sort_key = final_bad.long()
+    _, perm = torch.sort(sort_key, dim=1, stable=True)
+    xt_tmp = torch.gather(xt_tmp, 1, perm)
+    num_keep = (~final_bad).sum(dim=1)  # [B]
+    tail_mask = positions >= num_keep.unsqueeze(1)  # [B, L]
+    xt_tmp = torch.where(tail_mask, torch.full_like(xt_tmp, pad), xt_tmp)
+    return xt_tmp
+def apply_schedule_aware_remasking(
+    model,
+    new_xt,
+    t,
+    dt,
+    remasking_conf,
+    clean_index,
+    mask,
+    neg_inf,
+    batch_size,
+    unmask_quality_threshold=None,
+):
+    """
+    Apply schedule-aware remasking: adjust number of masks to match expected count from schedule.
+    Args:
+        model: Model with interpolant that has an unmask_schedule
+        new_xt: Current sequence [B, L]
+        t: Current time [B]
+        dt: Time step size
+        remasking_conf: Confidence scores for tokens [B, L]
+        clean_index: Boolean mask of clean tokens (not mask, not pad) [B, L]
+        mask: Mask token ID
+        neg_inf: Negative infinity tensor
+        batch_size: Batch size
+    Returns:
+        new_xt: Modified sequence with schedule-aware remasking applied
+    """
+    # Optional AJD threshold gate (overrides the schedule-driven count when set):
+    # remask every clean token whose unmasking-quality confidence is below the
+    # threshold. Higher threshold => more aggressive remasking.
+    if unmask_quality_threshold is not None:
+        to_mask = clean_index & (remasking_conf < unmask_quality_threshold)
+        return torch.where(to_mask, torch.full_like(new_xt, mask), new_xt)
+    t_next = t + dt
+    num_clean = clean_index.sum(dim=1)  # [B], long
+    current_seq_len = (num_clean + (new_xt == mask).sum(dim=1)).float()  # [B]
+    expected_unmasked_frac = model.interpolant.unmask_schedule.at(t_next)  # [B]
+    expected_num_clean = expected_unmasked_frac * current_seq_len  # [B]
+    masks_to_add = (num_clean.float() - expected_num_clean).round().long()  # [B]
+    # Per-row k = min(masks_to_add, num_clean), clamped to >= 0.
+    k_per_row = torch.minimum(masks_to_add.clamp(min=0), num_clean)  # [B]
+    if k_per_row.sum() == 0:
+        return new_xt
+    # Use confidence to decide which clean tokens to remask: lowest conf first.
+    remasking_score_temp = -1.0 * remasking_conf  # low conf = high score
+    remasking_score_temp = torch.where(clean_index, remasking_score_temp, neg_inf)
+    _, sorted_indices = remasking_score_temp.sort(dim=1, descending=True)
+    L = remasking_score_temp.shape[1]
+    positions = torch.arange(L, device=new_xt.device).unsqueeze(0)  # [1, L]
+    keep_in_topk = positions < k_per_row.unsqueeze(1)  # [B, L]
+    to_mask = torch.zeros_like(clean_index)
+    to_mask.scatter_(1, sorted_indices, keep_in_topk)
+    new_xt = torch.where(to_mask, torch.full_like(new_xt, mask), new_xt)
+    return new_xt

a2d2_mol/sampling.py ADDED Viewed

	@@ -0,0 +1,1401 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # add repo root to path
+import torch
+from dataclasses import dataclass
+from typing import Any, Literal, Optional
+import numpy as np
+import pandas as pd
+from lightning_modules.mdm import MaskedDiffusionModule
+@dataclass
+class SamplingTraceDatapoint:
+    t: float
+    event_type: Literal["insertion", "change"]
+    position: int
+    token: Any
+@dataclass
+class SamplingResult:
+    samples: torch.Tensor
+    # Trace is supposed to be processed sequentially as updates are not commutative
+    trace: Optional[list[SamplingTraceDatapoint]]
+    def __iter__(self):
+        yield from [self.samples, self.trace]
+# Sample from categorical distribution for each position using the transition probabilities
+def _sample_tokens(probs: torch.Tensor) -> torch.Tensor:
+    """Sample one token per position from probability distribution.
+    Args:
+        probs: [batch_size, seq_len, vocab_size] transition probabilities
+    Returns:
+        [batch_size, seq_len] sampled token indices
+    """
+    batch_size, seq_len, vocab_size = probs.shape
+    flat_probs = probs.view(-1, vocab_size)
+    samples = torch.multinomial(flat_probs, num_samples=1)
+    return samples.view(batch_size, seq_len)
+def _sample_batched_tokens(probs: torch.Tensor) -> torch.Tensor:
+    batch_size, seq_len, vocab_size = probs.shape
+    gumbel_noise = (-torch.log(-torch.log(torch.rand(batch_size, seq_len, vocab_size) + 1e-10) + 1e-10)).to(probs.device)
+    noisy_logits =  torch.log(probs + 1e-10) + gumbel_noise  # add Gumbel noise to log probabilities
+    # select the highest score (most likely category after Gumbel noise)
+    samples = noisy_logits.argmax(dim=-1).to(dtype=torch.long)
+    return samples.view(batch_size, seq_len)
+@torch.no_grad()
+def mdm_euler_sampling(
+    model: MaskedDiffusionModule,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+):
+    assert not return_trace, "Trace is not yet implemented in MDM Euler sampling"
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    for i in range(steps):
+        print("i-th sampling step")
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        _xt = xt.clone()
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0
+            print(trans_prob[mask_pos + (mask,)])
+        new_xt = _sample_tokens(trans_prob)
+        new_xt = torch.where(xt != mask, xt, new_xt)
+        xt = new_xt
+        t = t + dt
+    return xt, []
+@torch.no_grad()
+def any_order_mask_insertion_euler_sampling(
+    model: torch.nn.Module,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    # 1) Initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                # normalize to sum to 1
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # Check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # Check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    return xt, sampling_trace
+@torch.no_grad()
+def batch_mcts_reverse_step(
+    xt: torch.Tensor,
+    t: torch.Tensor,
+    dt: float,
+    model: torch.nn.Module,
+    pretrained: torch.nn.Module,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    last_step: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    xt = xt.repeat(batch_size, 1)
+    # squeeze to remove extra dimensions, then expand to batch_size
+    t = t.squeeze().expand(batch_size)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    # ——— predict and convert rates ———
+    pred_rate = model(xt, t)
+    pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+    unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+    len_rate = pred_rate.length_rate  # (B, L+1)
+    # ——— get pretrained model rates for log_rnd computation ———
+    pretrained_pred = pretrained(xt, t)
+    pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+    pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+    pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+    # ——— unmask step (Euler) ———
+    mask_pos = (xt == mask).nonzero(as_tuple=True)
+    unmask_rate[xt != mask] = 0
+    unmask_rate[mask_pos + (mask,)] = 0
+    unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+    # Same for pretrained
+    pretrained_unmask_rate[xt != mask] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+    # add “stay” probability
+    _xt = xt.clone()
+    _xt[xt == pad] = mask
+    trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+    )
+    pretrained_trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+    )
+    if last_step:
+        print("Final step, removing mask token from sampling")
+        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+        # renormalize probabilities to ensure they sum to 1
+        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+        # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+        if mask_has_zero_prob.any():
+            # create uniform distribution over valid tokens (excluding mask and pad)
+            uniform_prob = torch.zeros_like(trans_prob[0])
+            uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+        else:
+            # normalize to sum to 1
+            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+    new_xt = _sample_tokens(trans_prob)
+    new_xt[xt == pad] = pad
+    new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+    # ——— compute log probabilities for RND ———
+    lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+    log_policy_step = (lp * changed_mask).sum(dim=1)
+    log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+    log_rnd = log_pretrained_step - log_policy_step  # (B,)
+    if not last_step:
+        # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+        ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+        insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        # log P(ext; λ) = ext*log(λ) - λ
+        log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+        log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+        log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+        log_rnd += log_insert_diff
+        log_pretrained_step += log_pretrained_insert
+        log_policy_step += log_policy_insert
+        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+        seq_dim = ext.size(1)  # Use actual ext dimension to avoid mismatch
+        gaps = torch.arange(seq_dim, device=device).view(1, -1)
+        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+        total_ext = ext.sum(dim=1)
+        valid = xt_len + total_ext <= max_length
+        ext = ext * valid.view(batch_size, 1).long()
+        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+        new_len = xt_len + total_ext  # (B,)
+        xt_tmp = torch.full_like(xt, pad)
+        mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+        xt_tmp[mask_fill] = mask
+        new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+        flat_b = batch_idx_L[orig_mask]
+        flat_p = new_pos_orig[orig_mask]
+        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+    else:
+        xt_tmp = new_xt
+    return xt_tmp, log_rnd, log_policy_step, log_pretrained_step
+@torch.no_grad()
+def mcts_reverse_step(
+    xt: torch.Tensor,
+    t: torch.Tensor,
+    dt: float,
+    model: torch.nn.Module,
+    pretrained: torch.nn.Module,
+    mask: int,
+    pad: int,
+    max_length: int,
+    last_step: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    batch_size = xt.size(0)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    # ——— predict and convert rates ———
+    pred_rate = model(xt, t)
+    pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+    unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+    len_rate = pred_rate.length_rate  # (B, L+1)
+    # ——— get pretrained model rates for log_rnd computation ———
+    pretrained_pred = pretrained(xt, t)
+    pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+    pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+    pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+    # ——— unmask step (Euler) ———
+    mask_pos = (xt == mask).nonzero(as_tuple=True)
+    unmask_rate[xt != mask] = 0
+    unmask_rate[mask_pos + (mask,)] = 0
+    unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+    # same for pretrained
+    pretrained_unmask_rate[xt != mask] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+    # add “stay” probability
+    _xt = xt.clone()
+    _xt[xt == pad] = mask
+    trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+    )
+    pretrained_trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+    )
+    if last_step:
+        print("Final step, removing mask token from sampling")
+        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+        # renormalize probabilities to ensure they sum to 1
+        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+        # avoid division by zero - if all probs are 0, use uniform distribution (excluding mask and pad)
+        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+        if mask_has_zero_prob.any():
+            # create uniform distribution over valid tokens (excluding mask and pad)
+            uniform_prob = torch.zeros_like(trans_prob[0])
+            uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+        else:
+            # normalize to sum to 1
+            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+    new_xt = _sample_tokens(trans_prob)
+    new_xt[xt == pad] = pad
+    new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+    # ——— compute log probabilities for RND ———
+    lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+    log_policy_step = (lp * changed_mask).sum(dim=1)
+    log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+    log_rnd = log_pretrained_step - log_policy_step  # (B,)
+    if not last_step:
+        # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+        ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+        insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        # log P(ext; λ) = ext*log(λ) - λ
+        log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+        log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+        log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+        log_rnd += log_insert_diff
+        log_pretrained_step += log_pretrained_insert
+        log_policy_step += log_policy_insert
+        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+        seq_dim = ext.size(1)  # Use actual ext dimension to avoid mismatch
+        gaps = torch.arange(seq_dim, device=device).view(1, -1)
+        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+        total_ext = ext.sum(dim=1)
+        valid = xt_len + total_ext <= max_length
+        ext = ext * valid.view(batch_size, 1).long()
+        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+        new_len = xt_len + total_ext  # (B,)
+        xt_tmp = torch.full_like(xt, pad)
+        mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+        xt_tmp[mask_fill] = mask
+        new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+        flat_b = batch_idx_L[orig_mask]
+        flat_p = new_pos_orig[orig_mask]
+        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+    else:
+        xt_tmp = new_xt
+    return xt_tmp, log_rnd, log_policy_step, log_pretrained_step
+@torch.no_grad()
+def any_order_euler_sampling_with_schedule(
+    model: torch.nn.Module,
+    time_schedule: torch.Tensor,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    time_schedule = time_schedule.to(device)
+    if time_schedule[0] < time_schedule[-1]:
+        time_schedule = torch.flip(time_schedule, [0]) # descending order
+    steps = len(time_schedule) - 1
+    # initialize all-pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # use scheduled timesteps
+        t = time_schedule[i].repeat(batch_size)
+        t_next = time_schedule[i + 1]
+        dt = (t - t_next).abs()  # timestep difference
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt[:, None, None]).clamp(0.0, 1.0)
+        # add "stay" probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+            ext = torch.bernoulli((len_rate * dt[:, None]).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # Check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # Check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+    return xt, sampling_trace
+@torch.no_grad()
+def any_order_mask_insertion_euler_sampling_with_rnd(
+    model, pretrained, reward_model, analyzer,
+    tokenizer, steps,
+    mask,
+    pad,
+    batch_size,
+    max_length,
+    return_trace = False,
+    alpha = 0.1,
+    temperature: float = 1.0,
+):
+    device = next(model.parameters()).device
+    # initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    # initialize log_rnd to accumulate log probability ratios
+    log_rnd = torch.zeros(batch_size, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— get pretrained model rates for log_rnd computation ———
+        pretrained_pred = pretrained(xt, t)
+        pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+        pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+        pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # Same for pretrained
+        pretrained_unmask_rate[xt != mask] = 0
+        pretrained_unmask_rate[mask_pos + (mask,)] = 0
+        pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        pretrained_trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # ——— compute log probabilities for RND ———
+        lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+        lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+        changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+        log_policy_step = (lp * changed_mask).sum(dim=1)
+        log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+        log_rnd = log_pretrained_step - log_policy_step  # (B,)
+        if i != steps - 1:
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+            pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+            log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+            log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+            log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+            log_rnd += log_insert_diff
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # check if the token was changed
+            for i in range(batch_size):
+                for j in range(max_length):
+                    if xt[i, j] != pad and xt[i, j] != new_xt[i, j]:
+                        sampling_trace[i].append(
+                            SamplingTraceDatapoint(
+                                t=t[i].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[i, j].item(),
+                            )
+                        )
+                # check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[i, id]:
+                        sampling_trace[i].append(
+                            SamplingTraceDatapoint(
+                                t=t[i].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    # change rewards for peptides
+    samples = xt.to(device)
+    # store raw token IDs
+    # Decode and strip samples
+    decoded_samples = tokenizer.batch_decode(samples)
+    valid_x_final = []
+    validSequences = []
+    valid_log_rnd = []
+    for idx, seq in enumerate(decoded_samples):
+        # check if the peptide is valid
+        if analyzer.is_peptide(seq):
+            valid_x_final.append(xt[idx])
+            validSequences.append(seq)
+            valid_log_rnd.append(log_rnd[idx])
+    print("len valid sequences:", len(validSequences))
+    # compute multi-objective rewards
+    score_vectors = reward_model(input_seqs=validSequences)
+    scalar_rewards = np.sum(score_vectors, axis=-1)
+    scalar_rewards = torch.as_tensor(scalar_rewards, dtype=torch.float32, device=device)
+    print(f"scalar reward dim{len(scalar_rewards)}")
+    valid_log_rnd = torch.stack(valid_log_rnd, dim=0)
+    log_rnd = valid_log_rnd + (scalar_rewards / alpha) # scale down by alpha
+    valid_x_final = torch.stack(valid_x_final, dim=0)
+    return valid_x_final, log_rnd, scalar_rewards, sampling_trace
+@torch.no_grad()
+def any_order_finetuned_euler_sampler(
+        model, reward_model, analyzer,
+        tokenizer, steps,
+        mask,
+        pad,
+        batch_size,
+        max_length,
+        return_trace = False,
+        dataframe = False,
+        temperature: float = 1.0,
+    ):
+    device = next(model.parameters()).device
+    # initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                # normalize to sum to 1
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # gap-wise insertion refactored — compute new length, fill masks, scatter tokens
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    # start eval
+    samples = xt.to(device)
+    decoded_samples = tokenizer.batch_decode(samples)
+    valid_x_final = []
+    validSequences = []
+    for idx, seq in enumerate(decoded_samples):
+        if analyzer.is_peptide(seq):
+            valid_x_final.append(samples[idx])
+            validSequences.append(seq)
+    print("len valid sequences:", len(validSequences))
+    valid_fraction = len(validSequences) / batch_size
+    if (len(validSequences) != 0):
+        # add scores to log
+        score_vectors = reward_model(input_seqs=validSequences) # (num_children, num_objectives)
+        average_scores = score_vectors.T
+        affinity = average_scores[0]
+        sol = average_scores[1]
+        hemo = average_scores[2]
+        nf = average_scores[3]
+        permeability = average_scores[4]
+    else:
+        zeros = [0.0]
+        affinity = zeros
+        sol = zeros
+        hemo = zeros
+        nf = zeros
+        permeability = zeros
+    if dataframe:
+        df = pd.DataFrame({
+            "Peptide Sequence": validSequences,
+            "Binding Affinity": affinity if len(validSequences) else [0.0],
+            "Solubility": sol if len(validSequences) else [0.0],
+            "Hemolysis": hemo if len(validSequences) else [0.0],
+            "Nonfouling": nf if len(validSequences) else [0.0],
+            "Permeability": permeability if len(validSequences) else [0.0],
+        })
+        return samples, affinity, sol, hemo, nf, permeability, valid_fraction, df
+    return samples, affinity, sol, hemo, nf, permeability, valid_fraction
+@torch.no_grad()
+def mdm_tau_leaping_sampling(
+    model: MaskedDiffusionModule,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+):
+    assert not return_trace, "Trace is not yet supported"
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred = model(xt, t)
+        pred = model.interpolant.to_actual_rate(xt, pred, t)
+        unmask_rate = pred.unmask_rate  # (B, L, V)
+        if i == steps - 1:
+            # last step: deterministic unmask via argmax
+            mask_pos = xt == mask  # (B, L)
+            new_token = unmask_rate.argmax(dim=2)  # (B, L)
+            new_xt = xt.clone()
+            new_xt[mask_pos] = new_token[mask_pos]
+            new_xt = torch.where(xt != mask, xt, new_xt)
+            xt = new_xt
+            t = t + dt
+            continue
+        # tau-leaping via Poisson counts
+        counts = torch.poisson(unmask_rate * dt).long()
+        mask_pos = xt == mask  # (B, L)
+        # zero out non-mask positions and mask→mask
+        counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
+        counts[..., mask] = 0
+        # only accept exactly one event
+        sum_c = counts.sum(dim=2)  # (B, L)
+        one_event = sum_c == 1
+        new_token = counts.argmax(dim=2)  # (B, L)
+        # build new xt
+        new_xt = xt.clone()
+        new_xt[one_event] = new_token[one_event]
+        # keep pads and already-unmasked tokens
+        new_xt = torch.where(xt != mask, xt, new_xt)
+        xt = new_xt
+        t = t + dt
+    return xt, []
+# Not used in production, for debugging purposes
+lengths = {4: 0.1, 16: 0.4, 32: 0.4, 64: 0.1}
+def binomial_mass(k, n, p):
+    """
+    Calculate the probability mass function (PMF) for a binomial distribution.
+    Args:
+        k (int): Number of successes
+        n (int): Number of trials
+        p (float): Probability of success in a single trial
+    Returns:
+        float: Probability mass P(X = k)
+    """
+    import math
+    # Calculate binomial coefficient (n choose k)
+    try:
+        binom_coef = math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
+    except ValueError:
+        # Handle cases where k > n or negative values
+        return 0.0
+    # Calculate probability mass
+    return binom_coef * (p ** k) * ((1 - p) ** (n - k))
+def calculate_rate_batch(alpha_t, len_t):
+    """
+    Calculate rate for a batch of alpha_t and len_t values.
+    Args:
+        alpha_t (torch.Tensor): Tensor of shape (batch_size,)
+        len_t (torch.Tensor): Tensor of shape (batch_size,)
+    Returns:
+        torch.Tensor: Tensor of shape (batch_size,) containing calculated rates
+    """
+    batch_size = alpha_t.shape[0]
+    device = alpha_t.device
+    # Initialize tensors for numerator and denominator
+    nom = torch.zeros(batch_size, device=device)
+    denom = torch.zeros(batch_size, device=device)
+    for length, probability in lengths.items():
+        # Create mask for valid entries where len_t <= length
+        valid_mask = (len_t <= length) & (len_t >= 0)
+        if not valid_mask.any():
+            continue
+        valid_indices = valid_mask.nonzero(as_tuple=True)[0]
+        valid_len_t = len_t[valid_indices]
+        valid_alpha_t = alpha_t[valid_indices]
+        # Calculate binomial probabilities efficiently using torch distribution
+        binom_dist = torch.distributions.Binomial(total_count=length, probs=valid_alpha_t)
+        binom_probs = binom_dist.log_prob(valid_len_t).exp()
+        # Update numerator and denominator for valid indices
+        nom[valid_indices] += (length - valid_len_t) * probability * binom_probs
+        denom[valid_indices] += probability * binom_probs
+    # Handle division by zero in a vectorized way
+    result = torch.zeros_like(nom)
+    div_mask = denom > 0
+    result[div_mask] = nom[div_mask] / (denom[div_mask])
+    return result
+# Keep the original function for backward compatibility
+def calculate_rate(alpha_t, len_t):
+    """Legacy scalar version of calculate_rate"""
+    if isinstance(alpha_t, torch.Tensor) and alpha_t.ndim > 0:
+        return calculate_rate_batch(alpha_t, len_t)
+    nom, denom = 0, 0
+    for length, probability in lengths.items():
+        if length >= len_t:
+            nom += (length - len_t) * probability * binomial_mass(len_t, length, alpha_t)
+            denom += probability * binomial_mass(len_t, length, alpha_t)
+    if denom == 0:
+        return 0.0
+    return nom /denom
+@torch.no_grad()
+def any_order_mask_insertion_tau_leaping_sampling(
+    model: torch.nn.Module,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    confidence_based_sampling: bool = True,  # whether to use confidence-based decoding
+    alpha: float = 5.0,  # hyperparameter for window size calculation
+    max_window: int = 32,  # Maximum window size for sliding window
+    confidence_method: str = "prob_diff",  # "position", "top_prob", "prob_diff", "entropy"
+    use_sliding_window: bool = False,  # whether to use sliding window for position selection
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    for i in range(steps):
+        # --- predict rates ---
+        pred = model(xt, t)
+        xt_len = (xt != pad).sum(dim=1)
+        pred = model.interpolant.to_actual_rate(xt, pred, t)
+        unmask_rate = pred.unmask_rate  # (B, L, V)
+        len_rate = pred.length_rate  # (B, L+1)
+        if i == steps - 1:
+            # last step: deterministic unmask via argmax
+            mask_pos = xt == mask
+            new_token = unmask_rate.argmax(dim=2)
+            new_xt = xt.clone()
+            new_xt[mask_pos] = new_token[mask_pos]
+            new_xt = torch.where(xt == pad, pad, new_xt)
+            new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+            xt = new_xt
+            t = t + dt
+            continue
+        # --- confidence-based decoding ---
+        if confidence_based_sampling > 0.0:
+            # Confidence-based unmasking (vectorized)
+            mask_positions = (xt == mask)  # (B, L)
+            num_mask_positions = mask_positions.sum(dim=1)  # (B,)
+            # 1. Determine number of tokens to unmask using Poisson
+            unmask_counts = torch.poisson(num_mask_positions.float() * dt).long()  # (B,)
+            # 2. Calculate confidence based on selected method
+            if confidence_method == "position":
+                # Position-based confidence: position i / len(xt)
+                xt_len = (xt != pad).sum(dim=1)  # (B,) - current sequence lengths
+                position_indices = torch.arange(max_length, device=device).unsqueeze(0).expand(batch_size, -1)  # (B, L)
+                confidence = 1.0 - (position_indices.float() / xt_len.unsqueeze(1).float().clamp(min=1))  # (B, L)
+            elif confidence_method == "top_prob":
+                # Top probability confidence
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V) - use the unmask_rate as logits
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                confidence = unmask_probs.max(dim=-1)[0]  # (B, L)
+            elif confidence_method == "prob_diff":
+                # Probability difference confidence (top - second top)
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V)
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                top2_probs, _ = torch.topk(unmask_probs, k=2, dim=-1)  # (B, L, 2)
+                confidence = top2_probs[:, :, 0] - top2_probs[:, :, 1]  # (B, L)
+            elif confidence_method == "entropy":
+                # Entropy-based confidence (lower entropy = higher confidence)
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V)
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                entropy = -torch.sum(unmask_probs * torch.log(unmask_probs + 1e-10), dim=-1)  # (B, L)
+                confidence = -entropy  # (B, L) - negative entropy so lower entropy gives higher confidence
+            else:
+                raise ValueError(f"Unknown confidence_method: {confidence_method}")
+            # 3. Apply window constraint if enabled
+            if use_sliding_window:
+                # Calculate dynamic k for each batch
+                k_values = torch.minimum(
+                    torch.minimum(
+                        (alpha * unmask_counts).long(),
+                        torch.tensor(max_window, device=device)
+                    ), num_mask_positions)  # (B,)
+                # Get cumulative count of mask positions
+                mask_cumsum = mask_positions.cumsum(dim=1)  # (B, L)
+                # Create window mask: position is eligible if it's a mask and within first k masks
+                is_within_window = mask_cumsum <= k_values.unsqueeze(1)  # (B, L)
+                window_mask = mask_positions & is_within_window  # (B, L)
+                # Set confidence to -inf for positions outside the window or non-mask positions
+                confidence = torch.where(window_mask, confidence, torch.tensor(-float('inf'), device=device))
+            else:
+                # No window constraint - only mask positions are eligible
+                confidence = torch.where(mask_positions, confidence, torch.tensor(-float('inf'), device=device))
+            new_xt = xt.clone()
+            # vectorized unmasking
+            max_unmask = unmask_counts.max().item()
+            if max_unmask > 0:
+                _, all_top_indices = torch.topk(confidence, k=max_unmask, dim=1, largest=True)  # (B, max_unmask)
+                # create mask for valid unmask operations
+                unmask_mask = torch.arange(max_unmask, device=device).unsqueeze(0) < unmask_counts.unsqueeze(1)  # (B, max_unmask)
+                most_likely_tokens = unmask_rate.argmax(dim=-1)  # (B, L)
+                selected_positions = all_top_indices[unmask_mask]
+                batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, max_unmask)[unmask_mask]
+                new_xt[batch_indices, selected_positions] = most_likely_tokens[batch_indices, selected_positions]
+        else:
+            # --- tau-leaping unmask via Poisson ---
+            counts = torch.poisson(unmask_rate * dt).long()
+            mask_pos = xt == mask
+            counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
+            counts[..., mask] = 0
+            sum_c = counts.sum(dim=2)
+            one_event = sum_c == 1
+            new_token = counts.argmax(dim=2)
+            new_xt = xt.clone()
+            new_xt[one_event] = new_token[one_event]
+            new_xt = torch.where(xt == pad, pad, new_xt)
+            new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # insertion only on non-last
+        if i != steps - 1:
+            # --- Poisson insertion, compute new lengths and fill masks ---
+            ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            # compute prefix sums of insertions
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            # initialize with pads, then fill mask up to new_len
+            xt_tmp = torch.full_like(xt, pad)
+            mask_pos = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_pos] = mask
+            # shift and scatter original tokens
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        xt = xt_tmp
+        t = t + dt
+        if return_trace:
+            sampling_trace.append(xt)
+    return xt, sampling_trace

a2d2_mol/scripts/run_mol_finetune.slurm ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/bin/bash
+# NOTE: --partition and --qos below are specific to our cluster. Change them
+# (or remove them and pass `--partition` on the `sbatch` command line) to match
+# the partitions/QOS available on yours.
+#SBATCH --job-name=mol-finetune
+#SBATCH --partition=dgx-b200
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=1
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=80GB
+#SBATCH --time=02-00:00:00
+#SBATCH --output=logs/slurm-%A.%x.log
+# =====================================================================
+# run_mol_finetune.slurm
+#
+# Single-mode job (1 MIG GPU) running ONE finetune_mol experiment.
+# Select which mode to run via the MODE_ID variable below (or override
+# at submit time with `sbatch --export=ALL,MODE_ID=2 ...`):
+#   0) A2D2 (Ours)                – with full planner (alternating)
+#   1) A2D2 w/o quality           – --disable_planner
+#   2) A2D2 w/o insertion planner – --disable_insertion_planner
+#   3) A2D2 w/o unmasking planner – --disable_unmasking_planner
+#
+# The job trains the selected mode then evaluates the resulting
+# checkpoint on the same GPU.
+# =====================================================================
+set -e
+# --- Mode selection ---------------------------------------------------
+# Which experiment to run (0-3). Override with `--export=ALL,MODE_ID=N`.
+MODE_ID="${MODE_ID:-0}"
+# Run prefix
+PREFIX=${SLURM_JOB_ID:-$(date +%Y%m%d_%H%M%S)}
+# --- Paths ------------------------------------------------------------
+# Repo root is resolved at submit time so the job runs from any clone:
+#   - set A2D2_ROOT explicitly, OR
+#   - submit with `sbatch` from the repo root (SLURM sets SLURM_SUBMIT_DIR;
+#     note sbatch copies the script to a spool dir, so we can't rely on the
+#     script's own path here), OR
+#   - run the script directly, falling back to its location on disk.
+if [ -n "${A2D2_ROOT:-}" ]; then
+    HOME_LOC="$A2D2_ROOT"
+elif [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
+    HOME_LOC="$SLURM_SUBMIT_DIR"
+else
+    # This script lives in a2d2_mol/scripts/, so the repo root is two levels up.
+    HOME_LOC="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+fi
+SCRIPT_LOC="$HOME_LOC/a2d2_mol"
+LOG_LOC=$HOME_LOC/logs
+SAVE_DIR=$HOME_LOC/checkpoints/finetune_mol
+RESULTS_DIR=$HOME_LOC/results/mol_ablation
+mkdir -p "$LOG_LOC" "$SAVE_DIR" "$RESULTS_DIR"
+# --- Environment setup ------------------------------------------------
+# Set WANDB_API_KEY in your shell/secret store before submitting (do NOT commit it):
+#   export WANDB_API_KEY=...     or   `wandb login`
+export WANDB_DIR=$HOME_LOC/.wandb
+export WANDB_CONFIG_DIR=$HOME_LOC/.config/wandb
+export WANDB_CACHE_DIR=$HOME_LOC/.cache/wandb
+mkdir -p "$WANDB_DIR" "$WANDB_CONFIG_DIR" "$WANDB_CACHE_DIR"
+export TRITON_CACHE_DIR=$HOME_LOC/.triton/cache
+mkdir -p "$TRITON_CACHE_DIR"
+export TORCHINDUCTOR_CACHE_DIR=$HOME_LOC/.torchinductor/cache
+mkdir -p "$TORCHINDUCTOR_CACHE_DIR"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# Force unbuffered stdout/stderr so live training output is flushed to the
+# redirected RUN_LOG (Python block-buffers stdout when it's a file, not a TTY).
+export PYTHONUNBUFFERED=1
+# Activate conda env. Override CONDA_ROOT to point at your conda/miniconda
+# install, or just have `conda` on PATH; override CONDA_ENV if your env name
+# differs from the one created by environment.yml.
+CONDA_ENV="${CONDA_ENV:-a2d2}"
+if [ -n "${CONDA_ROOT:-}" ]; then
+    source "$CONDA_ROOT/bin/activate" "$CONDA_ENV"
+elif command -v conda >/dev/null 2>&1; then
+    source "$(conda info --base)/bin/activate" "$CONDA_ENV"
+else
+    echo "ERROR: conda not found; set CONDA_ROOT to your miniconda install." >&2
+    exit 1
+fi
+PYTHON_EXECUTABLE=$(which python)
+cd "$SCRIPT_LOC"
+# Pretrained base checkpoint
+PRETRAINED_CKPT="$HOME_LOC/pretrained/anylength_mol.ckpt"
+# --- Shared training hyperparameters ----------------------------------
+COMMON_ARGS=(
+    --base_path "$HOME_LOC"
+    --use_quality_filter
+    --noise_removal
+    --wdce_num_replicates 16
+    --pool_size 1000
+    --pool_refresh_fraction 0.3
+    --buffer_size 100
+    --batch_size 200
+    --training_mini_batch_size 20
+    --max_length 256
+    --total_num_steps 256
+    --num_iter 20
+    --resample_every_n_step 10
+    --num_epochs 1000
+    --save_every_n_epochs 100
+    --reset_every_n_step 1
+    --alpha 0.01
+    --no_mcts
+    --schedule_warmup_epochs 20
+    --alternation_frequency 5
+    --num_remasking 3
+    --quality_threshold 0.3
+    --checkpoint_path "$PRETRAINED_CKPT"
+    --grad_clip
+    --qed_only
+    --seed 42
+    --num_training_steps_per_epoch 25
+)
+# --- Shared evaluation hyperparameters --------------------------------
+EVAL_COMMON_ARGS=(
+    --pretrained_ckpt "$PRETRAINED_CKPT"
+    --num_samples 1000
+    --batch_size 50
+    --max_length 256
+    --total_num_steps 256
+    --num_remasking 2
+    --quality_threshold 0.3
+    --seed 42
+)
+# =====================================================================
+# Pick experiment from $MODE_ID
+# =====================================================================
+case "$MODE_ID" in
+    0) MODE="with_planner";         EXTRA_ARGS=() ;;
+    1) MODE="no_planner";           EXTRA_ARGS=(--disable_planner) ;;
+    2) MODE="no_insertion_planner"; EXTRA_ARGS=(--disable_insertion_planner) ;;
+    3) MODE="no_unmasking_planner"; EXTRA_ARGS=(--disable_unmasking_planner) ;;
+    *) echo "Unknown MODE_ID=$MODE_ID (expected 0-3)"; exit 1 ;;
+esac
+RUN_NAME="${PREFIX}_mol_${MODE}"
+RUN_LOG="$LOG_LOC/${RUN_NAME}.log"
+RUN_SAVE_DIR="$SAVE_DIR/${RUN_NAME}"
+RESULTS_SUBDIR="$RESULTS_DIR/${MODE}"
+mkdir -p "$RUN_SAVE_DIR" "$RESULTS_SUBDIR"
+echo "=== Mol finetune (MODE_ID=$MODE_ID) ==="
+echo "Job: ${SLURM_JOB_ID}  Node: $SLURM_NODELIST"
+echo "Mode: $MODE"
+echo "Save dir: $RUN_SAVE_DIR"
+echo "Results dir: $RESULTS_SUBDIR"
+echo "Python: $PYTHON_EXECUTABLE"
+echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-(unset)}"
+# =====================================================================
+# Train
+# =====================================================================
+$PYTHON_EXECUTABLE $SCRIPT_LOC/finetune_mol.py \
+    "${COMMON_ARGS[@]}" \
+    --devices 1 \
+    "${EXTRA_ARGS[@]}" \
+    --save_path_dir "$RUN_SAVE_DIR" \
+    >> "$RUN_LOG" 2>&1
+echo "Training finished for $MODE. Log: $RUN_LOG"
+# =====================================================================
+# Evaluate
+# =====================================================================
+RUN_CKPT=$(ls -t "$RUN_SAVE_DIR"/*/last.ckpt "$RUN_SAVE_DIR"/last.ckpt 2>/dev/null | head -1)
+if [ -z "$RUN_CKPT" ]; then
+    echo "No checkpoint found in $RUN_SAVE_DIR — skipping eval."
+    exit 1
+fi
+echo "Evaluating checkpoint: $RUN_CKPT"
+$PYTHON_EXECUTABLE $SCRIPT_LOC/evaluate_mol_table.py \
+    --checkpoint_path "$RUN_CKPT" \
+    "${EVAL_COMMON_ARGS[@]}" \
+    "${EXTRA_ARGS[@]}" \
+    --output_dir "$RESULTS_SUBDIR" \
+    --device cuda:0 \
+    >> "$RESULTS_SUBDIR/eval.log" 2>&1
+echo "Eval finished for $MODE. CSV: $RESULTS_SUBDIR/eval_metrics_${MODE}.csv"
+conda deactivate

a2d2_mol/scripts/train_mol.sh ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/bin/bash
+#SBATCH --job-name=a2d2-mol-pretrain
+#SBATCH --partition=dgx-b200
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=2
+#SBATCH --ntasks-per-node=2
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=512GB
+#SBATCH --time=7-00:00:00
+# SLURM's own catch-file (anything printed before the exec redirect below, plus
+# slurm-infra messages). Relative to the submit dir, so submit this script from
+# the a2d2_mol/ directory; the real run output is redirected via exec below.
+#SBATCH --output=logs/slurm/%x_%j.out
+#SBATCH --error=logs/slurm/%x_%j.err
+#
+# Pretrain the any-length insertion MDM on drug-like SAFE molecules on a dgx-b200 node.
+# Submit with:  sbatch scripts/train_mol.sh   (from the a2d2_mol/ directory).
+#
+# DDP is launched by SLURM: one srun task per GPU. --gpus-per-node and
+# --ntasks-per-node must match; change both together (and they override the
+# training.devices value baked into config_mol.yaml via the hydra override below).
+DATE=$(date +%Y%m%d)
+SPECIAL_PREFIX='a2d2-mol'
+# Resolve a2d2_mol/ (which holds train.py + config_mol.yaml) so paths are
+# repo-relative. This script lives in a2d2_mol/scripts/, so the direct-run
+# fallback goes one level up. Under sbatch, BASH_SOURCE points at the spooled
+# copy, so we rely on SLURM_SUBMIT_DIR (submit from the a2d2_mol/ directory).
+if [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
+    SCRIPT_DIR="$SLURM_SUBMIT_DIR"
+else
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+fi
+cd "$SCRIPT_DIR"
+# Auto-detect GPUs from the SLURM allocation (falls back to 2 for `bash` runs).
+DEVICES=${SLURM_GPUS_ON_NODE:-${SLURM_GPUS_PER_NODE:-2}}
+NTASKS=${SLURM_NTASKS_PER_NODE:-$DEVICES}
+NODES=${SLURM_NNODES:-1}
+LOG_LOC="$SCRIPT_DIR/logs"
+mkdir -p "$LOG_LOC/slurm"
+exec > "${LOG_LOC}/${DATE}_${SPECIAL_PREFIX}_${SLURM_JOB_ID:-local}.log" 2>&1
+# ---------------------------------------------------------------------------
+# Weights & Biases: log in once on your machine before running this script with
+#   `wandb login`  (or `export WANDB_API_KEY=<your-key>`).
+# Do NOT hardcode your API key here. To disable W&B entirely, uncomment:
+# export WANDB_MODE=disabled
+# ---------------------------------------------------------------------------
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+# Activate the conda env that has the deps (torch / pytorch_lightning / hydra).
+# The batch shell does NOT source ~/.bashrc, so conda is not on PATH. Override
+# CONDA_ROOT to point at your conda/miniconda install, or just have `conda` on
+# PATH; override CONDA_ENV if your env name differs from the one created by
+# environment.yml.
+CONDA_ENV="${CONDA_ENV:-a2d2}"
+if [ -n "${CONDA_ROOT:-}" ]; then
+    source "$CONDA_ROOT/bin/activate" "$CONDA_ENV"
+elif command -v conda >/dev/null 2>&1; then
+    source "$(conda info --base)/bin/activate" "$CONDA_ENV"
+else
+    echo "ERROR: conda not found; set CONDA_ROOT to your miniconda install." >&2
+    exit 1
+fi
+# --- Distributed / NCCL setup (single node, intra-node NVLink) --------------
+ETH_IFACE=$(ip -o -4 addr list | grep -v "127.0.0.1" | grep -E "ens|eth|enp|bond" | head -1 | awk '{print $2}')
+if [ -z "$ETH_IFACE" ]; then
+    ETH_IFACE=$(ip -o -4 addr list | grep -v "127.0.0.1" | grep -v "ibp" | head -1 | awk '{print $2}')
+fi
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_FAMILY=AF_INET
+export NCCL_SOCKET_IFNAME=$ETH_IFACE
+export NCCL_P2P_LEVEL=NVL
+export MASTER_ADDR=$(scontrol show hostnames "${SLURM_NODELIST:-$(hostname)}" | head -n 1)
+export MASTER_PORT=$(shuf -i 15000-59999 -n 1)
+export NODE_RANK=${SLURM_NODEID:-0}
+echo "=== a2d2 molecule pretraining (dgx-b200) ==="
+echo "Job ID: ${SLURM_JOB_ID:-local}  Node: ${SLURM_NODELIST:-$(hostname)}  GPUs: $DEVICES  Tasks: $NTASKS"
+# --task mol makes train.py load config_mol.yaml; the hydra overrides pin
+# devices/nodes to the SLURM allocation so the two never drift apart.
+srun --ntasks-per-node=$NTASKS python train.py --task mol \
+    training.devices=$DEVICES \
+    training.nodes=$NODES
+conda deactivate

a2d2_mol/train.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+import os
+import sys
+import argparse
+import hydra
+from omegaconf import OmegaConf
+from datetime import datetime
+# Directory containing this file and the config_*.yaml files (used by Hydra below).
+CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve.
+sys.path.insert(0, os.path.dirname(CONFIG_DIR))
+import wandb
+from lightning_modules import AnyOrderInsertionFlowModule
+torch.set_printoptions(threshold=10_000)
+torch.set_float32_matmul_precision("high")
+# Disable DDP optimizer due to incompatibility with flex_attention higher-order ops
+torch._dynamo.config.optimize_ddp = False
+def train(config):
+	wandb_logger = None
+	# set the random seed
+	pl.seed_everything(42)
+	torch.manual_seed(42)
+	# Only initialize wandb on rank 0 to avoid multiple runs
+	if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+		wandb.init(
+			project=config.wandb.project,
+			name=config.wandb.name,
+			config=OmegaConf.to_container(config, resolve=True),  # Convert to dict
+			dir=config.wandb.path
+		)
+		wandb_logger = WandbLogger(
+				project=wandb.run.project,
+				name=wandb.run.name,
+				log_model=False,  # Disable checkpoint uploading to save disk space
+			)
+	# Modify config to add timestamp to checkpoint directory
+	OmegaConf.set_struct(config, False)
+	time_string = datetime.now().strftime("%Y%m%d-%H%M%S")
+	config.training.checkpoint_dir = os.path.join(
+		config.training.checkpoint_dir, time_string
+	)
+	OmegaConf.set_struct(config, True)
+	# Create checkpoint directory
+	os.makedirs(config.training.checkpoint_dir, exist_ok=True)
+	# Setup data module - check if using HuggingFace dataset
+	if hasattr(config, 'hf_dataset'):
+		# Imported lazily: the HF/SAFE path is only used by the molecule configs,
+		# which keep mol_dataset.py (and its `safe` dependency) in a2d2_mol/.
+		from mol_dataset import setup_hf_data_and_update_config
+		print(f"Using HuggingFace dataset: {config.hf_dataset.name}")
+		data_module = setup_hf_data_and_update_config(
+			config,
+			dataset_name=config.hf_dataset.name,
+			smiles_column=config.hf_dataset.get('smiles_column', 'smiles')
+		)
+	else:
+		# Imported lazily: the local (arrow) path is used by the peptide config,
+		# which keeps dataloading_for_dynamic_batching.py in a2d2_pep/.
+		from dataloading_for_dynamic_batching import setup_data_and_update_config
+		print("Using local dataset")
+		data_module = setup_data_and_update_config(config)
+	module = AnyOrderInsertionFlowModule(config)
+	# Initialize trainer
+	# Configure trainer arguments
+	# Map torch_dtype to Lightning precision
+	dtype_str = config.model.get('torch_dtype', 'bfloat16')
+	precision_map = {
+		'float32': '32-true',
+		'float16': '16-mixed',
+		'bfloat16': 'bf16-mixed'
+	}
+	precision = precision_map.get(dtype_str, 'bf16-mixed')
+	trainer_kwargs = dict(
+		num_nodes=config.training.nodes,
+		accelerator="gpu",
+		devices=config.training.devices,
+		strategy="ddp",
+		precision=precision,
+		accumulate_grad_batches=(
+			config.training.batch_size
+			// (
+				config.training.per_gpu_batch_size
+				* config.training.nodes
+				* config.training.devices
+			)
+		),
+		log_every_n_steps=10,
+		enable_checkpointing=True,
+		default_root_dir=config.training.checkpoint_dir,
+		gradient_clip_val=1.0,
+	)
+	# Only one of max_steps or max_epochs will be used
+	if config.training.max_steps is not None:
+		trainer_kwargs["max_steps"] = config.training.max_steps
+	elif config.training.num_epochs is not None:
+		trainer_kwargs["max_epochs"] = config.training.num_epochs
+		config.training.max_steps = config.training.max_steps
+	else:
+		raise ValueError(
+			"Either max_steps or num_epochs must be specified in the config"
+		)
+	if config.training.warmup_steps is None:
+		config.training.warmup_steps = int(config.training.max_steps * 0.01)
+	# Add ModelCheckpoint callback to save the checkpoint when validation loss is at a new low
+	checkpoint_callback = ModelCheckpoint(
+		monitor="train/total_loss",
+		mode="min",
+		save_top_k=config.training.save_top_k,
+		save_last=True,
+		filename="epoch-{epoch:02d}-train_loss-{train/total_loss:.4f}",
+		dirpath=config.training.checkpoint_dir,
+		# Don't use val_loss in filename for periodic saves - causes failures when val doesn't run
+		auto_insert_metric_name=False
+	)
+	# Add separate callback for periodic saves (no val_loss dependency). Use
+	# step-based saves for streaming datasets (save_every_n_steps) and epoch-based
+	# saves otherwise (save_every_n_epochs); whichever the config provides.
+	save_every_n_steps = config.training.get('save_every_n_steps', None)
+	save_every_n_epochs = config.training.get('save_every_n_epochs', None)
+	if save_every_n_steps is not None:
+		periodic_checkpoint_callback = ModelCheckpoint(
+			save_top_k=-1,  # Save all periodic checkpoints
+			filename="step-{step:08d}",
+			dirpath=config.training.checkpoint_dir,
+			every_n_train_steps=save_every_n_steps,
+			auto_insert_metric_name=False
+		)
+	elif save_every_n_epochs is not None:
+		periodic_checkpoint_callback = ModelCheckpoint(
+			save_top_k=-1,  # Save all periodic checkpoints
+			filename="epoch-{epoch:02d}",
+			dirpath=config.training.checkpoint_dir,
+			every_n_epochs=save_every_n_epochs,
+			auto_insert_metric_name=False
+		)
+	else:
+		raise ValueError(
+			"Either save_every_n_steps or save_every_n_epochs must be specified in the config"
+		)
+	trainer_kwargs["callbacks"] = [checkpoint_callback, periodic_checkpoint_callback]
+	if wandb_logger is not None:
+		trainer_kwargs["logger"] = wandb_logger
+	trainer = pl.Trainer(**trainer_kwargs)
+	# Train the model
+	ckpt_path = None
+	if "resume_path" in config.training:
+		ckpt_path = config.training.resume_path
+	trainer.fit(module,
+             datamodule=data_module,
+             ckpt_path=ckpt_path)
+	# Only finish wandb on rank 0
+	if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+		wandb.finish()
+if __name__ == '__main__':
+	# Parse arguments to get config name
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--config_name', type=str, default='config',
+	                   help='Name of the config file to use')
+	parser.add_argument('--task', type=str, default=None,
+	                   help='Task name (uses config_{task}.yaml)')
+	# Parse known args (hydra will handle the rest)
+	args, unknown = parser.parse_known_args()
+	# Determine config name from task or config_name
+	if args.task:
+		config_name = f'config_{args.task}'
+	else:
+		config_name = args.config_name
+	print(f"Using config: {config_name}.yaml")
+	# Add config name to Hydra overrides (this persists across DDP subprocesses)
+	if '--config-name' not in unknown and f'--config-name={config_name}' not in unknown:
+		unknown.insert(0, f'--config-name={config_name}')
+	# Reconstruct sys.argv for hydra
+	sys.argv = [sys.argv[0]] + unknown
+	# Define main function with default config (will be overridden by command line)
+	@hydra.main(version_base=None,
+	           config_path=CONFIG_DIR,
+	           config_name='config')
+	def main(config):
+		"""Main entry point for training"""
+		train(config)
+	main()

a2d2_pep/README.md ADDED Viewed

	@@ -0,0 +1,145 @@

+# A2D2 for Multi-Objective Therapeutic Peptide Generation 🧫
+This part of the code fine-tunes an **any-length masked diffusion model (MDM)** over peptide SMILES with **A2D2** (Fine-Tuning Any-Length Discrete Diffusion for Adaptive Decoding) to optimize **five therapeutic properties simultaneously**: binding affinity to a target protein, solubility, non-hemolysis, non-fouling, and cell-membrane permeability.
+A2D2 jointly fine-tunes the insertion and unmasking policies together with **insertion and unmasking quality predictors**, generating peptides via **Adaptive Joint Decoding (AJD)** that remasks low-quality tokens and drops low-quality insertions to sample from the reward-tilted distribution while preserving generation quality.
+Peptides are represented as **SMILES** strings and tokenized with the SMILES Pair Encoding tokenizer (vocabulary size `V = 587`) from [PeptideCLM](https://pubs.acs.org/doi/10.1021/acs.jcim.4c01443). Generated SMILES are decoded and validity-checked with the `SMILES2PEPTIDE` filter from [PepTune](https://arxiv.org/abs/2412.17780).
+The codebase is partially built upon [FlexMDM (Kim et.al, 2025)](https://github.com/brianlck/FlexMDM/tree/main) and [TR2-D2 (Tang et.al, 2025)](https://github.com/sophtang/TR2-D2/tree/main).
+## Environment Installation
+```
+# from the repository root
+conda env create -f environment.yml
+conda activate a2d2
+```
+The peptide scripts share the `a2d2` environment with the molecule and language experiments. See the root [`environment.yml`](../environment.yml) for the `flash-attn` install step.
+## Model Pretrained Weights
+A2D2 fine-tunes a pretrained any-length insertion MDM trained on ~11M peptide SMILES (7,451 sequences from CycPeptMPDB, 825,632 from SmProt, and ~10M modified peptides from CycloPs). Download the base checkpoint and place it at:
+```
+A2D2/pretrained/anylength_pep.ckpt
+```
+```bash
+# from the repository root
+pip install gdown
+mkdir -p pretrained
+gdown 1K8yxM-omh-MuPo0EG6UyxHZLk3HehoJc -O pretrained/anylength_pep.ckpt
+```
+(Or download manually from https://drive.google.com/file/d/1K8yxM-omh-MuPo0EG6UyxHZLk3HehoJc/view?usp=drive_link — a plain `wget`/`curl` of the link saves Google's HTML warning page, not the checkpoint.)
+This is the default `--checkpoint_path`; pass `--checkpoint_path` to override it.
+The reward classifiers (binding-affinity Transformer, plus XGBoost predictors for solubility, hemolysis, non-fouling, and permeability) and the SMILES PE tokenizer ship with the repo under [`pep_scoring/`](pep_scoring); no separate download is required. The PeptideCLM embedding model is fetched automatically from the Hugging Face Hub (`aaronfeller/PeptideCLM-23M-all`) on first run.
+## Pretraining the Any-Length Model
+If you only want to fine-tune with A2D2, download the released `anylength_pep.ckpt` above and skip this section. Follow these steps to reproduce the base checkpoint by pretraining the any-length insertion MDM from scratch.
+### 1. Download the pretraining dataset
+The pretraining corpus is ~11M peptide SMILES (7,451 from CycPeptMPDB, 825,632 from SmProt, and ~10M modified peptides from CycloPs), already tokenized with the in-repo SMILES PE tokenizer and saved as a Hugging Face `arrow` dataset (with `train`/`val` splits) via `save_to_disk`.
+Download the archive and unpack it into [`data/`](data):
+```bash
+# from a2d2_pep/
+pip install gdown
+gdown https://drive.google.com/uc?id=1yCDr641WVjCtECg3nbG0nsMNu8j7d7gp -O 11M_peptide_smiles.zip
+mkdir -p data
+unzip 11M_peptide_smiles.zip -d data/
+# result: a2d2_pep/data/11M_peptide_smiles/{train,val}/...
+```
+This is the default `training.data_path` in [`config_pep.yaml`](config_pep.yaml). To store the dataset elsewhere, set `training.data_path` (absolute, or relative to `a2d2_pep/`).
+### 2. Configure
+Pretraining is driven by [`config_pep.yaml`](config_pep.yaml). Key fields:
+| Field | Default | Notes |
+|-------|---------|-------|
+| `training.data_path` | `data/11M_peptide_smiles` | Preprocessed arrow dataset from step 1. |
+| `training.devices` | `4` | GPUs per node (DDP). |
+| `training.batch_size` | `1024` | Global batch; gradient accumulation is derived automatically from `per_gpu_batch_size`. |
+| `training.max_steps` | `1000000` | Total optimizer steps. |
+| `training.learning_rate` | `3e-4` | AdamW LR with `warmup_steps: 2000`. |
+| `training.checkpoint_dir` | `checkpoints/peptides` | A timestamped subdirectory is created per run. |
+| `interpolant.max_length` | `1024` | Max token length. |
+### 3. Pre-training Any-Length Peptide Model
+Log in to Weights & Biases once (`wandb login`), or set `export WANDB_MODE=disabled` to skip logging. Then submit the SLURM job:
+```bash
+# from a2d2_pep/
+sbatch train_pep.sh
+```
+`train_pep.sh` is a SLURM batch script that requests one `dgx-b200` node with 4 full B200 GPUs and launches DDP via `srun` (one task per GPU), running the equivalent of:
+```bash
+python train.py --task pep
+```
+It activates the conda env (`CONDA_ENV`, defaults to the `peptune` env) from `CONDA_ROOT` (defaults to the shared miniconda install) — the batch shell does not source `~/.bashrc`, so override these env vars if your install or env path differs. The GPU count is auto-detected from the SLURM allocation and passed to hydra as `training.devices`/`training.nodes`, so to scale just change `--gpus-per-node` and `--ntasks-per-node` together at the top of the script (they must match). `--task pep` makes `train.py` load `config_pep.yaml`.
+Checkpoints are written to `checkpoints/peptides/<timestamp>/` (use `last.ckpt` / the best `train_loss` checkpoint as the `--checkpoint_path` for fine-tuning); the run log goes to `logs/<date>_a2d2-peptide_<jobid>.log` and SLURM's catch-file to `logs/slurm/`. To resume, add a `training.resume_path: /path/to/last.ckpt` entry to the config.
+## Fine-Tune with A2D2
+All paths resolve relative to the repository, so the scripts run from any checkout. Before running, create the output directories `A2D2/checkpoints`, `A2D2/results`, and `A2D2/logs` (the script also creates them on demand). Fine-tuning curves and a `<prot_name>_generation_results.csv` are written to `<base_path>/results/<run_name>/`, and checkpoints to `--save_path_dir`.
+Choose a target protein with `--prot_name` (looked up in the built-in `PROTEINS` table — e.g. `glp1` for GLP-1R or `glast` for GLAST), or supply an arbitrary target with `--prot_seq <amino acid sequence>`.
+#### Available `--prot_name` targets
+The named targets and their amino-acid sequences are defined in the `PROTEINS` dict in [`finetune_quality.py`](finetune_quality.py) (search for `PROTEINS = {`). The default is `glast`; passing a name not in the table raises an error listing the valid keys. To add a new target, add a `'<key>': '<sequence>'` entry there, or skip the table entirely with `--prot_seq`.
+| `--prot_name` | Target |
+|---------------|--------|
+| `tfr` | Transferrin receptor (TfR) |
+| `glp1` | GLP-1 receptor (GLP-1R) |
+### Single run
+[`scripts/run_peptide_finetune.slurm`](scripts/run_peptide_finetune.slurm) runs a single `finetune_quality.py` experiment on one MIG GPU, then evaluates the resulting checkpoint. It bundles the hyperparameter set from the peptide column of the fine-tuning table in the paper — replicates `R = 8`, buffer size `B = 50`, resample interval `N_resample = 10`, gradient steps per iteration `N_update = 10`, alternation frequency `N_alt = 5`, warmup `N_warmup = 20`, sampling steps `N_steps = 256`, training mini-batch `10`, reward scaling `α = 0.1`, quality threshold `μ_min = 0.5`, and `--num_obj 5` — so you don't have to pass them by hand.
+The script resolves the repo root automatically — `$A2D2_ROOT` if set, else the `sbatch` submit directory, else the script's own location — so either submit from the repo root or export your clone path. Set `CONDA_ROOT` (your miniconda install) and, if needed, `CONDA_ENV` (defaults to `peptune`) and `WANDB_ENTITY`:
+```bash
+export A2D2_ROOT=/path/to/your/A2D2     # absolute path to your clone
+export CONDA_ROOT=/path/to/miniconda3   # or just have `conda` on PATH
+export WANDB_ENTITY=your_wandb_entity   # optional
+sbatch scripts/run_peptide_finetune.slurm
+```
+Select which variant to run with `MODE_ID` (default `0`): `0` = A2D2 (full planner), `1` = `--disable_planner`, `2` = `--disable_insertion_planner`, `3` = `--disable_unmasking_planner`. Override at submit time:
+```bash
+sbatch --export=ALL,MODE_ID=2 scripts/run_peptide_finetune.slurm
+```
+The target protein is set by the `PROT_NAME` variable near the top of the script (default `tfr`); edit it to one of the named targets above (or any key in the `PROTEINS` table). The pretrained base checkpoint is read from `$A2D2_ROOT/pretrained/anylength_pep.ckpt`. Outputs land in `checkpoints/finetune_test_peptides_<prot>/<job>_peptide_<prot>_<mode>/` and `results/peptide_test_ablation_<prot>/<mode>/`.
+### Key arguments
+- `--prot_name` / `--prot_seq` — target protein (named lookup, or a raw amino-acid sequence).
+- `--alternation_frequency` — epochs to train each of {policy, planner} before alternating.
+- `--alpha` — reward-tilting temperature (smaller = stronger reward optimization).
+- `--buffer_size`, `--resample_every_n_step` — replay-buffer size and how often it is regenerated.
+### Ablation flags
+| Flag | Variant |
+|------|---------|
+| *(none)* | A2D2 w/ insertion + unmasking quality (alternation) |
+| `--disable_planner` | A2D2 w/o quality (policy only, no remasking) |
+| `--disable_insertion_planner` | A2D2 w/o insertion quality |
+| `--disable_unmasking_planner` | A2D2 w/o unmasking/remasking quality |
+| `--joint_training` | train policy + quality heads jointly (no alternation) |
+During buffer generation only sequences passing the `SMILES2PEPTIDE` validity filter are retained; the scalarized multi-objective reward is added to the log Radon–Nikodym derivative of each sequence. Fine-tuning runs on a single GPU (`--devices 1`).
+## Evaluation
+Evaluation runs automatically every `--eval_every_n_epochs` epochs and at the end of training. It samples from the current model and reports the fraction of valid peptides along with the five therapeutic rewards (binding affinity, solubility, non-hemolysis, non-fouling, permeability), saving per-objective curves and `<prot_name>_generation_results.csv` under `<base_path>/results/<run_name>/`.
+To resume a run, pass `--resume_ckpt /path/to/last.ckpt` (restores epoch, optimizer, and planner state; new checkpoints continue in the same directory).

a2d2_pep/config_pep.yaml ADDED Viewed

	@@ -0,0 +1,50 @@

+trainer: "any-order-flow"
+dataset: "peptides"
+model:
+  hidden_size: 768
+  n_heads: 12
+  cond_dim: 128
+  dropout: 0.05
+  n_blocks: 12
+interpolant:
+  type: "any-order"
+  tokens: null # filled in automatically
+  pad_token: null # filled in automatically
+  mask_token: null # filled in automatically
+  max_length: 1024
+  insert_schedule:
+    type: "linear"
+  unmask_schedule:
+    type: "linear"
+training:
+  only_embed_insert: true
+  batch_size: 1024
+  per_gpu_batch_size: 64 # Gradient accumulation happens automatically
+  cpus: 4
+  learning_rate: 3e-4
+  nodes: 1
+  devices: 4
+  max_steps: 1000000
+  weight_decay: 0.03
+  # Path to the preprocessed (arrow) pretraining dataset; see README for the download link.
+  # Relative paths resolve against a2d2_pep/. Defaults to a2d2_pep/data/11M_peptide_smiles.
+  data_path: "data/11M_peptide_smiles"
+  checkpoint_dir: "checkpoints/peptides"
+  save_top_k: 1
+  save_every_n_epochs: 1
+  loss_fn:
+    unmask: "elbo"
+    insert: "expectation"
+  reset_lr: false
+  warmup_steps: 2000
+  ema_decay: 0.9999
+  filter_max_length: false
+wandb:
+  entity: null  # set to your W&B entity, or leave null to use the default
+  project: "a2d2-pep"
+  name: "a2d2-pep"
+  path: "./wandb"

a2d2_pep/data/dataloading_for_dynamic_batching.py ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env
+import os
+import torch
+from torch.utils.data import Dataset, DataLoader
+from datasets import Dataset,load_from_disk
+import sys
+import pytorch_lightning as pl
+from pep_scoring.tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from functools import partial
+import re
+# Directory containing this file; used to resolve the in-repo tokenizer files.
+_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+class DynamicBatchingDataset(Dataset):
+    def __init__(self, dataset_dict, tokenizer):
+        print('Initializing dataset...')
+        self.dataset_dict = {
+            'attention_mask': [torch.tensor(item) for item in dataset_dict['attention_mask']],
+            'input_ids': [torch.tensor(item) for item in dataset_dict['input_ids']],
+            'labels': dataset_dict['labels']
+        }
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.dataset_dict['attention_mask'])
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return {
+                'input_ids': self.dataset_dict['input_ids'][idx],
+                'attention_mask': self.dataset_dict['attention_mask'][idx],
+                'labels': self.dataset_dict['labels'][idx]
+            }
+        elif isinstance(idx, list):
+            return {
+                'input_ids': [self.dataset_dict['input_ids'][i] for i in idx],
+                'attention_mask': [self.dataset_dict['attention_mask'][i] for i in idx],
+                'labels': [self.dataset_dict['labels'][i] for i in idx]
+            }
+        else:
+            raise ValueError(f"Expected idx to be int or list, but got {type(idx)}")
+class CustomDataModule(pl.LightningDataModule):
+    def __init__(self, dataset_path, tokenizer):
+        super().__init__()
+        self.dataset = load_from_disk(dataset_path)
+        self.tokenizer = tokenizer
+    def peptide_bond_mask(self, smiles_list):
+        """
+        Returns a mask with shape (batch_size, seq_length) that has 1 at the locations
+        of recognized bonds in the positions dictionary and 0 elsewhere.
+        Args:
+            smiles_list: List of peptide SMILES strings (batch of SMILES strings).
+        Returns:
+            np.ndarray: A mask of shape (batch_size, seq_length) with 1s at bond positions.
+        """
+        # Initialize the batch mask
+        batch_size = len(smiles_list)
+        max_seq_length = 1035 #max(len(smiles) for smiles in smiles_list)  # Find the longest SMILES
+        mask = torch.zeros((batch_size, max_seq_length), dtype=torch.int)  # Mask filled with zeros
+        bond_patterns = [
+            (r'OC\(=O\)', 'ester'),
+            (r'N\(C\)C\(=O\)', 'n_methyl'),
+            (r'N[12]C\(=O\)', 'peptide'),  # Pro peptide bonds
+            (r'NC\(=O\)', 'peptide'),  # Regular peptide bonds
+            (r'C\(=O\)N\(C\)', 'n_methyl'),
+            (r'C\(=O\)N[12]?', 'peptide')
+        ]
+        for batch_idx, smiles in enumerate(smiles_list):
+            positions = []
+            used = set()
+            # Identify bonds
+            for pattern, bond_type in bond_patterns:
+                for match in re.finditer(pattern, smiles):
+                    if not any(p in range(match.start(), match.end()) for p in used):
+                        positions.append({
+                            'start': match.start(),
+                            'end': match.end(),
+                            'type': bond_type,
+                            'pattern': match.group()
+                        })
+                        used.update(range(match.start(), match.end()))
+            # Update the mask for the current SMILES
+            for pos in positions:
+                mask[batch_idx, pos['start']:pos['end']] = 1
+        return mask
+    def peptide_token_mask(self, smiles_list, token_lists):
+        """
+        Returns a mask with shape (batch_size, num_tokens) that has 1 for tokens
+        where any part of the token overlaps with a peptide bond, and 0 elsewhere.
+        Args:
+            smiles_list: List of peptide SMILES strings (batch of SMILES strings).
+            token_lists: List of tokenized SMILES strings (split into tokens).
+        Returns:
+            np.ndarray: A mask of shape (batch_size, num_tokens) with 1s for peptide bond tokens.
+        """
+        # Initialize the batch mask
+        batch_size = len(smiles_list)
+        token_seq_length = max(len(tokens) for tokens in token_lists)  # Find the longest tokenized sequence
+        tokenized_masks = torch.zeros((batch_size, token_seq_length), dtype=torch.int)  # Mask filled with zeros
+        atomwise_masks = self.peptide_bond_mask(smiles_list)
+        for batch_idx, atomwise_mask in enumerate(atomwise_masks):
+            token_seq = token_lists[batch_idx]
+            atom_idx = 0
+            for token_idx, token in enumerate(token_seq):
+                if token_idx != 0 and token_idx != len(token_seq) - 1:
+                    if torch.sum(atomwise_mask[atom_idx:atom_idx+len(token)]) >= 1:
+                        tokenized_masks[batch_idx][token_idx] = 1
+                    atom_idx += len(token)
+        return tokenized_masks
+    def collate_fn(self, batch):
+        item = batch[0]
+        token_array = self.tokenizer.get_token_split(item['input_ids'])
+        bond_mask = self.peptide_token_mask(item['labels'], token_array)
+        return {
+            'input_ids': item['input_ids'],
+            'attention_mask': item['attention_mask'],
+            'bond_mask': bond_mask
+        }
+    def train_dataloader(self):
+        train_dataset = DynamicBatchingDataset(self.dataset['train'], tokenizer=self.tokenizer)
+        return DataLoader(
+            train_dataset,
+            batch_size=1,
+            collate_fn=self.collate_fn,  # Use the instance method
+            shuffle=True,
+            num_workers=12,
+            pin_memory=True
+        )
+    def val_dataloader(self):
+        val_dataset = DynamicBatchingDataset(self.dataset['val'], tokenizer=self.tokenizer)
+        return DataLoader(
+            val_dataset,
+            batch_size=1,
+            collate_fn=self.collate_fn,  # Use the instance method
+            num_workers=8,
+            pin_memory=True
+        )
+def setup_data_and_update_config(config):
+    """
+    Get the dataset and update the config with token information for text datasets.
+    """
+    # SMILES Pair Encoding tokenizer ships with the repo under pep_scoring/tokenizer/.
+    tokenizer = SMILES_SPE_Tokenizer(
+        os.path.join(_THIS_DIR, 'pep_scoring', 'tokenizer', 'new_vocab.txt'),
+        os.path.join(_THIS_DIR, 'pep_scoring', 'tokenizer', 'new_splits.txt'),
+    )
+    config.interpolant.tokens = len(tokenizer)
+    config.interpolant.pad_token = tokenizer.pad_token_id
+    config.interpolant.mask_token = tokenizer.mask_token_id
+    # Path to the preprocessed (arrow) pretraining dataset saved via `save_to_disk`.
+    # Download instructions are in the README; override with `training.data_path` in the config.
+    data_path = config.training.get('data_path', os.path.join('data', '11M_peptide_smiles'))
+    if not os.path.isabs(data_path):
+        data_path = os.path.join(_THIS_DIR, data_path)
+    if not os.path.exists(data_path):
+        raise FileNotFoundError(
+            f"Pretraining dataset not found at '{data_path}'. Download it (see a2d2_pep/README.md, "
+            "'Pretraining the Any-Length Model') and set `training.data_path` in config_pep.yaml."
+        )
+    data_module = CustomDataModule(data_path, tokenizer)
+    return data_module

a2d2_pep/data/dataset.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import re
+import torch
+import utils
+from torch.utils.data import Dataset, DataLoader
+import pytorch_lightning as pl
+from functools import partial
+import sys
+class CustomDataset(Dataset):
+    def __init__(self, dataset, indices):
+        self.dataset = dataset
+        self.indices = indices
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, idx):
+        actual_idx = int(self.indices[idx])
+        item = self.dataset[actual_idx]
+        return item
+# for weighting losses of peptide bonds
+def peptide_bond_mask(smiles_list):
+    """
+    Returns a mask with shape (batch_size, seq_length) that has 1 at the locations
+    of recognized bonds in the positions dictionary and 0 elsewhere.
+    Args:
+        smiles_list: List of peptide SMILES strings (batch of SMILES strings).
+    Returns:
+        np.ndarray: A mask of shape (batch_size, seq_length) with 1s at bond positions.
+    """
+    # Initialize the batch mask
+    batch_size = len(smiles_list)
+    max_seq_length = max(len(smiles) for smiles in smiles_list)  # Find the longest SMILES
+    mask = torch.zeros((batch_size, max_seq_length), dtype=torch.int)  # Mask filled with zeros
+    bond_patterns = [
+        (r'OC\(=O\)', 'ester'),
+        (r'N\(C\)C\(=O\)', 'n_methyl'),
+        (r'N[12]C\(=O\)', 'peptide'),  # Pro peptide bonds
+        (r'NC\(=O\)', 'peptide'),  # Regular peptide bonds
+        (r'C\(=O\)N\(C\)', 'n_methyl'),
+        (r'C\(=O\)N[12]?', 'peptide')
+    ]
+    for batch_idx, smiles in enumerate(smiles_list):
+        positions = []
+        used = set()
+        # Identify bonds
+        for pattern, bond_type in bond_patterns:
+            for match in re.finditer(pattern, smiles):
+                if not any(p in range(match.start(), match.end()) for p in used):
+                    positions.append({
+                        'start': match.start(),
+                        'end': match.end(),
+                        'type': bond_type,
+                        'pattern': match.group()
+                    })
+                    used.update(range(match.start(), match.end()))
+        # Update the mask for the current SMILES
+        for pos in positions:
+            mask[batch_idx, pos['start']:pos['end']] = 1
+    return mask
+def peptide_token_mask(smiles_list, token_lists):
+    """
+    Returns a mask with shape (batch_size, num_tokens) that has 1 for tokens
+    where any part of the token overlaps with a peptide bond, and 0 elsewhere.
+    Args:
+        smiles_list: List of peptide SMILES strings (batch of SMILES strings).
+        token_lists: List of tokenized SMILES strings (split into tokens).
+    Returns:
+        np.ndarray: A mask of shape (batch_size, num_tokens) with 1s for peptide bond tokens.
+    """
+    # Initialize the batch mask
+    batch_size = len(smiles_list)
+    token_seq_length = max(len(tokens) for tokens in token_lists)  # Find the longest tokenized sequence
+    tokenized_masks = torch.zeros((batch_size, token_seq_length), dtype=torch.int)  # Mask filled with zeros
+    atomwise_masks = peptide_bond_mask(smiles_list)
+    for batch_idx, atomwise_mask in enumerate(atomwise_masks):
+        token_seq = token_lists[batch_idx]
+        atom_idx = 0
+        for token_idx, token in enumerate(token_seq):
+            if token_idx != 0 and token_idx != len(token_seq) - 1:
+                if torch.sum(atomwise_mask[atom_idx:atom_idx+len(token)]) >= 1:
+                    tokenized_masks[batch_idx][token_idx] = 1
+                atom_idx += len(token)
+    return tokenized_masks
+def extract_amino_acid_sequence(helm_string):
+    """
+    Extracts the amino acid sequence from a HELM peptide notation and outputs it as an array,
+    removing any brackets around each amino acid.
+    Args:
+        helm_string (str): The HELM notation string for a peptide.
+    Returns:
+        list: A list containing each amino acid in sequence without brackets.
+    """
+    # Use regex to find the pattern within `{}` brackets following "PEPTIDE" followed by a number
+    matches = re.findall(r'PEPTIDE\d+\{([^}]+)\}', helm_string)
+    if matches:
+        # Join all matched sequences and split by dots to get individual amino acids
+        amino_acid_sequence = []
+        for match in matches:
+            sequence = match.replace('[', '').replace(']', '').split('.')
+            amino_acid_sequence.extend(sequence)
+        return amino_acid_sequence
+    else:
+        return "Invalid HELM notation or no peptide sequence found."
+def helm_collate_fn(batch, tokenizer):
+    sequences = [item['HELM'] for item in batch]
+    max_len = 0
+    for sequence in sequences:
+        seq_len = len(extract_amino_acid_sequence(sequence))
+        if seq_len > max_len:
+            max_len = seq_len
+    tokens = tokenizer(sequences, return_tensors='pt', padding=True, truncation=True, max_length=1024)
+    return {
+        'input_ids': tokens['input_ids'],
+        'attention_mask': tokens['attention_mask']
+    }
+def collate_fn(batch, tokenizer):
+    """Standard data collator that truncates/pad sequences based on max_length"""
+    valid_sequences = []
+    valid_items = []
+    for item in batch:
+        try:
+            test_tokens = tokenizer([item['SMILES']], return_tensors='pt', padding=False, truncation=True, max_length=1035)
+            valid_sequences.append(item['SMILES'])
+            valid_items.append(item)
+        except Exception as e:
+            print(f"Skipping sequence due to: {str(e)}")
+            continue
+    #sequences = [item['SMILES'] for item in batch]
+    #max_len = max([len(seq) for seq in sequences])
+    #labels = torch.tensor([item['labels'] for item in batch], dtype=torch.float32)
+    tokens = tokenizer(valid_sequences, return_tensors='pt', padding=True, truncation=True, max_length=1035)
+    token_array = tokenizer.get_token_split(tokens['input_ids'])
+    bond_mask = peptide_token_mask(valid_sequences, token_array)
+    #attention_masks = torch.ones(tokens.size()[:2], dtype=torch.bool)
+    return {
+        'input_ids': tokens['input_ids'],
+        'attention_mask': tokens['attention_mask'],
+        'bond_mask': bond_mask
+    }
+class CustomDataModule(pl.LightningDataModule):
+    def __init__(self, train_dataset, val_dataset, test_dataset, tokenizer, batch_size, collate_fn=collate_fn):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        #self.test_dataset = test_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.collate_fn = collate_fn
+    def train_dataloader(self):
+        return DataLoader(self.train_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8,
+                          pin_memory=True
+                          )
+    def val_dataloader(self):
+        return DataLoader(self.val_dataset,
+                          batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8,
+                          pin_memory=True
+                          )
+    """def test_dataloader(self):
+        return DataLoader(self.test_dataset, batch_size=self.batch_size,
+                          collate_fn=partial(self.collate_fn, tokenizer=self.tokenizer),
+                          num_workers=8, pin_memory=True)"""

a2d2_pep/evaluate_peptide_table.py ADDED Viewed

	@@ -0,0 +1,326 @@

+"""
+Evaluate a finetuned peptide model checkpoint by sampling sequences
+and computing metrics for the De Novo Peptide Generation table:
+  Validity (%), Affinity (↑), Solubility (↑), Hemolysis (↑),
+  Nonfouling (↑), Permeability (↑), Sampling Time (↓)
+"""
+import os
+import sys
+import argparse
+import time
+import torch
+import numpy as np
+import pandas as pd
+# add repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, REPO_ROOT)
+from lightning_modules.any_length_remask import AnyOrderInsertionFlowModuleFT
+from lightning_modules import AnyOrderInsertionFlowModule
+from inference_quality import sample_peptides_eval
+from pep_scoring.scoring_functions import ScoringFunctions
+from pep_utils.analyzer import PeptideAnalyzer
+from pep_scoring.tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from finetune_quality import PeptideFinetuner
+from pep_utils.utils import str2bool, set_seed
+from tdc import Evaluator
+# Protein sequences
+PROTEINS = {
+    'amhr': 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV',
+    'tfr': 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF',
+    'gfap': 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM',
+    'glp1': 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS',
+    'glast': 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM',
+    'ncam': 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF',
+    'cereblon': 'MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNIINFDTSLPTSHTYLGADMEEFHGRTLHDDDSCQVIPVLPQVMMILIPGQTLPLQLFHPQEVSMVRNLIQKDRTFAVLAYSNVQEREAQFGTTAEIYAYREEQDFGIEIVKVKAIGRQRFKVLELRTQSDGIQQAKVQILPECVLPSTMSAVQLESLNKCQIFPSKPVSREDQCSYKWWQKYQKRKFHCANLTSWPRWLYSLYDAETLMDRIKKQLREWDENLKDDSLPSNPIDFSYRVAACLPIDDVLRIQLLKIGSAIQRLRCELDIMNKCTSLCCKQCQETEITTKNEIFSLSLCGPMAAYVNPHGYVHETLTVYKACNLNLIGRPSTEHSWFPGYAWTVAQCKICASHIGWKFTATKKDMSPQKFWGLTRSALLPTIPDTEDEISPDKVILCL',
+    'ligase': 'MASQPPEDTAESQASDELECKICYNRYNLKQRKPKVLECCHRVCAKCLYKIIDFGDSPQGVIVCPFCRFETCLPDDEVSSLPDDNNILVNLTCGGKGKKCLPENPTELLLTPKRLASLVSPSHTSSNCLVITIMEVQRESSPSLSSTPVVEFYRPASFDSVTTVSHNWTVWNCTSLLFQTSIRVLVWLLGLLYFSSLPLGIYLLVSKKVTLGVVFVSLVPSSLVILMVYGFCQCVCHEFLDCMAPPS',
+    'skp2': 'MHRKHLQEIPDLSSNVATSFTWGWDSSKTSELLSGMGVSALEKEEPDSENIPQELLSNLGHPESPPRKRLKSKGSDKDFVIVRRPKLNRENFPGVSWDSLPDELLLGIFSCLCLPELLKVSGVCKRWYRLASDESLWQTLDLTGKNLHPDVTGRLLSQGVIAFRCPRSFMDQPLAEHFSPFRVQHMDLSNSVIEVSTLHGILSQCSKLQNLSLEGLRLSDPIVNTLAKNSNLVRLNLSGCSGFSEFALQTLLSSCSRLDELNLSWCFDFTEKHVQVAVAHVSETITQLNLSGYRKNLQKSDLSTLVRRCPNLVHLDLSDSVMLKNDCFQEFFQLNYLQHLSLSRCYDIIPETLLELGEIPTLKTLQVFGIVPDGTLQLLKEALPHLQINCSHFTTIARPTIGNKKNQEIWGIKCRLTLQKPSCL',
+}
+def load_finetuned_model(checkpoint_path, pretrained_ckpt_path, device='cuda'):
+    """Load a finetuned PeptideFinetuner from a Lightning checkpoint."""
+    ckpt = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+    hparams = ckpt.get('hyper_parameters', {})
+    args = hparams.get('args', None)
+    # Load pretrained base checkpoint to get config
+    base_ckpt = torch.load(pretrained_ckpt_path, map_location='cpu', weights_only=False)
+    if 'hyper_parameters' in base_ckpt:
+        config = base_ckpt['hyper_parameters']['config']
+    elif 'config' in base_ckpt:
+        config = base_ckpt['config']
+    else:
+        raise ValueError("Cannot find config in base checkpoint")
+    from omegaconf import OmegaConf, DictConfig
+    if not OmegaConf.is_config(config):
+        config = DictConfig(config)
+    OmegaConf.set_struct(config, False)
+    config.training.use_adaptive_schedule = getattr(args, 'use_adaptive_schedule', True)
+    config.training.schedule_hidden_dim = getattr(args, 'schedule_hidden_dim', 256)
+    config.training.schedule_num_layers = getattr(args, 'schedule_num_layers', 2)
+    config.training.schedule_loss_weight = getattr(args, 'schedule_loss_weight', 0.1)
+    config.training.freeze_base_model = getattr(args, 'freeze_base_model', False)
+    config.training.schedule_warmup_epochs = getattr(args, 'schedule_warmup_epochs', 0)
+    OmegaConf.set_struct(config, True)
+    disable_planner = getattr(args, 'disable_planner', False)
+    policy_model = AnyOrderInsertionFlowModuleFT(
+        config=config,
+        args=args,
+        pretrained_checkpoint=pretrained_ckpt_path,
+        insertion_planner=not disable_planner,
+    )
+    # Load finetuned weights
+    state_dict = ckpt['state_dict']
+    policy_state = {}
+    for k, v in state_dict.items():
+        if k.startswith('policy_model.'):
+            policy_state[k[len('policy_model.'):]] = v
+    policy_model.load_state_dict(policy_state, strict=False)
+    policy_model = policy_model.to(device)
+    policy_model.eval()
+    return policy_model, args, config
+@torch.no_grad()
+def evaluate_checkpoint(policy_model, tokenizer, reward_model, analyzer,
+                        num_samples=1000, batch_size=50, max_length=512,
+                        total_num_steps=256, quality_mode="both", num_remasking=3,
+                        quality_threshold=0.5, unmask_quality_threshold=None, device='cuda'):
+    """
+    Sample `num_samples` peptides and compute all table metrics.
+    Returns a dict with: validity, affinity, sol, hemo, nf, permeability, sampling_time
+    """
+    all_affinity = []
+    all_sol = []
+    all_hemo = []
+    all_nf = []
+    all_permeability = []
+    all_valid_seqs = []
+    total_valid = 0
+    total_generated = 0
+    total_time = 0.0
+    num_batches = (num_samples + batch_size - 1) // batch_size
+    remaining = num_samples
+    for b in range(num_batches):
+        bs = min(batch_size, remaining)
+        remaining -= bs
+        t_start = time.time()
+        result = sample_peptides_eval(
+            model=policy_model,
+            reward_model=reward_model,
+            analyzer=analyzer,
+            tokenizer=tokenizer,
+            steps=total_num_steps,
+            mask=policy_model.interpolant.mask_token,
+            pad=policy_model.interpolant.pad_token,
+            batch_size=bs,
+            max_length=max_length,
+            quality_mode=quality_mode,
+            num_remasking=num_remasking,
+            quality_threshold=quality_threshold,
+            unmask_quality_threshold=unmask_quality_threshold,
+            return_valid=True,
+        )
+        t_end = time.time()
+        # Unpack: validSequences, affinity, sol, hemo, nf, permeability, valid_fraction
+        valid_seqs, affinity, sol, hemo, nf, permeability, valid_fraction = result
+        batch_valid = len(valid_seqs)
+        total_valid += batch_valid
+        total_generated += bs
+        total_time += (t_end - t_start)
+        all_valid_seqs.extend(valid_seqs)
+        if isinstance(affinity, (list, np.ndarray)) and len(affinity) > 0:
+            all_affinity.extend(affinity if isinstance(affinity, list) else affinity.tolist())
+            all_sol.extend(sol if isinstance(sol, list) else sol.tolist())
+            all_hemo.extend(hemo if isinstance(hemo, list) else hemo.tolist())
+            all_nf.extend(nf if isinstance(nf, list) else nf.tolist())
+            all_permeability.extend(permeability if isinstance(permeability, list) else permeability.tolist())
+        print(f"  Batch {b+1}/{num_batches}: {batch_valid}/{bs} valid, "
+              f"time={t_end - t_start:.1f}s")
+    validity = total_valid / total_generated * 100.0 if total_generated > 0 else 0.0
+    # Uniqueness (% of valid sequences that are unique) and
+    # Diversity (1 - mean pairwise Tanimoto on Morgan FPs of unique sequences).
+    # Matches the convention used in evaluate_mol_table.py.
+    all_unique = list(set(all_valid_seqs))
+    num_unique = len(all_unique)
+    uniqueness = num_unique / total_valid * 100.0 if total_valid > 0 else 0.0
+    if num_unique > 1:
+        diversity = Evaluator('diversity')(all_unique)
+    else:
+        diversity = 0.0
+    metrics = {
+        'Validity (%)': validity,
+        'Uniqueness (%)': uniqueness,
+        'Diversity': diversity,
+        'Affinity': np.mean(all_affinity) if all_affinity else 0.0,
+        'Affinity Std': np.std(all_affinity) if all_affinity else 0.0,
+        'Solubility': np.mean(all_sol) if all_sol else 0.0,
+        'Solubility Std': np.std(all_sol) if all_sol else 0.0,
+        'Hemolysis': np.mean(all_hemo) if all_hemo else 0.0,
+        'Hemolysis Std': np.std(all_hemo) if all_hemo else 0.0,
+        'Nonfouling': np.mean(all_nf) if all_nf else 0.0,
+        'Nonfouling Std': np.std(all_nf) if all_nf else 0.0,
+        'Permeability': np.mean(all_permeability) if all_permeability else 0.0,
+        'Permeability Std': np.std(all_permeability) if all_permeability else 0.0,
+        'Sampling Time (s)': total_time,
+        'Num Generated': total_generated,
+        'Num Valid': total_valid,
+        'Num Unique': num_unique,
+    }
+    return metrics
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate a finetuned peptide checkpoint")
+    parser.add_argument('--checkpoint_path', type=str, required=True,
+                        help='Path to the finetuned Lightning checkpoint (e.g., last.ckpt)')
+    parser.add_argument('--pretrained_ckpt', type=str,
+                        default=os.path.join(REPO_ROOT, 'pretrained', 'anylength_pep.ckpt'),
+                        help='Path to the pretrained base model checkpoint')
+    parser.add_argument('--num_samples', type=int, default=500,
+                        help='Number of peptides to sample')
+    parser.add_argument('--batch_size', type=int, default=50,
+                        help='Batch size for sampling')
+    parser.add_argument('--max_length', type=int, default=512)
+    parser.add_argument('--total_num_steps', type=int, default=256)
+    parser.add_argument('--num_remasking', type=int, default=3)
+    parser.add_argument('--quality_threshold', type=float, default=0.5,
+                        help='Threshold for insertion quality filtering during sampling')
+    parser.add_argument('--unmask_quality_threshold', type=float, default=None,
+                        help='If set, gate unmasking/remasking by confidence: remask '
+                             'ALL clean tokens whose unmasking confidence is below this '
+                             'threshold, regardless of the schedule budget. If unset '
+                             '(default), remasking is purely schedule-driven (count-based).')
+    parser.add_argument('--prot_name', type=str, default='glast',
+                        help='Target protein name (must be one of: ' + ', '.join(PROTEINS.keys()) + ')')
+    parser.add_argument('--prot_seq', type=str, default=None,
+                        help='Custom protein sequence (overrides --prot_name)')
+    parser.add_argument('--disable_planner', action='store_true',
+                        help='If set, disable remasking during evaluation')
+    parser.add_argument('--disable_insertion_planner', action='store_true',
+                        help='If set, disable insertion quality filtering during evaluation')
+    parser.add_argument('--disable_unmasking_planner', action='store_true',
+                        help='If set, disable unmasking confidence planner during evaluation')
+    parser.add_argument('--output_dir', type=str, default=None,
+                        help='Directory to save results CSV. Defaults to checkpoint directory.')
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--seed', type=int, default=42)
+    args = parser.parse_args()
+    set_seed(args.seed, use_cuda=True)
+    device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
+    # Map flags to quality_mode
+    if args.disable_planner:
+        quality_mode = "none"
+    elif args.disable_insertion_planner and args.disable_unmasking_planner:
+        quality_mode = "none"
+    elif args.disable_insertion_planner:
+        quality_mode = "unmasking_only"
+    elif args.disable_unmasking_planner:
+        quality_mode = "insertion_only"
+    else:
+        quality_mode = "both"
+    print(f"Loading checkpoint: {args.checkpoint_path}")
+    print(f"Pretrained base: {args.pretrained_ckpt}")
+    print(f"Quality mode: {quality_mode}")
+    policy_model, train_args, config = load_finetuned_model(
+        args.checkpoint_path, args.pretrained_ckpt, device=device
+    )
+    # Setup tokenizer, reward model, analyzer
+    tokenizer = SMILES_SPE_Tokenizer(
+        os.path.join(REPO_ROOT, 'a2d2_pep', 'pep_scoring', 'tokenizer', 'new_vocab.txt'),
+        os.path.join(REPO_ROOT, 'a2d2_pep', 'pep_scoring', 'tokenizer', 'new_splits.txt')
+    )
+    if args.prot_seq is not None:
+        prot = args.prot_seq
+        prot_name = args.prot_name
+    else:
+        prot_name = args.prot_name
+        if prot_name not in PROTEINS:
+            raise ValueError(f"Unknown protein: {prot_name}. Choose from: {list(PROTEINS.keys())}")
+        prot = PROTEINS[prot_name]
+    score_func_names = ['binding_affinity1', 'solubility', 'hemolysis', 'nonfouling', 'permeability']
+    reward_model = ScoringFunctions(score_func_names, prot_seqs=[prot], device=device)
+    analyzer = PeptideAnalyzer()
+    print(f"\nSampling {args.num_samples} peptides (quality_mode={quality_mode}, target={prot_name})...")
+    metrics = evaluate_checkpoint(
+        policy_model=policy_model,
+        tokenizer=tokenizer,
+        reward_model=reward_model,
+        analyzer=analyzer,
+        num_samples=args.num_samples,
+        batch_size=args.batch_size,
+        max_length=args.max_length,
+        total_num_steps=args.total_num_steps,
+        quality_mode=quality_mode,
+        num_remasking=args.num_remasking,
+        quality_threshold=args.quality_threshold,
+        unmask_quality_threshold=args.unmask_quality_threshold,
+        device=device,
+    )
+    # Print summary table
+    print("\n" + "=" * 60)
+    print("  De Novo Peptide Generation Results")
+    print("=" * 60)
+    for k, v in metrics.items():
+        if isinstance(v, float):
+            print(f"  {k:<30s}: {v:.4f}")
+        else:
+            print(f"  {k:<30s}: {v}")
+    print("=" * 60)
+    # Save results
+    output_dir = args.output_dir or os.path.dirname(args.checkpoint_path)
+    os.makedirs(output_dir, exist_ok=True)
+    if args.disable_planner:
+        tag = "no_planner"
+    elif args.disable_insertion_planner:
+        tag = "no_insertion_planner"
+    elif args.disable_unmasking_planner:
+        tag = "no_unmasking_planner"
+    else:
+        tag = "with_planner"
+    if args.unmask_quality_threshold is not None:
+        tag += f"_ut{args.unmask_quality_threshold:g}"
+    # Record the sweep parameter in the saved row for traceability.
+    metrics['unmask_quality_threshold'] = args.unmask_quality_threshold
+    metrics['quality_threshold'] = args.quality_threshold
+    metrics_path = os.path.join(output_dir, f'eval_metrics_{tag}_{prot_name}.csv')
+    pd.DataFrame([metrics]).to_csv(metrics_path, index=False)
+    print(f"Metrics saved to: {metrics_path}")
+if __name__ == '__main__':
+    main()

a2d2_pep/finetune_quality.py ADDED Viewed

	@@ -0,0 +1,892 @@

+# Distributed Data Parallel (DDP) finetuning for peptide generation using PyTorch Lightning
+import argparse
+import math
+from datetime import datetime
+import numpy as np
+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.strategies import DDPStrategy
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.loggers import WandbLogger
+import wandb
+import os
+import sys
+from tqdm import tqdm
+import pandas as pd
+# add repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from inference_quality import sample_peptides_buffer, sample_peptides_eval
+from pep_utils.analyzer import PeptideAnalyzer
+from pep_utils.utils import str2bool, set_seed
+from pep_scoring.scoring_functions import ScoringFunctions
+from pep_scoring.tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from lightning_modules.any_length_remask import AnyOrderInsertionFlowModuleFT
+from lightning_modules import AnyOrderInsertionFlowModule
+from tdc import Evaluator
+# Repository root (two levels up from this file: A2D2/a2d2_pep/finetune_quality.py)
+REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+class PeptideFinetuner(pl.LightningModule):
+    """Lightning module for distributed peptide finetuning."""
+    def __init__(
+        self,
+        args,
+        policy_model,
+        reward_model,
+        tokenizer,
+        pretrained=None,
+        mcts=None,
+        filename=None,
+        prot_name=None,
+        eps=1e-5
+    ):
+        super().__init__()
+        self.args = args
+        self.policy_model = policy_model
+        self.reward_model = reward_model
+        self.tokenizer = tokenizer
+        self.pretrained = pretrained
+        self.mcts = mcts
+        self.filename = filename
+        self.prot_name = prot_name
+        self.eps = eps
+        # Length cutoff is tunable from the CLI: --min_peptide_bonds N enforces
+        # >=N peptide bonds (filters degenerate short reward-hacked molecules);
+        # --min_peptide_bonds 0 disables the cutoff.
+        min_bonds = getattr(args, 'min_peptide_bonds', 4)
+        self.analyzer = PeptideAnalyzer(
+            min_peptide_bonds=max(0, min_bonds),
+            enforce_min_peptide_bonds=min_bonds > 0,
+        )
+        # Save hyperparameters
+        self.save_hyperparameters(ignore=['policy_model', 'reward_model', 'tokenizer', 'pretrained', 'mcts'])
+        # Buffer for sequences
+        self.x_saved = None
+        self.log_rnd_saved = None
+        self.final_rewards_saved = None
+        # Logs
+        self.valid_fraction_log = []
+        self.uniqueness_log = []
+        self.diversity_log = []
+        self.affinity_log = []
+        self.sol_log = []
+        self.hemo_log = []
+        self.nf_log = []
+        self.permeability_log = []
+        self._diversity_evaluator = Evaluator('diversity')
+        # Alternating training between policy and planner
+        self.train_policy = True  # Start by training policy
+        self.alternation_frequency = getattr(args, 'alternation_frequency', 1)  # Alternate every N epochs
+    def freeze_policy_model(self):
+        """Freeze policy model parameters (but not planner)."""
+        for name, param in self.policy_model.named_parameters():
+            if not name.startswith('planner.'):
+                param.requires_grad = False
+    def unfreeze_policy_model(self):
+        """Unfreeze policy model parameters (but not planner)."""
+        for name, param in self.policy_model.named_parameters():
+            if not name.startswith('planner.'):
+                param.requires_grad = True
+    def freeze_planner_model(self):
+        """Freeze planner parameters."""
+        if hasattr(self.policy_model, 'planner'):
+            for param in self.policy_model.planner.parameters():
+                param.requires_grad = False
+    def unfreeze_planner_model(self):
+        """Unfreeze planner parameters."""
+        if hasattr(self.policy_model, 'planner'):
+            for param in self.policy_model.planner.parameters():
+                param.requires_grad = True
+    def configure_optimizers(self):
+        # Separate parameter groups for policy backbone vs planner heads
+        planner_lr = getattr(self.args, 'planner_learning_rate', self.args.learning_rate)
+        planner_params = []
+        policy_params = []
+        for name, param in self.policy_model.named_parameters():
+            if name.startswith('planner.'):
+                planner_params.append(param)
+            else:
+                policy_params.append(param)
+        param_groups = [
+            {'params': policy_params, 'lr': self.args.learning_rate},
+            {'params': planner_params, 'lr': planner_lr},
+        ]
+        optimizer = torch.optim.AdamW(param_groups)
+        return optimizer
+    def _get_quality_mode(self):
+        """Map ablation flags + warmup state to quality_mode string."""
+        if self.args.disable_planner:
+            return "none"
+        if self.current_epoch < self.args.schedule_warmup_epochs:
+            return "none"
+        di = getattr(self.args, 'disable_insertion_planner', False)
+        du = getattr(self.args, 'disable_unmasking_planner', False)
+        if di and du:
+            return "none"
+        if di:
+            return "unmasking_only"
+        if du:
+            return "insertion_only"
+        return "both"
+    def on_save_checkpoint(self, checkpoint):
+        """
+        Save additional metadata to make loading easier.
+        Saves the config directly in the checkpoint so loading doesn't need to follow references.
+        """
+        # Save the config from the policy model directly in the checkpoint
+        if hasattr(self.policy_model, 'config'):
+            checkpoint['config'] = self.policy_model.config
+            print(f"Saved config to checkpoint for easier loading")
+        # Save EMA params if they exist in the policy model
+        if hasattr(self.policy_model, 'ema_params') and self.policy_model.ema_params:
+            checkpoint['ema_params'] = self.policy_model.ema_params
+            print(f"Saved EMA params to checkpoint")
+        # Save planner state if it exists
+        if hasattr(self.policy_model, 'planner'):
+            checkpoint['planner_state'] = self.policy_model.planner.state_dict()
+            print(f"Saved planner state to checkpoint")
+    def on_train_epoch_start(self):
+        """Called at the start of each training epoch."""
+        # If disable_planner mode, only train policy (no alternation)
+        if self.args.disable_planner:
+            self.train_policy = True
+            self.unfreeze_policy_model()
+            self.freeze_planner_model()
+            if self.global_rank == 0 and self.current_epoch == 0:
+                print(f"[FINETUNE_QUALITY] Training ONLY policy model (planner frozen, no remasking)")
+        elif getattr(self.args, 'joint_training', False):
+            # Joint mode: train policy + planner together every step (no alternation)
+            self.train_policy = True  # marker; training_step adds planner loss when joint_training is set
+            self.unfreeze_policy_model()
+            self.unfreeze_planner_model()
+            if self.global_rank == 0 and self.current_epoch == 0:
+                print(f"[FINETUNE_QUALITY] JOINT TRAINING: policy + planner trained together (no alternation)")
+        else:
+            # Alternate between training policy and planner from epoch 0
+            # Determine which model to train this epoch
+            cycle_position = (self.current_epoch // self.alternation_frequency) % 2
+            self.train_policy = (cycle_position == 0)
+            if self.train_policy:
+                # Train policy, freeze planner
+                self.unfreeze_policy_model()
+                self.freeze_planner_model()
+                if self.global_rank == 0:
+                    print(f"[ALTERNATION] Epoch {self.current_epoch}: Training POLICY model (planner frozen)")
+            else:
+                # Train planner, freeze policy
+                self.freeze_policy_model()
+                self.unfreeze_planner_model()
+                if self.global_rank == 0:
+                    print(f"[ALTERNATION] Epoch {self.current_epoch}: Training PLANNER model (policy frozen)")
+        # Resample buffer if needed
+        if self.x_saved is None or self.current_epoch % self.args.resample_every_n_step == 0:
+            self._generate_buffer()
+            # Synchronize all ranks after buffer generation to prevent NCCL timeout
+            if self.trainer and self.trainer.world_size > 1:
+                torch.distributed.barrier()
+    def _generate_buffer(self):
+        """Generate buffer of sequences for training - all ranks generate in parallel.
+        When pool_size > 0, maintains a persistent pool and refreshes a fraction
+        each time instead of regenerating the entire buffer from scratch. This
+        preserves diversity/uniqueness across training by avoiding wholesale
+        replacement with samples from an increasingly mode-collapsed policy.
+        """
+        world_size = self.trainer.world_size if self.trainer else 1
+        rank = self.global_rank if self.trainer else 0
+        pool_size = getattr(self.args, 'pool_size', 0)
+        is_pool = pool_size > 0
+        is_init = self.x_saved is None
+        # Determine how many sequences to sample this call
+        if is_pool:
+            refresh_frac = getattr(self.args, 'pool_refresh_fraction', 0.2)
+            if is_init:
+                samples_per_gpu = pool_size
+            else:
+                samples_per_gpu = max(1, int(pool_size * refresh_frac))
+            if rank == 0:
+                if is_init:
+                    print(f"\n[POOL] Initializing pool with {pool_size} sequences at epoch {self.current_epoch}")
+                else:
+                    print(f"\n[POOL] Refreshing {samples_per_gpu}/{pool_size} sequences ({refresh_frac*100:.0f}%) at epoch {self.current_epoch}")
+        else:
+            samples_per_gpu = self.args.buffer_size // world_size
+            if rank == 0:
+                samples_per_gpu += self.args.buffer_size % world_size
+        accumulated_x = []
+        accumulated_log_rnd = []
+        accumulated_rewards = []
+        total_accumulated = 0
+        if rank == 0:
+            print(f"\n[BUFFER] Starting buffer generation at epoch {self.current_epoch}")
+            print(f"[BUFFER] GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
+            print(f"[BUFFER] GPU memory reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
+            if not is_pool:
+                print(f"[BUFFER] Each of {world_size} ranks will generate {samples_per_gpu} samples")
+        max_attempts = getattr(self.args, 'max_buffer_attempts', 100)  # cap wasted GPU / infinite loop
+        starvation_patience = getattr(self.args, 'buffer_starvation_patience', 10)
+        attempts = 0
+        import time
+        while total_accumulated < samples_per_gpu and attempts < max_attempts:
+            attempts += 1
+            if rank == 0:
+                print(f"[BUFFER] rank={rank} starting sampling attempt {attempts} at {time.strftime('%H:%M:%S')}")
+            start_time = time.time()
+            # new elbo loss
+            if self.args.elbo_rnd:
+                x_final, _, final_rewards, trace = \
+                    sample_peptides_buffer(
+                        self.policy_model,
+                        self.reward_model, self.analyzer,
+                        self.tokenizer,
+                        steps=self.args.total_num_steps,
+                        mask=self.policy_model.interpolant.mask_token,
+                        pad=self.policy_model.interpolant.pad_token,
+                        batch_size=self.args.batch_size,
+                        max_length=self.args.max_length,
+                        # Buffer generation never uses the quality heads (planner):
+                        # the backbone must train on raw policy samples so that a
+                        # poorly-trained planner can't corrupt the backbone's data.
+                        quality_mode="none",
+                        compute_rnd=False,
+                        alpha=self.args.alpha,
+                        num_remasking=self.args.num_remasking,
+                        min_length=self.args.min_length,
+                    )
+                if x_final.shape[0] > 0:
+                    with torch.no_grad():
+                        noised = self.policy_model.prepare_noised_sample(
+                            x_final, num_samples=self.args.elbo_rnd_num_samples)
+                        policy_loss = self.policy_model.compute_loss_from_noised(noised)
+                        pretrained_loss = self.pretrained.compute_loss_from_noised(noised)
+                        log_rnd = (pretrained_loss - policy_loss) + (final_rewards / self.args.alpha)
+                else:
+                    log_rnd = torch.empty((0,), dtype=torch.float32, device=x_final.device)
+            else:
+                x_final, log_rnd, final_rewards, trace = \
+                    sample_peptides_buffer(
+                        self.policy_model,
+                        self.reward_model, self.analyzer,
+                        self.tokenizer,
+                        steps=self.args.total_num_steps,
+                        mask=self.policy_model.interpolant.mask_token,
+                        pad=self.policy_model.interpolant.pad_token,
+                        batch_size=self.args.batch_size,
+                        max_length=self.args.max_length,
+                        # Buffer generation never uses the quality heads (planner):
+                        # the backbone must train on raw policy samples so that a
+                        # poorly-trained planner can't corrupt the backbone's data.
+                        quality_mode="none",
+                        compute_rnd=True,
+                        pretrained=self.pretrained,
+                        alpha=self.args.alpha,
+                        num_remasking=self.args.num_remasking,
+                        min_length=self.args.min_length,
+                    )
+            elapsed = time.time() - start_time
+            if rank == 0:
+                print(f"[BUFFER] rank={rank} sampling took {elapsed:.1f}s")
+            n_valid = x_final.shape[0]
+            if n_valid > 0:
+                accumulated_x.append(x_final)
+                accumulated_log_rnd.append(log_rnd)
+                accumulated_rewards.append(final_rewards)
+                total_accumulated += n_valid
+            if rank == 0:
+                print(f"[BUFFER] rank={rank} epoch={self.current_epoch} quality_mode=none (heads disabled for buffer gen) accumulated={total_accumulated} / {samples_per_gpu} (batch yielded {n_valid} valid) attempt={attempts}")
+            # Starvation guard: if nothing valid comes through (e.g. the length
+            # cutoff is too aggressive for a collapsed policy), stop grinding GPU
+            # hours and fail fast with an actionable message.
+            if attempts >= starvation_patience and total_accumulated == 0:
+                if rank == 0:
+                    print(f"[BUFFER STARVATION] 0 valid samples after {attempts} attempts "
+                          f"(min_peptide_bonds={getattr(self.args, 'min_peptide_bonds', 4)}). "
+                          f"Aborting refill early — lower --min_peptide_bonds or check the policy.")
+                break
+        if total_accumulated == 0:
+            raise RuntimeError(f"[BUFFER ERROR] Rank {rank}: No valid sequences generated after {attempts} attempts. Check sampling function and reward model.")
+        if total_accumulated < samples_per_gpu:
+            print(f"[BUFFER WARNING] Rank {rank}: Only generated {total_accumulated}/{samples_per_gpu} sequences after {attempts} attempts")
+        new_x = torch.cat(accumulated_x, dim=0)[:samples_per_gpu]
+        new_log_rnd = torch.cat(accumulated_log_rnd, dim=0)[:samples_per_gpu]
+        new_rewards = torch.cat(accumulated_rewards, dim=0)[:samples_per_gpu]
+        del accumulated_x, accumulated_log_rnd, accumulated_rewards
+        torch.cuda.empty_cache()
+        # Pool mode (after init): replace a random subset of the existing pool.
+        # Classic mode / pool init: overwrite the buffer.
+        if is_pool and not is_init:
+            actual_new = min(new_x.shape[0], self.x_saved.shape[0])
+            indices = torch.randperm(self.x_saved.shape[0], device=self.x_saved.device)[:actual_new]
+            self.x_saved[indices] = new_x[:actual_new]
+            self.log_rnd_saved[indices] = new_log_rnd[:actual_new]
+            self.final_rewards_saved[indices] = new_rewards[:actual_new]
+            if rank == 0:
+                print(f"[POOL] Replaced {actual_new}/{self.x_saved.shape[0]} sequences, reward mean={self.final_rewards_saved.mean():.4f}")
+        else:
+            self.x_saved = new_x
+            self.log_rnd_saved = new_log_rnd
+            self.final_rewards_saved = new_rewards
+        # Sanity check: median length (non-pad tokens) of buffered peptides.
+        if rank == 0:
+            pad = self.policy_model.interpolant.pad_token
+            token_lens = (self.x_saved != pad).sum(dim=1)
+            print(f"[BUFFER] peptide token length: median={token_lens.median().item()} "
+                  f"min={token_lens.min().item()} max={token_lens.max().item()} "
+                  f"(n={token_lens.shape[0]})")
+    def training_step(self, batch, batch_idx):
+        """Training step - batch is ignored, we use saved buffer."""
+        # Use mini-batch sampling from buffer to avoid OOM
+        buffer_size = self.x_saved.shape[0]
+        mini_batch_size = getattr(self.args, 'training_mini_batch_size', 6)
+        # Randomly sample mini_batch_size sequences from buffer
+        if buffer_size > mini_batch_size:
+            indices = torch.randperm(buffer_size, device=self.x_saved.device)[:mini_batch_size]
+            x_final = self.x_saved[indices]
+            log_rnd = self.log_rnd_saved[indices]
+        else:
+            # If buffer is smaller than mini_batch_size, use all
+            x_final = self.x_saved
+            log_rnd = self.log_rnd_saved
+        joint = getattr(self.args, 'joint_training', False)
+        policy_loss = None
+        planner_loss = None
+        if self.train_policy:
+            # Train policy with WDCE loss
+            policy_loss = self.policy_model.loss_wdce_flexible(
+                log_rnd,
+                x_final,
+                num_replicates=self.args.wdce_num_replicates,
+                centering=self.args.centering,
+                centering_strength=self.args.centering_strength
+            )
+            self.log('train/policy_loss', policy_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        if (not self.train_policy) or joint:
+            # Train planner with appropriate loss based on ablation flags
+            if self.args.disable_insertion_planner:
+                # Ablation: only train unmasking/remasking planner (no insertion head)
+                planner_loss = self.policy_model.loss_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength
+                )
+                self.log('train/planner_unmask_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', 0.0, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+            elif self.args.disable_unmasking_planner:
+                # Ablation: only train insertion planner (no remasking head)
+                unmask_loss, insert_loss, _ = self.policy_model.loss_insert_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength
+                )
+                # Zero out the unmasking component - only backprop insertion loss
+                planner_loss = insert_loss
+                self.log('train/planner_unmask_loss', 0.0, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', insert_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+            else:
+                # Full planner: train both remasking + insertion
+                unmask_loss, insert_loss, planner_loss = self.policy_model.loss_insert_planner_flexible(
+                    log_rnd,
+                    x_final,
+                    num_replicates=self.args.wdce_num_replicates,
+                    centering=self.args.centering,
+                    centering_strength=self.args.centering_strength
+                )
+                self.log('train/planner_unmask_loss', unmask_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_insert_loss', insert_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+                self.log('train/planner_loss', planner_loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        # Combine losses depending on mode
+        if joint:
+            loss = policy_loss + planner_loss
+            mode_value = 0.5
+        elif self.train_policy:
+            loss = policy_loss
+            mode_value = 0.0
+        else:
+            loss = planner_loss
+            mode_value = 1.0
+        # Log overall loss and mode
+        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, sync_dist=True)
+        self.log('train/mode', mode_value, prog_bar=True, sync_dist=True)
+        return loss
+    def on_train_epoch_end(self):
+        """Called at the end of each training epoch - only rank 0 evaluates."""
+        # Only evaluate every N epochs to save time
+        eval_frequency = getattr(self.args, 'eval_every_n_epochs', 5)
+        is_last_epoch = (self.trainer and self.current_epoch == self.trainer.max_epochs - 1)
+        if self.global_rank == 0 and (self.current_epoch % eval_frequency == 0 or is_last_epoch):
+            # Sample eval batch with updated policy
+            valid_seqs, affinity, sol, hemo, nf, permeability, valid_fraction = \
+                sample_peptides_eval(
+                    self.policy_model, self.reward_model, self.analyzer,
+                    self.tokenizer,
+                    steps=self.args.total_num_steps,
+                    mask=self.policy_model.interpolant.mask_token,
+                    pad=self.policy_model.interpolant.pad_token,
+                    batch_size=50,
+                    max_length=self.args.max_length,
+                    quality_mode=self._get_quality_mode(),
+                    num_remasking=self.args.num_remasking,
+                    return_valid=True,
+                )
+            # Uniqueness (% of valid that are unique) and Diversity
+            # (1 - mean pairwise Tanimoto on Morgan FPs of unique sequences),
+            # matching evaluate_peptide_table.py / evaluate_mol_table.py.
+            num_valid = len(valid_seqs)
+            unique_seqs = list(set(valid_seqs))
+            num_unique = len(unique_seqs)
+            uniqueness = num_unique / num_valid * 100.0 if num_valid > 0 else 0.0
+            diversity = self._diversity_evaluator(unique_seqs) if num_unique > 1 else 0.0
+            # Append to logs
+            self.affinity_log.append(affinity)
+            self.sol_log.append(sol)
+            self.hemo_log.append(hemo)
+            self.nf_log.append(nf)
+            self.permeability_log.append(permeability)
+            self.valid_fraction_log.append(valid_fraction)
+            self.uniqueness_log.append(uniqueness)
+            self.diversity_log.append(diversity)
+            # Compute reward stats
+            mean_reward = self.final_rewards_saved.mean().item()
+            min_reward = self.final_rewards_saved.min().item()
+            max_reward = self.final_rewards_saved.max().item()
+            median_reward = self.final_rewards_saved.median().item()
+            # Log metrics
+            self.log_dict({
+                "eval/affinity": np.mean(affinity),
+                "eval/sol": np.mean(sol),
+                "eval/hemo": np.mean(hemo),
+                "eval/nf": np.mean(nf),
+                "eval/permeability": np.mean(permeability),
+                "eval/valid_fraction": valid_fraction,
+                "eval/uniqueness": uniqueness,
+                "eval/diversity": diversity,
+                "eval/mean_reward_search": mean_reward,
+                "eval/min_reward_search": min_reward,
+                "eval/max_reward_search": max_reward,
+                "eval/median_reward_search": median_reward
+            })
+            print(f"epoch {self.current_epoch} | affinity {np.mean(affinity):.4f} | "
+                  f"sol {np.mean(sol):.4f} | hemo {np.mean(hemo):.4f} | "
+                  f"nf {np.mean(nf):.4f} | permeability {np.mean(permeability):.4f} | "
+                  f"valid {valid_fraction:.4f} | uniq {uniqueness:.2f}% | div {diversity:.4f}")
+    def on_fit_end(self):
+        """Called at the end of training - save results."""
+        if self.global_rank == 0:
+            # Save logs and plot
+            base_path = self.args.base_path
+            plot_path = f'{base_path}/results/{self.args.run_name}'
+            os.makedirs(plot_path, exist_ok=True)
+            output_log_path = f'{plot_path}/log_{self.filename}.csv'
+            save_logs_to_file(self.valid_fraction_log, self.affinity_log,
+                              self.sol_log, self.hemo_log, self.nf_log,
+                              self.permeability_log, output_log_path,
+                              uniqueness_log=self.uniqueness_log,
+                              diversity_log=self.diversity_log)
+            # Final generation
+            x_eval, affinity, sol, hemo, nf, permeability, valid_fraction, df = \
+                sample_peptides_eval(
+                    self.policy_model, self.reward_model, self.analyzer,
+                    self.tokenizer,
+                    steps=self.args.total_num_steps,
+                    mask=self.policy_model.interpolant.mask_token,
+                    pad=self.policy_model.interpolant.pad_token,
+                    batch_size=50,
+                    max_length=self.args.max_length,
+                    quality_mode=self._get_quality_mode(),
+                    num_remasking=self.args.num_remasking,
+                    dataframe=True,
+                )
+            df.to_csv(f'{plot_path}/{self.prot_name}_generation_results.csv', index=False)
+def save_logs_to_file(valid_fraction_log, affinity_log,
+                      sol_log, hemo_log, nf_log,
+                      permeability_log, output_path,
+                      uniqueness_log=None, diversity_log=None):
+    """
+    Saves the logs to a CSV file.
+    """
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    log_data = {
+        "Iteration": list(range(1, len(valid_fraction_log) + 1)),
+        "Valid Fraction": valid_fraction_log,
+        "Binding Affinity": affinity_log,
+        "Solubility": sol_log,
+        "Hemolysis": hemo_log,
+        "Nonfouling": nf_log,
+        "Permeability": permeability_log,
+    }
+    if uniqueness_log is not None:
+        log_data["Uniqueness (%)"] = uniqueness_log
+    if diversity_log is not None:
+        log_data["Diversity"] = diversity_log
+    df = pd.DataFrame(log_data)
+    df.to_csv(output_path, index=False)
+class DummyDataset(torch.utils.data.Dataset):
+    """Dummy dataset for Lightning trainer (we use buffer instead)."""
+    def __init__(self, size=10):
+        self.size = size
+    def __len__(self):
+        return self.size
+    def __getitem__(self, idx):
+        return torch.zeros(1)  # Dummy data
+def main():
+    """Main entry point for distributed training."""
+    # Disable DDP optimizer for higher-order ops like flex_attention
+    import torch._dynamo
+    torch._dynamo.config.optimize_ddp = False
+    argparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    argparser.add_argument('--base_path', type=str, default=REPO_ROOT)
+    argparser.add_argument('--learning_rate', type=float, default=1e-4)
+    argparser.add_argument('--num_epochs', type=int, default=100)
+    argparser.add_argument('--num_accum_steps', type=int, default=4)
+    argparser.add_argument('--truncate_steps', type=int, default=50)
+    argparser.add_argument("--truncate_kl", type=str2bool, default=False)
+    argparser.add_argument('--gumbel_temp', type=float, default=1.0)
+    argparser.add_argument('--gradnorm_clip', type=float, default=1.0)
+    argparser.add_argument('--batch_size', type=int, default=50)
+    argparser.add_argument('--name', type=str, default='debug')
+    argparser.add_argument('--total_num_steps', type=int, default=128)
+    argparser.add_argument('--copy_flag_temp', type=float, default=None)
+    argparser.add_argument('--save_every_n_epochs', type=int, default=10)
+    argparser.add_argument('--alpha_schedule_warmup', type=int, default=0)
+    argparser.add_argument("--seed", type=int, default=0)
+    # new
+    argparser.add_argument('--run_name', type=str, default='peptides')
+    argparser.add_argument("--save_path_dir", default=os.path.join(REPO_ROOT, "checkpoints", "finetune_peptides"), type=str)
+    # mcts
+    argparser.add_argument('--num_sequences', type=int, default=10)
+    argparser.add_argument('--max_length', type=int, default=1024)
+    argparser.add_argument('--min_length', type=int, default=0,
+                           help='Minimum sequence length (in SMILES SPE tokens). '
+                                'Samples shorter than this are dropped from the buffer. 0 disables the filter.')
+    argparser.add_argument('--num_children', type=int, default=50)
+    argparser.add_argument('--num_iter', type=int, default=30)
+    argparser.add_argument('--seq_length', type=int, default=1024)
+    argparser.add_argument('--time_conditioning', action='store_true', default=False)
+    argparser.add_argument('--mcts_sampling', type=int, default=0) # for batched categorical sampling: '0' means gumbel noise
+    argparser.add_argument('--buffer_size', type=int, default=100)
+    argparser.add_argument('--wdce_num_replicates', type=int, default=16)
+    argparser.add_argument('--noise_removal', action='store_true', default=False)
+    argparser.add_argument('--grad_clip', action='store_true', default=False)
+    argparser.add_argument('--resample_every_n_step', type=int, default=10)
+    argparser.add_argument('--exploration', type=float, default=0.1)
+    argparser.add_argument('--reset_every_n_step', type=int, default=100)
+    argparser.add_argument('--alpha', type=float, default=0.01)
+    argparser.add_argument('--scalarization', type=str, default='sum')
+    argparser.add_argument('--no_mcts', action='store_true', default=False)
+    argparser.add_argument("--centering", action='store_true', default=False)
+    argparser.add_argument("--centering_strength", type=float, default=1.0)
+    # ELBO-based log_rnd estimation
+    argparser.add_argument('--elbo_rnd', action='store_true', default=False,
+        help='If set, compute log_rnd via ELBO instead of trajectory rollout')
+    argparser.add_argument('--elbo_rnd_num_samples', type=int, default=16,
+        help='Number of noisy time samples per sequence for ELBO-based log_rnd estimation')
+    # adaptive schedule parameters
+    argparser.add_argument('--use_adaptive_schedule', action='store_true', default=True)
+    argparser.add_argument('--schedule_hidden_dim', type=int, default=256)
+    argparser.add_argument('--schedule_num_layers', type=int, default=2)
+    argparser.add_argument('--schedule_loss_weight', type=float, default=0.1)
+    argparser.add_argument('--adaptive_threshold', type=float, default=0.5)
+    argparser.add_argument('--freeze_base_model', action='store_true', default=False)
+    argparser.add_argument('--schedule_warmup_epochs', type=int, default=0, help='Number of initial epochs to train WITHOUT remasking in buffer generation')
+    argparser.add_argument('--alternation_frequency', type=int, default=20, help='Number of epochs to train each model before alternating (1=alternate every epoch)')
+    argparser.add_argument('--planner_learning_rate', type=float, default=None, help='Separate learning rate for planner heads (defaults to --learning_rate if not set)')
+    # objectives
+    argparser.add_argument('--num_obj', type=int, default=5)
+    argparser.add_argument('--prot_seq', type=str, default=None)
+    argparser.add_argument('--prot_name', type=str, default='glast',
+                           help='Protein target name. Looked up in PROTEINS dict unless --prot_seq is given.')
+    argparser.add_argument('--devices', type=int, default=-1)
+    argparser.add_argument('--checkpoint_path', type=str, default=None)
+    argparser.add_argument('--resume_ckpt', type=str, default=None,
+        help='Path to a Lightning last.ckpt to resume training from (restores epoch/optimizer/planner state). '
+             'New checkpoints continue in the same directory as this checkpoint.')
+    # remasking
+    argparser.add_argument('--num_remasking', type=int, default=5)
+    argparser.add_argument('--quality_threshold', type=float, default=1)
+    # length cutoff (peptide-bond filter) + buffer starvation guard
+    argparser.add_argument('--min_peptide_bonds', type=int, default=4,
+        help='Minimum backbone peptide bonds for a sample to count as valid. '
+             '0 disables the cutoff. Filters degenerate short reward-hacked molecules.')
+    argparser.add_argument('--max_buffer_attempts', type=int, default=100,
+        help='Max sampling rounds per buffer refill before giving up (caps wasted GPU when validity is low).')
+    argparser.add_argument('--buffer_starvation_patience', type=int, default=10,
+        help='If 0 valid samples after this many rounds, abort the refill early (starvation guard).')
+    # planner ablation flags
+    argparser.add_argument('--disable_planner', action='store_true', help='If set, disable remasking completely and only train policy (not planner) for quality optimization')
+    argparser.add_argument('--disable_insertion_planner', action='store_true', help='Ablation: disable insertion quality filtering but keep unmasking/remasking planner')
+    argparser.add_argument('--disable_unmasking_planner', action='store_true', help='Ablation: disable unmasking/remasking planner but keep insertion quality filtering')
+    argparser.add_argument('--joint_training', action='store_true', help='Ablation: train policy and planner jointly each step (no alternation, both unfrozen, summed loss). Incompatible with --disable_planner.')
+    # performance optimization
+    argparser.add_argument('--eval_every_n_epochs', type=int, default=5, help='Evaluate only every N epochs to save time')
+    argparser.add_argument('--num_training_steps_per_epoch', type=int, default=10, help='Number of gradient updates per epoch')
+    argparser.add_argument('--training_mini_batch_size', type=int, default=6, help='Mini-batch size for training from buffer to avoid OOM')
+    argparser.add_argument('--pool_size', type=int, default=0,
+        help='If >0, maintain a persistent pool of this size and refresh a fraction each resample step (0=disabled, classic buffer). Helps preserve uniqueness/diversity over training.')
+    argparser.add_argument('--pool_refresh_fraction', type=float, default=0.2,
+        help='Fraction of pool to replace each resample step (only used when pool_size>0)')
+    args = argparser.parse_args()
+    # Default planner LR to policy LR if not specified
+    if args.planner_learning_rate is None:
+        args.planner_learning_rate = args.learning_rate
+    # Set seed
+    pl.seed_everything(args.seed)
+    # Load models
+    checkpoint_path = args.checkpoint_path if args.checkpoint_path else \
+        os.path.join(REPO_ROOT, 'pretrained', 'anylength_pep.ckpt')
+    # Update args.checkpoint_path to ensure it's saved in hyperparameters for later inference
+    args.checkpoint_path = checkpoint_path
+    PROTEINS = {
+        'amhr': 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV',
+        'tfr':  'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF',
+        'gfap': 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM',
+        'glp1': 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS',
+        'glast': 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM',
+        'ncam': 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF',
+        'cereblon': 'MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNIINFDTSLPTSHTYLGADMEEFHGRTLHDDDSCQVIPVLPQVMMILIPGQTLPLQLFHPQEVSMVRNLIQKDRTFAVLAYSNVQEREAQFGTTAEIYAYREEQDFGIEIVKVKAIGRQRFKVLELRTQSDGIQQAKVQILPECVLPSTMSAVQLESLNKCQIFPSKPVSREDQCSYKWWQKYQKRKFHCANLTSWPRWLYSLYDAETLMDRIKKQLREWDENLKDDSLPSNPIDFSYRVAACLPIDDVLRIQLLKIGSAIQRLRCELDIMNKCTSLCCKQCQETEITTKNEIFSLSLCGPMAAYVNPHGYVHETLTVYKACNLNLIGRPSTEHSWFPGYAWTVAQCKICASHIGWKFTATKKDMSPQKFWGLTRSALLPTIPDTEDEISPDKVILCL',
+        'ligase': 'MASQPPEDTAESQASDELECKICYNRYNLKQRKPKVLECCHRVCAKCLYKIIDFGDSPQGVIVCPFCRFETCLPDDEVSSLPDDNNILVNLTCGGKGKKCLPENPTELLLTPKRLASLVSPSHTSSNCLVITIMEVQRESSPSLSSTPVVEFYRPASFDSVTTVSHNWTVWNCTSLLFQTSIRVLVWLLGLLYFSSLPLGIYLLVSKKVTLGVVFVSLVPSSLVILMVYGFCQCVCHEFLDCMAPPS',
+        'skp2': 'MHRKHLQEIPDLSSNVATSFTWGWDSSKTSELLSGMGVSALEKEEPDSENIPQELLSNLGHPESPPRKRLKSKGSDKDFVIVRRPKLNRENFPGVSWDSLPDELLLGIFSCLCLPELLKVSGVCKRWYRLASDESLWQTLDLTGKNLHPDVTGRLLSQGVIAFRCPRSFMDQPLAEHFSPFRVQHMDLSNSVIEVSTLHGILSQCSKLQNLSLEGLRLSDPIVNTLAKNSNLVRLNLSGCSGFSEFALQTLLSSCSRLDELNLSWCFDFTEKHVQVAVAHVSETITQLNLSGYRKNLQKSDLSTLVRRCPNLVHLDLSDSVMLKNDCFQEFFQLNYLQHLSLSRCYDIIPETLLELGEIPTLKTLQVFGIVPDGTLQLLKEALPHLQINCSHFTTIARPTIGNKKNQEIWGIKCRLTLQKPSCL',
+    }
+    if args.prot_seq is not None:
+        prot = args.prot_seq
+        prot_name = args.prot_name
+    else:
+        prot_name = args.prot_name
+        if prot_name not in PROTEINS:
+            raise ValueError(f"Unknown protein: {prot_name}. Choose from: {list(PROTEINS.keys())}")
+        prot = PROTEINS[prot_name]
+    filename = prot_name
+    curr_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if args.no_mcts:
+        args.run_name = f'{curr_time}_adaptive_{prot_name}_resample{args.resample_every_n_step}_no-mcts'
+    else:
+        args.run_name = f'{curr_time}_adaptive_{prot_name}_resample{args.resample_every_n_step}_buffer{args.buffer_size}_numiter{args.num_iter}_children{args.num_children}'
+    # Append ablation tags to run name for easy identification
+    if args.disable_planner:
+        args.run_name += '_no_planner'
+    if args.disable_insertion_planner:
+        args.run_name += '_no_insertion_planner'
+    if args.disable_unmasking_planner:
+        args.run_name += '_no_unmasking_planner'
+    if args.joint_training:
+        if args.disable_planner:
+            raise ValueError("--joint_training is incompatible with --disable_planner (no planner to train)")
+        args.run_name += '_joint_training'
+    # When resuming, continue writing checkpoints into the SAME directory as the
+    # checkpoint we resume from (keeps model-{epoch}.ckpt contiguous) instead of
+    # spawning a fresh timestamped run directory.
+    if args.resume_ckpt:
+        args.save_path = os.path.dirname(os.path.abspath(args.resume_ckpt))
+        args.run_name = os.path.basename(args.save_path)
+    else:
+        args.save_path = os.path.join(args.save_path_dir, args.run_name)
+    os.makedirs(args.save_path, exist_ok=True)
+    set_seed(args.seed, use_cuda=False)  # Don't init CUDA before Lightning spawns DDP workers
+    # Initialize the model
+    print("Loading models..")
+    # Load pretrained model for reference (frozen)
+    pretrained = AnyOrderInsertionFlowModule.load_from_checkpoint(checkpoint_path,
+                                                map_location='cpu',
+                                                weights_only=False)
+    pretrained.eval()
+    for param in pretrained.parameters():
+        param.requires_grad = False
+    # Load checkpoint to extract config
+    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+    if 'hyper_parameters' in checkpoint:
+        config = checkpoint['hyper_parameters']['config']
+    elif 'config' in checkpoint:
+        config = checkpoint['config']
+    else:
+        raise ValueError("Cannot find config in checkpoint")
+    # Update config for adaptive schedule
+    from omegaconf import OmegaConf
+    if not OmegaConf.is_config(config):
+        from omegaconf import DictConfig
+        config = DictConfig(config)
+    # Disable struct mode to allow adding new keys
+    OmegaConf.set_struct(config, False)
+    config.training.use_adaptive_schedule = args.use_adaptive_schedule
+    config.training.schedule_hidden_dim = args.schedule_hidden_dim
+    config.training.schedule_num_layers = args.schedule_num_layers
+    config.training.schedule_loss_weight = args.schedule_loss_weight
+    config.training.freeze_base_model = args.freeze_base_model
+    config.training.schedule_warmup_epochs = args.schedule_warmup_epochs
+    # Re-enable struct mode
+    OmegaConf.set_struct(config, True)
+    # Initialize policy model with adaptive schedule
+    policy_model = AnyOrderInsertionFlowModuleFT(
+        config=config,
+        args=args,
+        pretrained_checkpoint=checkpoint_path,
+        insertion_planner=True,
+    )
+    # define mcts
+    score_func_names = ['binding_affinity1', 'solubility', 'hemolysis', 'nonfouling', 'permeability']
+    tokenizer = SMILES_SPE_Tokenizer(
+        os.path.join(REPO_ROOT, 'a2d2_pep', 'pep_scoring', 'tokenizer', 'new_vocab.txt'),
+        os.path.join(REPO_ROOT, 'a2d2_pep', 'pep_scoring', 'tokenizer', 'new_splits.txt')
+    )
+    # Device will be set by Lightning automatically in DDP
+    reward_model = ScoringFunctions(score_func_names, prot_seqs=[prot], device='cpu')
+    model = PeptideFinetuner(
+        args=args,
+        policy_model=policy_model,
+        reward_model=reward_model,
+        tokenizer=tokenizer,
+        pretrained=pretrained,
+        mcts=None,
+        filename=filename,
+        prot_name=prot_name
+    )
+    # Setup checkpoint callback
+    checkpoint_callback = ModelCheckpoint(
+        dirpath=args.save_path,
+        filename='model-{epoch:02d}',
+        every_n_epochs=args.save_every_n_epochs,
+        save_top_k=-1,
+        save_last=True,  # Also save last.ckpt
+        auto_insert_metric_name=False
+    )
+    # Setup wandb logger - only on rank 0 to avoid multiple runs
+    # Check if we're in a spawned DDP process
+    rank = int(os.environ.get('LOCAL_RANK', 0))
+    if rank == 0:
+        # Defaults to your default wandb entity; override with the WANDB_ENTITY env var.
+        wandb_logger = WandbLogger(entity=os.environ.get('WANDB_ENTITY'), project='a2d2-pep', name=args.run_name)
+    else:
+        wandb_logger = None
+    # Create dummy dataloader
+    dataset = DummyDataset(size=args.num_training_steps_per_epoch)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
+    # Setup trainer with DDP
+    trainer = pl.Trainer(
+        max_epochs=args.num_epochs,
+        accelerator='gpu',
+        devices=args.devices,
+        strategy=DDPStrategy(find_unused_parameters=True) if args.devices != 1 else 'auto',
+        gradient_clip_val=args.gradnorm_clip if args.grad_clip else None,
+        logger=wandb_logger,
+        callbacks=[checkpoint_callback],
+        enable_progress_bar=True,
+        log_every_n_steps=1
+    )
+    # Train (resume full training state from --resume_ckpt if provided).
+    # weights_only=False is required when resuming because these checkpoints
+    # store argparse.Namespace / OmegaConf objects in hyper_parameters, which
+    # PyTorch 2.6's default weights_only=True unpickler rejects.
+    if args.resume_ckpt:
+        trainer.fit(model, dataloader, ckpt_path=args.resume_ckpt, weights_only=False)
+    else:
+        trainer.fit(model, dataloader)
+if __name__ == '__main__':
+    main()

a2d2_pep/inference_quality.py ADDED Viewed

	@@ -0,0 +1,605 @@

+"""Unified peptide sampling with quality-guided planning.
+Supports 4 quality modes and optional RND (importance weight) computation.
+Quality modes:
+    "none"            - No planner, no remasking (policy-only)
+    "both"            - Both unmasking + insertion planners active
+    "unmasking_only"  - Only unmasking/remasking planner (insertion planner disabled)
+    "insertion_only"  - Only insertion planner (unmasking planner disabled)
+RND toggle:
+    compute_rnd=True  - Run pretrained model in parallel, compute step-wise log importance weights
+    compute_rnd=False - Run policy model only (use with ELBO-based RND or eval)
+"""
+import os
+import torch
+import numpy as np
+import pandas as pd
+import torch.nn.functional as F
+from sampling import SamplingResult, SamplingTraceDatapoint, _sample_tokens
+from remasking_scheduleaware import apply_schedule_aware_remasking, apply_schedule_aware_insertion
+QUALITY_MODES = {"none", "both", "unmasking_only", "insertion_only"}
+# When set (e.g. A2D2_QUALITY_DEBUG=1), the diffusion loop prints, per step, how
+# many already-unmasked tokens get remasked and how many proposed insertions get
+# filtered by the quality planner, plus a per-batch total. Off by default so it
+# never spams training/eval runs.
+_QUALITY_DEBUG = os.environ.get("A2D2_QUALITY_DEBUG", "") not in ("", "0", "false", "False")
+@torch.no_grad()
+def _diffusion_loop(
+    model, steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    compute_rnd=False,
+    pretrained=None,
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    unmask_quality_threshold=None,
+    unmask_all=False,
+    freq_penalty=0.0,
+    return_trace=False,
+):
+    """Core discrete diffusion sampling loop for peptide generation.
+    Args:
+        model: Finetuned policy model.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: One of "none", "both", "unmasking_only", "insertion_only".
+        compute_rnd: Whether to compute step-wise log importance weights.
+        pretrained: Frozen pretrained model (required if compute_rnd=True).
+        remasking_mode: Remasking strategy ("schedule_aware", "remdm", "remdm_conf").
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering. None if schedule-driven.
+        return_trace: Whether to record sampling trace.
+    Returns:
+        (xt, log_rnd, sampling_trace)
+        log_rnd is None when compute_rnd=False.
+    """
+    assert quality_mode in QUALITY_MODES, f"quality_mode must be one of {QUALITY_MODES}"
+    if compute_rnd:
+        assert pretrained is not None, "pretrained model required when compute_rnd=True"
+    # Derive flags from quality_mode
+    use_remasking = quality_mode != "none"
+    disable_unmasking_planner = quality_mode in ("none", "insertion_only")
+    disable_insertion_planner = quality_mode in ("none", "unmasking_only")
+    device = next(model.parameters()).device
+    # Initialize all-pad sequence
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute index tensors
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    neg_inf = torch.tensor(-np.inf, device=device)
+    if use_remasking and remasking_mode == "remdm_conf":
+        remasking_score = torch.zeros((batch_size, max_length), device=device)
+    log_rnd = None
+    dbg_total_remasked = 0
+    dbg_total_proposed_ins = 0
+    dbg_total_filtered = 0
+    for i in range(steps):
+        step_remasked = 0
+        step_proposed_ins = 0
+        step_filtered = 0
+        # --- Policy model forward ---
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # --- Pretrained model forward (for RND) ---
+        if compute_rnd:
+            pretrained_pred = pretrained(xt, t)
+            pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+            pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+            pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+        # --- Unmask step (Euler) ---
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        if compute_rnd:
+            pretrained_unmask_rate[xt != mask] = 0
+            pretrained_unmask_rate[mask_pos + (mask,)] = 0
+            pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+            pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+        # Add "stay" probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        if compute_rnd:
+            pretrained_trans_prob.scatter_add_(
+                2,
+                _xt.unsqueeze(-1),
+                torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+            )
+        # Remove mask token from sampling so every masked position is decoded.
+        # The final step always does this; unmask_all does it every step, so the
+        # schedule-aware remasking below re-masks the lowest-quality tokens back
+        # down to the schedule's expected mask count.
+        if i == steps - 1 or unmask_all:
+            if i == steps - 1:
+                print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                num_zero_prob = mask_has_zero_prob.sum().item()
+                uniform_prob = torch.zeros((num_zero_prob, trans_prob.shape[-1]), device=device, dtype=trans_prob.dtype)
+                uniform_prob[:, :mask] = 1.0 / mask
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        # --- Frequency penalty: down-weight residues already abundant in the
+        # sequence so (re)decoded masked positions don't collapse onto the modal
+        # token (glycine). Only masked positions are sampled; clean positions are
+        # overwritten below, so penalizing the whole tensor is harmless. mask/pad
+        # never accumulate counts, so their entries stay untouched. Applied to a
+        # copy so trans_prob (used for RND log-probs) is unchanged.
+        sample_prob = trans_prob
+        if freq_penalty > 0.0:
+            V = trans_prob.shape[-1]
+            clean_tok = (xt != mask) & (xt != pad)  # (B, L)
+            counts = torch.zeros(batch_size, V, device=device, dtype=trans_prob.dtype)
+            counts.scatter_add_(1, torch.where(clean_tok, xt, torch.zeros_like(xt)),
+                                clean_tok.to(trans_prob.dtype))
+            sample_prob = trans_prob * torch.exp(-freq_penalty * counts).unsqueeze(1)
+        new_xt = _sample_tokens(sample_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # Update remasking_score buffer for remdm_conf mode
+        if use_remasking and remasking_mode == "remdm_conf" and i < steps - 1:
+            token_probs = F.softmax(unmask_rate, dim=-1)  # (B, L, V)
+            chosen_probs = torch.gather(token_probs, dim=-1, index=new_xt.unsqueeze(-1)).squeeze(-1)  # (B, L)
+            changed_mask_to_token = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+            remasking_score = torch.where(changed_mask_to_token, chosen_probs, remasking_score)
+        # --- Remasking step ---
+        if use_remasking and i < steps - 1:
+            if disable_unmasking_planner or not (hasattr(model, 'planner') and model.planner is not None):
+                remasking_conf = torch.zeros((batch_size, max_length), device=device)
+            else:
+                planner_out = model.planner(new_xt, t)
+                remasking_conf = planner_out["remasking_conf"].squeeze(-1)  # (B, L)
+            clean_index = (new_xt != mask) & (new_xt != pad)  # (B, L)
+            if remasking_mode == "remdm":
+                remasking_score_temp = torch.rand(remasking_conf.shape, device=device)
+            elif remasking_mode == "remdm_conf":
+                remasking_score_temp = -1.0 * remasking_conf
+            elif remasking_mode == "schedule_aware":
+                # Only remask when the unmasking planner is active. Otherwise
+                # (e.g. insertion_only / no_unmasking_planner) remasking_conf is
+                # all zeros, so this would remask schedule-excess tokens by
+                # position rather than by quality.
+                if not disable_unmasking_planner:
+                    new_xt = apply_schedule_aware_remasking(
+                        model, new_xt, t, dt, remasking_conf, clean_index,
+                        mask, neg_inf, batch_size,
+                        unmask_quality_threshold=unmask_quality_threshold,
+                    )
+                remasking_score_temp = None
+            else:
+                raise ValueError(f"Unknown remasking_mode: {remasking_mode}")
+            if remasking_score_temp is not None:
+                remasking_score_temp = torch.where(clean_index, remasking_score_temp, neg_inf)
+                for j in range(batch_size):
+                    k = min(num_remasking, int(clean_index[j].sum().item()))
+                    if k > 0:
+                        _, select_indices = torch.topk(remasking_score_temp[j], k=k)
+                        new_xt[j, select_indices] = mask
+            if _QUALITY_DEBUG:
+                # Positions that were clean before this remasking block and are
+                # now mask are exactly the unmasked tokens that got remasked.
+                step_remasked = int((clean_index & (new_xt == mask)).sum().item())
+            if return_trace:
+                for batch_idx in range(batch_size):
+                    for pos in range(max_length):
+                        if clean_index[batch_idx, pos] and new_xt[batch_idx, pos] == mask:
+                            sampling_trace[batch_idx].append(
+                                SamplingTraceDatapoint(
+                                    t=t[batch_idx].item(),
+                                    event_type="change",
+                                    position=pos,
+                                    token=mask,
+                                )
+                            )
+        # --- Compute log probabilities for RND ---
+        if compute_rnd:
+            lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+            lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+            changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+            log_policy_step = (lp * changed_mask).sum(dim=1)
+            log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+            log_rnd = log_pretrained_step - log_policy_step  # (B,)
+        # --- Insertion step ---
+        if i != steps - 1:
+            ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+            if _QUALITY_DEBUG:
+                # ext has been masked by the max-length validity check above, so
+                # this is the number of fresh mask tokens actually inserted.
+                step_proposed_ins = int(ext.sum().item())
+            # Schedule-aware insertion quality filtering
+            if use_remasking and not disable_insertion_planner:
+                if compute_rnd:
+                    xt_tmp_before = xt_tmp.clone()
+                dbg_nonpad_before = int((xt_tmp != pad).sum().item()) if _QUALITY_DEBUG else 0
+                xt_tmp = apply_schedule_aware_insertion(
+                    model, xt_tmp, new_xt, t, dt, ext, mask, pad, max_length,
+                    orig_mask, new_pos_orig, quality_threshold
+                )
+                if _QUALITY_DEBUG:
+                    # Filtering only drops/compacts tokens, so the drop in
+                    # non-pad count is the number of insertions filtered out.
+                    step_filtered = dbg_nonpad_before - int((xt_tmp != pad).sum().item())
+                if compute_rnd:
+                    # Compute corrected ext based on what actually stayed
+                    ext_corrected = torch.zeros_like(ext)
+                    for b in range(batch_size):
+                        after_len = xt_tmp[b].ne(pad).sum().item()
+                        orig_len = xt_len[b].item()
+                        surviving_insertions = after_len - orig_len
+                        if total_ext[b] > 0:
+                            ratio = surviving_insertions / total_ext[b].item()
+                            ext_corrected[b] = (ext[b].float() * ratio).long()
+                else:
+                    ext_corrected = ext
+            else:
+                ext_corrected = ext
+            # Compute insertion log_rnd
+            if compute_rnd:
+                insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+                pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+                log_policy_insert = (ext_corrected * torch.log(insertion_rate) - insertion_rate).sum(dim=1)
+                log_pretrained_insert = (ext_corrected * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)
+                log_insert_diff = log_pretrained_insert - log_policy_insert
+                log_rnd += log_insert_diff
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                if i != steps - 1:
+                    for j in range(max_length):
+                        id = max_length - j - 1
+                        if ext[batch_idx, id]:
+                            sampling_trace[batch_idx].append(
+                                SamplingTraceDatapoint(
+                                    t=t[batch_idx].item(),
+                                    event_type="insertion",
+                                    position=id,
+                                    token=mask,
+                                )
+                            )
+        if _QUALITY_DEBUG:
+            dbg_total_remasked += step_remasked
+            dbg_total_proposed_ins += step_proposed_ins
+            dbg_total_filtered += step_filtered
+            print(
+                f"[QUALITY {quality_mode}] step {i+1}/{steps}: "
+                f"remasked {step_remasked} unmasked tokens -> mask | "
+                f"insertions proposed {step_proposed_ins}, "
+                f"filtered {step_filtered}, kept {step_proposed_ins - step_filtered}"
+            )
+        xt = xt_tmp
+        t = t + dt
+    if _QUALITY_DEBUG:
+        print(
+            f"[QUALITY {quality_mode}] TOTAL over {steps} steps (batch_size={batch_size}): "
+            f"remasked {dbg_total_remasked} unmasked tokens | "
+            f"insertions proposed {dbg_total_proposed_ins}, "
+            f"filtered {dbg_total_filtered}, kept {dbg_total_proposed_ins - dbg_total_filtered}"
+        )
+    return xt, log_rnd, sampling_trace
+@torch.no_grad()
+def sample_peptides_buffer(
+    model, reward_model, analyzer, tokenizer,
+    steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    compute_rnd=False,
+    pretrained=None,
+    alpha=0.1,
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    min_length=0,
+):
+    """Generate peptides for training buffer.
+    Args:
+        model: Finetuned policy model.
+        reward_model: Multi-objective scoring function.
+        analyzer: PeptideAnalyzer for validation.
+        tokenizer: Tokenizer for decoding.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: "none", "both", "unmasking_only", or "insertion_only".
+        compute_rnd: If True, compute step-wise log importance weights (requires pretrained).
+                     If False, returns placeholder zero log_rnd (for ELBO-based RND).
+        pretrained: Frozen pretrained model (required when compute_rnd=True).
+        alpha: RND scaling factor.
+        remasking_mode: Remasking strategy.
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering.
+    Returns:
+        (valid_x, log_rnd, scalar_rewards, sampling_trace)
+    """
+    xt, log_rnd, trace = _diffusion_loop(
+        model, steps, mask, pad, batch_size, max_length,
+        quality_mode=quality_mode,
+        compute_rnd=compute_rnd,
+        pretrained=pretrained,
+        remasking_mode=remasking_mode,
+        num_remasking=num_remasking,
+        quality_threshold=quality_threshold,
+    )
+    device = xt.device
+    decoded_samples = tokenizer.batch_decode(xt)
+    valid_x_final = []
+    validSequences = []
+    valid_log_rnd = []
+    for idx, seq in enumerate(decoded_samples):
+        if not analyzer.is_peptide(seq):
+            continue
+        token_len = int((xt[idx] != pad).sum().item())
+        if min_length > 0 and token_len < min_length:
+            continue
+        valid_x_final.append(xt[idx])
+        validSequences.append(seq)
+        if compute_rnd:
+            valid_log_rnd.append(log_rnd[idx])
+    print("len valid sequences:", len(validSequences))
+    if len(validSequences) == 0:
+        print("[WARNING] No valid peptides generated in this batch")
+        empty_x = torch.empty((0, max_length), dtype=torch.long, device=device)
+        empty_log_rnd = torch.empty((0,), dtype=torch.float32, device=device)
+        empty_rewards = torch.empty((0,), dtype=torch.float32, device=device)
+        return empty_x, empty_log_rnd, empty_rewards, trace
+    score_vectors = reward_model(input_seqs=validSequences)
+    scalar_rewards = np.sum(score_vectors, axis=-1)
+    scalar_rewards = torch.as_tensor(scalar_rewards, dtype=torch.float32, device=device)
+    print(f"scalar reward dim{len(scalar_rewards)}")
+    valid_x_final = torch.stack(valid_x_final, dim=0)
+    if compute_rnd:
+        valid_log_rnd = torch.stack(valid_log_rnd, dim=0)
+        log_rnd_out = valid_log_rnd + (scalar_rewards / alpha)
+    else:
+        log_rnd_out = torch.zeros(len(validSequences), dtype=torch.float32, device=device)
+    return valid_x_final, log_rnd_out, scalar_rewards, trace
+@torch.no_grad()
+def sample_peptides_eval(
+    model, reward_model, analyzer, tokenizer,
+    steps, mask, pad, batch_size, max_length,
+    quality_mode="both",
+    remasking_mode="schedule_aware",
+    num_remasking=1,
+    quality_threshold=1,
+    unmask_quality_threshold=None,
+    unmask_all=False,
+    freq_penalty=0.0,
+    dataframe=False,
+    return_valid=False,
+):
+    """Generate peptides for evaluation.
+    Args:
+        model: Finetuned policy model.
+        reward_model: Multi-objective scoring function.
+        analyzer: PeptideAnalyzer for validation.
+        tokenizer: Tokenizer for decoding.
+        steps: Number of diffusion steps.
+        mask: Mask token ID.
+        pad: Pad token ID.
+        batch_size: Number of sequences to generate.
+        max_length: Maximum sequence length.
+        quality_mode: "none", "both", "unmasking_only", or "insertion_only".
+        remasking_mode: Remasking strategy.
+        num_remasking: Number of tokens to remask per step.
+        quality_threshold: Threshold for insertion quality filtering.
+        dataframe: If True, include a pandas DataFrame in the return.
+        return_valid: If True, return decoded valid sequences instead of raw token tensors.
+    Returns:
+        For multi-objective (5 objectives):
+            (samples, affinity, sol, hemo, nf, permeability, valid_fraction[, df])
+        For single objective:
+            (samples, sol, valid_fraction[, df])
+        When return_valid=True, samples is replaced with validSequences list.
+    """
+    xt, _, trace = _diffusion_loop(
+        model, steps, mask, pad, batch_size, max_length,
+        quality_mode=quality_mode,
+        compute_rnd=False,
+        remasking_mode=remasking_mode,
+        num_remasking=num_remasking,
+        quality_threshold=quality_threshold,
+        unmask_quality_threshold=unmask_quality_threshold,
+        unmask_all=unmask_all,
+        freq_penalty=freq_penalty,
+    )
+    device = xt.device
+    samples = xt.to(device)
+    decoded_samples = tokenizer.batch_decode(samples)
+    valid_x_final = []
+    validSequences = []
+    for idx, seq in enumerate(decoded_samples):
+        if analyzer.is_peptide(seq):
+            valid_x_final.append(samples[idx])
+            validSequences.append(seq)
+    print("len valid sequences:", len(validSequences))
+    valid_fraction = len(validSequences) / batch_size
+    # Determine number of objectives from reward model
+    num_objectives = len(reward_model.score_func_names) if hasattr(reward_model, 'score_func_names') else 5
+    if len(validSequences) != 0:
+        score_vectors = reward_model(input_seqs=validSequences)  # (N, num_objectives)
+        average_scores = score_vectors.T
+        if num_objectives == 1:
+            sol = average_scores[0]
+        else:
+            affinity = average_scores[0]
+            sol = average_scores[1]
+            hemo = average_scores[2]
+            nf = average_scores[3]
+            permeability = average_scores[4]
+    else:
+        zeros = [0.0]
+        if num_objectives == 1:
+            sol = zeros
+        else:
+            affinity = zeros
+            sol = zeros
+            hemo = zeros
+            nf = zeros
+            permeability = zeros
+    if num_objectives == 1:
+        if dataframe:
+            df = pd.DataFrame({
+                "Peptide Sequence": validSequences,
+                "Solubility": sol if len(validSequences) else [0.0],
+            })
+            if return_valid:
+                return validSequences, sol, valid_fraction, df
+            return samples, sol, valid_fraction, df
+        if return_valid:
+            return validSequences, sol, valid_fraction
+        return samples, sol, valid_fraction
+    if dataframe:
+        df = pd.DataFrame({
+            "Peptide Sequence": validSequences,
+            "Binding Affinity": affinity if len(validSequences) else [0.0],
+            "Solubility": sol if len(validSequences) else [0.0],
+            "Hemolysis": hemo if len(validSequences) else [0.0],
+            "Nonfouling": nf if len(validSequences) else [0.0],
+            "Permeability": permeability if len(validSequences) else [0.0],
+        })
+        if return_valid:
+            return validSequences, affinity, sol, hemo, nf, permeability, valid_fraction, df
+        return samples, affinity, sol, hemo, nf, permeability, valid_fraction, df
+    if return_valid:
+        return validSequences, affinity, sol, hemo, nf, permeability, valid_fraction
+    return samples, affinity, sol, hemo, nf, permeability, valid_fraction

a2d2_pep/pep_scoring/functions/binding.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import sys
+import os, torch
+import numpy as np
+import torch
+import pandas as pd
+import torch.nn as nn
+import esm
+from transformers import AutoModelForMaskedLM
+class ImprovedBindingPredictor(nn.Module):
+    def __init__(self,
+                 esm_dim=1280,
+                 smiles_dim=768,
+                 hidden_dim=512,
+                 n_heads=8,
+                 n_layers=3,
+                 dropout=0.1):
+        super().__init__()
+        # Define binding thresholds
+        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
+        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
+        # Project to same dimension
+        self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
+        self.protein_projection = nn.Linear(esm_dim, hidden_dim)
+        self.protein_norm = nn.LayerNorm(hidden_dim)
+        self.smiles_norm = nn.LayerNorm(hidden_dim)
+        # Cross attention blocks with layer norm
+        self.cross_attention_layers = nn.ModuleList([
+            nn.ModuleDict({
+                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
+                'norm1': nn.LayerNorm(hidden_dim),
+                'ffn': nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim * 4),
+                    nn.ReLU(),
+                    nn.Dropout(dropout),
+                    nn.Linear(hidden_dim * 4, hidden_dim)
+                ),
+                'norm2': nn.LayerNorm(hidden_dim)
+            }) for _ in range(n_layers)
+        ])
+        # Prediction heads
+        self.shared_head = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+        )
+        # Regression head
+        self.regression_head = nn.Linear(hidden_dim, 1)
+        # Classification head (3 classes: tight, medium, loose binding)
+        self.classification_head = nn.Linear(hidden_dim, 3)
+    def get_binding_class(self, affinity):
+        """Convert affinity values to class indices
+        0: tight binding (>= 7.5)
+        1: medium binding (6.0-7.5)
+        2: weak binding (< 6.0)
+        """
+        if isinstance(affinity, torch.Tensor):
+            tight_mask = affinity >= self.tight_threshold
+            weak_mask = affinity < self.weak_threshold
+            medium_mask = ~(tight_mask | weak_mask)
+            classes = torch.zeros_like(affinity, dtype=torch.long)
+            classes[medium_mask] = 1
+            classes[weak_mask] = 2
+            return classes
+        else:
+            if affinity >= self.tight_threshold:
+                return 0  # tight binding
+            elif affinity < self.weak_threshold:
+                return 2  # weak binding
+            else:
+                return 1  # medium binding
+    def forward(self, protein_emb, smiles_emb):
+        protein = self.protein_norm(self.protein_projection(protein_emb))
+        smiles = self.smiles_norm(self.smiles_projection(smiles_emb))
+        #protein = protein.transpose(0, 1)
+        #smiles = smiles.transpose(0, 1)
+        # Cross attention layers
+        for layer in self.cross_attention_layers:
+            # Protein attending to SMILES
+            attended_protein = layer['attention'](
+                protein, smiles, smiles
+            )[0]
+            protein = layer['norm1'](protein + attended_protein)
+            protein = layer['norm2'](protein + layer['ffn'](protein))
+            # SMILES attending to protein
+            attended_smiles = layer['attention'](
+                smiles, protein, protein
+            )[0]
+            smiles = layer['norm1'](smiles + attended_smiles)
+            smiles = layer['norm2'](smiles + layer['ffn'](smiles))
+        # Get sequence-level representations
+        protein_pool = torch.mean(protein, dim=0)
+        smiles_pool = torch.mean(smiles, dim=0)
+        # Concatenate both representations
+        combined = torch.cat([protein_pool, smiles_pool], dim=-1)
+        # Shared features
+        shared_features = self.shared_head(combined)
+        regression_output = self.regression_head(shared_features)
+        classification_logits = self.classification_head(shared_features)
+        return regression_output, classification_logits
+class BindingAffinity:
+    def __init__(self, prot_seq, tokenizer, base_path, device=None, emb_model=None):
+        super().__init__()
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        # peptide embeddings
+        if emb_model is not None:
+            self.pep_model = emb_model.to(self.device).eval()
+        else:
+            self.pep_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(self.device).eval()
+        self.pep_tokenizer = tokenizer
+        self.model = ImprovedBindingPredictor().to(self.device)
+        checkpoint = torch.load(f'{base_path}/functions/classifiers/binding-affinity.pt',
+                                map_location=self.device,
+                                weights_only=False)
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.model.eval()
+        self.esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # load ESM-2 model
+        self.esm_model = self.esm_model.to(self.device).eval()
+        self.prot_tokenizer = alphabet.get_batch_converter() # load esm tokenizer
+        data = [("target", prot_seq)]
+        # get tokenized protein
+        _, _, prot_tokens = self.prot_tokenizer(data)
+        prot_tokens = prot_tokens.to(self.device)
+        with torch.no_grad():
+            results = self.esm_model.forward(prot_tokens, repr_layers=[33])  # Example with ESM-2
+            prot_emb = results["representations"][33]
+        self.prot_emb = prot_emb[0].to(self.device)
+        self.prot_emb = torch.mean(self.prot_emb, dim=0, keepdim=True)
+    def forward(self, input_seqs):
+        with torch.no_grad():
+            scores = []
+            for seq in input_seqs:
+                pep_tokens = self.pep_tokenizer(seq, return_tensors='pt', padding=True)
+                pep_tokens = {k: v.to(self.device) for k, v in pep_tokens.items()}
+                with torch.no_grad():
+                    emb = self.pep_model(input_ids=pep_tokens['input_ids'],
+                                         attention_mask=pep_tokens['attention_mask'],
+                                         output_hidden_states=True)
+                #emb = self.pep_model(input_ids=pep_tokens['input_ids'], attention_mask=pep_tokens['attention_mask'])
+                pep_emb = emb.last_hidden_state.squeeze(0)
+                pep_emb = torch.mean(pep_emb, dim=0, keepdim=True)
+                score, logits = self.model.forward(self.prot_emb, pep_emb)
+                scores.append(score.item())
+        return scores
+    def __call__(self, input_seqs: list):
+        return self.forward(input_seqs)

a2d2_pep/pep_scoring/functions/binding_utils.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from torch import nn
+import torch
+import numpy as np
+def to_var(x):
+    if torch.cuda.is_available():
+        x = x.cuda()
+    return x
+class MultiHeadAttentionSequence(nn.Module):
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_k = d_k
+        self.d_v = d_v
+        self.W_Q = nn.Linear(d_model, n_head*d_k)
+        self.W_K = nn.Linear(d_model, n_head*d_k)
+        self.W_V = nn.Linear(d_model, n_head*d_v)
+        self.W_O = nn.Linear(n_head*d_v, d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, q, k, v):
+        batch, len_q, _ = q.size()
+        batch, len_k, _ = k.size()
+        batch, len_v, _ = v.size()
+        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
+        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
+        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2).transpose(2, 3)
+        V = V.transpose(1, 2)
+        attention = torch.matmul(Q, K)
+        attention = attention / np.sqrt(self.d_k)
+        attention = F.softmax(attention, dim=-1)
+        output = torch.matmul(attention, V)
+        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
+        output = self.W_O(output)
+        output = self.dropout(output)
+        output = self.layer_norm(output + q)
+        return output, attention
+class MultiHeadAttentionReciprocal(nn.Module):
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_k = d_k
+        self.d_v = d_v
+        self.W_Q = nn.Linear(d_model, n_head*d_k)
+        self.W_K = nn.Linear(d_model, n_head*d_k)
+        self.W_V = nn.Linear(d_model, n_head*d_v)
+        self.W_O = nn.Linear(n_head*d_v, d_model)
+        self.W_V_2 = nn.Linear(d_model, n_head*d_v)
+        self.W_O_2 = nn.Linear(n_head*d_v, d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm_2 = nn.LayerNorm(d_model)
+        self.dropout_2 = nn.Dropout(dropout)
+    def forward(self, q, k, v, v_2):
+        batch, len_q, _ = q.size()
+        batch, len_k, _ = k.size()
+        batch, len_v, _ = v.size()
+        batch, len_v_2, _ = v_2.size()
+        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
+        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
+        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
+        V_2 = self.W_V_2(v_2).view([batch, len_v_2, self.n_head, self.d_v])
+        Q = Q.transpose(1, 2)
+        K = K.transpose(1, 2).transpose(2, 3)
+        V = V.transpose(1, 2)
+        V_2 = V_2.transpose(1,2)
+        attention = torch.matmul(Q, K)
+        attention = attention /np.sqrt(self.d_k)
+        attention_2 = attention.transpose(-2, -1)
+        attention = F.softmax(attention, dim=-1)
+        attention_2 = F.softmax(attention_2, dim=-1)
+        output = torch.matmul(attention, V)
+        output_2 = torch.matmul(attention_2, V_2)
+        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
+        output_2 = output_2.transpose(1, 2).reshape([batch, len_k, self.d_v*self.n_head])
+        output = self.W_O(output)
+        output_2 = self.W_O_2(output_2)
+        output = self.dropout(output)
+        output = self.layer_norm(output + q)
+        output_2 = self.dropout(output_2)
+        output_2 = self.layer_norm(output_2 + k)
+        return output, output_2, attention, attention_2
+class FFN(nn.Module):
+    def __init__(self, d_in, d_hid, dropout=0.1):
+        super().__init__()
+        self.layer_1 = nn.Conv1d(d_in, d_hid,1)
+        self.layer_2 = nn.Conv1d(d_hid, d_in,1)
+        self.relu = nn.ReLU()
+        self.layer_norm = nn.LayerNorm(d_in)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        output = self.layer_1(x.transpose(1, 2))
+        output = self.relu(output)
+        output = self.layer_2(output)
+        output = self.dropout(output)
+        output = self.layer_norm(output.transpose(1, 2)+residual)
+        return output
+class ConvLayer(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, padding, dilation):
+        super(ConvLayer, self).__init__()
+        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.relu(out)
+        return out
+class DilatedCNN(nn.Module):
+    def __init__(self, d_model, d_hidden):
+        super(DilatedCNN, self).__init__()
+        self.first_ = nn.ModuleList()
+        self.second_ = nn.ModuleList()
+        self.third_ = nn.ModuleList()
+        dilation_tuple = (1, 2, 3)
+        dim_in_tuple = (d_model, d_hidden, d_hidden)
+        dim_out_tuple = (d_hidden, d_hidden, d_hidden)
+        for i, dilation_rate in enumerate(dilation_tuple):
+            self.first_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=3, padding=dilation_rate,
+                                         dilation=dilation_rate))
+        for i, dilation_rate in enumerate(dilation_tuple):
+            self.second_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=5, padding=2*dilation_rate,
+                                          dilation=dilation_rate))
+        for i, dilation_rate in enumerate(dilation_tuple):
+            self.third_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=7, padding=3*dilation_rate,
+                                         dilation=dilation_rate))
+    def forward(self, protein_seq_enc):
+        # pdb.set_trace()
+        protein_seq_enc = protein_seq_enc.transpose(1, 2)    # protein_seq_enc's shape: B*L*d_model -> B*d_model*L
+        first_embedding = protein_seq_enc
+        second_embedding = protein_seq_enc
+        third_embedding = protein_seq_enc
+        for i in range(len(self.first_)):
+            first_embedding = self.first_[i](first_embedding)
+        for i in range(len(self.second_)):
+            second_embedding = self.second_[i](second_embedding)
+        for i in range(len(self.third_)):
+            third_embedding = self.third_[i](third_embedding)
+        # pdb.set_trace()
+        protein_seq_enc = first_embedding + second_embedding + third_embedding
+        return protein_seq_enc.transpose(1, 2)
+class ReciprocalLayerwithCNN(nn.Module):
+    def __init__(self, d_model, d_inner, d_hidden, n_head, d_k, d_v):
+        super().__init__()
+        self.cnn = DilatedCNN(d_model, d_hidden)
+        self.sequence_attention_layer = MultiHeadAttentionSequence(n_head, d_hidden, d_k, d_v)
+        self.protein_attention_layer = MultiHeadAttentionSequence(n_head, d_hidden, d_k, d_v)
+        self.reciprocal_attention_layer = MultiHeadAttentionReciprocal(n_head, d_hidden, d_k, d_v)
+        self.ffn_seq = FFN(d_hidden, d_inner)
+        self.ffn_protein = FFN(d_hidden, d_inner)
+    def forward(self, sequence_enc, protein_seq_enc):
+        # pdb.set_trace()  # protein_seq_enc.shape = B * L * d_model
+        protein_seq_enc = self.cnn(protein_seq_enc)
+        prot_enc, prot_attention = self.protein_attention_layer(protein_seq_enc, protein_seq_enc, protein_seq_enc)
+        seq_enc, sequence_attention = self.sequence_attention_layer(sequence_enc, sequence_enc, sequence_enc)
+        prot_enc, seq_enc, prot_seq_attention, seq_prot_attention = self.reciprocal_attention_layer(prot_enc, seq_enc, seq_enc, prot_enc)
+        prot_enc = self.ffn_protein(prot_enc)
+        seq_enc = self.ffn_seq(seq_enc)
+        return prot_enc, seq_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention
+class ReciprocalLayer(nn.Module):
+    def __init__(self, d_model, d_inner, n_head, d_k, d_v):
+        super().__init__()
+        self.sequence_attention_layer = MultiHeadAttentionSequence(n_head, d_model, d_k, d_v)
+        self.protein_attention_layer = MultiHeadAttentionSequence(n_head, d_model, d_k, d_v)
+        self.reciprocal_attention_layer = MultiHeadAttentionReciprocal(n_head, d_model, d_k, d_v)
+        self.ffn_seq = FFN(d_model, d_inner)
+        self.ffn_protein = FFN(d_model, d_inner)
+    def forward(self, sequence_enc, protein_seq_enc):
+        prot_enc, prot_attention = self.protein_attention_layer(protein_seq_enc, protein_seq_enc, protein_seq_enc)
+        seq_enc, sequence_attention = self.sequence_attention_layer(sequence_enc, sequence_enc, sequence_enc)
+        prot_enc, seq_enc, prot_seq_attention, seq_prot_attention = self.reciprocal_attention_layer(prot_enc, seq_enc, seq_enc, prot_enc)
+        prot_enc = self.ffn_protein(prot_enc)
+        seq_enc = self.ffn_seq(seq_enc)
+        return prot_enc, seq_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention

a2d2_pep/pep_scoring/functions/hemolysis.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+import warnings
+import numpy as np
+from rdkit import rdBase
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+class Hemolysis:
+    def __init__(self, tokenizer, base_path, device=None, emb_model=None):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.predictor = xgb.Booster(model_file=f'{base_path}/functions/classifiers/hemolysis-xgboost.json')
+        self.emb_model = emb_model if emb_model is not None else AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(device).eval()
+        self.tokenizer = tokenizer
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.ones(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        probs = self.predictor.predict(features)
+        # return the probability of it being not hemolytic
+        return scores - probs
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    hemo = Hemolysis()
+    seq = ["[te]NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    print(hemo.tokenizer.vocab_size)
+    scores = hemo(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

a2d2_pep/pep_scoring/functions/nonfouling.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+import warnings
+import numpy as np
+from rdkit import Chem, rdBase, DataStructs
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+class Nonfouling:
+    def __init__(self, tokenizer, base_path, device=None, emb_model=None):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.predictor = xgb.Booster(model_file=f'{base_path}/functions/classifiers/nonfouling-xgboost.json')
+        self.emb_model = emb_model if emb_model is not None else AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(device).eval()
+        self.tokenizer = tokenizer
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        # return the probability of it being not hemolytic
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    nf = Nonfouling()
+    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    scores = nf(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

a2d2_pep/pep_scoring/functions/permeability.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import sys
+import os
+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+import warnings
+import numpy as np
+from rdkit.Chem import Descriptors, rdMolDescriptors
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+from typing import List
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+def fingerprints_from_smiles(smiles: List, size=2048):
+    """ Create ECFP fingerprints of smiles, with validity check """
+    fps = []
+    valid_mask = []
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        valid_mask.append(int(mol is not None))
+        fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
+        fps.append(fp)
+    fps = np.concatenate(fps, axis=0)
+    return fps, valid_mask
+def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
+    """ Create ECFP fingerprint of a molecule """
+    if hashed:
+        fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
+    else:
+        fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
+    fp_np = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
+    return fp_np.reshape(1, -1)
+def getMolDescriptors(mol, missingVal=0):
+    """ calculate the full list of descriptors for a molecule """
+    values, names = [], []
+    for nm, fn in Descriptors._descList:
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
+                          'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
+                          'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
+    for nm, fn in custom_descriptors.items():
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    return values, names
+def get_pep_dps_from_smi(smi):
+    try:
+        mol = Chem.MolFromSmiles(smi)
+    except:
+        print(f"convert smi {smi} to molecule failed!")
+        mol = None
+    dps, _ = getMolDescriptors(mol)
+    return np.array(dps)
+def get_pep_dps(smi_list):
+    if len(smi_list) == 0:
+        return np.zeros((0, 213))
+    return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
+def check_smi_validity(smiles: list):
+    valid_smi, valid_idx = [], []
+    for idx, smi in enumerate(smiles):
+        try:
+            mol = Chem.MolFromSmiles(smi) if smi else None
+            if mol:
+                valid_smi.append(smi)
+                valid_idx.append(idx)
+        except Exception as e:
+            # logger.debug(f'Error: {e} in smiles {smi}')
+            pass
+    return valid_smi, valid_idx
+class Permeability:
+    def __init__(self, tokenizer, base_path, device=None, emb_model=None):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.predictor = xgb.Booster(model_file=f'{base_path}/functions/classifiers/permeability-xgboost.json')
+        if emb_model is not None:
+            self.emb_model = emb_model.to(self.device).eval()
+        else:
+            self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(device).eval()
+        self.tokenizer = tokenizer
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_features(self, input_seqs: list, dps=False, fps=False):
+        #valid_smiles, valid_idxes = check_smi_validity(input_seqs)
+        if fps:
+            fingerprints = fingerprints_from_smiles(input_seqs)[0]
+        else:
+            fingerprints = torch.empty((len(input_seqs), 0))
+        if dps:
+            descriptors = get_pep_dps(input_seqs)
+        else:
+            descriptors = torch.empty((len(input_seqs), 0))
+        embeddings = self.generate_embeddings(input_seqs)
+        # logger.debug(f'X_fps.shape: {X_fps.shape}, X_dps.shape: {X_dps.shape}')
+        features = np.concatenate([fingerprints, descriptors, embeddings], axis=1)
+        return features
+    def get_scores(self, input_seqs: list):
+        scores = -10 * np.ones(len(input_seqs))
+        features = self.get_features(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    permeability = Permeability()
+    seq = ['N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1cNc2c1cc(O)cc2)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H]([C@@H](O)C(C)C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O']
+    scores = permeability(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

a2d2_pep/pep_scoring/functions/scoring_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import warnings
+import numpy as np
+from loguru import logger
+from sklearn.ensemble import RandomForestRegressor
+from rdkit.Chem import Descriptors, rdMolDescriptors
+import joblib
+from rdkit import Chem, rdBase, DataStructs
+from rdkit.Chem import AllChem
+from typing import List
+def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
+    """
+        Create ECFP fingerprint of a molecule
+    """
+    if hashed:
+        fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
+    else:
+        fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
+    fp_np = np.zeros((1,))
+    DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
+    return fp_np.reshape(1, -1)
+def fingerprints_from_smiles(smiles: List, size=2048):
+    """ Create ECFP fingerprints of smiles, with validity check """
+    fps = []
+    valid_mask = []
+    for i, smile in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smile)
+        valid_mask.append(int(mol is not None))
+        fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
+        fps.append(fp)
+    fps = np.concatenate(fps, axis=0) if len(fps) > 0 else np.zeros((0, size))
+    return fps, valid_mask
+def getMolDescriptors(mol, missingVal=0):
+    """ calculate the full list of descriptors for a molecule """
+    values, names = [], []
+    for nm, fn in Descriptors._descList:
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
+                          'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
+                          'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
+    for nm, fn in custom_descriptors.items():
+        try:
+            val = fn(mol)
+        except:
+            val = missingVal
+        values.append(val)
+        names.append(nm)
+    return values, names
+def get_pep_dps_from_smi(smi):
+    try:
+        mol = Chem.MolFromSmiles(smi)
+    except:
+        print(f"convert smi {smi} to molecule failed!")
+        mol = None
+    dps, _ = getMolDescriptors(mol)
+    return np.array(dps)
+def get_pep_dps(smi_list):
+    if len(smi_list) == 0:
+        return np.zeros((0, 211))
+    return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
+def check_smi_validity(smiles: list):
+    valid_smi, valid_idx = [], []
+    for idx, smi in enumerate(smiles):
+        try:
+            mol = Chem.MolFromSmiles(smi) if smi else None
+            if mol:
+                valid_smi.append(smi)
+                valid_idx.append(idx)
+        except Exception as e:
+            # logger.debug(f'Error: {e} in smiles {smi}')
+            pass
+    return valid_smi, valid_idx

a2d2_pep/pep_scoring/functions/solubility.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import xgboost as xgb
+import torch
+import numpy as np
+from transformers import AutoModelForMaskedLM
+import warnings
+import numpy as np
+from rdkit import rdBase
+rdBase.DisableLog('rdApp.error')
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+class Solubility:
+    def __init__(self, tokenizer, base_path, device=None, emb_model=None):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.predictor = xgb.Booster(model_file=f'{base_path}/functions/classifiers/solubility-xgboost.json')
+        if emb_model is not None:
+            self.emb_model = emb_model.to(self.device).eval()
+        else:
+            self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(self.device).eval()
+        self.tokenizer = tokenizer
+    def generate_embeddings(self, sequences):
+        embeddings = []
+        for sequence in sequences:
+            tokenized = self.tokenizer(sequence, return_tensors='pt')
+            tokenized = {k: v.to(self.device) for k, v in tokenized.items()}
+            with torch.no_grad():
+                output = self.emb_model(**tokenized)
+            # Mean pooling across sequence length
+            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
+            embeddings.append(embedding)
+        return np.array(embeddings)
+    def get_scores(self, input_seqs: list):
+        scores = np.zeros(len(input_seqs))
+        features = self.generate_embeddings(input_seqs)
+        if len(features) == 0:
+            return scores
+        features = np.nan_to_num(features, nan=0.)
+        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
+        features = xgb.DMatrix(features)
+        scores = self.predictor.predict(features)
+        return scores
+    def __call__(self, input_seqs: list):
+        scores = self.get_scores(input_seqs)
+        return scores
+def unittest():
+    solubility = Solubility()
+    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
+    scores = solubility(input_seqs=seq)
+    print(scores)
+if __name__ == '__main__':
+    unittest()

a2d2_pep/pep_scoring/scoring_functions.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+from .tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+from transformers import AutoModelForMaskedLM
+import numpy as np
+from .functions.binding import BindingAffinity
+from .functions.permeability import Permeability
+from .functions.solubility import Solubility
+from .functions.hemolysis import Hemolysis
+from .functions.nonfouling import Nonfouling
+# base path: this package directory (holds tokenizer/ and functions/classifiers/)
+base_path = os.path.dirname(os.path.abspath(__file__))
+class ScoringFunctions:
+    def __init__(self, score_func_names=None, prot_seqs=None, device=None):
+        """
+        Class for generating score vectors given generated sequence
+        Args:
+            score_func_names: list of scoring function names to be evaluated
+            score_weights: weights to scale scores (default: 1)
+            target_protein: sequence of target protein binder
+        """
+        emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer.to(device).eval()
+        tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/tokenizer/new_vocab.txt',
+                                        f'{base_path}/tokenizer/new_splits.txt')
+        prot_seqs = prot_seqs if prot_seqs is not None else []
+        if score_func_names is None:
+            # just do unmasking based on validity of peptide bonds
+            self.score_func_names = []
+        else:
+            self.score_func_names = score_func_names
+        # self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights)
+        # binding affinities
+        self.target_protein = prot_seqs
+        print(len(prot_seqs))
+        if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1):
+            binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device)
+            binding_affinity2 = None
+        elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2):
+            binding_affinity1 = BindingAffinity(prot_seqs[0], tokenizer=tokenizer, base_path=base_path, device=device)
+            binding_affinity2 = BindingAffinity(prot_seqs[1], tokenizer=tokenizer, base_path=base_path, device=device)
+        else:
+            print("here")
+            binding_affinity1 = None
+            binding_affinity2 = None
+        permeability = Permeability(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
+        sol = Solubility(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
+        nonfouling = Nonfouling(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
+        hemo = Hemolysis(tokenizer=tokenizer, base_path=base_path, device=device, emb_model=emb_model)
+        self.all_funcs = {'binding_affinity1': binding_affinity1,
+                          'binding_affinity2': binding_affinity2,
+                          'permeability': permeability,
+                          'nonfouling': nonfouling,
+                          'solubility': sol,
+                          'hemolysis': hemo
+                          }
+    def forward(self, input_seqs):
+        scores = []
+        for i, score_func in enumerate(self.score_func_names):
+            score = self.all_funcs[score_func](input_seqs = input_seqs)
+            scores.append(score)
+        # convert to numpy arrays with shape (num_sequences, num_functions)
+        scores = np.float32(scores).T
+        return scores
+    def __call__(self, input_seqs: list):
+        return self.forward(input_seqs)

a2d2_pep/pep_scoring/tokenizer/my_tokenizers.py ADDED Viewed

	@@ -0,0 +1,424 @@

+import collections
+import os
+import re
+from typing import List, Optional
+from transformers import PreTrainedTokenizer
+from SmilesPE.tokenizer import SPE_Tokenizer
+import torch
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class Atomwise_Tokenizer(object):
+    """Run atom-level SMILES tokenization"""
+    def __init__(self):
+        """ Constructs a atom-level Tokenizer.
+        """
+        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """ Basic Tokenization of a SMILES.
+        """
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        spe_file (:obj:`string`):
+            File containing the trained SMILES Pair Encoding vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(self, vocab_file, spe_file,
+                unk_token="[UNK]",
+                sep_token="[SEP]",
+                pad_token="[PAD]",
+                cls_token="[CLS]",
+                mask_token="[MASK]",
+                **kwargs):
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        if not os.path.isfile(spe_file):
+            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
+        self.vocab = load_vocab(vocab_file)
+        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.spe_tokenizer.tokenize(text).split(' ')
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    # changed encode and decode functions
+    def encode(self, token_array):
+        token_ids = []
+        token_ids.append(2)
+        for token in token_array:
+            id = self._convert_token_to_id(token)
+            token_ids.append(id)
+        token_ids.append(3)
+        token_ids = torch.tensor([token_ids])
+        attn_mask = torch.ones_like(token_ids)
+        return {'input_ids': token_ids, 'attention_mask': attn_mask}
+    def decode(self, token_ids, skip_special_tokens=True):
+        token_ids = token_ids.squeeze(0).cpu().tolist()
+        token_array = []
+        for idx in token_ids:
+            if idx == 3:  # Stop decoding when token ID 3 is encountered
+                break
+            if skip_special_tokens and idx in self.all_special_ids:
+                continue
+            token = self._convert_id_to_token(idx)
+            token_array.append(token)
+        sequence = "".join(token_array)
+        return sequence
+    def batch_decode(self, batch_token_ids, skip_special_tokens=True):
+        sequences = []
+        for token_ids in batch_token_ids:
+            sequences.append(self.decode(token_ids))
+        return sequences
+    def get_token_split(self, token_ids):
+        if isinstance(token_ids, torch.Tensor):
+            token_ids = token_ids.cpu().tolist()
+        token_array = []
+        for seq_ids in token_ids:
+            seq_array = []
+            for id in seq_ids:
+                token = self._convert_id_to_token(id)
+                seq_array.append(token)
+            token_array.append(seq_array)
+        return token_array
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.tokenizer = Atomwise_Tokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        return self.tokenizer.tokenize(text)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)

a2d2_pep/pep_utils/analyzer.py ADDED Viewed

	@@ -0,0 +1,1274 @@

+import os
+import re
+import pandas as pd
+from io import StringIO
+import rdkit
+from rdkit import Chem
+from rdkit.Chem import AllChem, Draw
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from io import BytesIO
+import tempfile
+from rdkit import Chem
+class PeptideAnalyzer:
+    def __init__(self, min_peptide_bonds=2, enforce_min_peptide_bonds=True):
+        # length cutoff: minimum number of backbone residues (N-Cα-C(=O) units)
+        self.min_peptide_bonds = min_peptide_bonds
+        self.enforce_min_peptide_bonds = enforce_min_peptide_bonds
+        self.bond_patterns = [
+            (r'OC\(=O\)', 'ester'),  # Ester bond
+            (r'N\(C\)C\(=O\)', 'n_methyl'),  # N-methylated peptide bond
+            (r'N[0-9]C\(=O\)', 'proline'),  # Proline peptide bond
+            (r'NC\(=O\)', 'peptide'),  # Standard peptide bond
+            (r'C\(=O\)N\(C\)', 'n_methyl_reverse'),  # Reverse N-methylated
+            (r'C\(=O\)N[12]?', 'peptide_reverse')  # Reverse peptide bond
+        ]
+        # Three to one letter code mapping
+        self.three_to_one = {
+            'Ala': 'A', 'Cys': 'C', 'Asp': 'D', 'Glu': 'E',
+            'Phe': 'F', 'Gly': 'G', 'His': 'H', 'Ile': 'I',
+            'Lys': 'K', 'Leu': 'L', 'Met': 'M', 'Asn': 'N',
+            'Pro': 'P', 'Gln': 'Q', 'Arg': 'R', 'Ser': 'S',
+            'Thr': 'T', 'Val': 'V', 'Trp': 'W', 'Tyr': 'Y'
+        }
+    def count_peptide_bonds(self, smiles):
+        """Count backbone peptide residues via N-Cα-C(=O) units.
+        Matches the backbone pattern [NX3][CX4][CX3](=O): an amide nitrogen
+        bonded to an sp3 alpha-carbon bonded to a carbonyl. Requiring the sp3
+        Cα excludes non-backbone amides — ureas/biurets (N-C(=O)-N, no Cα),
+        sulfonamides, and side-chain amides (Asn/Gln) — and uniquify=True
+        avoids the multiple-mapping over-count of symmetric N-methyl groups.
+        Each match corresponds to one backbone residue.
+        """
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return 0
+        backbone_pattern = Chem.MolFromSmarts('[NX3][CX4][CX3](=O)')
+        return len(mol.GetSubstructMatches(backbone_pattern, uniquify=True))
+    def is_peptide(self, smiles):
+        """Check if the SMILES represents a peptide structure"""
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return False
+        # Count backbone residues (N-Cα-C(=O) units). Requiring a real backbone
+        # unit rejects ureas/biurets and side-chain-only amides outright.
+        n_residues = self.count_peptide_bonds(smiles)
+        if n_residues == 0:
+            return False
+        # length cutoff: reject molecules with too few backbone residues
+        if self.enforce_min_peptide_bonds and n_residues < self.min_peptide_bonds:
+            return False
+        return True
+    def is_cyclic(self, smiles):
+        """Improved cyclic peptide detection"""
+        # Check for C-terminal carboxyl
+        if smiles.endswith('C(=O)O'):
+            return False, [], []
+        # Find all numbers used in ring closures
+        ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
+        # Find aromatic ring numbers
+        aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
+        aromatic_cycles = []
+        for match in aromatic_matches:
+            numbers = re.findall(r'[0-9]', match)
+            aromatic_cycles.extend(numbers)
+        # Numbers that aren't part of aromatic rings are peptide cycles
+        peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
+        is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
+        return is_cyclic, peptide_cycles, aromatic_cycles
+    def split_on_bonds(self, smiles):
+        """Split SMILES into segments with simplified Pro handling"""
+        positions = []
+        used = set()
+        # Find Gly pattern first
+        gly_pattern = r'NCC\(=O\)'
+        for match in re.finditer(gly_pattern, smiles):
+            if not any(p in range(match.start(), match.end()) for p in used):
+                positions.append({
+                    'start': match.start(),
+                    'end': match.end(),
+                    'type': 'gly',
+                    'pattern': match.group()
+                })
+                used.update(range(match.start(), match.end()))
+        for pattern, bond_type in self.bond_patterns:
+            for match in re.finditer(pattern, smiles):
+                if not any(p in range(match.start(), match.end()) for p in used):
+                    positions.append({
+                        'start': match.start(),
+                        'end': match.end(),
+                        'type': bond_type,
+                        'pattern': match.group()
+                    })
+                    used.update(range(match.start(), match.end()))
+        # Sort by position
+        positions.sort(key=lambda x: x['start'])
+        # Create segments
+        segments = []
+        if positions:
+            # First segment
+            if positions[0]['start'] > 0:
+                segments.append({
+                    'content': smiles[0:positions[0]['start']],
+                    'bond_after': positions[0]['pattern']
+                })
+            # Process segments
+            for i in range(len(positions)-1):
+                current = positions[i]
+                next_pos = positions[i+1]
+                if current['type'] == 'gly':
+                    segments.append({
+                        'content': 'NCC(=O)',
+                        'bond_before': positions[i-1]['pattern'] if i > 0 else None,
+                        'bond_after': next_pos['pattern']
+                    })
+                else:
+                    content = smiles[current['end']:next_pos['start']]
+                    if content:
+                        segments.append({
+                            'content': content,
+                            'bond_before': current['pattern'],
+                            'bond_after': next_pos['pattern']
+                        })
+            # Last segment
+            if positions[-1]['end'] < len(smiles):
+                segments.append({
+                    'content': smiles[positions[-1]['end']:],
+                    'bond_before': positions[-1]['pattern']
+                })
+        return segments
+    def clean_terminal_carboxyl(self, segment):
+        """Remove C-terminal carboxyl only if it's the true terminus"""
+        content = segment['content']
+        # Only clean if:
+        # 1. Contains C(=O)O
+        # 2. No bond_after exists (meaning it's the last segment)
+        # 3. C(=O)O is at the end of the content
+        if 'C(=O)O' in content and not segment.get('bond_after'):
+            print('recognized?')
+            # Remove C(=O)O pattern regardless of position
+            cleaned = re.sub(r'\(C\(=O\)O\)', '', content)
+            # Remove any leftover empty parentheses
+            cleaned = re.sub(r'\(\)', '', cleaned)
+            print(cleaned)
+            return cleaned
+        return content
+    def identify_residue(self, segment):
+        """Identify residue with Pro reconstruction"""
+        # Only clean terminal carboxyl if this is the last segment
+        content = self.clean_terminal_carboxyl(segment)
+        mods = self.get_modifications(segment)
+        # UAA pattern matching section - before regular residues
+        # Phenylglycine and derivatives
+        if 'c1ccccc1' in content:
+            if '[C@@H](c1ccccc1)' in content or '[C@H](c1ccccc1)' in content:
+                return '4', mods  # Base phenylglycine
+        # 4-substituted phenylalanines
+        if 'Cc1ccc' in content:
+            if 'OMe' in content or 'OCc1ccc' in content:
+                return '0A1', mods  # 4-methoxy-Phenylalanine
+            elif 'Clc1ccc' in content:
+                return '200', mods  # 4-chloro-Phenylalanine
+            elif 'Brc1ccc' in content:
+                return '4BF', mods  # 4-Bromo-phenylalanine
+            elif 'C#Nc1ccc' in content:
+                return '4CF', mods  # 4-cyano-phenylalanine
+            elif 'Ic1ccc' in content:
+                return 'PHI', mods  # 4-Iodo-phenylalanine
+            elif 'Fc1ccc' in content:
+                return 'PFF', mods  # 4-Fluoro-phenylalanine
+        # Modified tryptophans
+        if 'c[nH]c2' in content:
+            if 'Oc2cccc2' in content:
+                return '0AF', mods  # 7-hydroxy-tryptophan
+            elif 'Fc2cccc2' in content:
+                return '4FW', mods  # 4-fluoro-tryptophan
+            elif 'Clc2cccc2' in content:
+                return '6CW', mods  # 6-chloro-tryptophan
+            elif 'Brc2cccc2' in content:
+                return 'BTR', mods  # 6-bromo-tryptophan
+            elif 'COc2cccc2' in content:
+                return 'MOT5', mods  # 5-Methoxy-tryptophan
+            elif 'Cc2cccc2' in content:
+                return 'MTR5', mods  # 5-Methyl-tryptophan
+        # Special amino acids
+        if 'CC(C)(C)[C@@H]' in content or 'CC(C)(C)[C@H]' in content:
+            return 'BUG', mods  # Tertleucine
+        if 'CCCNC(=N)N' in content:
+            return 'CIR', mods  # Citrulline
+        if '[SeH]' in content:
+            return 'CSE', mods  # Selenocysteine
+        if '[NH3]CC[C@@H]' in content or '[NH3]CC[C@H]' in content:
+            return 'DAB', mods  # Diaminobutyric acid
+        if 'C1CCCCC1' in content:
+            if 'C1CCCCC1[C@@H]' in content or 'C1CCCCC1[C@H]' in content:
+                return 'CHG', mods  # Cyclohexylglycine
+            elif 'C1CCCCC1C[C@@H]' in content or 'C1CCCCC1C[C@H]' in content:
+                return 'ALC', mods  # 3-cyclohexyl-alanine
+        # Naphthalene derivatives
+        if 'c1cccc2c1cccc2' in content:
+            if 'c1cccc2c1cccc2[C@@H]' in content or 'c1cccc2c1cccc2[C@H]' in content:
+                return 'NAL', mods  # 2-Naphthyl-alanine
+        # Heteroaromatic derivatives
+        if 'c1cncc' in content:
+            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
+        if 'c1cscc' in content:
+            return 'THA3', mods  # 3-(3-thienyl)-alanine
+        if 'c1nnc' in content:
+            return 'TRZ4', mods  # 3-(1,2,4-Triazol-1-yl)-alanine
+        # Modified serines and threonines
+        if 'OP(O)(O)O' in content:
+            if '[C@@H](COP' in content or '[C@H](COP' in content:
+                return 'SEP', mods  # phosphoserine
+            elif '[C@@H](OP' in content or '[C@H](OP' in content:
+                return 'TPO', mods  # phosphothreonine
+        # Specialized ring systems
+        if 'c1c2ccccc2cc2c1cccc2' in content:
+            return 'ANTH', mods  # 3-(9-anthryl)-alanine
+        if 'c1csc2c1cccc2' in content:
+            return 'BTH3', mods  # 3-(3-benzothienyl)-alanine
+        if '[C@]12C[C@H]3C[C@@H](C2)C[C@@H](C1)C3' in content:
+            return 'ADAM', mods  # Adamanthane
+        # Fluorinated derivatives
+        if 'FC(F)(F)' in content:
+            if 'CC(F)(F)F' in content:
+                return 'FLA', mods  # Trifluoro-alanine
+            if 'C(F)(F)F)c1' in content:
+                if 'c1ccccc1C(F)(F)F' in content:
+                    return 'TFG2', mods  # 2-(Trifluoromethyl)-phenylglycine
+                if 'c1cccc(c1)C(F)(F)F' in content:
+                    return 'TFG3', mods  # 3-(Trifluoromethyl)-phenylglycine
+                if 'c1ccc(cc1)C(F)(F)F' in content:
+                    return 'TFG4', mods  # 4-(Trifluoromethyl)-phenylglycine
+        # Multiple halogen patterns
+        if 'F' in content and 'c1' in content:
+            if 'c1ccc(c(c1)F)F' in content:
+                return 'F2F', mods  # 3,4-Difluoro-phenylalanine
+            if 'cc(F)cc(c1)F' in content:
+                return 'WFP', mods  # 3,5-Difluoro-phenylalanine
+        if 'Cl' in content and 'c1' in content:
+            if 'c1ccc(cc1Cl)Cl' in content:
+                return 'CP24', mods  # 2,4-dichloro-phenylalanine
+            if 'c1ccc(c(c1)Cl)Cl' in content:
+                return 'CP34', mods  # 3,4-dichloro-phenylalanine
+        # Hydroxy and amino derivatives
+        if 'O' in content and 'c1' in content:
+            if 'c1cc(O)cc(c1)O' in content:
+                return '3FG', mods  # (2s)-amino(3,5-dihydroxyphenyl)-ethanoic acid
+            if 'c1ccc(c(c1)O)O' in content:
+                return 'DAH', mods  # 3,4-Dihydroxy-phenylalanine
+        # Cyclic amino acids
+        if 'C1CCCC1' in content:
+            return 'CPA3', mods  # 3-Cyclopentyl-alanine
+        if 'C1CCCCC1' in content:
+            if 'CC1CCCCC1' in content:
+                return 'ALC', mods  # 3-cyclohexyl-alanine
+            else:
+                return 'CHG', mods  # Cyclohexylglycine
+        # Chain-length variants
+        if 'CCC[C@@H]' in content or 'CCC[C@H]' in content:
+            return 'NLE', mods  # Norleucine
+        if 'CC[C@@H]' in content or 'CC[C@H]' in content:
+            if not any(x in content for x in ['CC(C)', 'COC', 'CN(']):
+                return 'ABA', mods  # 2-Aminobutyric acid
+        # Modified histidines
+        if 'c1cnc' in content:
+            if '[C@@H]1CN[C@@H](N1)F' in content:
+                return '2HF', mods  # 2-fluoro-l-histidine
+            if 'c1cnc([nH]1)F' in content:
+                return '2HF1', mods  # 2-fluoro-l-histidine variant
+            if 'c1c[nH]c(n1)F' in content:
+                return '2HF2', mods  # 2-fluoro-l-histidine variant
+        # Sulfur and selenium containing
+        if '[SeH]' in content:
+            return 'CSE', mods  # Selenocysteine
+        if 'S' in content:
+            if 'CSCc1ccccc1' in content:
+                return 'BCS', mods  # benzylcysteine
+            if 'CCSC' in content:
+                return 'ESC', mods  # Ethionine
+            if 'CCS' in content:
+                return 'HCS', mods  # homocysteine
+        # Additional modifications
+        if 'CN=[N]=N' in content:
+            return 'AZDA', mods  # azido-alanine
+        if '[NH]=[C](=[NH2])=[NH2]' in content:
+            if 'CCC[NH]=' in content:
+                return 'AGM', mods  # 5-methyl-arginine
+            if 'CC[NH]=' in content:
+                return 'GDPR', mods  # 2-Amino-3-guanidinopropionic acid
+        if 'CCON' in content:
+            return 'CAN', mods  # canaline
+        if '[C@@H]1C=C[C@@H](C=C1)' in content:
+            return 'ACZ', mods  # cis-amiclenomycin
+        if 'CCC(=O)[NH3]' in content:
+            return 'ONL', mods  # 5-oxo-l-norleucine
+        if 'c1ccncc1' in content:
+            return 'PYR4', mods  # 3-(4-Pyridyl)-alanine
+        if 'c1ccco1' in content:
+            return 'FUA2', mods  # (2-furyl)-alanine
+        if 'c1ccc' in content:
+            if 'c1ccc(cc1)c1ccccc1' in content:
+                return 'BIF', mods  # 4,4-biphenylalanine
+            if 'c1ccc(cc1)C(=O)c1ccccc1' in content:
+                return 'PBF', mods  # 4-benzoyl-phenylalanine
+            if 'c1ccc(cc1)C(C)(C)C' in content:
+                return 'TBP4', mods  # 4-tert-butyl-phenylalanine
+            if 'c1ccc(cc1)[C](=[NH2])=[NH2]' in content:
+                return '0BN', mods  # 4-carbamimidoyl-l-phenylalanine
+            if 'c1cccc(c1)[C](=[NH2])=[NH2]' in content:
+                return 'APM', mods  # m-amidinophenyl-3-alanine
+        # Multiple hydroxy patterns
+        if 'O' in content:
+            if '[C@H]([C@H](C)O)O' in content:
+                return 'ILX', mods  # 4,5-dihydroxy-isoleucine
+            if '[C@H]([C@@H](C)O)O' in content:
+                return 'ALO', mods  # Allo-threonine
+            if '[C@H](COP(O)(O)O)' in content:
+                return 'SEP', mods  # phosphoserine
+            if '[C@H]([C@@H](C)OP(O)(O)O)' in content:
+                return 'TPO', mods  # phosphothreonine
+            if '[C@H](c1ccc(O)cc1)O' in content:
+                return 'OMX', mods  # (betar)-beta-hydroxy-l-tyrosine
+            if '[C@H](c1ccc(c(Cl)c1)O)O' in content:
+                return 'OMY', mods  # (betar)-3-chloro-beta-hydroxy-l-tyrosine
+        # Heterocyclic patterns
+        if 'n1' in content:
+            if 'n1cccn1' in content:
+                return 'PYZ1', mods  # 3-(1-Pyrazolyl)-alanine
+            if 'n1nncn1' in content:
+                return 'TEZA', mods  # 3-(2-Tetrazolyl)-alanine
+            if 'c2c(n1)cccc2' in content:
+                return 'QU32', mods  # 3-(2-Quinolyl)-alanine
+            if 'c1cnc2c(c1)cccc2' in content:
+                return 'QU33', mods  # 3-(3-quinolyl)-alanine
+            if 'c1ccnc2c1cccc2' in content:
+                return 'QU34', mods  # 3-(4-quinolyl)-alanine
+            if 'c1ccc2c(c1)nccc2' in content:
+                return 'QU35', mods  # 3-(5-Quinolyl)-alanine
+            if 'c1ccc2c(c1)cncc2' in content:
+                return 'QU36', mods  # 3-(6-Quinolyl)-alanine
+            if 'c1cnc2c(n1)cccc2' in content:
+                return 'QX32', mods  # 3-(2-quinoxalyl)-alanine
+        # Multiple nitrogen patterns
+        if 'N' in content:
+            if '[NH3]CC[C@@H]' in content:
+                return 'DAB', mods  # Diaminobutyric acid
+            if '[NH3]C[C@@H]' in content:
+                return 'DPP', mods  # 2,3-Diaminopropanoic acid
+            if '[NH3]CCCCCC[C@@H]' in content:
+                return 'HHK', mods  # (2s)-2,8-diaminooctanoic acid
+            if 'CCC[NH]=[C](=[NH2])=[NH2]' in content:
+                return 'GBUT', mods  # 2-Amino-4-guanidinobutryric acid
+            if '[NH]=[C](=S)=[NH2]' in content:
+                return 'THIC', mods  # Thio-citrulline
+        # Chain modified amino acids
+        if 'CC' in content:
+            if 'CCCC[C@@H]' in content:
+                return 'AHP', mods  # 2-Aminoheptanoic acid
+            if 'CCC([C@@H])(C)C' in content:
+                return 'I2M', mods  # 3-methyl-l-alloisoleucine
+            if 'CC[C@H]([C@@H])C' in content:
+                return 'IIL', mods  # Allo-Isoleucine
+            if '[C@H](CCC(C)C)' in content:
+                return 'HLEU', mods  # Homoleucine
+            if '[C@@H]([C@@H](C)O)C' in content:
+                return 'HLU', mods  # beta-hydroxyleucine
+        # Modified glutamate/aspartate patterns
+        if '[C@@H]' in content:
+            if '[C@@H](C[C@@H](F))' in content:
+                return 'FGA4', mods  # 4-Fluoro-glutamic acid
+            if '[C@@H](C[C@@H](O))' in content:
+                return '3GL', mods  # 4-hydroxy-glutamic-acid
+            if '[C@@H](C[C@H](C))' in content:
+                return 'LME', mods  # (3r)-3-methyl-l-glutamic acid
+            if '[C@@H](CC[C@H](C))' in content:
+                return 'MEG', mods  # (3s)-3-methyl-l-glutamic acid
+        # Sulfur and selenium modifications
+        if 'S' in content:
+            if 'SCC[C@@H]' in content:
+                return 'HSER', mods  # homoserine
+            if 'SCCN' in content:
+                return 'SLZ', mods  # thialysine
+            if 'SC(=O)' in content:
+                return 'CSA', mods  # s-acetonylcysteine
+            if '[S@@](=O)' in content:
+                return 'SME', mods  # Methionine sulfoxide
+            if 'S(=O)(=O)' in content:
+                return 'OMT', mods  # Methionine sulfone
+        # Double bond containing
+        if 'C=' in content:
+            if 'C=C[C@@H]' in content:
+                return '2AG', mods  # 2-Allyl-glycine
+            if 'C=C[C@@H]' in content:
+                return 'LVG', mods  # vinylglycine
+            if 'C=Cc1ccccc1' in content:
+                return 'STYA', mods  # Styrylalanine
+        # Special cases
+        if '[C@@H]1Cc2c(C1)cccc2' in content:
+            return 'IGL', mods  # alpha-amino-2-indanacetic acid
+        if '[C](=[C](=O)=O)=O' in content:
+            return '26P', mods  # 2-amino-6-oxopimelic acid
+        if '[C](=[C](=O)=O)=C' in content:
+            return '2NP', mods  # l-2-amino-6-methylene-pimelic acid
+        if 'c2cnc[nH]2' in content:
+            return 'HIS', mods  # histidine core
+        if 'c1cccc2c1cc(O)cc2' in content:
+            return 'NAO1', mods  # 5-hydroxy-1-naphthalene
+        if 'c1ccc2c(c1)cc(O)cc2' in content:
+            return 'NAO2', mods  # 6-hydroxy-2-naphthalene
+        # Proline (P) - flexible ring numbers
+        if any([
+            # Check for any ring number in bond patterns
+            (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
+            any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
+            for n in '123456789'
+        ]) or any([
+            # Check ending patterns with any ring number
+            (f'CCCN{n}' in content and content.endswith('=O') and
+            any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
+            for n in '123456789'
+        ]) or any([
+            # Handle CCC[C@H]n patterns
+            (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
+            (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
+            # N-terminal Pro with any ring number
+            (f'N{n}CCC[C@H]{n}' in content) or
+            (f'N{n}CCC[C@@H]{n}' in content)
+            for n in '123456789'
+        ]):
+            return 'Pro', mods
+        # Tryptophan (W) - more specific indole pattern
+        if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
+        'c[nH]c' in content.replace(' ', ''):
+            return 'Trp', mods
+        # Lysine (K) - both patterns
+        if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
+            return 'Lys', mods
+        # Arginine (R) - both patterns
+        if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
+            return 'Arg', mods
+        if ('C[C@H](CCCC)' in content or 'C[C@@H](CCCC)' in content) and 'CC(C)' not in content:
+            return 'Nle', mods
+        # Ornithine (Orn) - 3-carbon chain with NH2
+        if ('C[C@H](CCCN)' in content or 'C[C@@H](CCCN)' in content) and 'CC(C)' not in content:
+            return 'Orn', mods
+        # 2-Naphthylalanine (2Nal) - distinct from Phe pattern
+        if ('Cc3cc2ccccc2c3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return '2Nal', mods
+        # Cyclohexylalanine (Cha) - already in your code but moved here for clarity
+        if 'N2CCCCC2' in content or 'CCCCC2' in content:
+            return 'Cha', mods
+        # Aminobutyric acid (Abu) - 2-carbon chain
+        if ('C[C@H](CC)' in content or 'C[C@@H](CC)' in content) and not any(p in content for p in ['CC(C)', 'CCCC', 'CCC(C)']):
+            return 'Abu', mods
+        # Pipecolic acid (Pip) - 6-membered ring like Pro
+        if ('N3CCCCC3' in content or 'CCCCC3' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Pip', mods
+        # Cyclohexylglycine (Chg) - direct cyclohexyl without CH2
+        if ('C[C@H](C1CCCCC1)' in content or 'C[C@@H](C1CCCCC1)' in content):
+            return 'Chg', mods
+        # 4-Fluorophenylalanine (4F-Phe)
+        if ('Cc2ccc(F)cc2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return '4F-Phe', mods
+        # Regular residue identification
+        if ('NCC(=O)' in content) or (content == 'C'):
+            # Middle case - between bonds
+            if segment.get('bond_before') and segment.get('bond_after'):
+                if ('C(=O)N' in segment['bond_before'] or 'C(=O)N(C)' in segment['bond_before']):
+                    return 'Gly', mods
+            # Terminal case - at the end
+            elif segment.get('bond_before') and segment.get('bond_before').startswith('C(=O)N'):
+                return 'Gly', mods
+        if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content:
+            return 'Leu', mods
+        if '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content:
+            return 'Leu', mods
+        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content:
+            return 'Thr', mods
+        if '[C@H](Cc2ccccc2)' in content or '[C@@H](Cc2ccccc2)' in content:
+            return 'Phe', mods
+        if ('[C@H](C(C)C)' in content or       # With outer parentheses
+            '[C@@H](C(C)C)' in content or      # With outer parentheses
+            '[C@H]C(C)C' in content or         # Without outer parentheses
+            '[C@@H]C(C)C' in content):         # Without outer parentheses
+            if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]']):  # Still check not Leu
+                return 'Val', mods
+        if '[C@H](COC(C)(C)C)' in content or '[C@@H](COC(C)(C)C)' in content:
+            return 'O-tBu', mods
+        if any([
+            'CC[C@H](C)' in content,
+            'CC[C@@H](C)' in content,
+            'C(C)C[C@H]' in content and 'CC(C)C' not in content,
+            'C(C)C[C@@H]' in content and 'CC(C)C' not in content
+        ]):
+            return 'Ile', mods
+        if ('[C@H](C)' in content or '[C@@H](C)' in content):
+            if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
+                return 'Ala', mods
+        # Tyrosine (Tyr) - 4-hydroxybenzyl side chain
+        if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
+            return 'Tyr', mods
+        # Serine (Ser) - Hydroxymethyl side chain
+        if '[C@H](CO)' in content or '[C@@H](CO)' in content:
+            if not ('C(C)O' in content or 'COC' in content):
+                return 'Ser', mods
+        # Threonine (Thr) - 1-hydroxyethyl side chain
+        if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H](C)O' in content or '[C@H](C)O' in content:
+            return 'Thr', mods
+        # Cysteine (Cys) - Thiol side chain
+        if '[C@H](CS)' in content or '[C@@H](CS)' in content:
+            return 'Cys', mods
+        # Methionine (Met) - Methylthioethyl side chain
+        if ('C[C@H](CCSC)' in content or 'C[C@@H](CCSC)' in content):
+            return 'Met', mods
+        # Asparagine (Asn) - Carbamoylmethyl side chain
+        if ('CC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Asn', mods
+        # Glutamine (Gln) - Carbamoylethyl side chain
+        if ('CCC(=O)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Gln', mods
+        # Aspartic acid (Asp) - Carboxymethyl side chain
+        if ('CC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Asp', mods
+        # Glutamic acid (Glu) - Carboxyethyl side chain
+        if ('CCC(=O)O' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Glu', mods
+        # Arginine (Arg) - 3-guanidinopropyl side chain
+        if ('CCCNC(=N)N' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'Arg', mods
+        # Histidine (His) - Imidazole side chain
+        if ('Cc2cnc[nH]2' in content) and ('C[C@H]' in content or 'C[C@@H]' in content):
+            return 'His', mods
+        return None, mods
+    def get_modifications(self, segment):
+        """Get modifications based on bond types"""
+        mods = []
+        if segment.get('bond_after'):
+            if 'N(C)' in segment['bond_after'] or segment['bond_after'].startswith('C(=O)N(C)'):
+                mods.append('N-Me')
+            if 'OC(=O)' in segment['bond_after']:
+                mods.append('O-linked')
+        return mods
+    def analyze_structure(self, smiles):
+        """Main analysis function with debug output"""
+        print("\nAnalyzing structure:", smiles)
+        # Split into segments
+        segments = self.split_on_bonds(smiles)
+        print("\nSegment Analysis:")
+        sequence = []
+        for i, segment in enumerate(segments):
+            print(f"\nSegment {i}:")
+            print(f"Content: {segment['content']}")
+            print(f"Bond before: {segment.get('bond_before', 'None')}")
+            print(f"Bond after: {segment.get('bond_after', 'None')}")
+            residue, mods = self.identify_residue(segment)
+            if residue:
+                if mods:
+                    sequence.append(f"{residue}({','.join(mods)})")
+                else:
+                    sequence.append(residue)
+                print(f"Identified as: {residue}")
+                print(f"Modifications: {mods}")
+            else:
+                print(f"Warning: Could not identify residue in segment: {segment['content']}")
+        # Check if cyclic
+        is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
+        three_letter = '-'.join(sequence)
+        one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
+        if is_cyclic:
+            three_letter = f"cyclo({three_letter})"
+            one_letter = f"cyclo({one_letter})"
+        print(f"\nFinal sequence: {three_letter}")
+        print(f"One-letter code: {one_letter}")
+        print(f"Is cyclic: {is_cyclic}")
+        #print(f"Peptide cycles: {peptide_cycles}")
+        #print(f"Aromatic cycles: {aromatic_cycles}")
+        return three_letter, len(segments)
+        """return {
+            'three_letter': three_letter,
+            #'one_letter': one_letter,
+            'is_cyclic': is_cyclic
+        }"""
+    def return_sequence(self, smiles):
+        """Main analysis function with debug output"""
+        print("\nAnalyzing structure:", smiles)
+        # Split into segments
+        segments = self.split_on_bonds(smiles)
+        print("\nSegment Analysis:")
+        sequence = []
+        for i, segment in enumerate(segments):
+            print(f"\nSegment {i}:")
+            print(f"Content: {segment['content']}")
+            print(f"Bond before: {segment.get('bond_before', 'None')}")
+            print(f"Bond after: {segment.get('bond_after', 'None')}")
+            residue, mods = self.identify_residue(segment)
+            if residue:
+                if mods:
+                    sequence.append(f"{residue}({','.join(mods)})")
+                else:
+                    sequence.append(residue)
+                print(f"Identified as: {residue}")
+                print(f"Modifications: {mods}")
+            else:
+                print(f"Warning: Could not identify residue in segment: {segment['content']}")
+        return sequence
+"""
+def annotate_cyclic_structure(mol, sequence):
+    '''Create annotated 2D structure with clear, non-overlapping residue labels'''
+    # Generate 2D coordinates
+    # Generate 2D coordinates
+    AllChem.Compute2DCoords(mol)
+    # Create drawer with larger size for annotations
+    drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)  # Even larger size
+    # Get residue list and reverse it to match structural representation
+    if sequence.startswith('cyclo('):
+        residues = sequence[6:-1].split('-')
+    else:
+        residues = sequence.split('-')
+    residues = list(reversed(residues))  # Reverse the sequence
+    # Draw molecule first to get its bounds
+    drawer.drawOptions().addAtomIndices = False
+    drawer.DrawMolecule(mol)
+    drawer.FinishDrawing()
+    # Convert to PIL Image
+    img = Image.open(BytesIO(drawer.GetDrawingText()))
+    draw = ImageDraw.Draw(img)
+    try:
+        # Try to use DejaVuSans as it's commonly available on Linux systems
+        font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
+        small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
+    except OSError:
+        try:
+            # Fallback to Arial if available (common on Windows)
+            font = ImageFont.truetype("arial.ttf", 60)
+            small_font = ImageFont.truetype("arial.ttf", 60)
+        except OSError:
+            # If no TrueType fonts are available, fall back to default
+            print("Warning: TrueType fonts not available, using default font")
+            font = ImageFont.load_default()
+            small_font = ImageFont.load_default()
+    # Get molecule bounds
+    conf = mol.GetConformer()
+    positions = []
+    for i in range(mol.GetNumAtoms()):
+        pos = conf.GetAtomPosition(i)
+        positions.append((pos.x, pos.y))
+    x_coords = [p[0] for p in positions]
+    y_coords = [p[1] for p in positions]
+    min_x, max_x = min(x_coords), max(x_coords)
+    min_y, max_y = min(y_coords), max(y_coords)
+    # Calculate scaling factors
+    scale = 150  # Increased scale factor
+    center_x = 1000  # Image center
+    center_y = 1000
+    # Add residue labels in a circular arrangement around the structure
+    n_residues = len(residues)
+    radius = 700  # Distance of labels from center
+    # Start from the rightmost point (3 o'clock position) and go counterclockwise
+    # Offset by -3 positions to align with structure
+    offset = 0  # Adjust this value to match the structure alignment
+    for i, residue in enumerate(residues):
+        # Calculate position in a circle around the structure
+        # Start from 0 (3 o'clock) and go counterclockwise
+        angle = -(2 * np.pi * ((i + offset) % n_residues) / n_residues)
+        # Calculate label position
+        label_x = center_x + radius * np.cos(angle)
+        label_y = center_y + radius * np.sin(angle)
+        # Draw residue label
+        text = f"{i+1}. {residue}"
+        bbox = draw.textbbox((label_x, label_y), text, font=font)
+        padding = 10
+        draw.rectangle([bbox[0]-padding, bbox[1]-padding,
+                       bbox[2]+padding, bbox[3]+padding],
+                      fill='white', outline='white')
+        draw.text((label_x, label_y), text,
+                 font=font, fill='black', anchor="mm")
+    # Add sequence at the top with white background
+    seq_text = f"Sequence: {sequence}"
+    bbox = draw.textbbox((center_x, 100), seq_text, font=small_font)
+    padding = 10
+    draw.rectangle([bbox[0]-padding, bbox[1]-padding,
+                   bbox[2]+padding, bbox[3]+padding],
+                  fill='white', outline='white')
+    draw.text((center_x, 100), seq_text,
+             font=small_font, fill='black', anchor="mm")
+    return img
+"""
+def annotate_cyclic_structure(mol, sequence):
+    """Create structure visualization with just the sequence header"""
+    # Generate 2D coordinates
+    AllChem.Compute2DCoords(mol)
+    # Create drawer with larger size for annotations
+    drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
+    # Draw molecule first
+    drawer.drawOptions().addAtomIndices = False
+    drawer.DrawMolecule(mol)
+    drawer.FinishDrawing()
+    # Convert to PIL Image
+    img = Image.open(BytesIO(drawer.GetDrawingText()))
+    draw = ImageDraw.Draw(img)
+    try:
+        small_font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 60)
+    except OSError:
+        try:
+            small_font = ImageFont.truetype("arial.ttf", 60)
+        except OSError:
+            print("Warning: TrueType fonts not available, using default font")
+            small_font = ImageFont.load_default()
+    # Add just the sequence header at the top
+    seq_text = f"Sequence: {sequence}"
+    bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
+    padding = 10
+    draw.rectangle([bbox[0]-padding, bbox[1]-padding,
+                   bbox[2]+padding, bbox[3]+padding],
+                  fill='white', outline='white')
+    draw.text((1000, 100), seq_text,
+             font=small_font, fill='black', anchor="mm")
+    return img
+def create_enhanced_linear_viz(sequence, smiles):
+    """Create an enhanced linear representation using PeptideAnalyzer"""
+    analyzer = PeptideAnalyzer()  # Create analyzer instance
+    # Create figure with two subplots
+    fig = plt.figure(figsize=(15, 10))
+    gs = fig.add_gridspec(2, 1, height_ratios=[1, 2])
+    ax_struct = fig.add_subplot(gs[0])
+    ax_detail = fig.add_subplot(gs[1])
+    # Parse sequence and get residues
+    if sequence.startswith('cyclo('):
+        residues = sequence[6:-1].split('-')
+    else:
+        residues = sequence.split('-')
+    # Get segments using analyzer
+    segments = analyzer.split_on_bonds(smiles)
+    # Debug print
+    print(f"Number of residues: {len(residues)}")
+    print(f"Number of segments: {len(segments)}")
+    # Top subplot - Basic structure
+    ax_struct.set_xlim(0, 10)
+    ax_struct.set_ylim(0, 2)
+    num_residues = len(residues)
+    spacing = 9.0 / (num_residues - 1) if num_residues > 1 else 9.0
+    # Draw basic structure
+    y_pos = 1.5
+    for i in range(num_residues):
+        x_pos = 0.5 + i * spacing
+        # Draw amino acid box
+        rect = patches.Rectangle((x_pos-0.3, y_pos-0.2), 0.6, 0.4,
+                               facecolor='lightblue', edgecolor='black')
+        ax_struct.add_patch(rect)
+        # Draw connecting bonds if not the last residue
+        if i < num_residues - 1:
+            segment = segments[i] if i < len(segments) else None
+            if segment:
+                # Determine bond type from segment info
+                bond_type = 'ester' if 'O-linked' in segment.get('bond_after', '') else 'peptide'
+                is_n_methylated = 'N-Me' in segment.get('bond_after', '')
+                bond_color = 'red' if bond_type == 'ester' else 'black'
+                linestyle = '--' if bond_type == 'ester' else '-'
+                # Draw bond line
+                ax_struct.plot([x_pos+0.3, x_pos+spacing-0.3], [y_pos, y_pos],
+                             color=bond_color, linestyle=linestyle, linewidth=2)
+                # Add bond type label
+                mid_x = x_pos + spacing/2
+                bond_label = f"{bond_type}"
+                if is_n_methylated:
+                    bond_label += "\n(N-Me)"
+                ax_struct.text(mid_x, y_pos+0.1, bond_label,
+                             ha='center', va='bottom', fontsize=10,
+                             color=bond_color)
+        # Add residue label
+        ax_struct.text(x_pos, y_pos-0.5, residues[i],
+                      ha='center', va='top', fontsize=14)
+    # Bottom subplot - Detailed breakdown
+    ax_detail.set_ylim(0, len(segments)+1)
+    ax_detail.set_xlim(0, 1)
+    # Create detailed breakdown
+    segment_y = len(segments)  # Start from top
+    for i, segment in enumerate(segments):
+        y = segment_y - i
+        # Check if this is a bond or residue
+        residue, mods = analyzer.identify_residue(segment)
+        if residue:
+            text = f"Residue {i+1}: {residue}"
+            if mods:
+                text += f" ({', '.join(mods)})"
+            color = 'blue'
+        else:
+            # Must be a bond
+            text = f"Bond {i}: "
+            if 'O-linked' in segment.get('bond_after', ''):
+                text += "ester"
+            elif 'N-Me' in segment.get('bond_after', ''):
+                text += "peptide (N-methylated)"
+            else:
+                text += "peptide"
+            color = 'red'
+        # Add segment analysis
+        ax_detail.text(0.05, y, text, fontsize=12, color=color)
+        ax_detail.text(0.5, y, f"SMILES: {segment.get('content', '')}", fontsize=10, color='gray')
+    # If cyclic, add connection indicator
+    if sequence.startswith('cyclo('):
+        ax_struct.annotate('', xy=(9.5, y_pos), xytext=(0.5, y_pos),
+                          arrowprops=dict(arrowstyle='<->', color='red', lw=2))
+        ax_struct.text(5, y_pos+0.3, 'Cyclic Connection',
+                      ha='center', color='red', fontsize=14)
+    # Add titles and adjust layout
+    ax_struct.set_title("Peptide Structure Overview", pad=20)
+    ax_detail.set_title("Segment Analysis Breakdown", pad=20)
+    # Remove axes
+    for ax in [ax_struct, ax_detail]:
+        ax.set_xticks([])
+        ax.set_yticks([])
+        ax.axis('off')
+    plt.tight_layout()
+    return fig
+class PeptideStructureGenerator:
+    """A class to generate 3D structures of peptides using different embedding methods"""
+    @staticmethod
+    def prepare_molecule(smiles):
+        """Prepare molecule with proper hydrogen handling"""
+        mol = Chem.MolFromSmiles(smiles, sanitize=False)
+        if mol is None:
+            raise ValueError("Failed to create molecule from SMILES")
+        # Calculate valence for each atom
+        for atom in mol.GetAtoms():
+            atom.UpdatePropertyCache(strict=False)
+        # Sanitize with reduced requirements
+        Chem.SanitizeMol(mol,
+                        sanitizeOps=Chem.SANITIZE_FINDRADICALS|
+                                  Chem.SANITIZE_KEKULIZE|
+                                  Chem.SANITIZE_SETAROMATICITY|
+                                  Chem.SANITIZE_SETCONJUGATION|
+                                  Chem.SANITIZE_SETHYBRIDIZATION|
+                                  Chem.SANITIZE_CLEANUPCHIRALITY)
+        mol = Chem.AddHs(mol)
+        return mol
+    @staticmethod
+    def get_etkdg_params(attempt=0):
+        """Get ETKDG parameters with optional modifications based on attempt number"""
+        params = AllChem.ETKDGv3()
+        params.randomSeed = -1
+        params.maxIterations = 200
+        params.numThreads = 4  # Reduced for web interface
+        params.useBasicKnowledge = True
+        params.enforceChirality = True
+        params.useExpTorsionAnglePrefs = True
+        params.useSmallRingTorsions = True
+        params.useMacrocycleTorsions = True
+        params.ETversion = 2
+        params.pruneRmsThresh = -1
+        params.embedRmsThresh = 0.5
+        if attempt > 10:
+            params.bondLength = 1.5 + (attempt - 10) * 0.02
+            params.useExpTorsionAnglePrefs = False
+        return params
+    def generate_structure_etkdg(self, smiles, max_attempts=20):
+        """Generate 3D structure using ETKDG without UFF optimization"""
+        success = False
+        mol = None
+        for attempt in range(max_attempts):
+            try:
+                mol = self.prepare_molecule(smiles)
+                params = self.get_etkdg_params(attempt)
+                if AllChem.EmbedMolecule(mol, params) == 0:
+                    success = True
+                    break
+            except Exception as e:
+                continue
+        if not success:
+            raise ValueError("Failed to generate structure with ETKDG")
+        return mol
+    def generate_structure_uff(self, smiles, max_attempts=20):
+        """Generate 3D structure using ETKDG followed by UFF optimization"""
+        best_mol = None
+        lowest_energy = float('inf')
+        for attempt in range(max_attempts):
+            try:
+                test_mol = self.prepare_molecule(smiles)
+                params = self.get_etkdg_params(attempt)
+                if AllChem.EmbedMolecule(test_mol, params) == 0:
+                    res = AllChem.UFFOptimizeMolecule(test_mol, maxIters=2000,
+                                                     vdwThresh=10.0, confId=0,
+                                                     ignoreInterfragInteractions=True)
+                    if res == 0:
+                        ff = AllChem.UFFGetMoleculeForceField(test_mol)
+                        if ff:
+                            current_energy = ff.CalcEnergy()
+                            if current_energy < lowest_energy:
+                                lowest_energy = current_energy
+                                best_mol = Chem.Mol(test_mol)
+            except Exception:
+                continue
+        if best_mol is None:
+            raise ValueError("Failed to generate optimized structure")
+        return best_mol
+    @staticmethod
+    def mol_to_sdf_bytes(mol):
+        """Convert RDKit molecule to SDF file bytes"""
+        # First write to StringIO in text mode
+        sio = StringIO()
+        writer = Chem.SDWriter(sio)
+        writer.write(mol)
+        writer.close()
+        # Convert the string to bytes
+        return sio.getvalue().encode('utf-8')
+def process_input(smiles_input=None, file_obj=None, show_linear=False,
+                 show_segment_details=False, generate_3d=False, use_uff=False):
+    """Process input and create visualizations using PeptideAnalyzer"""
+    analyzer = PeptideAnalyzer()
+    temp_dir = tempfile.mkdtemp() if generate_3d else None
+    structure_files = []
+    # Handle direct SMILES input
+    if smiles_input:
+        smiles = smiles_input.strip()
+        # First check if it's a peptide using analyzer's method
+        if not analyzer.is_peptide(smiles):
+            return "Error: Input SMILES does not appear to be a peptide structure.", None, None
+        try:
+            # Create molecule
+            mol = Chem.MolFromSmiles(smiles)
+            if mol is None:
+                return "Error: Invalid SMILES notation.", None, None
+            # Generate 3D structures if requested
+            if generate_3d:
+                generator = PeptideStructureGenerator()
+                try:
+                    # Generate ETKDG structure
+                    mol_etkdg = generator.generate_structure_etkdg(smiles)
+                    etkdg_path = os.path.join(temp_dir, "structure_etkdg.sdf")
+                    writer = Chem.SDWriter(etkdg_path)
+                    writer.write(mol_etkdg)
+                    writer.close()
+                    structure_files.append(etkdg_path)
+                    # Generate UFF structure if requested
+                    if use_uff:
+                        mol_uff = generator.generate_structure_uff(smiles)
+                        uff_path = os.path.join(temp_dir, "structure_uff.sdf")
+                        writer = Chem.SDWriter(uff_path)
+                        writer.write(mol_uff)
+                        writer.close()
+                        structure_files.append(uff_path)
+                except Exception as e:
+                    return f"Error generating 3D structures: {str(e)}", None, None, None
+            # Use analyzer to get sequence
+            segments = analyzer.split_on_bonds(smiles)
+            # Process segments and build sequence
+            sequence_parts = []
+            output_text = ""
+            # Only include segment analysis in output if requested
+            if show_segment_details:
+                output_text += "Segment Analysis:\n"
+                for i, segment in enumerate(segments):
+                    output_text += f"\nSegment {i}:\n"
+                    output_text += f"Content: {segment['content']}\n"
+                    output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
+                    output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
+                    residue, mods = analyzer.identify_residue(segment)
+                    if residue:
+                        if mods:
+                            sequence_parts.append(f"{residue}({','.join(mods)})")
+                        else:
+                            sequence_parts.append(residue)
+                        output_text += f"Identified as: {residue}\n"
+                        output_text += f"Modifications: {mods}\n"
+                    else:
+                        output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
+                output_text += "\n"
+            else:
+                # Just build sequence without detailed analysis in output
+                for segment in segments:
+                    residue, mods = analyzer.identify_residue(segment)
+                    if residue:
+                        if mods:
+                            sequence_parts.append(f"{residue}({','.join(mods)})")
+                        else:
+                            sequence_parts.append(residue)
+            # Check if cyclic using analyzer's method
+            is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
+            three_letter = '-'.join(sequence_parts)
+            one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
+            if is_cyclic:
+                three_letter = f"cyclo({three_letter})"
+                one_letter = f"cyclo({one_letter})"
+            # Create cyclic structure visualization
+            img_cyclic = annotate_cyclic_structure(mol, three_letter)
+            # Create linear representation if requested
+            img_linear = None
+            if show_linear:
+                fig_linear = create_enhanced_linear_viz(three_letter, smiles)
+                buf = BytesIO()
+                fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
+                buf.seek(0)
+                img_linear = Image.open(buf)
+                plt.close(fig_linear)
+            # Add summary to output
+            summary = "Summary:\n"
+            summary += f"Sequence: {three_letter}\n"
+            summary += f"One-letter code: {one_letter}\n"
+            summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
+            #if is_cyclic:
+                #summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
+                #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
+            if structure_files:
+                summary += "\n3D Structures Generated:\n"
+                for filepath in structure_files:
+                    summary += f"- {os.path.basename(filepath)}\n"
+            return summary + output_text, img_cyclic, img_linear, structure_files if structure_files else None
+        except Exception as e:
+            return f"Error processing SMILES: {str(e)}", None, None, None
+    # Handle file input
+    if file_obj is not None:
+        try:
+            # Handle file content
+            if hasattr(file_obj, 'name'):
+                with open(file_obj.name, 'r') as f:
+                    content = f.read()
+            else:
+                content = file_obj.decode('utf-8') if isinstance(file_obj, bytes) else str(file_obj)
+            output_text = ""
+            for line in content.splitlines():
+                smiles = line.strip()
+                if smiles:
+                    # Check if it's a peptide
+                    if not analyzer.is_peptide(smiles):
+                        output_text += f"Skipping non-peptide SMILES: {smiles}\n"
+                        continue
+                    # Process this SMILES
+                    segments = analyzer.split_on_bonds(smiles)
+                    sequence_parts = []
+                    # Add segment details if requested
+                    if show_segment_details:
+                        output_text += f"\nSegment Analysis for SMILES: {smiles}\n"
+                        for i, segment in enumerate(segments):
+                            output_text += f"\nSegment {i}:\n"
+                            output_text += f"Content: {segment['content']}\n"
+                            output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
+                            output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
+                            residue, mods = analyzer.identify_residue(segment)
+                            if residue:
+                                if mods:
+                                    sequence_parts.append(f"{residue}({','.join(mods)})")
+                                else:
+                                    sequence_parts.append(residue)
+                                output_text += f"Identified as: {residue}\n"
+                                output_text += f"Modifications: {mods}\n"
+                    else:
+                        for segment in segments:
+                            residue, mods = analyzer.identify_residue(segment)
+                            if residue:
+                                if mods:
+                                    sequence_parts.append(f"{residue}({','.join(mods)})")
+                                else:
+                                    sequence_parts.append(residue)
+                    # Get cyclicity and create sequence
+                    is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
+                    sequence = f"cyclo({'-'.join(sequence_parts)})" if is_cyclic else '-'.join(sequence_parts)
+                    output_text += f"\nSummary for SMILES: {smiles}\n"
+                    output_text += f"Sequence: {sequence}\n"
+                    output_text += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
+                    if is_cyclic:
+                        output_text += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
+                        #output_text += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
+                    output_text += "-" * 50 + "\n"
+            return output_text, None, None
+        except Exception as e:
+            return f"Error processing file: {str(e)}", None, None
+    return "No input provided.", None, None

a2d2_pep/pep_utils/utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Console logger utilities.
+Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py
+Copied from https://docs.python.org/3/howto/logging-cookbook.html#using-a-context-manager-for-selective-logging
+"""
+import logging
+import fsspec
+import lightning
+import torch
+from timm.scheduler import CosineLRScheduler
+import argparse
+import numpy as np
+import random
+import os
+def sample_categorical_logits(logits, dtype=torch.float64):
+  # do not require logits to be log-softmaxed
+  gumbel_noise = -(1e-10 - (torch.rand_like(logits, dtype=dtype) + 1e-10).log()).log()
+  return (logits + gumbel_noise).argmax(dim=-1)
+def fsspec_exists(filename):
+  """Check if a file exists using fsspec."""
+  fs, _ = fsspec.core.url_to_fs(filename)
+  return fs.exists(filename)
+def fsspec_listdir(dirname):
+  """Listdir in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  return fs.ls(dirname)
+def fsspec_mkdirs(dirname, exist_ok=True):
+  """Mkdirs in manner compatible with fsspec."""
+  fs, _ = fsspec.core.url_to_fs(dirname)
+  fs.makedirs(dirname, exist_ok=exist_ok)
+def print_nans(tensor, name):
+  if torch.isnan(tensor).any():
+    print(name, tensor)
+class CosineDecayWarmupLRScheduler(
+  CosineLRScheduler,
+  torch.optim.lr_scheduler._LRScheduler):
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._last_epoch = -1
+    self.step(epoch=0)
+  def step(self, epoch=None):
+    if epoch is None:
+      self._last_epoch += 1
+    else:
+      self._last_epoch = epoch
+    # We call either step or step_update, depending on
+    # whether we're using the scheduler every epoch or every
+    # step.
+    # Otherwise, lightning will always call step (i.e.,
+    # meant for each epoch), and if we set scheduler
+    # interval to "step", then the learning rate update will
+    # be wrong.
+    if self.t_in_epochs:
+      super().step(epoch=self._last_epoch)
+    else:
+      super().step_update(num_updates=self._last_epoch)
+class LoggingContext:
+  """Context manager for selective logging."""
+  def __init__(self, logger, level=None, handler=None, close=True):
+    self.logger = logger
+    self.level = level
+    self.handler = handler
+    self.close = close
+  def __enter__(self):
+    if self.level is not None:
+      self.old_level = self.logger.level
+      self.logger.setLevel(self.level)
+    if self.handler:
+      self.logger.addHandler(self.handler)
+  def __exit__(self, et, ev, tb):
+    if self.level is not None:
+      self.logger.setLevel(self.old_level)
+    if self.handler:
+      self.logger.removeHandler(self.handler)
+    if self.handler and self.close:
+      self.handler.close()
+def get_logger(name=__name__, level=logging.INFO) -> logging.Logger:
+  """Initializes multi-GPU-friendly python logger."""
+  logger = logging.getLogger(name)
+  logger.setLevel(level)
+  # this ensures all logging levels get marked with the rank zero decorator
+  # otherwise logs would get multiplied for each GPU process in multi-GPU setup
+  for level in ('debug', 'info', 'warning', 'error',
+                'exception', 'fatal', 'critical'):
+    setattr(logger,
+            level,
+            lightning.pytorch.utilities.rank_zero_only(
+              getattr(logger, level)))
+  return logger
+def str2bool(v):
+  if isinstance(v, bool):
+    return v
+  if v.lower() in ('yes', 'true', 't', 'y', '1'):
+    return True
+  elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+    return False
+  else:
+    raise argparse.ArgumentTypeError('Boolean value expected.')
+def set_seed(seed, use_cuda):
+  os.environ['PYTHONHASHSEED'] = str(seed)
+  np.random.seed(seed)
+  random.seed(seed)
+  torch.manual_seed(seed)
+  # torch.backends.cudnn.deterministic = True
+  if use_cuda:
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+  print(f'=> Seed of the run set to {seed}')

a2d2_pep/remasking_scheduleaware.py ADDED Viewed

	@@ -0,0 +1,181 @@

+"""
+Schedule-aware remasking and insertion logic that ensures the number of masked tokens
+follows the interpolant schedule.
+"""
+import torch
+import numpy as np
+def apply_schedule_aware_insertion(
+    model,
+    xt_tmp,
+    new_xt,
+    t,
+    dt,
+    ext,
+    mask,
+    pad,
+    max_length,
+    orig_mask,
+    new_pos_orig,
+    quality_threshold=1,
+):
+    """
+    Remove low-quality insertions based on insertion confidence while respecting
+    the interpolant schedule for expected sequence length.
+    Args:
+        model: Model with planner and interpolant
+        xt_tmp: Sequence after insertion [B, L]
+        new_xt: Sequence before insertion [B, L]
+        t: Current time [B]
+        dt: Time step size
+        ext: Number of insertions per gap [B, L+1]
+        mask: Mask token ID
+        pad: Pad token ID
+        max_length: Maximum sequence length
+        orig_mask: Mask of original token positions [B, L]
+        new_pos_orig: New positions of original tokens [B, L]
+        quality_threshold: If a float, drop insertions with confidence below it; if None, use schedule-driven deletion
+    Returns:
+        xt_tmp: Modified sequence with low-quality insertions removed (respecting schedule)
+    """
+    device = xt_tmp.device
+    batch_size, L = xt_tmp.shape
+    total_ext = ext.sum(dim=1)
+    # Only proceed if there were insertions
+    if total_ext.sum() == 0:
+        return xt_tmp
+    # Get planner predictions on inserted state. The insertion head is trained
+    # with the pre-step time t (see loss_insert_planner_flexible), so condition
+    # on t here too; t_next is still used below for the length schedule.
+    t_next = t + dt
+    planner_out = model.planner(xt_tmp, t)
+    insertion_conf = planner_out.get("insertion_conf", None)
+    if insertion_conf is None:
+        return xt_tmp
+    insertion_conf = insertion_conf.squeeze(-1)  # (B, L)
+    # Expected sequence length at next timestep according to schedule
+    current_length_after = xt_tmp.ne(pad).sum(dim=1).float()  # [B]
+    expected_progress = model.interpolant.insertion_schedule.at(t_next)  # [B]
+    estimated_final_length = current_length_after / (expected_progress.clamp(min=0.1))
+    expected_length = estimated_final_length * expected_progress  # [B]
+    # Mark positions in xt_tmp that came from new_xt (originals) vs. fresh insertions.
+    # Fancy-indexing scatter avoids the per-batch python loop.
+    valid_b, valid_l = orig_mask.nonzero(as_tuple=True)
+    valid_p = new_pos_orig[valid_b, valid_l].long().clamp_(0, L - 1)
+    is_original = torch.zeros_like(xt_tmp, dtype=torch.bool)
+    is_original[valid_b, valid_p] = True
+    inserted_positions = (xt_tmp == mask) & ~is_original
+    # Two deletion modes, selected by `quality_threshold`:
+    #   * float: drop insertions whose confidence is below the threshold, capped
+    #     so the length never falls below the scheduled minimum.
+    candidates = inserted_positions & (insertion_conf < quality_threshold)
+    num_bad = candidates.sum(dim=1)  # [B], long
+    min_length = expected_length.long().clamp(min=1)  # [B]
+    max_removable = (current_length_after.long() - min_length).clamp(min=0)
+    length_after_removal = current_length_after.long() - num_bad
+    schedule_violates = length_after_removal < min_length
+    k_per_row = torch.where(schedule_violates, max_removable, num_bad)
+    k_per_row = torch.where(num_bad > 0, k_per_row, torch.zeros_like(k_per_row))
+    if not candidates.any():
+        return xt_tmp
+    # Select the lowest-confidence candidates per row via a sort.
+    neg_inf = torch.tensor(float('-inf'), device=device, dtype=insertion_conf.dtype)
+    scores = torch.where(candidates, -insertion_conf, neg_inf)  # higher = worse
+    _, sorted_indices = scores.sort(dim=1, descending=True)
+    positions = torch.arange(L, device=device).unsqueeze(0)  # [1, L]
+    keep_in_topk = positions < k_per_row.unsqueeze(1)  # [B, L]
+    final_bad = torch.zeros_like(candidates)
+    final_bad.scatter_(1, sorted_indices, keep_in_topk)
+    if not final_bad.any():
+        return xt_tmp
+    # Compact each row to the left (keep good, drop bad), then pad the tail.
+    # Stable sort by the bad flag pushes bad positions to the right.
+    sort_key = final_bad.long()
+    _, perm = torch.sort(sort_key, dim=1, stable=True)
+    xt_tmp = torch.gather(xt_tmp, 1, perm)
+    num_keep = (~final_bad).sum(dim=1)  # [B]
+    tail_mask = positions >= num_keep.unsqueeze(1)  # [B, L]
+    xt_tmp = torch.where(tail_mask, torch.full_like(xt_tmp, pad), xt_tmp)
+    return xt_tmp
+def apply_schedule_aware_remasking(
+    model,
+    new_xt,
+    t,
+    dt,
+    remasking_conf,
+    clean_index,
+    mask,
+    neg_inf,
+    batch_size,
+    unmask_quality_threshold=None,
+):
+    """
+    Apply schedule-aware remasking: adjust number of masks to match expected count from schedule.
+    Args:
+        model: Model with interpolant that has an unmask_schedule
+        new_xt: Current sequence [B, L]
+        t: Current time [B]
+        dt: Time step size
+        remasking_conf: Confidence scores for tokens [B, L]
+        clean_index: Boolean mask of clean tokens (not mask, not pad) [B, L]
+        mask: Mask token ID
+        neg_inf: Negative infinity tensor
+        batch_size: Batch size
+        unmask_quality_threshold: If None (default), remask exactly the schedule
+            excess (count-based). If a float, ignore the schedule budget entirely
+            and remask EVERY clean token whose unmasking-quality confidence is
+            below the threshold. Higher threshold => more aggressive remasking.
+    Returns:
+        new_xt: Modified sequence with schedule-aware remasking applied
+    """
+    # Threshold gate (overrides the schedule-driven count when set): remask every
+    # clean token whose unmasking-quality confidence is below the threshold,
+    # regardless of the schedule budget. Higher threshold => more remasking.
+    if unmask_quality_threshold is not None:
+        to_mask = clean_index & (remasking_conf < unmask_quality_threshold)
+        return torch.where(to_mask, torch.full_like(new_xt, mask), new_xt)
+    t_next = t + dt
+    num_clean = clean_index.sum(dim=1)  # [B], long
+    current_seq_len = (num_clean + (new_xt == mask).sum(dim=1)).float()  # [B]
+    expected_unmasked_frac = model.interpolant.unmask_schedule.at(t_next)  # [B]
+    expected_num_clean = expected_unmasked_frac * current_seq_len  # [B]
+    masks_to_add = (num_clean.float() - expected_num_clean).round().long()  # [B]
+    # Per-row k = min(masks_to_add, num_clean), clamped to >= 0.
+    k_per_row = torch.minimum(masks_to_add.clamp(min=0), num_clean)  # [B]
+    if k_per_row.sum() == 0:
+        return new_xt
+    # Use confidence to decide which clean tokens to remask: lowest conf first.
+    remasking_score_temp = -1.0 * remasking_conf  # low conf = high score
+    remasking_score_temp = torch.where(clean_index, remasking_score_temp, neg_inf)
+    _, sorted_indices = remasking_score_temp.sort(dim=1, descending=True)
+    L = remasking_score_temp.shape[1]
+    positions = torch.arange(L, device=new_xt.device).unsqueeze(0)  # [1, L]
+    keep_in_topk = positions < k_per_row.unsqueeze(1)  # [B, L]
+    to_mask = torch.zeros_like(clean_index)
+    to_mask.scatter_(1, sorted_indices, keep_in_topk)
+    new_xt = torch.where(to_mask, torch.full_like(new_xt, mask), new_xt)
+    return new_xt

a2d2_pep/sampling.py ADDED Viewed

	@@ -0,0 +1,1401 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # add repo root to path
+import torch
+from dataclasses import dataclass
+from typing import Any, Literal, Optional
+import numpy as np
+import pandas as pd
+from lightning_modules.mdm import MaskedDiffusionModule
+@dataclass
+class SamplingTraceDatapoint:
+    t: float
+    event_type: Literal["insertion", "change"]
+    position: int
+    token: Any
+@dataclass
+class SamplingResult:
+    samples: torch.Tensor
+    # Trace is supposed to be processed sequentially as updates are not commutative
+    trace: Optional[list[SamplingTraceDatapoint]]
+    def __iter__(self):
+        yield from [self.samples, self.trace]
+# Sample from categorical distribution for each position using the transition probabilities
+def _sample_tokens(probs: torch.Tensor) -> torch.Tensor:
+    """Sample one token per position from probability distribution.
+    Args:
+        probs: [batch_size, seq_len, vocab_size] transition probabilities
+    Returns:
+        [batch_size, seq_len] sampled token indices
+    """
+    batch_size, seq_len, vocab_size = probs.shape
+    flat_probs = probs.view(-1, vocab_size)
+    samples = torch.multinomial(flat_probs, num_samples=1)
+    return samples.view(batch_size, seq_len)
+def _sample_batched_tokens(probs: torch.Tensor) -> torch.Tensor:
+    batch_size, seq_len, vocab_size = probs.shape
+    gumbel_noise = (-torch.log(-torch.log(torch.rand(batch_size, seq_len, vocab_size) + 1e-10) + 1e-10)).to(probs.device)
+    noisy_logits =  torch.log(probs + 1e-10) + gumbel_noise  # add Gumbel noise to log probabilities
+    # select the highest score (most likely category after Gumbel noise)
+    samples = noisy_logits.argmax(dim=-1).to(dtype=torch.long)
+    return samples.view(batch_size, seq_len)
+@torch.no_grad()
+def mdm_euler_sampling(
+    model: MaskedDiffusionModule,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+):
+    assert not return_trace, "Trace is not yet implemented in MDM Euler sampling"
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    for i in range(steps):
+        print("i-th sampling step")
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        _xt = xt.clone()
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0
+            print(trans_prob[mask_pos + (mask,)])
+        new_xt = _sample_tokens(trans_prob)
+        new_xt = torch.where(xt != mask, xt, new_xt)
+        xt = new_xt
+        t = t + dt
+    return xt, []
+@torch.no_grad()
+def any_order_mask_insertion_euler_sampling(
+    model: torch.nn.Module,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    # 1) Initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                # normalize to sum to 1
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # Check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # Check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    return xt, sampling_trace
+@torch.no_grad()
+def batch_mcts_reverse_step(
+    xt: torch.Tensor,
+    t: torch.Tensor,
+    dt: float,
+    model: torch.nn.Module,
+    pretrained: torch.nn.Module,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    last_step: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    xt = xt.repeat(batch_size, 1)
+    # squeeze to remove extra dimensions, then expand to batch_size
+    t = t.squeeze().expand(batch_size)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    # ——— predict and convert rates ———
+    pred_rate = model(xt, t)
+    pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+    unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+    len_rate = pred_rate.length_rate  # (B, L+1)
+    # ——— get pretrained model rates for log_rnd computation ———
+    pretrained_pred = pretrained(xt, t)
+    pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+    pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+    pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+    # ——— unmask step (Euler) ———
+    mask_pos = (xt == mask).nonzero(as_tuple=True)
+    unmask_rate[xt != mask] = 0
+    unmask_rate[mask_pos + (mask,)] = 0
+    unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+    # Same for pretrained
+    pretrained_unmask_rate[xt != mask] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+    # add “stay” probability
+    _xt = xt.clone()
+    _xt[xt == pad] = mask
+    trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+    )
+    pretrained_trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+    )
+    if last_step:
+        print("Final step, removing mask token from sampling")
+        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+        # renormalize probabilities to ensure they sum to 1
+        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+        # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+        if mask_has_zero_prob.any():
+            # create uniform distribution over valid tokens (excluding mask and pad)
+            uniform_prob = torch.zeros_like(trans_prob[0])
+            uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+        else:
+            # normalize to sum to 1
+            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+    new_xt = _sample_tokens(trans_prob)
+    new_xt[xt == pad] = pad
+    new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+    # ——— compute log probabilities for RND ———
+    lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+    log_policy_step = (lp * changed_mask).sum(dim=1)
+    log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+    log_rnd = log_pretrained_step - log_policy_step  # (B,)
+    if not last_step:
+        # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+        ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+        insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        # log P(ext; λ) = ext*log(λ) - λ
+        log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+        log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+        log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+        log_rnd += log_insert_diff
+        log_pretrained_step += log_pretrained_insert
+        log_policy_step += log_policy_insert
+        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+        seq_dim = ext.size(1)  # Use actual ext dimension to avoid mismatch
+        gaps = torch.arange(seq_dim, device=device).view(1, -1)
+        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+        total_ext = ext.sum(dim=1)
+        valid = xt_len + total_ext <= max_length
+        ext = ext * valid.view(batch_size, 1).long()
+        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+        new_len = xt_len + total_ext  # (B,)
+        xt_tmp = torch.full_like(xt, pad)
+        mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+        xt_tmp[mask_fill] = mask
+        new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+        flat_b = batch_idx_L[orig_mask]
+        flat_p = new_pos_orig[orig_mask]
+        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+    else:
+        xt_tmp = new_xt
+    return xt_tmp, log_rnd, log_policy_step, log_pretrained_step
+@torch.no_grad()
+def mcts_reverse_step(
+    xt: torch.Tensor,
+    t: torch.Tensor,
+    dt: float,
+    model: torch.nn.Module,
+    pretrained: torch.nn.Module,
+    mask: int,
+    pad: int,
+    max_length: int,
+    last_step: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    batch_size = xt.size(0)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    # ——— predict and convert rates ———
+    pred_rate = model(xt, t)
+    pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+    unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+    len_rate = pred_rate.length_rate  # (B, L+1)
+    # ——— get pretrained model rates for log_rnd computation ———
+    pretrained_pred = pretrained(xt, t)
+    pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+    pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+    pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+    # ——— unmask step (Euler) ———
+    mask_pos = (xt == mask).nonzero(as_tuple=True)
+    unmask_rate[xt != mask] = 0
+    unmask_rate[mask_pos + (mask,)] = 0
+    unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+    # same for pretrained
+    pretrained_unmask_rate[xt != mask] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = 0
+    pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+    pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+    # add “stay” probability
+    _xt = xt.clone()
+    _xt[xt == pad] = mask
+    trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+    )
+    pretrained_trans_prob.scatter_add_(
+        2,
+        _xt.unsqueeze(-1),
+        torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+    )
+    if last_step:
+        print("Final step, removing mask token from sampling")
+        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+        # renormalize probabilities to ensure they sum to 1
+        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+        # avoid division by zero - if all probs are 0, use uniform distribution (excluding mask and pad)
+        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+        if mask_has_zero_prob.any():
+            # create uniform distribution over valid tokens (excluding mask and pad)
+            uniform_prob = torch.zeros_like(trans_prob[0])
+            uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+        else:
+            # normalize to sum to 1
+            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+    new_xt = _sample_tokens(trans_prob)
+    new_xt[xt == pad] = pad
+    new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+    # ——— compute log probabilities for RND ———
+    lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+    changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+    log_policy_step = (lp * changed_mask).sum(dim=1)
+    log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+    log_rnd = log_pretrained_step - log_policy_step  # (B,)
+    if not last_step:
+        # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+        ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+        insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+        # log P(ext; λ) = ext*log(λ) - λ
+        log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+        log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+        log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+        log_rnd += log_insert_diff
+        log_pretrained_step += log_pretrained_insert
+        log_policy_step += log_policy_insert
+        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+        seq_dim = ext.size(1)  # Use actual ext dimension to avoid mismatch
+        gaps = torch.arange(seq_dim, device=device).view(1, -1)
+        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+        total_ext = ext.sum(dim=1)
+        valid = xt_len + total_ext <= max_length
+        ext = ext * valid.view(batch_size, 1).long()
+        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+        new_len = xt_len + total_ext  # (B,)
+        xt_tmp = torch.full_like(xt, pad)
+        mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+        xt_tmp[mask_fill] = mask
+        new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+        flat_b = batch_idx_L[orig_mask]
+        flat_p = new_pos_orig[orig_mask]
+        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+    else:
+        xt_tmp = new_xt
+    return xt_tmp, log_rnd, log_policy_step, log_pretrained_step
+@torch.no_grad()
+def any_order_euler_sampling_with_schedule(
+    model: torch.nn.Module,
+    time_schedule: torch.Tensor,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    time_schedule = time_schedule.to(device)
+    if time_schedule[0] < time_schedule[-1]:
+        time_schedule = torch.flip(time_schedule, [0]) # descending order
+    steps = len(time_schedule) - 1
+    # initialize all-pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # use scheduled timesteps
+        t = time_schedule[i].repeat(batch_size)
+        t_next = time_schedule[i + 1]
+        dt = (t - t_next).abs()  # timestep difference
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt[:, None, None]).clamp(0.0, 1.0)
+        # add "stay" probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
+            ext = torch.bernoulli((len_rate * dt[:, None]).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # Check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # Check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+    return xt, sampling_trace
+@torch.no_grad()
+def any_order_mask_insertion_euler_sampling_with_rnd(
+    model, pretrained, reward_model, analyzer,
+    tokenizer, steps,
+    mask,
+    pad,
+    batch_size,
+    max_length,
+    return_trace = False,
+    alpha = 0.1,
+    temperature: float = 1.0,
+):
+    device = next(model.parameters()).device
+    # initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    # initialize log_rnd to accumulate log probability ratios
+    log_rnd = torch.zeros(batch_size, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— get pretrained model rates for log_rnd computation ———
+        pretrained_pred = pretrained(xt, t)
+        pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
+        pretrained_unmask_rate = pretrained_rate.unmask_rate.clone()  # (B, L, V)
+        pretrained_len_rate = pretrained_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # Same for pretrained
+        pretrained_unmask_rate[xt != mask] = 0
+        pretrained_unmask_rate[mask_pos + (mask,)] = 0
+        pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        pretrained_trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # ——— compute log probabilities for RND ———
+        lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+        lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
+        changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)
+        log_policy_step = (lp * changed_mask).sum(dim=1)
+        log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)
+        log_rnd = log_pretrained_step - log_policy_step  # (B,)
+        if i != steps - 1:
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            insertion_rate = (len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+            pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10)  # (B, L+1)
+            log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1)  # (B,)
+            log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1)  # (B,)
+            log_insert_diff = log_pretrained_insert - log_policy_insert  # (B,)
+            log_rnd += log_insert_diff
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # check if the token was changed
+            for i in range(batch_size):
+                for j in range(max_length):
+                    if xt[i, j] != pad and xt[i, j] != new_xt[i, j]:
+                        sampling_trace[i].append(
+                            SamplingTraceDatapoint(
+                                t=t[i].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[i, j].item(),
+                            )
+                        )
+                # check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[i, id]:
+                        sampling_trace[i].append(
+                            SamplingTraceDatapoint(
+                                t=t[i].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    # change rewards for peptides
+    samples = xt.to(device)
+    # store raw token IDs
+    # Decode and strip samples
+    decoded_samples = tokenizer.batch_decode(samples)
+    valid_x_final = []
+    validSequences = []
+    valid_log_rnd = []
+    for idx, seq in enumerate(decoded_samples):
+        # check if the peptide is valid
+        if analyzer.is_peptide(seq):
+            valid_x_final.append(xt[idx])
+            validSequences.append(seq)
+            valid_log_rnd.append(log_rnd[idx])
+    print("len valid sequences:", len(validSequences))
+    # compute multi-objective rewards
+    score_vectors = reward_model(input_seqs=validSequences)
+    scalar_rewards = np.sum(score_vectors, axis=-1)
+    scalar_rewards = torch.as_tensor(scalar_rewards, dtype=torch.float32, device=device)
+    print(f"scalar reward dim{len(scalar_rewards)}")
+    valid_log_rnd = torch.stack(valid_log_rnd, dim=0)
+    log_rnd = valid_log_rnd + (scalar_rewards / alpha) # scale down by alpha
+    valid_x_final = torch.stack(valid_x_final, dim=0)
+    return valid_x_final, log_rnd, scalar_rewards, sampling_trace
+@torch.no_grad()
+def any_order_finetuned_euler_sampler(
+        model, reward_model, analyzer,
+        tokenizer, steps,
+        mask,
+        pad,
+        batch_size,
+        max_length,
+        return_trace = False,
+        dataframe = False,
+        temperature: float = 1.0,
+    ):
+    device = next(model.parameters()).device
+    # initialize all‑pad sequence and trace
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    sampling_trace = [[] for _ in range(batch_size)] if return_trace else None
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred_rate = model(xt, t)
+        pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == mask).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # add “stay” probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        # Apply temperature scaling
+        if temperature != 1.0:
+            logits = torch.log(trans_prob + 1e-10) / temperature
+            trans_prob = torch.softmax(logits, dim=-1)
+        if i == steps - 1:
+            print("Final step, removing mask token from sampling")
+            trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+            # renormalize probabilities to ensure they sum to 1
+            prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+            # avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+            mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+            if mask_has_zero_prob.any():
+                # create uniform distribution over valid tokens (excluding mask and pad)
+                uniform_prob = torch.zeros_like(trans_prob[0])
+                uniform_prob[:mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+                trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+            else:
+                # normalize to sum to 1
+                trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        if i != steps - 1:
+            # gap-wise insertion refactored — compute new length, fill masks, scatter tokens
+            ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            xt_tmp = torch.full_like(xt, pad)
+            mask_fill = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_fill] = mask
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        if return_trace:
+            # check if the token was changed
+            for batch_idx in range(batch_size):
+                for j in range(max_length):
+                    if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="change",
+                                position=j,
+                                token=new_xt[batch_idx, j].item(),
+                            )
+                        )
+                # check if a new token was inserted
+                for j in range(max_length):
+                    id = max_length - j - 1
+                    if ext[batch_idx, id]:
+                        sampling_trace[batch_idx].append(
+                            SamplingTraceDatapoint(
+                                t=t[batch_idx].item(),
+                                event_type="insertion",
+                                position=id,
+                                token=mask,
+                            )
+                        )
+        xt = xt_tmp
+        t = t + dt
+    # start eval
+    samples = xt.to(device)
+    decoded_samples = tokenizer.batch_decode(samples)
+    valid_x_final = []
+    validSequences = []
+    for idx, seq in enumerate(decoded_samples):
+        if analyzer.is_peptide(seq):
+            valid_x_final.append(samples[idx])
+            validSequences.append(seq)
+    print("len valid sequences:", len(validSequences))
+    valid_fraction = len(validSequences) / batch_size
+    if (len(validSequences) != 0):
+        # add scores to log
+        score_vectors = reward_model(input_seqs=validSequences) # (num_children, num_objectives)
+        average_scores = score_vectors.T
+        affinity = average_scores[0]
+        sol = average_scores[1]
+        hemo = average_scores[2]
+        nf = average_scores[3]
+        permeability = average_scores[4]
+    else:
+        zeros = [0.0]
+        affinity = zeros
+        sol = zeros
+        hemo = zeros
+        nf = zeros
+        permeability = zeros
+    if dataframe:
+        df = pd.DataFrame({
+            "Peptide Sequence": validSequences,
+            "Binding Affinity": affinity if len(validSequences) else [0.0],
+            "Solubility": sol if len(validSequences) else [0.0],
+            "Hemolysis": hemo if len(validSequences) else [0.0],
+            "Nonfouling": nf if len(validSequences) else [0.0],
+            "Permeability": permeability if len(validSequences) else [0.0],
+        })
+        return samples, affinity, sol, hemo, nf, permeability, valid_fraction, df
+    return samples, affinity, sol, hemo, nf, permeability, valid_fraction
+@torch.no_grad()
+def mdm_tau_leaping_sampling(
+    model: MaskedDiffusionModule,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    temperature: float = 1.0,
+):
+    assert not return_trace, "Trace is not yet supported"
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    for i in range(steps):
+        # ——— predict and convert rates ———
+        pred = model(xt, t)
+        pred = model.interpolant.to_actual_rate(xt, pred, t)
+        unmask_rate = pred.unmask_rate  # (B, L, V)
+        if i == steps - 1:
+            # last step: deterministic unmask via argmax
+            mask_pos = xt == mask  # (B, L)
+            new_token = unmask_rate.argmax(dim=2)  # (B, L)
+            new_xt = xt.clone()
+            new_xt[mask_pos] = new_token[mask_pos]
+            new_xt = torch.where(xt != mask, xt, new_xt)
+            xt = new_xt
+            t = t + dt
+            continue
+        # tau-leaping via Poisson counts
+        counts = torch.poisson(unmask_rate * dt).long()
+        mask_pos = xt == mask  # (B, L)
+        # zero out non-mask positions and mask→mask
+        counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
+        counts[..., mask] = 0
+        # only accept exactly one event
+        sum_c = counts.sum(dim=2)  # (B, L)
+        one_event = sum_c == 1
+        new_token = counts.argmax(dim=2)  # (B, L)
+        # build new xt
+        new_xt = xt.clone()
+        new_xt[one_event] = new_token[one_event]
+        # keep pads and already-unmasked tokens
+        new_xt = torch.where(xt != mask, xt, new_xt)
+        xt = new_xt
+        t = t + dt
+    return xt, []
+# Not used in production, for debugging purposes
+lengths = {4: 0.1, 16: 0.4, 32: 0.4, 64: 0.1}
+def binomial_mass(k, n, p):
+    """
+    Calculate the probability mass function (PMF) for a binomial distribution.
+    Args:
+        k (int): Number of successes
+        n (int): Number of trials
+        p (float): Probability of success in a single trial
+    Returns:
+        float: Probability mass P(X = k)
+    """
+    import math
+    # Calculate binomial coefficient (n choose k)
+    try:
+        binom_coef = math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
+    except ValueError:
+        # Handle cases where k > n or negative values
+        return 0.0
+    # Calculate probability mass
+    return binom_coef * (p ** k) * ((1 - p) ** (n - k))
+def calculate_rate_batch(alpha_t, len_t):
+    """
+    Calculate rate for a batch of alpha_t and len_t values.
+    Args:
+        alpha_t (torch.Tensor): Tensor of shape (batch_size,)
+        len_t (torch.Tensor): Tensor of shape (batch_size,)
+    Returns:
+        torch.Tensor: Tensor of shape (batch_size,) containing calculated rates
+    """
+    batch_size = alpha_t.shape[0]
+    device = alpha_t.device
+    # Initialize tensors for numerator and denominator
+    nom = torch.zeros(batch_size, device=device)
+    denom = torch.zeros(batch_size, device=device)
+    for length, probability in lengths.items():
+        # Create mask for valid entries where len_t <= length
+        valid_mask = (len_t <= length) & (len_t >= 0)
+        if not valid_mask.any():
+            continue
+        valid_indices = valid_mask.nonzero(as_tuple=True)[0]
+        valid_len_t = len_t[valid_indices]
+        valid_alpha_t = alpha_t[valid_indices]
+        # Calculate binomial probabilities efficiently using torch distribution
+        binom_dist = torch.distributions.Binomial(total_count=length, probs=valid_alpha_t)
+        binom_probs = binom_dist.log_prob(valid_len_t).exp()
+        # Update numerator and denominator for valid indices
+        nom[valid_indices] += (length - valid_len_t) * probability * binom_probs
+        denom[valid_indices] += probability * binom_probs
+    # Handle division by zero in a vectorized way
+    result = torch.zeros_like(nom)
+    div_mask = denom > 0
+    result[div_mask] = nom[div_mask] / (denom[div_mask])
+    return result
+# Keep the original function for backward compatibility
+def calculate_rate(alpha_t, len_t):
+    """Legacy scalar version of calculate_rate"""
+    if isinstance(alpha_t, torch.Tensor) and alpha_t.ndim > 0:
+        return calculate_rate_batch(alpha_t, len_t)
+    nom, denom = 0, 0
+    for length, probability in lengths.items():
+        if length >= len_t:
+            nom += (length - len_t) * probability * binomial_mass(len_t, length, alpha_t)
+            denom += probability * binomial_mass(len_t, length, alpha_t)
+    if denom == 0:
+        return 0.0
+    return nom /denom
+@torch.no_grad()
+def any_order_mask_insertion_tau_leaping_sampling(
+    model: torch.nn.Module,
+    steps: int,
+    mask: int,
+    pad: int,
+    batch_size: int,
+    max_length: int,
+    return_trace: bool = False,
+    confidence_based_sampling: bool = True,  # whether to use confidence-based decoding
+    alpha: float = 5.0,  # hyperparameter for window size calculation
+    max_window: int = 32,  # Maximum window size for sliding window
+    confidence_method: str = "prob_diff",  # "position", "top_prob", "prob_diff", "entropy"
+    use_sliding_window: bool = False,  # whether to use sliding window for position selection
+    temperature: float = 1.0,
+) -> SamplingResult:
+    device = next(model.parameters()).device
+    xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
+    sampling_trace = []
+    dt = 1.0 / steps
+    t = torch.zeros(batch_size, device=device)
+    # Precompute row indices for scatter
+    batch_idx_L = (
+        torch.arange(batch_size, device=device)
+        .view(batch_size, 1)
+        .expand(batch_size, max_length)
+    )
+    pos_idx_L = (
+        torch.arange(max_length, device=device)
+        .view(1, max_length)
+        .expand(batch_size, max_length)
+    )
+    for i in range(steps):
+        # --- predict rates ---
+        pred = model(xt, t)
+        xt_len = (xt != pad).sum(dim=1)
+        pred = model.interpolant.to_actual_rate(xt, pred, t)
+        unmask_rate = pred.unmask_rate  # (B, L, V)
+        len_rate = pred.length_rate  # (B, L+1)
+        if i == steps - 1:
+            # last step: deterministic unmask via argmax
+            mask_pos = xt == mask
+            new_token = unmask_rate.argmax(dim=2)
+            new_xt = xt.clone()
+            new_xt[mask_pos] = new_token[mask_pos]
+            new_xt = torch.where(xt == pad, pad, new_xt)
+            new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+            xt = new_xt
+            t = t + dt
+            continue
+        # --- confidence-based decoding ---
+        if confidence_based_sampling > 0.0:
+            # Confidence-based unmasking (vectorized)
+            mask_positions = (xt == mask)  # (B, L)
+            num_mask_positions = mask_positions.sum(dim=1)  # (B,)
+            # 1. Determine number of tokens to unmask using Poisson
+            unmask_counts = torch.poisson(num_mask_positions.float() * dt).long()  # (B,)
+            # 2. Calculate confidence based on selected method
+            if confidence_method == "position":
+                # Position-based confidence: position i / len(xt)
+                xt_len = (xt != pad).sum(dim=1)  # (B,) - current sequence lengths
+                position_indices = torch.arange(max_length, device=device).unsqueeze(0).expand(batch_size, -1)  # (B, L)
+                confidence = 1.0 - (position_indices.float() / xt_len.unsqueeze(1).float().clamp(min=1))  # (B, L)
+            elif confidence_method == "top_prob":
+                # Top probability confidence
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V) - use the unmask_rate as logits
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                confidence = unmask_probs.max(dim=-1)[0]  # (B, L)
+            elif confidence_method == "prob_diff":
+                # Probability difference confidence (top - second top)
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V)
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                top2_probs, _ = torch.topk(unmask_probs, k=2, dim=-1)  # (B, L, 2)
+                confidence = top2_probs[:, :, 0] - top2_probs[:, :, 1]  # (B, L)
+            elif confidence_method == "entropy":
+                # Entropy-based confidence (lower entropy = higher confidence)
+                import torch.nn.functional as F
+                token_logits = unmask_rate  # (B, L, V)
+                unmask_probs = F.softmax(token_logits, dim=-1)  # (B, L, V)
+                entropy = -torch.sum(unmask_probs * torch.log(unmask_probs + 1e-10), dim=-1)  # (B, L)
+                confidence = -entropy  # (B, L) - negative entropy so lower entropy gives higher confidence
+            else:
+                raise ValueError(f"Unknown confidence_method: {confidence_method}")
+            # 3. Apply window constraint if enabled
+            if use_sliding_window:
+                # Calculate dynamic k for each batch
+                k_values = torch.minimum(
+                    torch.minimum(
+                        (alpha * unmask_counts).long(),
+                        torch.tensor(max_window, device=device)
+                    ), num_mask_positions)  # (B,)
+                # Get cumulative count of mask positions
+                mask_cumsum = mask_positions.cumsum(dim=1)  # (B, L)
+                # Create window mask: position is eligible if it's a mask and within first k masks
+                is_within_window = mask_cumsum <= k_values.unsqueeze(1)  # (B, L)
+                window_mask = mask_positions & is_within_window  # (B, L)
+                # Set confidence to -inf for positions outside the window or non-mask positions
+                confidence = torch.where(window_mask, confidence, torch.tensor(-float('inf'), device=device))
+            else:
+                # No window constraint - only mask positions are eligible
+                confidence = torch.where(mask_positions, confidence, torch.tensor(-float('inf'), device=device))
+            new_xt = xt.clone()
+            # vectorized unmasking
+            max_unmask = unmask_counts.max().item()
+            if max_unmask > 0:
+                _, all_top_indices = torch.topk(confidence, k=max_unmask, dim=1, largest=True)  # (B, max_unmask)
+                # create mask for valid unmask operations
+                unmask_mask = torch.arange(max_unmask, device=device).unsqueeze(0) < unmask_counts.unsqueeze(1)  # (B, max_unmask)
+                most_likely_tokens = unmask_rate.argmax(dim=-1)  # (B, L)
+                selected_positions = all_top_indices[unmask_mask]
+                batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, max_unmask)[unmask_mask]
+                new_xt[batch_indices, selected_positions] = most_likely_tokens[batch_indices, selected_positions]
+        else:
+            # --- tau-leaping unmask via Poisson ---
+            counts = torch.poisson(unmask_rate * dt).long()
+            mask_pos = xt == mask
+            counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
+            counts[..., mask] = 0
+            sum_c = counts.sum(dim=2)
+            one_event = sum_c == 1
+            new_token = counts.argmax(dim=2)
+            new_xt = xt.clone()
+            new_xt[one_event] = new_token[one_event]
+            new_xt = torch.where(xt == pad, pad, new_xt)
+            new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # insertion only on non-last
+        if i != steps - 1:
+            # --- Poisson insertion, compute new lengths and fill masks ---
+            ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
+            xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+            gaps = torch.arange(max_length + 1, device=device).view(1, -1)
+            ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+            total_ext = ext.sum(dim=1)
+            valid = xt_len + total_ext <= max_length
+            ext = ext * valid.view(batch_size, 1).long()
+            # compute prefix sums of insertions
+            ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+            new_len = xt_len + total_ext  # (B,)
+            # initialize with pads, then fill mask up to new_len
+            xt_tmp = torch.full_like(xt, pad)
+            mask_pos = pos_idx_L < new_len.view(batch_size, 1)
+            xt_tmp[mask_pos] = mask
+            # shift and scatter original tokens
+            new_pos_orig = pos_idx_L + ext_ex[:, :max_length]  # (B, L)
+            orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+            flat_b = batch_idx_L[orig_mask]
+            flat_p = new_pos_orig[orig_mask]
+            xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        else:
+            xt_tmp = new_xt
+        xt = xt_tmp
+        t = t + dt
+        if return_trace:
+            sampling_trace.append(xt)
+    return xt, sampling_trace

a2d2_pep/scripts/run_peptide_finetune.slurm ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/bin/bash
+# NOTE: --partition and --qos below are specific to our cluster. Change them
+# (or remove them and pass `--partition` on the `sbatch` command line) to match
+# the partitions/QOS available on yours.
+#SBATCH --job-name=peptide-finetune-len256
+#SBATCH --partition=b200-mig90
+#SBATCH --qos=mig
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=1
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=80GB
+#SBATCH --time=02-00:00:00
+#SBATCH --output=logs/peptide_finetune_%A.log
+# =====================================================================
+# run_peptide_finetune.slurm
+#
+# Single-mode job (1 MIG GPU) running ONE finetune_quality (peptide)
+# experiment. Select which mode to run via the MODE_ID variable below
+# (or override at submit time with `sbatch --export=ALL,MODE_ID=2 ...`):
+#   0) A2D2 (Ours)                – with full planner (alternating)
+#   1) A2D2 w/o quality           – --disable_planner
+#   2) A2D2 w/o insertion planner – --disable_insertion_planner
+#   3) A2D2 w/o unmasking planner – --disable_unmasking_planner
+#
+# The job trains the selected mode then evaluates the resulting
+# checkpoint on the same GPU.
+# =====================================================================
+set -e
+# --- Mode selection ---------------------------------------------------
+# Which experiment to run (0-3). Override with `--export=ALL,MODE_ID=N`.
+MODE_ID="${MODE_ID:-0}"
+# Run prefix: YYYYMMDD + SLURM job ID
+DATE_STAMP=$(date +%Y%m%d)
+PREFIX="${DATE_STAMP}_job${SLURM_JOB_ID:-local$(date +%H%M%S)}"
+# Default protein target (must be defined before path definitions below)
+PROT_NAME=tfr
+# --- Paths ------------------------------------------------------------
+# Repo root is resolved at submit time so the script works from any clone:
+#   - set A2D2_ROOT explicitly, OR
+#   - run `sbatch` from the repo root (SLURM sets SLURM_SUBMIT_DIR), OR
+#   - fall back to this script's location (a2d2_pep/scripts/ -> two levels up).
+if [ -n "${A2D2_ROOT:-}" ]; then
+    HOME_LOC="$A2D2_ROOT"
+elif [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
+    HOME_LOC="$SLURM_SUBMIT_DIR"
+else
+    HOME_LOC="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+fi
+SCRIPT_LOC="$HOME_LOC/a2d2_pep"
+LOG_LOC="$HOME_LOC/logs"
+SAVE_DIR="$HOME_LOC/checkpoints/finetune_test_peptides_${PROT_NAME}"
+RESULTS_DIR="$HOME_LOC/results/peptide_test_ablation_${PROT_NAME}"
+cd "$SCRIPT_LOC"
+# BASE_PATH is passed as --base_path to finetune_quality.py: it's used
+# to build the plot output path at $BASE_PATH/flexible/results/<run_name>
+# (see finetune_quality.py:421). The pretrained checkpoint is now passed
+# explicitly via --checkpoint_path below, so base_path no longer needs
+# to follow the legacy /scratch layout.
+BASE_PATH="${A2D2_BASE_PATH:-$HOME_LOC}"
+mkdir -p "$LOG_LOC" "$SAVE_DIR" "$RESULTS_DIR"
+# --- Environment setup ------------------------------------------------
+# Do NOT hardcode your W&B key. Either `wandb login` once on the cluster,
+# export WANDB_API_KEY in your shell/SLURM environment before submitting,
+# or set WANDB_MODE=offline to skip logging entirely.
+export WANDB_DIR=$HOME_LOC/.wandb
+export WANDB_CONFIG_DIR=$HOME_LOC/.config/wandb
+export WANDB_CACHE_DIR=$HOME_LOC/.cache/wandb
+# Stop wandb from hijacking stdout/stderr (its default fd-redirect mode sends
+# all output to wandb/run-*/files/output.log and freezes the RUN_LOG below).
+# With console off, everything flows to the `>> "$RUN_LOG" 2>&1` redirect.
+export WANDB_CONSOLE=off
+mkdir -p "$WANDB_DIR" "$WANDB_CONFIG_DIR" "$WANDB_CACHE_DIR"
+export TRITON_CACHE_DIR=$HOME_LOC/.triton/cache
+mkdir -p "$TRITON_CACHE_DIR"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+# Activate conda env. Override CONDA_ROOT to point at your conda/miniconda
+# install, or just have `conda` on PATH; override CONDA_ENV if your env name
+# differs from the one created by environment.yml.
+CONDA_ENV="${CONDA_ENV:-a2d2}"
+if [ -n "${CONDA_ROOT:-}" ]; then
+    source "$CONDA_ROOT/bin/activate" "$CONDA_ENV"
+elif command -v conda >/dev/null 2>&1; then
+    source "$(conda info --base)/bin/activate" "$CONDA_ENV"
+else
+    echo "ERROR: conda not found; set CONDA_ROOT to your miniconda install." >&2
+    exit 1
+fi
+PYTHON_EXECUTABLE=$(which python)
+# Pretrained base checkpoint
+PRETRAINED_CKPT="$HOME_LOC/pretrained/anylength_pep.ckpt"
+# --- Shared training hyperparameters ----------------------------------
+COMMON_ARGS=(
+    --base_path "$BASE_PATH"
+    --checkpoint_path "$PRETRAINED_CKPT"
+    --prot_name "$PROT_NAME"
+    --noise_removal
+    --wdce_num_replicates 8
+    --pool_size 100
+    --pool_refresh_fraction 1.0
+    --buffer_size 50
+    --batch_size 200
+    --total_num_steps 256
+    --num_iter 20
+    --resample_every_n_step 10
+    --num_epochs 1000
+    --save_every_n_epochs 50
+    --reset_every_n_step 1
+    --alpha 0.1
+    --no_mcts
+    --schedule_warmup_epochs 20
+    --alternation_frequency 5
+    --num_remasking 3
+    --quality_threshold 0.2
+    --training_mini_batch_size 10
+    --max_length 256
+    --eval_every_n_epochs 50
+    --min_peptide_bonds 4
+    --grad_clip
+    --seed 42
+)
+# --- Shared evaluation hyperparameters --------------------------------
+EVAL_COMMON_ARGS=(
+    --pretrained_ckpt "$PRETRAINED_CKPT"
+    --num_samples 50
+    --batch_size 200
+    --max_length 256
+    --total_num_steps 256
+    --num_remasking 3
+    --quality_threshold 0.2
+    --prot_name "$PROT_NAME"
+    --seed 42
+)
+# =====================================================================
+# Pick experiment from $MODE_ID
+# =====================================================================
+case "$MODE_ID" in
+    0) MODE="with_planner";         EXTRA_ARGS=() ;;
+    1) MODE="no_planner";           EXTRA_ARGS=(--disable_planner) ;;
+    2) MODE="no_insertion_planner"; EXTRA_ARGS=(--disable_insertion_planner) ;;
+    3) MODE="no_unmasking_planner"; EXTRA_ARGS=(--disable_unmasking_planner) ;;
+    *) echo "Unknown MODE_ID=$MODE_ID (expected 0-3)"; exit 1 ;;
+esac
+RUN_NAME="${PREFIX}_peptide_${PROT_NAME}_${MODE}"
+RUN_LOG="$LOG_LOC/${RUN_NAME}.log"
+RUN_SAVE_DIR="$SAVE_DIR/${RUN_NAME}"
+RESULTS_SUBDIR="$RESULTS_DIR/${MODE}"
+mkdir -p "$RUN_SAVE_DIR" "$RESULTS_SUBDIR"
+echo "=== Peptide finetune (MODE_ID=$MODE_ID) ==="
+echo "Job: ${SLURM_JOB_ID}  Node: $SLURM_NODELIST"
+echo "Mode: $MODE"
+echo "Save dir: $RUN_SAVE_DIR"
+echo "Results dir: $RESULTS_SUBDIR"
+echo "Python: $PYTHON_EXECUTABLE"
+echo "CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-(unset)}"
+# =====================================================================
+# Train
+# =====================================================================
+$PYTHON_EXECUTABLE $SCRIPT_LOC/finetune_quality.py \
+    "${COMMON_ARGS[@]}" \
+    --devices 1 \
+    "${EXTRA_ARGS[@]}" \
+    --save_path_dir "$RUN_SAVE_DIR" \
+    >> "$RUN_LOG" 2>&1
+echo "Training finished for $MODE. Log: $RUN_LOG"
+# =====================================================================
+# Evaluate
+# =====================================================================
+# finetune_quality.py saves to $RUN_SAVE_DIR/<auto_run_name>/last.ckpt,
+# so glob the run_name subdir.
+RUN_CKPT=$(ls -t "$RUN_SAVE_DIR"/*/last.ckpt 2>/dev/null | head -1)
+if [ -z "$RUN_CKPT" ]; then
+    echo "No checkpoint found in $RUN_SAVE_DIR — skipping eval."
+    exit 1
+fi
+echo "Evaluating checkpoint: $RUN_CKPT"
+$PYTHON_EXECUTABLE $SCRIPT_LOC/evaluate_peptide_table.py \
+    --checkpoint_path "$RUN_CKPT" \
+    "${EVAL_COMMON_ARGS[@]}" \
+    "${EXTRA_ARGS[@]}" \
+    --output_dir "$RESULTS_SUBDIR" \
+    --device cuda:0 \
+    >> "$RESULTS_SUBDIR/${RUN_NAME}_eval.log" 2>&1
+echo "Eval finished for $MODE. CSV: $RESULTS_SUBDIR/eval_metrics_${MODE}_${PROT_NAME}.csv"
+conda deactivate

a2d2_pep/scripts/train_pep.sh ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/bin/bash
+#SBATCH --job-name=a2d2-pep-pretrain
+#SBATCH --partition=dgx-b200
+#SBATCH --nodes=1
+#SBATCH --gpus-per-node=4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=512GB
+#SBATCH --time=7-00:00:00
+# SLURM's own catch-file (anything printed before the exec redirect below, plus
+# slurm-infra messages). Relative to the submit dir, so submit this script from
+# the a2d2_pep/ directory; the real run output is redirected via exec below.
+#SBATCH --output=logs/slurm/%x_%j.out
+#SBATCH --error=logs/slurm/%x_%j.err
+#
+# Pretrain the any-length insertion MDM on ~11M peptide SMILES on a dgx-b200 node.
+# Submit with:  sbatch scripts/train_pep.sh   (from the a2d2_pep/ directory).
+#
+# DDP is launched by SLURM: one srun task per GPU. --gpus-per-node and
+# --ntasks-per-node must match; change both together (and they override the
+# training.devices value baked into config_pep.yaml via the hydra override below).
+DATE=$(date +%Y%m%d)
+SPECIAL_PREFIX='a2d2-peptide'
+# Resolve a2d2_pep/ (which holds train.py + config_pep.yaml) so paths are
+# repo-relative. This script lives in a2d2_pep/scripts/, so the direct-run
+# fallback goes one level up. Under sbatch, BASH_SOURCE points at the spooled
+# copy, so we rely on SLURM_SUBMIT_DIR (submit from the a2d2_pep/ directory).
+if [ -n "${SLURM_SUBMIT_DIR:-}" ]; then
+    SCRIPT_DIR="$SLURM_SUBMIT_DIR"
+else
+    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+fi
+cd "$SCRIPT_DIR"
+# Auto-detect GPUs from the SLURM allocation (falls back to 4 for `bash` runs).
+DEVICES=${SLURM_GPUS_ON_NODE:-${SLURM_GPUS_PER_NODE:-4}}
+NTASKS=${SLURM_NTASKS_PER_NODE:-$DEVICES}
+NODES=${SLURM_NNODES:-1}
+LOG_LOC="$SCRIPT_DIR/logs"
+mkdir -p "$LOG_LOC/slurm"
+exec > "${LOG_LOC}/${DATE}_${SPECIAL_PREFIX}_${SLURM_JOB_ID:-local}.log" 2>&1
+# ---------------------------------------------------------------------------
+# Weights & Biases: log in once on your machine before running this script with
+#   `wandb login`  (or `export WANDB_API_KEY=<your-key>`).
+# Do NOT hardcode your API key here. To disable W&B entirely, uncomment:
+# export WANDB_MODE=disabled
+# ---------------------------------------------------------------------------
+export PYTORCH_ALLOC_CONF=expandable_segments:True
+# Activate the conda env that has the deps (torch / pytorch_lightning / hydra).
+# The batch shell does NOT source ~/.bashrc, so conda is not on PATH. Override
+# CONDA_ROOT to point at your conda/miniconda install, or just have `conda` on
+# PATH; override CONDA_ENV if your env name differs from the one created by
+# environment.yml.
+CONDA_ENV="${CONDA_ENV:-a2d2}"
+if [ -n "${CONDA_ROOT:-}" ]; then
+    source "$CONDA_ROOT/bin/activate" "$CONDA_ENV"
+elif command -v conda >/dev/null 2>&1; then
+    source "$(conda info --base)/bin/activate" "$CONDA_ENV"
+else
+    echo "ERROR: conda not found; set CONDA_ROOT to your miniconda install." >&2
+    exit 1
+fi
+# --- Distributed / NCCL setup (single node, intra-node NVLink) --------------
+ETH_IFACE=$(ip -o -4 addr list | grep -v "127.0.0.1" | grep -E "ens|eth|enp|bond" | head -1 | awk '{print $2}')
+if [ -z "$ETH_IFACE" ]; then
+    ETH_IFACE=$(ip -o -4 addr list | grep -v "127.0.0.1" | grep -v "ibp" | head -1 | awk '{print $2}')
+fi
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_FAMILY=AF_INET
+export NCCL_SOCKET_IFNAME=$ETH_IFACE
+export NCCL_P2P_LEVEL=NVL
+export MASTER_ADDR=$(scontrol show hostnames "${SLURM_NODELIST:-$(hostname)}" | head -n 1)
+export MASTER_PORT=$(shuf -i 15000-59999 -n 1)
+export NODE_RANK=${SLURM_NODEID:-0}
+echo "=== a2d2 peptide pretraining (dgx-b200) ==="
+echo "Job ID: ${SLURM_JOB_ID:-local}  Node: ${SLURM_NODELIST:-$(hostname)}  GPUs: $DEVICES  Tasks: $NTASKS"
+# --task pep makes train.py load config_pep.yaml; the hydra overrides pin
+# devices/nodes to the SLURM allocation so the two never drift apart.
+srun --ntasks-per-node=$NTASKS python train.py --task pep \
+    training.devices=$DEVICES \
+    training.nodes=$NODES
+conda deactivate

a2d2_pep/train.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+import os
+import sys
+import argparse
+import hydra
+from omegaconf import OmegaConf
+from datetime import datetime
+# Directory containing this file and the config_*.yaml files (used by Hydra below).
+CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))
+# Add the repo root (A2D2/) to sys.path so top-level packages like lightning_modules resolve.
+sys.path.insert(0, os.path.dirname(CONFIG_DIR))
+import wandb
+from lightning_modules import AnyOrderInsertionFlowModule
+torch.set_printoptions(threshold=10_000)
+torch.set_float32_matmul_precision("high")
+# Disable DDP optimizer due to incompatibility with flex_attention higher-order ops
+torch._dynamo.config.optimize_ddp = False
+def train(config):
+	wandb_logger = None
+	# set the random seed
+	pl.seed_everything(42)
+	torch.manual_seed(42)
+	# Only initialize wandb on rank 0 to avoid multiple runs
+	if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+		wandb.init(
+			project=config.wandb.project,
+			name=config.wandb.name,
+			config=OmegaConf.to_container(config, resolve=True),  # Convert to dict
+			dir=config.wandb.path
+		)
+		wandb_logger = WandbLogger(
+				project=wandb.run.project,
+				name=wandb.run.name,
+				log_model=False,  # Disable checkpoint uploading to save disk space
+			)
+	# Modify config to add timestamp to checkpoint directory
+	OmegaConf.set_struct(config, False)
+	time_string = datetime.now().strftime("%Y%m%d-%H%M%S")
+	config.training.checkpoint_dir = os.path.join(
+		config.training.checkpoint_dir, time_string
+	)
+	OmegaConf.set_struct(config, True)
+	# Create checkpoint directory
+	os.makedirs(config.training.checkpoint_dir, exist_ok=True)
+	# Setup data module - check if using HuggingFace dataset
+	if hasattr(config, 'hf_dataset'):
+		# Imported lazily: the HF/SAFE path is only used by the molecule configs,
+		# which keep mol_dataset.py (and its `safe` dependency) in a2d2_mol/.
+		from mol_dataset import setup_hf_data_and_update_config
+		print(f"Using HuggingFace dataset: {config.hf_dataset.name}")
+		data_module = setup_hf_data_and_update_config(
+			config,
+			dataset_name=config.hf_dataset.name,
+			smiles_column=config.hf_dataset.get('smiles_column', 'smiles')
+		)
+	else:
+		# Imported lazily: the local (arrow) path is used by the peptide config,
+		# which keeps dataloading_for_dynamic_batching.py in a2d2_pep/.
+		from data.dataloading_for_dynamic_batching import setup_data_and_update_config
+		print("Using local dataset")
+		data_module = setup_data_and_update_config(config)
+	module = AnyOrderInsertionFlowModule(config)
+	# Initialize trainer
+	# Configure trainer arguments
+	# Map torch_dtype to Lightning precision
+	dtype_str = config.model.get('torch_dtype', 'bfloat16')
+	precision_map = {
+		'float32': '32-true',
+		'float16': '16-mixed',
+		'bfloat16': 'bf16-mixed'
+	}
+	precision = precision_map.get(dtype_str, 'bf16-mixed')
+	trainer_kwargs = dict(
+		num_nodes=config.training.nodes,
+		accelerator="gpu",
+		devices=config.training.devices,
+		strategy="ddp",
+		precision=precision,
+		accumulate_grad_batches=(
+			config.training.batch_size
+			// (
+				config.training.per_gpu_batch_size
+				* config.training.nodes
+				* config.training.devices
+			)
+		),
+		log_every_n_steps=10,
+		enable_checkpointing=True,
+		default_root_dir=config.training.checkpoint_dir,
+		gradient_clip_val=1.0,
+	)
+	# Only one of max_steps or max_epochs will be used
+	if config.training.max_steps is not None:
+		trainer_kwargs["max_steps"] = config.training.max_steps
+	elif config.training.num_epochs is not None:
+		trainer_kwargs["max_epochs"] = config.training.num_epochs
+		config.training.max_steps = config.training.max_steps
+	else:
+		raise ValueError(
+			"Either max_steps or num_epochs must be specified in the config"
+		)
+	if config.training.warmup_steps is None:
+		config.training.warmup_steps = int(config.training.max_steps * 0.01)
+	# Add ModelCheckpoint callback to save the checkpoint when validation loss is at a new low
+	checkpoint_callback = ModelCheckpoint(
+		monitor="train/total_loss",
+		mode="min",
+		save_top_k=config.training.save_top_k,
+		save_last=True,
+		filename="epoch-{epoch:02d}-train_loss-{train/total_loss:.4f}",
+		dirpath=config.training.checkpoint_dir,
+		# Don't use val_loss in filename for periodic saves - causes failures when val doesn't run
+		auto_insert_metric_name=False
+	)
+	# Add separate callback for periodic saves (no val_loss dependency). Use
+	# step-based saves for streaming datasets (save_every_n_steps) and epoch-based
+	# saves otherwise (save_every_n_epochs); whichever the config provides.
+	save_every_n_steps = config.training.get('save_every_n_steps', None)
+	save_every_n_epochs = config.training.get('save_every_n_epochs', None)
+	if save_every_n_steps is not None:
+		periodic_checkpoint_callback = ModelCheckpoint(
+			save_top_k=-1,  # Save all periodic checkpoints
+			filename="step-{step:08d}",
+			dirpath=config.training.checkpoint_dir,
+			every_n_train_steps=save_every_n_steps,
+			auto_insert_metric_name=False
+		)
+	elif save_every_n_epochs is not None:
+		periodic_checkpoint_callback = ModelCheckpoint(
+			save_top_k=-1,  # Save all periodic checkpoints
+			filename="epoch-{epoch:02d}",
+			dirpath=config.training.checkpoint_dir,
+			every_n_epochs=save_every_n_epochs,
+			auto_insert_metric_name=False
+		)
+	else:
+		raise ValueError(
+			"Either save_every_n_steps or save_every_n_epochs must be specified in the config"
+		)
+	trainer_kwargs["callbacks"] = [checkpoint_callback, periodic_checkpoint_callback]
+	if wandb_logger is not None:
+		trainer_kwargs["logger"] = wandb_logger
+	trainer = pl.Trainer(**trainer_kwargs)
+	# Train the model
+	ckpt_path = None
+	if "resume_path" in config.training:
+		ckpt_path = config.training.resume_path
+	trainer.fit(module,
+             datamodule=data_module,
+             ckpt_path=ckpt_path)
+	# Only finish wandb on rank 0
+	if int(os.environ.get("LOCAL_RANK", 0)) == 0:
+		wandb.finish()
+if __name__ == '__main__':
+	# Parse arguments to get config name
+	parser = argparse.ArgumentParser()
+	parser.add_argument('--config_name', type=str, default='config',
+	                   help='Name of the config file to use')
+	parser.add_argument('--task', type=str, default=None,
+	                   help='Task name (uses config_{task}.yaml)')
+	# Parse known args (hydra will handle the rest)
+	args, unknown = parser.parse_known_args()
+	# Determine config name from task or config_name
+	if args.task:
+		config_name = f'config_{args.task}'
+	else:
+		config_name = args.config_name
+	print(f"Using config: {config_name}.yaml")
+	# Add config name to Hydra overrides (this persists across DDP subprocesses)
+	if '--config-name' not in unknown and f'--config-name={config_name}' not in unknown:
+		unknown.insert(0, f'--config-name={config_name}')
+	# Reconstruct sys.argv for hydra
+	sys.argv = [sys.argv[0]] + unknown
+	# Define main function with default config (will be overridden by command line)
+	@hydra.main(version_base=None,
+	           config_path=CONFIG_DIR,
+	           config_name='config')
+	def main(config):
+		"""Main entry point for training"""
+		train(config)
+	main()

assets/a2d2.gif ADDED Viewed

Git LFS Details

SHA256: 178ca7850ca39365492fea70cfc5e4f2e8653ceeda9a13dcd0438af61e1a83bb
Pointer size: 132 Bytes
Size of remote file: 7.83 MB

demo/quality_inference_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

environment.yml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Conda environment shared across the molecule, peptide, and language experiments.
+# Create with:
+#     conda env create -f environment.yml
+#     conda activate a2d2
+#
+# NOTE: flash-attn is hardware-specific and must be built against your installed torch
+# and CUDA, so it is not listed below. It is imported by the shared transformer backbone
+# (model/casual_transformer.py, model/rotary.py) and is required for all experiments.
+# After creating the env, install it with:
+#     pip install flash-attn==2.8.3 --no-build-isolation
+# Adjust pytorch-cuda below to match your CUDA toolkit / GPU.
+name: a2d2
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+dependencies:
+  - python=3.11
+  - pip
+  - pytorch
+  - pytorch-cuda=12.1
+  - rdkit=2023.9.6
+  - jupyterlab          # for demo/quality_inference_demo.ipynb
+  - pip:
+      # --- core scientific / DL stack ---
+      - numpy==1.26.4
+      - scipy==1.17.1
+      - pandas==2.1.4
+      - scikit-learn==1.8.0
+      - pytorch-lightning==2.6.0
+      - lightning==2.6.1
+      - transformers==4.55.4
+      - tokenizers==0.21.4
+      - safetensors==0.7.0
+      - accelerate==0.33.0
+      - peft==0.15.1            # LoRA adapters (language experiment)
+      - datasets==2.19.2
+      - huggingface-hub==0.36.2
+      - einops==0.8.2
+      - timm==1.0.26
+      - omegaconf==2.3.0
+      - wandb==0.26.1
+      # --- molecule experiment ---
+      - safe-mol==0.1.14
+      - datamol==0.12.5
+      - PyTDC==1.1.15
+      # --- peptide experiment ---
+      - SmilesPE==0.0.3
+      - fair-esm==2.0.0
+      - xgboost==3.2.0
+      # --- plotting / utilities ---
+      - matplotlib==3.10.6
+      - seaborn==0.13.2
+      - tqdm==4.67.1
+      - joblib==1.5.3
+      - loguru==0.7.3
+      - fsspec==2024.3.1

lightning_modules/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .mdm import MaskedDiffusionModule
+from .any_order import AnyOrderInsertionFlowModule
+__all__ = [
+    "MaskedDiffusionModule",
+    "AutoregressiveModule",
+    "AnyOrderInsertionFlowModule",
+]
+def __getattr__(name):
+    if name == "AutoregressiveModule":
+        from .autoregressive import AutoregressiveModule
+        return AutoregressiveModule
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

lightning_modules/any_length_remask.py ADDED Viewed

	@@ -0,0 +1,801 @@

+import os
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+from omegaconf import DictConfig
+import torch.nn.functional as F
+from model.transformer import AnyOrderMaskInsertionFlow
+from model.interpolant import AnyOrderMaskInsertionInterpolant, ModelPrediction
+from .bregman import jump_kernel_elbo, mse
+from .schedule import get_schedule_from_config
+from lightning_modules.any_order import AnyOrderInsertionFlowModule
+from model.model_wrapper import RemaskingAnyOrder
+from sampling import _sample_tokens
+import re
+from typing import Dict, Any
+from dataclasses import dataclass
+def strip_orig_mod_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Returns a new state_dict where any key containing '._orig_mod.' is replaced
+    by removing the '_orig_mod' segment, e.g.
+      'model._orig_mod.vocab_embed.embedding'
+    becomes
+      'model.vocab_embed.embedding'
+    """
+    new_state_dict: Dict[str, Any] = {}
+    for key, value in state_dict.items():
+        # remove all occurrences of '._orig_mod.'
+        clean_key = re.sub(r"\._orig_mod\.", ".", key)
+        new_state_dict[clean_key] = value
+    return new_state_dict
+@torch.no_grad()
+def _binary_auc(scores: torch.Tensor, labels: torch.Tensor) -> float:
+    """Rank-based AUROC (Mann-Whitney U statistic).
+    AUC = P(score[pos] > score[neg]); 0.5 means no discrimination. Returns NaN
+    when only one class is present (AUC undefined). Ties are not averaged, which
+    is fine for continuous logits used here.
+    """
+    scores = scores.float().reshape(-1)
+    labels = labels.float().reshape(-1)
+    n_pos = labels.sum()
+    n_neg = labels.numel() - n_pos
+    if n_pos == 0 or n_neg == 0:
+        return float("nan")
+    order = torch.argsort(scores)
+    ranks = torch.empty_like(scores)
+    ranks[order] = torch.arange(1, scores.numel() + 1, device=scores.device, dtype=scores.dtype)
+    auc = (ranks[labels == 1].sum() - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)
+    return auc.item()
+class AnyOrderInsertionFlowModuleFT(AnyOrderInsertionFlowModule):
+    """
+    Wrapper around AnyOrderInsertionFlowModule that adds adaptive schedule model
+    for fine-tuning. Can load a pretrained AnyOrderInsertionFlowModule checkpoint
+    and add the schedule model on top.
+    """
+    def __init__(self, config, args, pretrained_checkpoint, insertion_planner=False):
+        # Initialize parent class first
+        super().__init__(config)
+        self.args = args
+        self.insertion_planner = insertion_planner
+        # Save hyperparameters for this class (overrides parent's save)
+        self.save_hyperparameters(ignore=['pretrained_checkpoint', 'args'])
+        # Load pretrained model weights BEFORE initializing planner to avoid circular reference
+        if pretrained_checkpoint is not None:
+            self.load_pretrained_model(pretrained_checkpoint)
+        # Initialize adaptive schedule model AFTER loading pretrained weights
+        self.planner = RemaskingAnyOrder(
+            backbone=self,
+            d_model=self.config.model.hidden_size,
+            insertion_planner=insertion_planner)
+    def load_pretrained_model(self, checkpoint_path: str):
+        """
+        Load pretrained AnyOrderInsertionFlowModule weights.
+        Only loads the base model and interpolant, not the schedule model.
+        """
+        print(f"Loading pretrained model from {checkpoint_path}")
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
+        # Extract state dict - handle different checkpoint formats
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+        # Strip _orig_mod keys if present
+        state_dict = strip_orig_mod_keys(state_dict)
+        # Filter out planner keys (if any exist from a previous FT checkpoint)
+        base_state_dict = {k: v for k, v in state_dict.items()
+                          if not k.startswith('planner.')}
+        # Load the base model weights
+        # Use strict=False to ignore missing schedule_model keys
+        incompatible_keys = self.load_state_dict(base_state_dict, strict=False)
+        # Filter out expected missing planner keys for cleaner output
+        unexpected_missing = [k for k in incompatible_keys.missing_keys
+                            if not k.startswith('planner.')]
+        planner_missing = [k for k in incompatible_keys.missing_keys
+                          if k.startswith('planner.')]
+        if unexpected_missing:
+            print(f"Warning: Unexpected missing keys from pretrained checkpoint: {unexpected_missing}")
+        if planner_missing:
+            print(f"Note: Planner will be trained from scratch ({len(planner_missing)} parameters)")
+        if incompatible_keys.unexpected_keys:
+            print(f"Warning: Unexpected keys in pretrained checkpoint: {incompatible_keys.unexpected_keys}")
+        # Freeze base model if specified
+        if self.config.training.get('freeze_base_model', False):
+            print("Freezing base model parameters")
+            for name, param in self.named_parameters():
+                if not name.startswith('planner.'):
+                    param.requires_grad = False
+    def forward(self, x, t, return_features=False):
+        # Use parent class forward method
+        return super().forward(x, t, return_features=return_features)
+    def training_loss(self, x1, t):
+        # Use parent class training_loss for base model loss
+        # Planner is trained separately via loss_planner_flexible with reward gradients
+        unmask_loss, insertion_loss, total_loss = super().training_loss(x1, t)
+        return unmask_loss, insertion_loss, total_loss
+    def training_step(self, batch, batch_idx):
+        # Extract input data
+        if isinstance(batch, dict):
+            batch = batch["input_ids"]
+        x1 = batch
+        t = self.sample_time(x1.shape[0], x1.device)
+        # Calculate the base model loss (planner trained separately, not here)
+        unmask_loss, len_loss, loss = self.training_loss(x1, t)
+        # Log component losses
+        self.log("train/unmask_loss", unmask_loss, prog_bar=True)
+        self.log("train/len_loss", len_loss, prog_bar=True)
+        self.log("train/total_loss", loss, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        if isinstance(batch, dict):
+            batch = batch["input_ids"]
+        x1 = batch
+        t = self.sample_time(x1.shape[0], x1.device)
+        unmask_loss, len_loss, loss = self.training_loss(x1, t)
+        self.log("val/unmask_loss", unmask_loss, prog_bar=True, sync_dist=True)
+        self.log("val/len_loss", len_loss, prog_bar=True, sync_dist=True)
+        self.log("val_loss", loss, prog_bar=True, sync_dist=True)
+        return loss
+    @classmethod
+    def load_from_checkpoint(cls, checkpoint_path, map_location=None, strict=True, **kwargs):
+        """
+        Custom checkpoint loading that handles finetuned checkpoints wrapped by PeptideFinetuner.
+        Extracts config from original pretrained checkpoint and loads finetuned weights.
+        """
+        print(f"Loading finetuned checkpoint from {checkpoint_path}")
+        checkpoint = torch.load(checkpoint_path, map_location=map_location or 'cpu', weights_only=False)
+        # Check if this is a wrapped checkpoint (from PeptideFinetuner)
+        hparams = checkpoint.get('hyper_parameters', {})
+        state_dict = checkpoint.get('state_dict', {})
+        # Check for policy_model prefix in state_dict (indicates PeptideFinetuner wrapper)
+        has_policy_prefix = any(k.startswith('policy_model.') for k in state_dict.keys())
+        if has_policy_prefix:
+            # Detect model type (molecule vs peptide) based on vocab size in checkpoint
+            # Molecule models have vocab size ~1882, peptide models have ~587
+            vocab_size = None
+            for k, v in state_dict.items():
+                if 'vocab_embed.embedding' in k:
+                    vocab_size = v.shape[0]
+                    break
+            is_molecule_model = vocab_size is not None and vocab_size > 1000
+            model_type = "MolFinetuner" if is_molecule_model else "PeptideFinetuner"
+            print(f"Detected wrapped finetuned checkpoint ({model_type}, vocab_size={vocab_size})")
+            # Extract args from hyperparameters
+            if 'args' not in hparams:
+                raise ValueError(f"Cannot find 'args' in hyperparameters. This checkpoint may not be from {model_type}.")
+            args = hparams['args']
+            print(f"Found args in hyperparameters, type: {type(args)}")
+            # Get original checkpoint path from args
+            # Handle both Namespace (hasattr) and dict (get) access patterns
+            original_ckpt_path = None
+            if hasattr(args, 'checkpoint_path'):
+                original_ckpt_path = args.checkpoint_path
+            elif isinstance(args, dict) and 'checkpoint_path' in args:
+                original_ckpt_path = args['checkpoint_path']
+            # If checkpoint_path is not set or is None, use default pretrained checkpoint
+            # Select appropriate default based on detected model type
+            if original_ckpt_path is None:
+                _repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+                if is_molecule_model:
+                    original_ckpt_path = os.path.join(_repo_root, 'pretrained', 'anylength_mol.ckpt')
+                    print(f"Warning: checkpoint_path not found in args, using default molecule pretrained checkpoint")
+                else:
+                    original_ckpt_path = os.path.join(_repo_root, 'pretrained', 'anylength_pep.ckpt')
+                    print(f"Warning: checkpoint_path not found in args, using default peptide pretrained checkpoint")
+            # Try to load config directly from checkpoint first (new checkpoints)
+            # Fall back to loading from original checkpoint (old checkpoints)
+            if 'config' in checkpoint:
+                print("Found config directly in checkpoint")
+                config = checkpoint['config']
+            else:
+                print(f"Config not in checkpoint, loading from original checkpoint: {original_ckpt_path}")
+                # Load config from original pretrained checkpoint
+                orig_ckpt = torch.load(original_ckpt_path, map_location='cpu', weights_only=False)
+                if 'config' not in orig_ckpt:
+                    raise ValueError(f"Original checkpoint {original_ckpt_path} does not contain config")
+                config = orig_ckpt['config']
+            # Ensure adaptive schedule is enabled
+            # Need to disable struct mode to add new keys to OmegaConf config
+            from omegaconf import OmegaConf
+            if hasattr(config, 'training'):
+                OmegaConf.set_struct(config, False)
+                config.training.use_adaptive_schedule = True
+                OmegaConf.set_struct(config, True)
+            # Create args object if needed
+            if not hasattr(args, '__dict__'):
+                # Convert dict to object with attributes
+                class Args:
+                    pass
+                args_obj = Args()
+                for k, v in args.items():
+                    setattr(args_obj, k, v)
+                args = args_obj
+            # Initialize model with config and args
+            model = cls(
+                config=config,
+                args=args,
+                pretrained_checkpoint=None,  # Don't reload pretrained, weights already in checkpoint
+                insertion_planner=getattr(args, 'insertion_planner', False)
+            )
+            # Extract policy_model weights from state_dict
+            policy_state = {}
+            for k, v in state_dict.items():
+                if k.startswith('policy_model.'):
+                    # Strip 'policy_model.' prefix
+                    new_key = k[len('policy_model.'):]
+                    policy_state[new_key] = v
+            # Load the finetuned weights
+            incompatible = model.load_state_dict(policy_state, strict=False)
+            if incompatible.missing_keys or incompatible.unexpected_keys:
+                print(f"Warning: Incompatible keys when loading finetuned weights:")
+                if incompatible.missing_keys:
+                    print(f"  Missing: {incompatible.missing_keys[:5]}...")
+                if incompatible.unexpected_keys:
+                    print(f"  Unexpected: {incompatible.unexpected_keys[:5]}...")
+            # Initialize or load EMA params
+            if model.use_ema:
+                if "ema_params" in checkpoint:
+                    # Load EMA params from checkpoint
+                    model.ema_params = checkpoint["ema_params"]
+                    print("Loaded EMA params from checkpoint")
+                else:
+                    # Initialize empty EMA params (will be populated if needed)
+                    model.ema_params = {
+                        name: param.clone().detach()
+                        for name, param in model.named_parameters()
+                    }
+                    print("Initialized EMA params from current model state")
+            else:
+                model.ema_params = {}
+            # Load planner state if it exists
+            if "planner_state" in checkpoint and hasattr(model, 'planner'):
+                model.planner.load_state_dict(checkpoint["planner_state"], strict=False)
+                print("Loaded planner state from checkpoint")
+            return model
+        else:
+            # Not a wrapped checkpoint, use default Lightning loading
+            # But we still need to provide required __init__ arguments
+            raise NotImplementedError(
+                "Direct finetuned checkpoints (not wrapped by PeptideFinetuner) are not yet supported. "
+                "Please provide config and args as kwargs."
+            )
+    def on_save_checkpoint(self, checkpoint):
+        """Save config and EMA params, including planner state."""
+        # Call parent to save config and base model EMA
+        super().on_save_checkpoint(checkpoint)
+        # Explicitly save planner state
+        if hasattr(self, 'planner'):
+            checkpoint["planner_state"] = self.planner.state_dict()
+    def on_load_checkpoint(self, checkpoint):
+        """Load config and reinitialize interpolant, including planner."""
+        # For finetuned checkpoints loaded via custom load_from_checkpoint,
+        # config may not be in checkpoint (it's loaded from original checkpoint)
+        if "config" in checkpoint:
+            # Call parent to restore config and interpolant
+            super().on_load_checkpoint(checkpoint)
+        else:
+            # Config already set during __init__ via load_from_checkpoint
+            # Just restore EMA params if they exist
+            if self.use_ema and "ema_params" in checkpoint:
+                self.ema_params = checkpoint["ema_params"]
+        # Restore planner state if it exists in checkpoint
+        if hasattr(self, 'planner') and "planner_state" in checkpoint:
+            self.planner.load_state_dict(checkpoint["planner_state"])
+            print("Loaded planner from checkpoint")
+    def loss_wdce_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
+        r"""
+        Weighted denoising cross entropy loss
+        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
+        log_rnd: [B] — pre-computed importance weights (already softmax-normalized over the full buffer)
+        x: [B, L] (no mask)
+        num_replicates: R, number of replicates of each row in x
+        weight_func: w(lambda) for each sample, 1/lambda by default
+        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
+        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
+        """
+        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
+        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
+        if centering:
+            batch_weights = batch_weights - centering_strength * batch_weights.mean()
+        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
+        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
+        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
+        t = lamda
+        # compute unmasking and insertion loss
+        interpolant_sample = self.interpolant.sample_interpolant(t, batch)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        scale_factor = self.config.interpolant.max_length
+        match self.unmask_loss_fn:
+            case "elbo":
+                mask_indices = interpolant_sample.mask_indices
+                unmask_loss_all = torch.zeros_like(unmask_weight)  # [B*R, L]
+                unmask_loss_all[mask_indices] = unmask_weight[mask_indices] * F.cross_entropy(
+                    prediction.token_logits[mask_indices],
+                    interpolant_sample.unmasked[mask_indices],
+                    reduction="none",
+                )
+                unmask_loss = unmask_loss_all.sum(dim=1) / scale_factor  # [B*R]
+            case _:
+                raise ValueError(f"Invalid unmask loss type: {self.unmask_loss_fn}")
+        match self.insert_loss_fn:
+            case "expectation":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * jump_kernel_elbo(
+                    gaps[gaps_mask], prediction.expected_gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]
+            case "distribution":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * F.cross_entropy(
+                    prediction.length_posterior[gaps_mask], gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]
+        total_loss = unmask_loss + insertion_loss  # [B*R]
+        # end compute unmasking and insertion loss
+        weighted_loss = total_loss * batch_weights  # [B*R]
+        return weighted_loss.mean()
+    def one_step_sampler(self, xt, t, pred_rate=None):
+        """
+        Sample one step of unmasking using model predictions.
+        Args:
+            xt: Current state [B, L]
+            t: Time [B]
+            pred_rate: Optional pre-computed ModelPrediction. If None, will compute from model.
+        Returns:
+            new_xt: Next state [B, L]
+            update_ids: Boolean mask of updated positions [B, L]
+        """
+        mask = self.interpolant.mask_token
+        pad = self.interpolant.pad_token
+        batch_size, L = xt.shape
+        device = xt.device
+        steps = self.args.total_num_steps
+        dt = 1.0 / steps
+        max_length = self.interpolant.max_length
+        # Use actual tensor dimension L instead of max_length to handle replicated batches
+        batch_idx_L = (
+            torch.arange(batch_size, device=device)
+            .view(batch_size, 1)
+            .expand(batch_size, L)
+        )
+        pos_idx_L = (
+            torch.arange(L, device=device)
+            .view(1, L)
+            .expand(batch_size, L)
+        )
+        # ——— predict and convert rates ———
+        if pred_rate is None:
+            pred_rate = self(xt, t)
+        pred_rate = self.interpolant.to_actual_rate(xt, pred_rate, t)
+        unmask_rate = pred_rate.unmask_rate  # (B, L, V)
+        len_rate = pred_rate.length_rate  # (B, L+1)
+        # ——— unmask step (Euler) ———
+        mask_pos = (xt == self.interpolant.mask_token).nonzero(as_tuple=True)
+        unmask_rate[xt != mask] = 0
+        unmask_rate[mask_pos + (mask,)] = 0
+        unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
+        trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)
+        # add "stay" probability
+        _xt = xt.clone()
+        _xt[xt == pad] = mask
+        trans_prob.scatter_add_(
+            2,
+            _xt.unsqueeze(-1),
+            torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
+        )
+        trans_prob[mask_pos + (mask,)] = 0.0  # remove mask token from sampling at the last step
+        # Renormalize probabilities to ensure they sum to 1
+        prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
+        # Avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
+        mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
+        if mask_has_zero_prob.any():
+            # Create uniform distribution over valid tokens (excluding mask and pad)
+            num_zero_prob = mask_has_zero_prob.sum().item()
+            uniform_prob = torch.zeros((num_zero_prob, trans_prob.shape[-1]), device=device, dtype=trans_prob.dtype)
+            uniform_prob[:, :mask] = 1.0 / mask  # Uniform over tokens 0 to mask-1
+            trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
+        else:
+            # Normalize to sum to 1
+            trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum
+        new_xt = _sample_tokens(trans_prob)
+        new_xt[xt == pad] = pad
+        new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
+        # update indices--boolean tensor of shape (B, max_length)
+        # A position is updated if:
+        # 1. The token changed (xt != new_xt)
+        # 2. It's not a pad position
+        # 3. It WAS a mask token that got unmasked (so we check xt == mask, not xt != mask)
+        # Debug before fix
+        old_update_ids = (xt != new_xt) & (xt != pad) & (xt != mask)
+        # Correct logic: updated positions are where mask tokens were changed
+        update_ids = (xt != new_xt) & (xt != pad)
+        if self.insertion_planner is False:
+            return new_xt, update_ids
+        # ——— Poisson insertion (tau-leaping) — can insert multiple masks per gap ———
+        ext = torch.poisson(len_rate * dt).long()  # (B, L+1)
+        xt_len = xt.ne(pad).sum(dim=1)  # (B,)
+        # Use ext.shape[1] to get the actual max_length dimension from the data
+        actual_max_length = ext.shape[1] - 1  # ext is (B, L+1), so L = ext.shape[1] - 1
+        gaps = torch.arange(ext.shape[1], device=device).view(1, -1)
+        ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
+        total_ext = ext.sum(dim=1)
+        valid = xt_len + total_ext <= actual_max_length
+        ext = ext * valid.view(batch_size, 1).long()
+        ext_ex = ext.int().cumsum(dim=1)  # (B, L+1)
+        new_len = xt_len + total_ext  # (B,)
+        xt_tmp = torch.full_like(xt, pad)
+        # Create position indices that match xt_tmp's shape
+        pos_idx_for_fill = torch.arange(xt_tmp.shape[1], device=device).view(1, -1).expand(batch_size, -1)
+        mask_fill = pos_idx_for_fill < new_len.view(batch_size, 1)
+        xt_tmp[mask_fill] = mask
+        new_pos_orig = pos_idx_L + ext_ex[:, :actual_max_length]  # (B, L)
+        orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
+        flat_b = batch_idx_L[orig_mask]
+        flat_p = new_pos_orig[orig_mask]
+        xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
+        new_ins_xt = xt_tmp
+        # Newly inserted masks: positions that are mask now but weren't before.
+        newly_inserted_masks = (new_ins_xt == mask) & (xt != mask) & (xt != pad)
+        update_ins_ids = newly_inserted_masks
+        return new_xt, update_ids, new_ins_xt, update_ins_ids
+    def loss_planner_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
+        r"""
+        Weighted denoising cross entropy loss
+        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
+        log_rnd: [B] — pre-computed importance weights (already softmax-normalized over the full buffer)
+        x: [B, L] (no mask)
+        num_replicates: R, number of replicates of each row in x
+        weight_func: w(lambda) for each sample, 1/lambda by default
+        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
+        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
+        """
+        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
+        batch_size = batch.shape[0]
+        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
+        if centering:
+            batch_weights = batch_weights - centering_strength * batch_weights.mean()
+        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
+        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
+        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
+        t = lamda
+        scale_factor = self.config.interpolant.max_length
+        # compute unmasking and insertion loss
+        interpolant_sample = self.interpolant.sample_interpolant(t, batch)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        with torch.no_grad(): # no need to compute gradient in this step
+            sampler_out = self.one_step_sampler(interpolant_sample.xt, t, prediction)
+            # one_step_sampler returns (xs, update_ids) or (xs, update_ids, new_ins_xt, update_ins_ids)
+            xs, update_ids = sampler_out[0], sampler_out[1]
+        # The remasking head scores the freshly-decoded tokens to decide which to
+        # remask, so it reads the POST-unmask state xs (matching inference, which
+        # calls the planner on the decoded new_xt).
+        planner = self.planner(xs, t)
+        remasking_conf = planner["remasking_conf"]  # [B*R, L, 1]
+        # Compute per-sample loss
+        # IMPORTANT: interpolant_sample.xt has been reordered via st permutation
+        # We need to map back to the original positions to compare with batch
+        st = interpolant_sample.st  # [B*R, L] permutation indices
+        batch_reordered = torch.gather(batch, 1, st)  # Apply same permutation to ground truth
+        binary_label = (xs == batch_reordered).float()
+        # Only compute loss on positions that were updated
+        per_token_loss = F.binary_cross_entropy_with_logits(
+            remasking_conf.squeeze(-1),  # [B*R, L]
+            binary_label,  # [B*R, L]
+            reduction="none"  # [B*R, L]
+        )
+        per_token_loss = per_token_loss * update_ids.float()  # [B*R, L]
+        # Mask out non-updated positions and average per sample
+        per_sample_loss = per_token_loss.sum(dim=1) / (update_ids.sum(dim=1).float() + 1e-8)  # [B*R]
+        # Weight by importance sampling weights
+        weighted_loss = per_sample_loss * batch_weights  # [B*R]
+        # ——— AUC / label-balance diagnostics (see loss_insert_planner_flexible) ———
+        with torch.no_grad():
+            metrics = {}
+            sel_u = update_ids.bool()
+            if sel_u.any():
+                u_scores = remasking_conf.squeeze(-1)[sel_u]
+                u_labels = binary_label[sel_u]
+                metrics["unmask_auc"] = _binary_auc(u_scores, u_labels)
+                metrics["unmask_label_mean"] = u_labels.mean().item()
+                metrics["unmask_conf_mean"] = torch.sigmoid(u_scores).mean().item()
+                metrics["unmask_n"] = float(sel_u.sum().item())
+            self._last_planner_metrics = metrics
+        return weighted_loss.mean()
+    def loss_insert_planner_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False, centering_strength=1.0, softmax_temperature=1.0):
+        r"""
+        Weighted denoising cross entropy loss
+        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
+        log_rnd: [B] — pre-computed importance weights
+        x: [B, L] (no mask)
+        num_replicates: R, number of replicates of each row in x
+        weight_func: w(lambda) for each sample, 1/lambda by default
+        centering_strength: float, controls how much of the mean is subtracted (DMPO-style)
+        softmax_temperature: float, temperature for softmax on log_rnd (>1 smooths weights)
+        """
+        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
+        batch_size = batch.shape[0]
+        batch_weights = (log_rnd.detach() / softmax_temperature).softmax(dim=-1)  # [B]
+        if centering:
+            batch_weights = batch_weights - centering_strength * batch_weights.mean()
+        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
+        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
+        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
+        t = lamda
+        scale_factor = self.config.interpolant.max_length
+        # compute unmasking and insertion loss
+        # deleted mask: binary tensor [B*R, L] where true tokens in batch were deleted
+        # gap_assignment: [B*R, max_gaps, L] maps x1 positions to gap indices
+        interpolant_sample, deleted_mask, gap_assignment = self.interpolant.sample_interpolant_plan(t, batch)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        with torch.no_grad(): # no need to compute gradient in this step
+            xs_unmask, update_unmask_ids, xs_insert, update_ins_ids = self.one_step_sampler(interpolant_sample.xt, t, prediction)
+        # The remasking head scores the freshly-decoded tokens to decide which to
+        # remask, so it must see the POST-unmask state xs_unmask (matching
+        # inference in inference_quality.py, which calls the planner on the
+        # decoded new_xt). Grad stays on here since this head is what we train.
+        planner = self.planner(xs_unmask, t)
+        remasking_conf = planner["remasking_conf"]  # [B*R, L, 1]
+        # The insertion-quality head scores the freshly-inserted mask tokens, so
+        # it must see the POST-insertion state xs_insert (aligned with
+        # update_ins_ids / insertion_quality below, and matching inference in
+        # remasking_scheduleaware.apply_schedule_aware_insertion). Grad stays on
+        # here since this head is what we are training.
+        if self.planner.insertion_planner:
+            insertion_conf = self.planner(xs_insert, t)["insertion_conf"]  # [B*R, L, 1]
+        else:
+            insertion_conf = None
+        # Compute per-sample loss
+        # IMPORTANT: interpolant_sample.xt has been reordered via st permutation
+        # We need to map back to the original positions to compare with batch
+        # Use the st (permutation) to get the ground truth in the reordered space
+        st = interpolant_sample.st  # [B*R, L] permutation indices
+        batch_reordered = torch.gather(batch, 1, st)  # Apply same permutation to ground truth
+        # Now compare in the reordered space
+        binary_label = (xs_unmask == batch_reordered).float()
+        # Only compute loss on positions that were updated
+        per_token_loss = F.binary_cross_entropy_with_logits(
+            remasking_conf.squeeze(-1),  # [B*R, L]
+            binary_label,  # [B*R, L]
+            reduction="none"  # [B*R, L]
+        )
+        per_token_loss = per_token_loss * update_unmask_ids.float()  # [B*R, L]
+        # Mask out non-updated positions and average per sample
+        unmask_per_sample_loss = per_token_loss.sum(dim=1) / (update_unmask_ids.sum(dim=1).float() + 1e-8)  # [B*R]
+        # compute insertion planner loss
+        # For positions where masks were inserted, we evaluate the quality of insertion
+        # by computing the probability that the ground truth token would be predicted at that position
+        # IMPORTANT: We need to recompute predictions using xs_insert since that's where the masks were inserted
+        # The original prediction was computed from xt (before insertion)
+        with torch.no_grad():
+            prediction_after_insert: ModelPrediction = self(xs_insert, t)
+        # Get the token prediction probabilities at inserted mask positions
+        # prediction_after_insert.token_logits: [B*R, L, V] - logits for all positions in xs_insert
+        token_probs = F.softmax(prediction_after_insert.token_logits, dim=-1)  # [B*R, L, V]
+        # For each gap where masks were inserted, compute the sum of probabilities
+        # of the ground truth tokens that were deleted in that specific gap
+        # gap_assignment: [B*R, max_gaps, L] - maps x1 positions to gap indices
+        # batch: [B*R, L] - ground truth tokens in original space (before permutation)
+        vocab_size = token_probs.shape[-1]
+        L = token_probs.shape[1]
+        max_gaps = gap_assignment.shape[1]
+        # For each gap, create a vocabulary mask of tokens that belong to that gap
+        # gap_vocab_mask[b, gap_idx, token_id] = 1 if token_id was deleted in gap gap_idx
+        gap_vocab_mask = torch.zeros(batch_size, max_gaps, vocab_size, device=batch.device, dtype=torch.float)
+        # Vectorized: gather tokens from batch for all gaps at once
+        # tokens_expanded[b, gap_idx, pos] = batch[b, pos] for all positions
+        tokens_expanded = batch.unsqueeze(1).expand(batch_size, max_gaps, L)  # [B*R, max_gaps, L]
+        # valid_mask[b, gap_idx, pos] = 1 if position pos belongs to gap gap_idx and is not pad
+        valid_mask = (gap_assignment > 0) & (tokens_expanded != self.interpolant.pad_token)  # [B*R, max_gaps, L]
+        # Scatter tokens into vocabulary dimension: mark which tokens appear in each gap
+        gap_vocab_mask.scatter_add_(
+            2,  # scatter along vocabulary dimension
+            tokens_expanded.clamp(0, vocab_size - 1),  # token indices [B*R, max_gaps, L]
+            valid_mask.float()  # values to add [B*R, max_gaps, L]
+        )
+        # Binarize: a token either appears in the gap or not
+        gap_vocab_mask = (gap_vocab_mask > 0).float()  # [B*R, max_gaps, V]
+        # For each insertion position in xs_insert, determine which gap it corresponds to
+        # Position p in xs_insert corresponds to gap p (insertions occur between existing tokens)
+        # Vectorized: compute for all positions at once
+        # token_probs: [B*R, L, V]
+        # gap_vocab_mask[:, :L, :]: [B*R, L, V] - vocab mask for gaps 0 to L-1
+        insertion_quality_full = (token_probs * gap_vocab_mask[:, :L, :]).sum(dim=-1)  # [B*R, L]
+        # Only consider quality at positions where masks were actually inserted
+        insertion_quality = insertion_quality_full * update_ins_ids.float()  # [B*R, L]
+        # Compute insertion planner loss only if insertion_planner is enabled
+        if insertion_conf is not None:
+            # The planner predicts insertion confidence with insertion_conf
+            # We want to train it to predict high confidence when insertion_quality is high
+            # Use Bernoulli cross-entropy: treat insertion_quality as the "success probability"
+            # Binary cross-entropy with insertion_quality as continuous labels in [0,1]
+            ins_per_token_loss = F.binary_cross_entropy_with_logits(
+                insertion_conf.squeeze(-1),  # [B*R, L] - planner's insertion confidence logits
+                insertion_quality,  # [B*R, L] - ground truth token probability as quality metric
+                reduction="none"
+            )
+            # Only compute loss where masks were actually inserted
+            ins_per_token_loss = ins_per_token_loss * update_ins_ids.float()
+            # Average per sample
+            ins_per_sample_loss = ins_per_token_loss.sum(dim=1) / (update_ins_ids.sum(dim=1).float() + 1e-8)
+        else:
+            # No insertion planner - set loss to zero
+            ins_per_sample_loss = torch.zeros_like(unmask_per_sample_loss)
+        # Add to total loss
+        per_sample_loss = unmask_per_sample_loss + ins_per_sample_loss
+        # Weight by importance sampling weights
+        weighted_loss = per_sample_loss * batch_weights  # [B*R]
+        # ——— AUC / label-balance diagnostics (the loss alone hides degenerate
+        # targets; near-0 BCE can mean "all labels one class", not "learned") ———
+        with torch.no_grad():
+            metrics = {}
+            sel_u = update_unmask_ids.bool()
+            if sel_u.any():
+                u_scores = remasking_conf.squeeze(-1)[sel_u]
+                u_labels = binary_label[sel_u]
+                metrics["unmask_auc"] = _binary_auc(u_scores, u_labels)
+                metrics["unmask_label_mean"] = u_labels.mean().item()
+                metrics["unmask_conf_mean"] = torch.sigmoid(u_scores).mean().item()
+                metrics["unmask_n"] = float(sel_u.sum().item())
+            if insertion_conf is not None:
+                sel_i = update_ins_ids.bool()
+                if sel_i.any():
+                    i_scores = insertion_conf.squeeze(-1)[sel_i]
+                    i_targets = insertion_quality[sel_i]
+                    i_labels = (i_targets > 0.5).float()
+                    metrics["insert_auc"] = _binary_auc(i_scores, i_labels)
+                    metrics["insert_target_mean"] = i_targets.mean().item()
+                    metrics["insert_conf_mean"] = torch.sigmoid(i_scores).mean().item()
+                    metrics["insert_n"] = float(sel_i.sum().item())
+            self._last_planner_metrics = metrics
+        return unmask_per_sample_loss.mean(), ins_per_sample_loss.mean(), weighted_loss.mean()

lightning_modules/any_order.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import torch
+import pytorch_lightning as pl
+from omegaconf import DictConfig
+import torch.nn.functional as F
+from model.transformer import AnyOrderMaskInsertionFlow
+from model.interpolant import AnyOrderMaskInsertionInterpolant, ModelPrediction
+from .bregman import jump_kernel_elbo, mse
+from .schedule import get_schedule_from_config
+import re
+from typing import Dict, Any
+def strip_orig_mod_keys(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Returns a new state_dict where any key containing '._orig_mod.' is replaced
+    by removing the '_orig_mod' segment, e.g.
+      'model._orig_mod.vocab_embed.embedding'
+    becomes
+      'model.vocab_embed.embedding'
+    """
+    new_state_dict: Dict[str, Any] = {}
+    for key, value in state_dict.items():
+        # remove all occurrences of '._orig_mod.'
+        clean_key = re.sub(r"\._orig_mod\.", ".", key)
+        new_state_dict[clean_key] = value
+    return new_state_dict
+class AnyOrderInsertionFlowModule(pl.LightningModule):
+    def __init__(self, config: DictConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.interpolant.type
+        self.learning_rate = config.training.learning_rate
+        self.unmask_loss_fn = config.training.loss_fn.unmask
+        self.insert_loss_fn = config.training.loss_fn.insert
+        # Initialize model based on type
+        self.model = AnyOrderMaskInsertionFlow(config)
+        # self.model = torch.compile(self.model)  # Disabled: incompatible with flex_attention nested functions
+        insert_schedule = get_schedule_from_config(config.interpolant.insert_schedule)
+        unmask_schedule = get_schedule_from_config(config.interpolant.unmask_schedule)
+        # Initialize interpolant
+        self.interpolant = AnyOrderMaskInsertionInterpolant(
+            insertion_schedule=insert_schedule,
+            unmask_schedule=unmask_schedule,
+            vocab_size=config.interpolant.tokens,
+            mask_token=config.interpolant.mask_token,
+            pad_token=config.interpolant.pad_token,
+            max_length=config.interpolant.max_length,
+        )
+        # Save hyperparameters
+        self.save_hyperparameters()
+        self.ema_decay = config.training.ema_decay or 0.0
+        self.use_ema = self.ema_decay > 0
+        self._orig_params = {}
+    def forward(self, x, t, return_features: bool = False):
+        if self.config.training.only_embed_insert:
+            result = self.model(x, self.interpolant.insertion_schedule.at(t), return_features=return_features)
+        else:
+            result = self.model(x, t, return_features=return_features)
+        return result
+    def get_hidden_states(self, indices: torch.Tensor, t: torch.Tensor):
+        """Delegate to backbone transformer for RemaskingAnyOrder compatibility."""
+        return self.model.get_hidden_states(indices, t)
+    def training_loss(self, x1, t):
+        interpolant_sample = self.interpolant.sample_interpolant(t, x1)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, x1)
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        scale_factor = x1.shape[0] * self.config.interpolant.max_length
+        match self.unmask_loss_fn:
+            case "elbo":
+                mask_indices = interpolant_sample.mask_indices
+                unmask_loss = unmask_weight[mask_indices] * F.cross_entropy(
+                    prediction.token_logits[mask_indices],
+                    interpolant_sample.unmasked[mask_indices],
+                    reduction="none",
+                )
+                unmask_loss = unmask_loss.sum() / scale_factor
+            case _:
+                raise ValueError(f"Invalid unmask loss type: {self.unmask_loss_fn}")
+        match self.insert_loss_fn:
+            case "expectation":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss = insert_weight[gaps_mask] * jump_kernel_elbo(
+                    gaps[gaps_mask], prediction.expected_gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss.sum() / scale_factor
+            case "distribution":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss = insert_weight[gaps_mask] * F.cross_entropy(
+                    prediction.length_posterior[gaps_mask], gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss.sum() / scale_factor
+        total_loss = unmask_loss + insertion_loss
+        return unmask_loss, insertion_loss, total_loss
+    def prepare_noised_sample(self, x, num_samples=1, t=None):
+        """
+        Run the forward noising process on clean sequences x.
+        Replicates each sequence num_samples times with independent random times
+        so that both policy and pretrained can evaluate the same noised data.
+        Args:
+            x: [B, L] clean token sequences (no mask tokens)
+            num_samples: K, number of noisy time samples per sequence
+            t: [B*K] optional time values. If None, sampled uniformly.
+        Returns:
+            dict with all artifacts needed by compute_loss_from_noised.
+        """
+        B = x.shape[0]
+        x_rep = x.repeat_interleave(num_samples, dim=0)  # [B*K, L]
+        if t is None:
+            t = torch.rand(B * num_samples, device=x.device)
+        interpolant_sample = self.interpolant.sample_interpolant(t, x_rep)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, x_rep)
+        scale_factor = self.config.interpolant.max_length
+        return {
+            "interpolant_sample": interpolant_sample,
+            "unmask_weight": unmask_weight,
+            "insert_weight": insert_weight,
+            "t": t,
+            "scale_factor": scale_factor,
+            "num_samples": num_samples,
+            "batch_size": B,
+        }
+    def compute_loss_from_noised(self, noised):
+        """
+        Compute per-sample denoising loss given pre-noised data.
+        Each model runs its own forward pass on the shared noised xt.
+        Args:
+            noised: dict from prepare_noised_sample()
+        Returns:
+            total_loss: [B] per-sample loss averaged over K noisy samples
+        """
+        interpolant_sample = noised["interpolant_sample"]
+        unmask_weight = noised["unmask_weight"]
+        insert_weight = noised["insert_weight"]
+        t = noised["t"]
+        scale_factor = noised["scale_factor"]
+        num_samples = noised["num_samples"]
+        B = noised["batch_size"]
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        match self.unmask_loss_fn:
+            case "elbo":
+                mask_indices = interpolant_sample.mask_indices
+                unmask_loss_all = torch.zeros_like(unmask_weight)  # [B*K, L]
+                unmask_loss_all[mask_indices] = unmask_weight[mask_indices] * F.cross_entropy(
+                    prediction.token_logits[mask_indices],
+                    interpolant_sample.unmasked[mask_indices],
+                    reduction="none",
+                )
+                unmask_loss = unmask_loss_all.sum(dim=1) / scale_factor  # [B*K]
+            case _:
+                raise ValueError(f"Invalid unmask loss type: {self.unmask_loss_fn}")
+        match self.insert_loss_fn:
+            case "expectation":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*K, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * jump_kernel_elbo(
+                    gaps[gaps_mask], prediction.expected_gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*K]
+            case "distribution":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*K, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * F.cross_entropy(
+                    prediction.length_posterior[gaps_mask], gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*K]
+        per_replicate_loss = unmask_loss + insertion_loss  # [B*K]
+        per_sample_loss = per_replicate_loss.view(B, num_samples).mean(dim=1)  # [B]
+        return per_sample_loss
+    def loss_wdce_flexible(self, log_rnd, x, num_replicates=16, weight_func=lambda l: 1/l, eps=1e-3, centering=False):
+        r"""
+        Weighted denoising cross entropy loss
+        X_T ~ P^u_T and weights \log\frac{dP^*}{dP^u}(X)
+        log_rnd: [B]; x: [B, L] (no mask)
+        num_replicates: R, number of replicates of each row in x
+        weight_func: w(lambda) for each sample, 1/lambda by default
+        """
+        print("logrnd shape:", log_rnd.shape)
+        print("x shape:", x.shape)
+        batch = x.repeat_interleave(num_replicates, dim=0) # [B*R, L]
+        batch_weights = log_rnd.detach().softmax(dim=-1) # [B*R]
+        if centering:
+            batch_weights = batch_weights - batch_weights.mean(dim=-1, keepdim=True)
+        batch_weights = batch_weights.repeat_interleave(num_replicates, dim=0)
+        lamda = torch.rand(batch.shape[0], device=batch.device) # [B*R]
+        lamda_weights = weight_func(lamda).clamp(max=1e5) # [B*R]
+        t = lamda
+        # compute unmasking and insertion loss
+        interpolant_sample = self.interpolant.sample_interpolant(t, batch)
+        unmask_weight, insert_weight = self.interpolant.elbo_weight(t, batch)
+        prediction: ModelPrediction = self(interpolant_sample.xt, t)
+        scale_factor = self.config.interpolant.max_length
+        match self.unmask_loss_fn:
+            case "elbo":
+                mask_indices = interpolant_sample.mask_indices
+                unmask_loss_all = torch.zeros_like(unmask_weight)  # [B*R, L]
+                unmask_loss_all[mask_indices] = unmask_weight[mask_indices] * F.cross_entropy(
+                    prediction.token_logits[mask_indices],
+                    interpolant_sample.unmasked[mask_indices],
+                    reduction="none",
+                )
+                unmask_loss = unmask_loss_all.sum(dim=1) / scale_factor  # [B*R]
+            case _:
+                raise ValueError(f"Invalid unmask loss type: {self.unmask_loss_fn}")
+        match self.insert_loss_fn:
+            case "expectation":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * jump_kernel_elbo(
+                    gaps[gaps_mask], prediction.expected_gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]
+            case "distribution":
+                gaps, gaps_mask = interpolant_sample.gaps_and_mask
+                insertion_loss_all = torch.zeros_like(insert_weight)  # [B*R, L+1]
+                insertion_loss_all[gaps_mask] = insert_weight[gaps_mask] * F.cross_entropy(
+                    prediction.length_posterior[gaps_mask], gaps[gaps_mask]
+                )
+                insertion_loss = insertion_loss_all.sum(dim=1) / scale_factor  # [B*R]
+        total_loss = unmask_loss + insertion_loss  # [B*R]
+        # end compute unmasking and insertion loss
+        weighted_loss = total_loss * batch_weights  # [B*R]
+        return weighted_loss.mean()
+    def sample_time(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        eps = 1e-6
+        interval = 1.0 - eps
+        interval_size = interval / batch_size
+        u = torch.rand(batch_size, device=device)
+        return (torch.arange(batch_size, device=device, dtype=u.dtype) + u) * interval_size
+    def training_step(self, batch, batch_idx):
+        # Extract input data
+        if isinstance(batch, dict):
+            batch = batch["input_ids"]
+        x1 = batch
+        t = self.sample_time(x1.shape[0], x1.device)
+        # Calculate the combined loss normally
+        unmask_loss, len_loss, loss = self.training_loss(x1, t)
+        # Log component losses
+        self.log("train/unmask_loss", unmask_loss, prog_bar=True)
+        self.log("train/len_loss", len_loss, prog_bar=True)
+        self.log("train/total_loss", loss, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        if isinstance(batch, dict):
+            batch = batch["input_ids"]
+        x1 = batch
+        t = self.sample_time(x1.shape[0], x1.device)
+        unmask_loss, len_loss, loss = self.training_loss(x1, t)
+        self.log("val/unmask_loss", unmask_loss, prog_bar=True, sync_dist=True)
+        self.log("val/len_loss", len_loss, prog_bar=True, sync_dist=True)
+        self.log("val_loss", loss, prog_bar=True, sync_dist=True)
+        return loss
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(
+            self.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.config.training.weight_decay,
+        )
+        warmup_steps = self.config.training.warmup_steps
+        max_steps = self.config.training.max_steps
+        # Always create a fresh schedule starting from step 0
+        # This allows extending training beyond original max_steps
+        linear_scheduler = torch.optim.lr_scheduler.LinearLR(
+            optimizer,
+            start_factor=1e-6,
+            end_factor=1.0,
+            total_iters=warmup_steps,
+            last_epoch=-1,
+        )
+        post_warmup = max_steps - warmup_steps
+        cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer,
+            T_max=post_warmup,
+            eta_min=0.0,
+            last_epoch=-1,
+        )
+        scheduler = torch.optim.lr_scheduler.SequentialLR(
+            optimizer,
+            schedulers=[linear_scheduler, cosine_scheduler],
+            milestones=[warmup_steps],
+            last_epoch=-1,
+        )
+        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
+    def optimizer_step(
+        self,
+        epoch: int,
+        batch_idx: int,
+        optimizer,
+        optimizer_closure=None,
+    ):
+        super().optimizer_step(
+            epoch, batch_idx, optimizer, optimizer_closure=optimizer_closure
+        )
+        # log learning rate and gradient norm
+        lr = optimizer.param_groups[0]["lr"]
+        self.log("train/lr", lr, on_step=True, prog_bar=True)
+        grad_norm = torch.sqrt(
+            sum(p.grad.norm(2) ** 2 for p in self.parameters() if p.grad is not None)
+        )
+        self.log("train/grad_norm", grad_norm, on_step=True, prog_bar=True)
+        # update EMA
+        if self.use_ema:
+            for n, p in self.named_parameters():
+                self.ema_params[n].mul_(self.ema_decay).add_(
+                    p.data.clone().detach(), alpha=1 - self.ema_decay
+                )
+    def on_save_checkpoint(self, checkpoint):
+        checkpoint["config"] = self.config
+        # save EMA state
+        if self.use_ema:
+            checkpoint["ema_params"] = {
+                n: v.clone() for n, v in self.ema_params.items()
+            }
+    def on_load_checkpoint(self, checkpoint):
+        self.config = checkpoint["config"]
+        insert_schedule = get_schedule_from_config(
+            self.config.interpolant.insert_schedule
+        )
+        unmask_schedule = get_schedule_from_config(
+            self.config.interpolant.unmask_schedule
+        )
+        self.interpolant = AnyOrderMaskInsertionInterpolant(
+            insertion_schedule=insert_schedule,
+            unmask_schedule=unmask_schedule,
+            vocab_size=self.config.interpolant.tokens,
+            mask_token=self.config.interpolant.mask_token,
+            pad_token=self.config.interpolant.pad_token,
+            max_length=self.config.interpolant.max_length,
+        )
+        self.ema_params = checkpoint["ema_params"] if self.use_ema else {}
+    def swap_to_ema(self):
+        for name, p in self.named_parameters():
+            self._orig_params[name] = p.data.clone()
+            p.data.copy_(self.ema_params[name].to(p.device))
+    def restore_original(self):
+        for name, p in self.named_parameters():
+            p.data.copy_(self._orig_params[name])
+        self._orig_params.clear()
+    def on_train_start(self):
+        # initialize and move EMA buffers once model is on correct device
+        if self.use_ema:
+            self.ema_params = {
+                name: param.clone().detach().to(self.device)
+                for name, param in self.named_parameters()
+            }
+            for buf in self.ema_params.values():
+                buf.requires_grad = False