Sophia

initial commit

8019be0 2 days ago

55.6 kB

	import os
	import sys
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # add repo root to path

	import torch
	from dataclasses import dataclass
	from typing import Any, Literal, Optional
	import numpy as np
	import pandas as pd

	from lightning_modules.mdm import MaskedDiffusionModule


	@dataclass
	class SamplingTraceDatapoint:
	t: float
	event_type: Literal["insertion", "change"]
	position: int
	token: Any


	@dataclass
	class SamplingResult:
	samples: torch.Tensor
	# Trace is supposed to be processed sequentially as updates are not commutative
	trace: Optional[list[SamplingTraceDatapoint]]

	def __iter__(self):
	yield from [self.samples, self.trace]


	# Sample from categorical distribution for each position using the transition probabilities
	def _sample_tokens(probs: torch.Tensor) -> torch.Tensor:
	"""Sample one token per position from probability distribution.
	Args:
	probs: [batch_size, seq_len, vocab_size] transition probabilities
	Returns:
	[batch_size, seq_len] sampled token indices
	"""
	batch_size, seq_len, vocab_size = probs.shape
	flat_probs = probs.view(-1, vocab_size)
	samples = torch.multinomial(flat_probs, num_samples=1)
	return samples.view(batch_size, seq_len)


	def _sample_batched_tokens(probs: torch.Tensor) -> torch.Tensor:

	batch_size, seq_len, vocab_size = probs.shape

	gumbel_noise = (-torch.log(-torch.log(torch.rand(batch_size, seq_len, vocab_size) + 1e-10) + 1e-10)).to(probs.device)
	noisy_logits = torch.log(probs + 1e-10) + gumbel_noise # add Gumbel noise to log probabilities

	# select the highest score (most likely category after Gumbel noise)
	samples = noisy_logits.argmax(dim=-1).to(dtype=torch.long)

	return samples.view(batch_size, seq_len)

	@torch.no_grad()
	def mdm_euler_sampling(
	model: MaskedDiffusionModule,
	steps: int,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	return_trace: bool = False,
	temperature: float = 1.0,
	):
	assert not return_trace, "Trace is not yet implemented in MDM Euler sampling"
	device = next(model.parameters()).device
	xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)

	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	for i in range(steps):
	print("i-th sampling step")
	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	_xt = xt.clone()
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)

	# Apply temperature scaling
	if temperature != 1.0:
	logits = torch.log(trans_prob + 1e-10) / temperature
	trans_prob = torch.softmax(logits, dim=-1)

	if i == steps - 1:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0
	print(trans_prob[mask_pos + (mask,)])

	new_xt = _sample_tokens(trans_prob)
	new_xt = torch.where(xt != mask, xt, new_xt)

	xt = new_xt
	t = t + dt

	return xt, []


	@torch.no_grad()
	def any_order_mask_insertion_euler_sampling(
	model: torch.nn.Module,
	steps: int,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	return_trace: bool = False,
	temperature: float = 1.0,
	) -> SamplingResult:
	device = next(model.parameters()).device

	# 1) Initialize all‑pad sequence and trace
	xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
	sampling_trace = []

	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	# Precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)
	sampling_trace = [[] for _ in range(batch_size)] if return_trace else None

	for i in range(steps):
	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	# add “stay” probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)

	if i == steps - 1:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	# renormalize probabilities to ensure they sum to 1
	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	# avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
	if mask_has_zero_prob.any():
	# create uniform distribution over valid tokens (excluding mask and pad)
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	# normalize to sum to 1
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	if i != steps - 1:
	# ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
	ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long() # (B, L+1)
	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	gaps = torch.arange(max_length + 1, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	if return_trace:
	# Check if the token was changed
	for batch_idx in range(batch_size):
	for j in range(max_length):
	if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="change",
	position=j,
	token=new_xt[batch_idx, j].item(),
	)
	)

	# Check if a new token was inserted
	for j in range(max_length):
	id = max_length - j - 1
	if ext[batch_idx, id]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="insertion",
	position=id,
	token=mask,
	)
	)

	xt = xt_tmp
	t = t + dt

	return xt, sampling_trace

	@torch.no_grad()
	def batch_mcts_reverse_step(
	xt: torch.Tensor,
	t: torch.Tensor,
	dt: float,
	model: torch.nn.Module,
	pretrained: torch.nn.Module,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	last_step: bool = False,
	temperature: float = 1.0,
	) -> SamplingResult:
	device = next(model.parameters()).device

	xt = xt.repeat(batch_size, 1)

	# squeeze to remove extra dimensions, then expand to batch_size
	t = t.squeeze().expand(batch_size)
	# precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)

	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— get pretrained model rates for log_rnd computation ———
	pretrained_pred = pretrained(xt, t)
	pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
	pretrained_unmask_rate = pretrained_rate.unmask_rate.clone() # (B, L, V)
	pretrained_len_rate = pretrained_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	# Same for pretrained
	pretrained_unmask_rate[xt != mask] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)

	# add “stay” probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)
	pretrained_trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
	)

	if last_step:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	# renormalize probabilities to ensure they sum to 1
	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	# avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
	if mask_has_zero_prob.any():
	# create uniform distribution over valid tokens (excluding mask and pad)
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	# normalize to sum to 1
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	# ——— compute log probabilities for RND ———
	lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
	lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)

	changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)

	log_policy_step = (lp * changed_mask).sum(dim=1)
	log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)

	log_rnd = log_pretrained_step - log_policy_step # (B,)

	if not last_step:
	# ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
	ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long() # (B, L+1)

	insertion_rate = (len_rate * dt).clamp(min=1e-10) # (B, L+1)
	pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10) # (B, L+1)

	# log P(ext; λ) = ext*log(λ) - λ
	log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1) # (B,)
	log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1) # (B,)

	log_insert_diff = log_pretrained_insert - log_policy_insert # (B,)
	log_rnd += log_insert_diff
	log_pretrained_step += log_pretrained_insert
	log_policy_step += log_policy_insert

	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	seq_dim = ext.size(1) # Use actual ext dimension to avoid mismatch
	gaps = torch.arange(seq_dim, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	return xt_tmp, log_rnd, log_policy_step, log_pretrained_step


	@torch.no_grad()
	def mcts_reverse_step(
	xt: torch.Tensor,
	t: torch.Tensor,
	dt: float,
	model: torch.nn.Module,
	pretrained: torch.nn.Module,
	mask: int,
	pad: int,
	max_length: int,
	last_step: bool = False,
	temperature: float = 1.0,
	) -> SamplingResult:
	device = next(model.parameters()).device

	batch_size = xt.size(0)

	# precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)

	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— get pretrained model rates for log_rnd computation ———
	pretrained_pred = pretrained(xt, t)
	pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
	pretrained_unmask_rate = pretrained_rate.unmask_rate.clone() # (B, L, V)
	pretrained_len_rate = pretrained_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	# same for pretrained
	pretrained_unmask_rate[xt != mask] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)

	# add “stay” probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)
	pretrained_trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
	)

	if last_step:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	# renormalize probabilities to ensure they sum to 1
	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	# avoid division by zero - if all probs are 0, use uniform distribution (excluding mask and pad)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
	if mask_has_zero_prob.any():
	# create uniform distribution over valid tokens (excluding mask and pad)
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	# normalize to sum to 1
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	# ——— compute log probabilities for RND ———
	lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
	lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)

	changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)

	log_policy_step = (lp * changed_mask).sum(dim=1)
	log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)

	log_rnd = log_pretrained_step - log_policy_step # (B,)

	if not last_step:
	# ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
	ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long() # (B, L+1)

	insertion_rate = (len_rate * dt).clamp(min=1e-10) # (B, L+1)
	pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10) # (B, L+1)

	# log P(ext; λ) = ext*log(λ) - λ
	log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1) # (B,)
	log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1) # (B,)

	log_insert_diff = log_pretrained_insert - log_policy_insert # (B,)
	log_rnd += log_insert_diff
	log_pretrained_step += log_pretrained_insert
	log_policy_step += log_policy_insert

	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	seq_dim = ext.size(1) # Use actual ext dimension to avoid mismatch
	gaps = torch.arange(seq_dim, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	return xt_tmp, log_rnd, log_policy_step, log_pretrained_step

	@torch.no_grad()
	def any_order_euler_sampling_with_schedule(
	model: torch.nn.Module,
	time_schedule: torch.Tensor,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	return_trace: bool = False,
	temperature: float = 1.0,
	) -> SamplingResult:
	device = next(model.parameters()).device

	time_schedule = time_schedule.to(device)
	if time_schedule[0] < time_schedule[-1]:
	time_schedule = torch.flip(time_schedule, [0]) # descending order

	steps = len(time_schedule) - 1

	# initialize all-pad sequence and trace
	xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)

	# precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)
	sampling_trace = [[] for _ in range(batch_size)] if return_trace else None

	for i in range(steps):
	# use scheduled timesteps
	t = time_schedule[i].repeat(batch_size)
	t_next = time_schedule[i + 1]
	dt = (t - t_next).abs() # timestep difference

	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt[:, None, None]).clamp(0.0, 1.0)

	# add "stay" probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)

	# Apply temperature scaling
	if temperature != 1.0:
	logits = torch.log(trans_prob + 1e-10) / temperature
	trans_prob = torch.softmax(logits, dim=-1)

	if i == steps - 1:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)

	if mask_has_zero_prob.any():
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	if i != steps - 1:
	# ——— gap-wise insertion refactored — compute new length, fill masks, scatter tokens ———
	ext = torch.bernoulli((len_rate * dt[:, None]).clamp(0.0, 1.0)).long() # (B, L+1)
	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	gaps = torch.arange(max_length + 1, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	if return_trace:
	# Check if the token was changed
	for batch_idx in range(batch_size):
	for j in range(max_length):
	if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="change",
	position=j,
	token=new_xt[batch_idx, j].item(),
	)
	)

	# Check if a new token was inserted
	for j in range(max_length):
	id = max_length - j - 1
	if ext[batch_idx, id]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="insertion",
	position=id,
	token=mask,
	)
	)

	xt = xt_tmp

	return xt, sampling_trace


	@torch.no_grad()
	def any_order_mask_insertion_euler_sampling_with_rnd(
	model, pretrained, reward_model, analyzer,
	tokenizer, steps,
	mask,
	pad,
	batch_size,
	max_length,
	return_trace = False,
	alpha = 0.1,
	temperature: float = 1.0,
	):
	device = next(model.parameters()).device

	# initialize all‑pad sequence and trace
	xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
	sampling_trace = []

	# initialize log_rnd to accumulate log probability ratios
	log_rnd = torch.zeros(batch_size, device=device)

	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	# precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)
	sampling_trace = [[] for _ in range(batch_size)] if return_trace else None

	for i in range(steps):
	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— get pretrained model rates for log_rnd computation ———
	pretrained_pred = pretrained(xt, t)
	pretrained_rate = pretrained.interpolant.to_actual_rate(xt, pretrained_pred, t)
	pretrained_unmask_rate = pretrained_rate.unmask_rate.clone() # (B, L, V)
	pretrained_len_rate = pretrained_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	# Same for pretrained
	pretrained_unmask_rate[xt != mask] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = 0
	pretrained_unmask_rate[mask_pos + (mask,)] = -pretrained_unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	pretrained_trans_prob = (pretrained_unmask_rate * dt).clamp(0.0, 1.0)

	# add “stay” probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)
	pretrained_trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=pretrained_trans_prob.dtype),
	)

	# Apply temperature scaling
	if temperature != 1.0:
	logits = torch.log(trans_prob + 1e-10) / temperature
	trans_prob = torch.softmax(logits, dim=-1)

	if i == steps - 1:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	# renormalize probabilities to ensure they sum to 1
	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	# avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
	if mask_has_zero_prob.any():
	# create uniform distribution over valid tokens (excluding mask and pad)
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	# ——— compute log probabilities for RND ———
	lp = torch.gather(torch.log(trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)
	lp_pre = torch.gather(torch.log(pretrained_trans_prob + 1e-10), 2, new_xt.unsqueeze(-1)).squeeze(-1)

	changed_mask = (xt == mask) & (new_xt != mask) & (new_xt != pad)

	log_policy_step = (lp * changed_mask).sum(dim=1)
	log_pretrained_step = (lp_pre * changed_mask).sum(dim=1)

	log_rnd = log_pretrained_step - log_policy_step # (B,)

	if i != steps - 1:
	ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long() # (B, L+1)

	insertion_rate = (len_rate * dt).clamp(min=1e-10) # (B, L+1)
	pretrained_insertion_rate = (pretrained_len_rate * dt).clamp(min=1e-10) # (B, L+1)

	log_policy_insert = (ext * torch.log(insertion_rate) - insertion_rate).sum(dim=1) # (B,)
	log_pretrained_insert = (ext * torch.log(pretrained_insertion_rate) - pretrained_insertion_rate).sum(dim=1) # (B,)

	log_insert_diff = log_pretrained_insert - log_policy_insert # (B,)
	log_rnd += log_insert_diff

	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	gaps = torch.arange(max_length + 1, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	if return_trace:
	# check if the token was changed
	for i in range(batch_size):
	for j in range(max_length):
	if xt[i, j] != pad and xt[i, j] != new_xt[i, j]:
	sampling_trace[i].append(
	SamplingTraceDatapoint(
	t=t[i].item(),
	event_type="change",
	position=j,
	token=new_xt[i, j].item(),
	)
	)

	# check if a new token was inserted
	for j in range(max_length):
	id = max_length - j - 1
	if ext[i, id]:
	sampling_trace[i].append(
	SamplingTraceDatapoint(
	t=t[i].item(),
	event_type="insertion",
	position=id,
	token=mask,
	)
	)

	xt = xt_tmp
	t = t + dt

	# change rewards for peptides
	samples = xt.to(device)

	# store raw token IDs
	# Decode and strip samples
	decoded_samples = tokenizer.batch_decode(samples)

	valid_x_final = []
	validSequences = []
	valid_log_rnd = []

	for idx, seq in enumerate(decoded_samples):
	# check if the peptide is valid
	if analyzer.is_peptide(seq):
	valid_x_final.append(xt[idx])
	validSequences.append(seq)
	valid_log_rnd.append(log_rnd[idx])

	print("len valid sequences:", len(validSequences))
	# compute multi-objective rewards
	score_vectors = reward_model(input_seqs=validSequences)
	scalar_rewards = np.sum(score_vectors, axis=-1)
	scalar_rewards = torch.as_tensor(scalar_rewards, dtype=torch.float32, device=device)

	print(f"scalar reward dim{len(scalar_rewards)}")
	valid_log_rnd = torch.stack(valid_log_rnd, dim=0)

	log_rnd = valid_log_rnd + (scalar_rewards / alpha) # scale down by alpha
	valid_x_final = torch.stack(valid_x_final, dim=0)

	return valid_x_final, log_rnd, scalar_rewards, sampling_trace

	@torch.no_grad()
	def any_order_finetuned_euler_sampler(
	model, reward_model, analyzer,
	tokenizer, steps,
	mask,
	pad,
	batch_size,
	max_length,
	return_trace = False,
	dataframe = False,
	temperature: float = 1.0,
	):
	device = next(model.parameters()).device

	# initialize all‑pad sequence and trace
	xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
	sampling_trace = []

	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	# precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)
	sampling_trace = [[] for _ in range(batch_size)] if return_trace else None

	for i in range(steps):
	# ——— predict and convert rates ———
	pred_rate = model(xt, t)
	pred_rate = model.interpolant.to_actual_rate(xt, pred_rate, t)
	unmask_rate = pred_rate.unmask_rate # (B, L, V)
	len_rate = pred_rate.length_rate # (B, L+1)

	# ——— unmask step (Euler) ———
	mask_pos = (xt == mask).nonzero(as_tuple=True)
	unmask_rate[xt != mask] = 0
	unmask_rate[mask_pos + (mask,)] = 0
	unmask_rate[mask_pos + (mask,)] = -unmask_rate[mask_pos + (slice(None),)].sum(dim=1)
	trans_prob = (unmask_rate * dt).clamp(0.0, 1.0)

	# add “stay” probability
	_xt = xt.clone()
	_xt[xt == pad] = mask
	trans_prob.scatter_add_(
	2,
	_xt.unsqueeze(-1),
	torch.ones_like(_xt.unsqueeze(-1), dtype=trans_prob.dtype),
	)

	# Apply temperature scaling
	if temperature != 1.0:
	logits = torch.log(trans_prob + 1e-10) / temperature
	trans_prob = torch.softmax(logits, dim=-1)

	if i == steps - 1:
	print("Final step, removing mask token from sampling")
	trans_prob[mask_pos + (mask,)] = 0.0 # remove mask token from sampling at the last step

	# renormalize probabilities to ensure they sum to 1
	prob_sum = trans_prob[mask_pos].sum(dim=-1, keepdim=True)
	# avoid division by zero; if all probs are 0, use uniform distribution (excluding mask and pad)
	mask_has_zero_prob = (prob_sum.squeeze(-1) == 0.0)
	if mask_has_zero_prob.any():
	# create uniform distribution over valid tokens (excluding mask and pad)
	uniform_prob = torch.zeros_like(trans_prob[0])
	uniform_prob[:mask] = 1.0 / mask # Uniform over tokens 0 to mask-1
	trans_prob[mask_pos[0][mask_has_zero_prob], mask_pos[1][mask_has_zero_prob]] = uniform_prob
	else:
	# normalize to sum to 1
	trans_prob[mask_pos] = trans_prob[mask_pos] / prob_sum

	new_xt = _sample_tokens(trans_prob)
	new_xt[xt == pad] = pad
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	if i != steps - 1:
	# gap-wise insertion refactored — compute new length, fill masks, scatter tokens
	ext = torch.bernoulli((len_rate * dt).clamp(0.0, 1.0)).long() # (B, L+1)
	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	gaps = torch.arange(max_length + 1, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	xt_tmp = torch.full_like(xt, pad)
	mask_fill = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_fill] = mask

	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	if return_trace:
	# check if the token was changed
	for batch_idx in range(batch_size):
	for j in range(max_length):
	if xt[batch_idx, j] != pad and xt[batch_idx, j] != new_xt[batch_idx, j]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="change",
	position=j,
	token=new_xt[batch_idx, j].item(),
	)
	)

	# check if a new token was inserted
	for j in range(max_length):
	id = max_length - j - 1
	if ext[batch_idx, id]:
	sampling_trace[batch_idx].append(
	SamplingTraceDatapoint(
	t=t[batch_idx].item(),
	event_type="insertion",
	position=id,
	token=mask,
	)
	)

	xt = xt_tmp
	t = t + dt

	# start eval
	samples = xt.to(device)

	decoded_samples = tokenizer.batch_decode(samples)

	valid_x_final = []
	validSequences = []

	for idx, seq in enumerate(decoded_samples):
	if analyzer.is_peptide(seq):
	valid_x_final.append(samples[idx])
	validSequences.append(seq)

	print("len valid sequences:", len(validSequences))
	valid_fraction = len(validSequences) / batch_size

	if (len(validSequences) != 0):
	# add scores to log
	score_vectors = reward_model(input_seqs=validSequences) # (num_children, num_objectives)
	average_scores = score_vectors.T

	affinity = average_scores[0]
	sol = average_scores[1]
	hemo = average_scores[2]
	nf = average_scores[3]
	permeability = average_scores[4]

	else:
	zeros = [0.0]

	affinity = zeros
	sol = zeros
	hemo = zeros
	nf = zeros
	permeability = zeros

	if dataframe:
	df = pd.DataFrame({
	"Peptide Sequence": validSequences,
	"Binding Affinity": affinity if len(validSequences) else [0.0],
	"Solubility": sol if len(validSequences) else [0.0],
	"Hemolysis": hemo if len(validSequences) else [0.0],
	"Nonfouling": nf if len(validSequences) else [0.0],
	"Permeability": permeability if len(validSequences) else [0.0],
	})
	return samples, affinity, sol, hemo, nf, permeability, valid_fraction, df

	return samples, affinity, sol, hemo, nf, permeability, valid_fraction

	@torch.no_grad()
	def mdm_tau_leaping_sampling(
	model: MaskedDiffusionModule,
	steps: int,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	return_trace: bool = False,
	temperature: float = 1.0,
	):
	assert not return_trace, "Trace is not yet supported"
	device = next(model.parameters()).device
	xt = torch.full((batch_size, max_length), mask, dtype=torch.int64, device=device)
	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	for i in range(steps):
	# ——— predict and convert rates ———
	pred = model(xt, t)
	pred = model.interpolant.to_actual_rate(xt, pred, t)
	unmask_rate = pred.unmask_rate # (B, L, V)

	if i == steps - 1:
	# last step: deterministic unmask via argmax
	mask_pos = xt == mask # (B, L)
	new_token = unmask_rate.argmax(dim=2) # (B, L)
	new_xt = xt.clone()
	new_xt[mask_pos] = new_token[mask_pos]
	new_xt = torch.where(xt != mask, xt, new_xt)
	xt = new_xt
	t = t + dt
	continue
	# tau-leaping via Poisson counts
	counts = torch.poisson(unmask_rate * dt).long()
	mask_pos = xt == mask # (B, L)
	# zero out non-mask positions and mask→mask
	counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
	counts[..., mask] = 0
	# only accept exactly one event
	sum_c = counts.sum(dim=2) # (B, L)
	one_event = sum_c == 1
	new_token = counts.argmax(dim=2) # (B, L)

	# build new xt
	new_xt = xt.clone()
	new_xt[one_event] = new_token[one_event]
	# keep pads and already-unmasked tokens
	new_xt = torch.where(xt != mask, xt, new_xt)
	xt = new_xt
	t = t + dt

	return xt, []

	# Not used in production, for debugging purposes
	lengths = {4: 0.1, 16: 0.4, 32: 0.4, 64: 0.1}

	def binomial_mass(k, n, p):
	"""
	Calculate the probability mass function (PMF) for a binomial distribution.

	Args:
	k (int): Number of successes
	n (int): Number of trials
	p (float): Probability of success in a single trial

	Returns:
	float: Probability mass P(X = k)
	"""
	import math

	# Calculate binomial coefficient (n choose k)
	try:
	binom_coef = math.factorial(n) / (math.factorial(k) * math.factorial(n - k))
	except ValueError:
	# Handle cases where k > n or negative values
	return 0.0

	# Calculate probability mass
	return binom_coef * (p ** k) * ((1 - p) ** (n - k))

	def calculate_rate_batch(alpha_t, len_t):
	"""
	Calculate rate for a batch of alpha_t and len_t values.

	Args:
	alpha_t (torch.Tensor): Tensor of shape (batch_size,)
	len_t (torch.Tensor): Tensor of shape (batch_size,)

	Returns:
	torch.Tensor: Tensor of shape (batch_size,) containing calculated rates
	"""
	batch_size = alpha_t.shape[0]
	device = alpha_t.device

	# Initialize tensors for numerator and denominator
	nom = torch.zeros(batch_size, device=device)
	denom = torch.zeros(batch_size, device=device)

	for length, probability in lengths.items():
	# Create mask for valid entries where len_t <= length
	valid_mask = (len_t <= length) & (len_t >= 0)

	if not valid_mask.any():
	continue

	valid_indices = valid_mask.nonzero(as_tuple=True)[0]
	valid_len_t = len_t[valid_indices]
	valid_alpha_t = alpha_t[valid_indices]

	# Calculate binomial probabilities efficiently using torch distribution
	binom_dist = torch.distributions.Binomial(total_count=length, probs=valid_alpha_t)
	binom_probs = binom_dist.log_prob(valid_len_t).exp()

	# Update numerator and denominator for valid indices
	nom[valid_indices] += (length - valid_len_t) * probability * binom_probs
	denom[valid_indices] += probability * binom_probs

	# Handle division by zero in a vectorized way
	result = torch.zeros_like(nom)
	div_mask = denom > 0
	result[div_mask] = nom[div_mask] / (denom[div_mask])

	return result

	# Keep the original function for backward compatibility
	def calculate_rate(alpha_t, len_t):
	"""Legacy scalar version of calculate_rate"""
	if isinstance(alpha_t, torch.Tensor) and alpha_t.ndim > 0:
	return calculate_rate_batch(alpha_t, len_t)

	nom, denom = 0, 0
	for length, probability in lengths.items():
	if length >= len_t:
	nom += (length - len_t) * probability * binomial_mass(len_t, length, alpha_t)
	denom += probability * binomial_mass(len_t, length, alpha_t)

	if denom == 0:
	return 0.0

	return nom /denom


	@torch.no_grad()
	def any_order_mask_insertion_tau_leaping_sampling(
	model: torch.nn.Module,
	steps: int,
	mask: int,
	pad: int,
	batch_size: int,
	max_length: int,
	return_trace: bool = False,
	confidence_based_sampling: bool = True, # whether to use confidence-based decoding
	alpha: float = 5.0, # hyperparameter for window size calculation
	max_window: int = 32, # Maximum window size for sliding window
	confidence_method: str = "prob_diff", # "position", "top_prob", "prob_diff", "entropy"
	use_sliding_window: bool = False, # whether to use sliding window for position selection
	temperature: float = 1.0,
	) -> SamplingResult:

	device = next(model.parameters()).device
	xt = torch.full((batch_size, max_length), pad, dtype=torch.int64, device=device)
	sampling_trace = []
	dt = 1.0 / steps
	t = torch.zeros(batch_size, device=device)

	# Precompute row indices for scatter
	batch_idx_L = (
	torch.arange(batch_size, device=device)
	.view(batch_size, 1)
	.expand(batch_size, max_length)
	)
	pos_idx_L = (
	torch.arange(max_length, device=device)
	.view(1, max_length)
	.expand(batch_size, max_length)
	)

	for i in range(steps):
	# --- predict rates ---
	pred = model(xt, t)
	xt_len = (xt != pad).sum(dim=1)
	pred = model.interpolant.to_actual_rate(xt, pred, t)
	unmask_rate = pred.unmask_rate # (B, L, V)
	len_rate = pred.length_rate # (B, L+1)

	if i == steps - 1:
	# last step: deterministic unmask via argmax
	mask_pos = xt == mask
	new_token = unmask_rate.argmax(dim=2)
	new_xt = xt.clone()
	new_xt[mask_pos] = new_token[mask_pos]
	new_xt = torch.where(xt == pad, pad, new_xt)
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)
	xt = new_xt
	t = t + dt
	continue

	# --- confidence-based decoding ---
	if confidence_based_sampling > 0.0:
	# Confidence-based unmasking (vectorized)
	mask_positions = (xt == mask) # (B, L)
	num_mask_positions = mask_positions.sum(dim=1) # (B,)

	# 1. Determine number of tokens to unmask using Poisson
	unmask_counts = torch.poisson(num_mask_positions.float() * dt).long() # (B,)

	# 2. Calculate confidence based on selected method
	if confidence_method == "position":
	# Position-based confidence: position i / len(xt)
	xt_len = (xt != pad).sum(dim=1) # (B,) - current sequence lengths
	position_indices = torch.arange(max_length, device=device).unsqueeze(0).expand(batch_size, -1) # (B, L)
	confidence = 1.0 - (position_indices.float() / xt_len.unsqueeze(1).float().clamp(min=1)) # (B, L)

	elif confidence_method == "top_prob":
	# Top probability confidence
	import torch.nn.functional as F
	token_logits = unmask_rate # (B, L, V) - use the unmask_rate as logits
	unmask_probs = F.softmax(token_logits, dim=-1) # (B, L, V)
	confidence = unmask_probs.max(dim=-1)[0] # (B, L)

	elif confidence_method == "prob_diff":
	# Probability difference confidence (top - second top)
	import torch.nn.functional as F
	token_logits = unmask_rate # (B, L, V)
	unmask_probs = F.softmax(token_logits, dim=-1) # (B, L, V)
	top2_probs, _ = torch.topk(unmask_probs, k=2, dim=-1) # (B, L, 2)
	confidence = top2_probs[:, :, 0] - top2_probs[:, :, 1] # (B, L)

	elif confidence_method == "entropy":
	# Entropy-based confidence (lower entropy = higher confidence)
	import torch.nn.functional as F
	token_logits = unmask_rate # (B, L, V)
	unmask_probs = F.softmax(token_logits, dim=-1) # (B, L, V)
	entropy = -torch.sum(unmask_probs * torch.log(unmask_probs + 1e-10), dim=-1) # (B, L)
	confidence = -entropy # (B, L) - negative entropy so lower entropy gives higher confidence

	else:
	raise ValueError(f"Unknown confidence_method: {confidence_method}")

	# 3. Apply window constraint if enabled
	if use_sliding_window:
	# Calculate dynamic k for each batch
	k_values = torch.minimum(
	torch.minimum(
	(alpha * unmask_counts).long(),
	torch.tensor(max_window, device=device)
	), num_mask_positions) # (B,)

	# Get cumulative count of mask positions
	mask_cumsum = mask_positions.cumsum(dim=1) # (B, L)

	# Create window mask: position is eligible if it's a mask and within first k masks
	is_within_window = mask_cumsum <= k_values.unsqueeze(1) # (B, L)
	window_mask = mask_positions & is_within_window # (B, L)

	# Set confidence to -inf for positions outside the window or non-mask positions
	confidence = torch.where(window_mask, confidence, torch.tensor(-float('inf'), device=device))
	else:
	# No window constraint - only mask positions are eligible
	confidence = torch.where(mask_positions, confidence, torch.tensor(-float('inf'), device=device))

	new_xt = xt.clone()

	# vectorized unmasking
	max_unmask = unmask_counts.max().item()
	if max_unmask > 0:
	_, all_top_indices = torch.topk(confidence, k=max_unmask, dim=1, largest=True) # (B, max_unmask)

	# create mask for valid unmask operations
	unmask_mask = torch.arange(max_unmask, device=device).unsqueeze(0) < unmask_counts.unsqueeze(1) # (B, max_unmask)

	most_likely_tokens = unmask_rate.argmax(dim=-1) # (B, L)

	selected_positions = all_top_indices[unmask_mask]
	batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, max_unmask)[unmask_mask]

	new_xt[batch_indices, selected_positions] = most_likely_tokens[batch_indices, selected_positions]
	else:
	# --- tau-leaping unmask via Poisson ---
	counts = torch.poisson(unmask_rate * dt).long()
	mask_pos = xt == mask
	counts[~mask_pos.unsqueeze(-1).expand_as(counts)] = 0
	counts[..., mask] = 0
	sum_c = counts.sum(dim=2)
	one_event = sum_c == 1
	new_token = counts.argmax(dim=2)
	new_xt = xt.clone()
	new_xt[one_event] = new_token[one_event]
	new_xt = torch.where(xt == pad, pad, new_xt)
	new_xt = torch.where((xt != mask) & (xt != pad), xt, new_xt)

	# insertion only on non-last
	if i != steps - 1:
	# --- Poisson insertion, compute new lengths and fill masks ---
	ext = torch.poisson(len_rate * dt).long() # (B, L+1)
	xt_len = xt.ne(pad).sum(dim=1) # (B,)
	gaps = torch.arange(max_length + 1, device=device).view(1, -1)
	ext = ext * (gaps <= xt_len.view(batch_size, 1)).long()
	total_ext = ext.sum(dim=1)
	valid = xt_len + total_ext <= max_length
	ext = ext * valid.view(batch_size, 1).long()

	# compute prefix sums of insertions
	ext_ex = ext.int().cumsum(dim=1) # (B, L+1)
	new_len = xt_len + total_ext # (B,)

	# initialize with pads, then fill mask up to new_len
	xt_tmp = torch.full_like(xt, pad)
	mask_pos = pos_idx_L < new_len.view(batch_size, 1)
	xt_tmp[mask_pos] = mask

	# shift and scatter original tokens
	new_pos_orig = pos_idx_L + ext_ex[:, :max_length] # (B, L)
	orig_mask = pos_idx_L < xt_len.view(batch_size, 1)
	flat_b = batch_idx_L[orig_mask]
	flat_p = new_pos_orig[orig_mask]
	xt_tmp[flat_b, flat_p] = new_xt[orig_mask]
	else:
	xt_tmp = new_xt

	xt = xt_tmp
	t = t + dt
	if return_trace:
	sampling_trace.append(xt)

	return xt, sampling_trace