Spaces:

faceless-void
/

UNIStainNet

Running

App Files Files Community

UNIStainNet / src /models /trainer.py

faceless-void

Upload folder using huggingface_hub

4db9215 verified 6 days ago

raw

history blame contribute delete

43.3 kB

	"""
	UNIStainNet: Pixel-Space UNI-Guided Virtual Staining Network.

	Architecture:
	Generator: SPADE-UNet conditioned on UNI pathology features + stain/class embedding
	Discriminator: Multi-scale PatchGAN (512 + 256)
	Losses: LPIPS@128 + adversarial + DAB intensity + DAB contrast

	References:
	- Park et al., "Semantic Image Synthesis with SPADE" (CVPR 2019)
	- Chen et al., "A general-purpose self-supervised model for pathology" (Nature Medicine 2024)
	- Isola et al., "Image-to-Image Translation with pix2pix" (CVPR 2017)
	"""

	import copy
	from collections import OrderedDict

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import lpips
	import pytorch_lightning as pl
	import torchvision
	import wandb

	from src.models.discriminator import (
	PatchDiscriminator, MultiScaleDiscriminator,
	hinge_loss_d, hinge_loss_g, r1_gradient_penalty, feature_matching_loss,
	)
	from src.models.generator import SPADEUNetGenerator
	from src.models.losses import VGGFeatureExtractor, gram_matrix, PatchNCELoss
	from src.utils.dab import DABExtractor


	# ======================================================================
	# Training Module
	# ======================================================================

	class UNIStainNetTrainer(pl.LightningModule):
	"""PyTorch Lightning training module for UNIStainNet.

	Handles GAN training with manual optimization, CFG dropout, EMA, and
	all loss computations.
	"""

	def __init__(
	self,
	# Architecture
	num_classes=5,
	null_class=4,
	class_dim=64,
	uni_dim=1024,
	ndf=64,
	disc_n_layers=3,
	input_skip=False,
	# Optimizer
	gen_lr=1e-4,
	disc_lr=4e-4,
	warmup_steps=1000,
	# Loss weights
	lpips_weight=1.0,
	lpips_256_weight=0.5,
	lpips_512_weight=0.0,
	adversarial_weight=1.0,
	dab_intensity_weight=0.1,
	dab_contrast_weight=0.05,
	dab_sharpness_weight=0.0,
	gram_style_weight=0.0,
	edge_weight=0.0,
	he_edge_weight=0.0,
	bg_white_weight=0.0,
	bg_threshold=0.85,
	l1_lowres_weight=0.0,
	edge_encoder=False,
	edge_base_ch=32,
	uni_spatial_size=4,
	uncond_disc_weight=0.0,
	crop_disc_weight=0.0,
	crop_size=128,
	feat_match_weight=0.0,
	patchnce_weight=0.0,
	patchnce_layers=(2, 3, 4),
	patchnce_n_patches=256,
	patchnce_temperature=0.07,
	# Ablation
	disable_uni=False,
	disable_class=False,
	# GAN training
	r1_weight=10.0,
	r1_every=16,
	adversarial_start_step=2000,
	# CFG
	cfg_drop_class_prob=0.10,
	cfg_drop_uni_prob=0.10,
	cfg_drop_both_prob=0.05,
	# EMA
	ema_decay=0.999,
	# On-the-fly UNI extraction (for crop-based training)
	extract_uni_on_the_fly=False,
	uni_spatial_pool_size=32,
	# Resolution
	image_size=512,
	# 1024 architecture: extend UNI SPADE to 512 level
	uni_spade_at_512=False,
	# Per-label names for multi-stain logging
	label_names=None,
	):
	super().__init__()
	self.save_hyperparameters()
	self.automatic_optimization = False

	self.null_class = null_class

	# On-the-fly UNI feature extraction (loaded lazily on first use)
	self._uni_model = None
	self._uni_extract_on_the_fly = extract_uni_on_the_fly

	# Generator
	self.generator = SPADEUNetGenerator(
	num_classes=num_classes,
	class_dim=class_dim,
	uni_dim=uni_dim,
	input_skip=input_skip,
	edge_encoder=edge_encoder,
	edge_base_ch=edge_base_ch,
	uni_spatial_size=uni_spatial_size,
	image_size=image_size,
	uni_spade_at_512=uni_spade_at_512,
	)

	# Discriminator (global multi-scale)
	self.discriminator = MultiScaleDiscriminator(
	in_channels=6, ndf=ndf, n_layers=disc_n_layers,
	)

	# Crop discriminator (local full-res detail)
	if crop_disc_weight > 0:
	self.crop_discriminator = PatchDiscriminator(
	in_channels=6, ndf=ndf, n_layers=disc_n_layers,
	)
	else:
	self.crop_discriminator = None

	# Unconditional discriminator (HER2-only, alignment-free texture judge)
	# Also needed for feature matching loss (FM uses uncond disc features)
	if uncond_disc_weight > 0 or feat_match_weight > 0:
	self.uncond_discriminator = PatchDiscriminator(
	in_channels=3, ndf=ndf, n_layers=disc_n_layers,
	)
	else:
	self.uncond_discriminator = None

	# PatchNCE loss (contrastive, alignment-free: H&E input vs generated)
	if patchnce_weight > 0:
	# Encoder channel dims: {1: 64, 2: 128, 3: 256, 4: 512}
	enc_channels = {1: 64, 2: 128, 3: 256, 4: 512}
	layer_channels = {l: enc_channels[l] for l in patchnce_layers}
	self.patchnce_loss = PatchNCELoss(
	layer_channels=layer_channels,
	num_patches=patchnce_n_patches,
	temperature=patchnce_temperature,
	)
	else:
	self.patchnce_loss = None

	# EMA generator
	self.generator_ema = copy.deepcopy(self.generator)
	self.generator_ema.requires_grad_(False)

	# Losses
	self.lpips_fn = lpips.LPIPS(net='alex')
	self.lpips_fn.requires_grad_(False)
	self.lpips_fn.eval()

	self.dab_extractor = DABExtractor(device='cpu')

	# VGG feature extractor for Gram-matrix style loss
	if gram_style_weight > 0:
	self.vgg_extractor = VGGFeatureExtractor()
	self.vgg_extractor.requires_grad_(False)
	self.vgg_extractor.eval()
	else:
	self.vgg_extractor = None

	# Param counts
	n_gen = sum(p.numel() for p in self.generator.parameters())
	n_disc = sum(p.numel() for p in self.discriminator.parameters())
	n_crop = sum(p.numel() for p in self.crop_discriminator.parameters()) if self.crop_discriminator else 0
	n_uncond = sum(p.numel() for p in self.uncond_discriminator.parameters()) if self.uncond_discriminator else 0
	print(f"Generator: {n_gen:,} params")
	print(f"Discriminator: {n_disc:,} params (global) + {n_crop:,} (crop) + {n_uncond:,} (uncond)")

	def configure_optimizers(self):
	gen_params = list(self.generator.parameters())
	if self.patchnce_loss is not None:
	gen_params += list(self.patchnce_loss.parameters())
	opt_g = torch.optim.Adam(
	gen_params,
	lr=self.hparams.gen_lr,
	betas=(0.0, 0.999),
	)
	# All discriminator params in one optimizer
	disc_params = list(self.discriminator.parameters())
	if self.crop_discriminator is not None:
	disc_params += list(self.crop_discriminator.parameters())
	if self.uncond_discriminator is not None:
	disc_params += list(self.uncond_discriminator.parameters())
	opt_d = torch.optim.Adam(
	disc_params,
	lr=self.hparams.disc_lr,
	betas=(0.0, 0.999),
	)
	return [opt_g, opt_d]

	def _get_lr_scale(self):
	"""Linear warmup."""
	if self.global_step < self.hparams.warmup_steps:
	return self.global_step / max(1, self.hparams.warmup_steps)
	return 1.0

	@torch.no_grad()
	def _update_ema(self):
	"""Update EMA generator weights."""
	decay = self.hparams.ema_decay
	for p_ema, p in zip(self.generator_ema.parameters(), self.generator.parameters()):
	p_ema.data.mul_(decay).add_(p.data, alpha=1 - decay)

	def on_save_checkpoint(self, checkpoint):
	"""Exclude frozen UNI model from checkpoint (it's reloaded on-the-fly)."""
	state_dict = checkpoint.get('state_dict', {})
	keys_to_remove = [k for k in state_dict if k.startswith('_uni_model.')]
	for k in keys_to_remove:
	del state_dict[k]

	def on_load_checkpoint(self, checkpoint):
	"""Filter out UNI model keys from old checkpoints that included them."""
	state_dict = checkpoint.get('state_dict', {})
	keys_to_remove = [k for k in state_dict if k.startswith('_uni_model.')]
	for k in keys_to_remove:
	del state_dict[k]

	def _load_uni_model(self):
	"""Lazily load UNI ViT-L/16 for on-the-fly feature extraction."""
	if self._uni_model is None:
	import timm
	self._uni_model = timm.create_model(
	"hf-hub:MahmoodLab/uni",
	pretrained=True,
	init_values=1e-5,
	dynamic_img_size=True,
	)
	self._uni_model.eval()
	self._uni_model.requires_grad_(False)
	self._uni_model = self._uni_model.to(self.device)
	n_params = sum(p.numel() for p in self._uni_model.parameters())
	print(f"UNI model loaded for on-the-fly extraction: {n_params:,} params")
	return self._uni_model

	@torch.no_grad()
	def _extract_uni_from_sub_crops(self, uni_sub_crops):
	"""Extract UNI features from pre-prepared sub-crops on GPU.

	Args:
	uni_sub_crops: [B, 16, 3, 224, 224] — batch of 4x4 sub-crop grids,
	already normalized with ImageNet stats.

	Returns:
	uni_features: [B, S*S, 1024] where S = uni_spatial_pool_size (default 32)
	"""
	uni_model = self._load_uni_model()
	B = uni_sub_crops.shape[0]
	spatial_size = self.hparams.uni_spatial_pool_size
	num_crops = 4 # 4x4 grid
	patches_per_side = 14 # 224/16

	# Batched UNI forward: [B, 16, 3, 224, 224] -> [B*16, 3, 224, 224]
	all_crops = uni_sub_crops.reshape(B * 16, 3, 224, 224).to(self.device)
	all_feats = uni_model.forward_features(all_crops) # [B*16, 197, 1024]
	patch_tokens = all_feats[:, 1:, :] # [B*16, 196, 1024]

	# Reshape back to per-sample grids: [B, 4, 4, 14, 14, 1024]
	patch_tokens = patch_tokens.reshape(
	B, num_crops, num_crops,
	patches_per_side, patches_per_side, 1024
	)
	# Interleave to spatial grid: [B, 56, 56, 1024]
	full_size = num_crops * patches_per_side # 56
	full_grid = patch_tokens.permute(0, 1, 3, 2, 4, 5)
	full_grid = full_grid.reshape(B, full_size, full_size, 1024)

	# Pool to target spatial size (batched)
	if spatial_size < full_size:
	grid_bchw = full_grid.permute(0, 3, 1, 2) # [B, 1024, 56, 56]
	pooled = F.adaptive_avg_pool2d(grid_bchw, spatial_size) # [B, 1024, S, S]
	result = pooled.permute(0, 2, 3, 1) # [B, S, S, 1024]
	else:
	result = full_grid

	S = result.shape[1]
	return result.reshape(B, S * S, 1024) # [B, S*S, 1024]

	def _apply_cfg_dropout(self, labels, uni_features):
	"""Apply classifier-free guidance dropout during training (vectorized)."""
	B = labels.shape[0]
	device = labels.device

	new_labels = labels.clone()
	new_uni = uni_features.clone()

	r = torch.rand(B, device=device)
	p_both = self.hparams.cfg_drop_both_prob
	p_class = p_both + self.hparams.cfg_drop_class_prob
	p_uni = p_class + self.hparams.cfg_drop_uni_prob

	drop_both = r < p_both
	drop_class = (r >= p_both) & (r < p_class)
	drop_uni = (r >= p_class) & (r < p_uni)

	new_labels[drop_both \| drop_class] = self.null_class
	new_uni[drop_both \| drop_uni] = 0.0

	return new_labels, new_uni

	def compute_dab_intensity_loss(self, generated, target):
	"""Top-10% percentile matching for DAB intensity."""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	tgt = target.float()

	dab_gen = self.dab_extractor.extract_dab_intensity(gen, normalize="none")
	dab_tgt = self.dab_extractor.extract_dab_intensity(tgt, normalize="none")

	def _batched_top10_mean(dab):
	"""Compute mean of top-10% DAB intensity per sample (vectorized)."""
	B = dab.shape[0]
	flat = dab.reshape(B, -1) # [B, H*W]
	p99 = torch.quantile(flat, 0.99, dim=1, keepdim=True)
	flat = flat.clamp(max=p99)
	p90 = torch.quantile(flat, 0.9, dim=1, keepdim=True)
	mask = flat >= p90 # [B, H*W]
	# Use masked mean: sum(vals * mask) / sum(mask), fallback to flat mean
	masked_sum = (flat * mask).sum(dim=1)
	mask_count = mask.sum(dim=1).clamp(min=1)
	return masked_sum / mask_count # [B]

	gen_scores = _batched_top10_mean(dab_gen)
	tgt_scores = _batched_top10_mean(dab_tgt)
	return F.l1_loss(gen_scores, tgt_scores)

	def compute_dab_contrast_loss(self, generated, labels):
	"""Class-ordering hinge loss: DAB(3+) > DAB(2+) > DAB(1+) > DAB(0)."""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	# Only use non-null labels
	valid = labels < self.null_class
	if valid.sum() < 2:
	return torch.tensor(0.0, device=self.device, requires_grad=True)

	gen_valid = gen[valid]
	labels_valid = labels[valid]

	dab_gen = self.dab_extractor.extract_dab_intensity(gen_valid, normalize="none")

	B = dab_gen.shape[0]
	flat = dab_gen.reshape(B, -1)
	p99 = torch.quantile(flat, 0.99, dim=1, keepdim=True)
	flat = flat.clamp(max=p99)
	p90 = torch.quantile(flat, 0.9, dim=1, keepdim=True)
	mask = flat >= p90
	masked_sum = (flat * mask).sum(dim=1)
	mask_count = mask.sum(dim=1).clamp(min=1)
	dab_scores = masked_sum / mask_count

	class_pairs = [
	(3, 0, 0.20), (3, 1, 0.15),
	(2, 0, 0.08), (3, 2, 0.10),
	]

	losses = []
	for high_cls, low_cls, margin in class_pairs:
	high_mask = labels_valid == high_cls
	low_mask = labels_valid == low_cls
	if high_mask.sum() > 0 and low_mask.sum() > 0:
	high_score = dab_scores[high_mask].mean()
	low_score = dab_scores[low_mask].mean()
	losses.append(F.relu(margin - (high_score - low_score)))

	if losses:
	return torch.stack(losses).mean()
	return torch.tensor(0.0, device=self.device, requires_grad=True)

	def compute_edge_loss(self, generated, target):
	"""Fourier spectral loss at 256x256 for boundary sharpness.

	Compares power spectrum magnitudes between generated and target.
	The Fourier magnitude is inherently translation-invariant — shifting
	an image doesn't change its frequency content — so this is robust to
	the ~30px misalignment in consecutive-cut BCI pairs.

	Focuses on high-frequency bands (outer 75% of spectrum) where
	blurriness manifests as reduced power.
	"""
	with torch.amp.autocast('cuda', enabled=False):
	gen = F.interpolate(generated.float(), size=256, mode='bilinear', align_corners=False)
	tgt = F.interpolate(target.float(), size=256, mode='bilinear', align_corners=False)

	# Grayscale
	gen_gray = gen.mean(dim=1, keepdim=True)
	tgt_gray = tgt.mean(dim=1, keepdim=True)

	# 2D FFT -> power spectrum (log-scale for stability)
	gen_fft = torch.fft.fft2(gen_gray)
	tgt_fft = torch.fft.fft2(tgt_gray)
	gen_mag = torch.log1p(gen_fft.abs())
	tgt_mag = torch.log1p(tgt_fft.abs())

	# High-frequency mask: keep outer 75% of spectrum
	H, W = gen_mag.shape[-2], gen_mag.shape[-1]
	cy, cx = H // 2, W // 2
	y = torch.arange(H, device=gen.device).float() - cy
	x = torch.arange(W, device=gen.device).float() - cx
	dist = (y[:, None] 2 + x[None, :] 2).sqrt()
	max_dist = (cy 2 + cx 2) ** 0.5
	hf_mask = (dist > 0.25 * max_dist).float()

	# L1 on high-frequency magnitudes
	return F.l1_loss(gen_mag * hf_mask, tgt_mag * hf_mask)

	def compute_dab_sharpness_loss(self, generated, target):
	"""DAB spatial sharpness loss: penalizes diffuse brown, rewards membrane-localized DAB.

	Two components:
	1. DAB gradient magnitude: mean Sobel gradient magnitude per image.
	2. DAB local variance distribution: sorted-L1 (Wasserstein-1) on
	patch variance vectors.
	"""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	tgt = target.float()

	dab_gen = self.dab_extractor.extract_dab_intensity(gen, normalize="none")
	dab_tgt = self.dab_extractor.extract_dab_intensity(tgt, normalize="none")

	# Ensure [B, 1, H, W]
	if dab_gen.dim() == 3:
	dab_gen = dab_gen.unsqueeze(1)
	if dab_tgt.dim() == 3:
	dab_tgt = dab_tgt.unsqueeze(1)

	B = dab_gen.shape[0]

	# --- Component 1: Gradient magnitude (batched) ---
	sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]],
	dtype=torch.float32, device=gen.device).view(1, 1, 3, 3)
	sobel_y = sobel_x.transpose(-1, -2)

	gx_gen = F.conv2d(dab_gen, sobel_x, padding=1)
	gy_gen = F.conv2d(dab_gen, sobel_y, padding=1)
	grad_gen = (gx_gen2 + gy_gen2 + 1e-8).sqrt()

	gx_tgt = F.conv2d(dab_tgt, sobel_x, padding=1)
	gy_tgt = F.conv2d(dab_tgt, sobel_y, padding=1)
	grad_tgt = (gx_tgt2 + gy_tgt2 + 1e-8).sqrt()

	# Match mean gradient magnitude per image
	grad_loss = F.l1_loss(grad_gen.mean(dim=[1, 2, 3]), grad_tgt.mean(dim=[1, 2, 3]))

	# --- Component 2: Local variance distribution (sorted-L1) ---
	ps = 16 # patch size
	var_losses = []
	for i in range(B):
	g = dab_gen[i, 0] # [H, W]
	t = dab_tgt[i, 0]

	H, W = g.shape
	nH, nW = H // ps, W // ps
	g_patches = g[:nHps, :nWps].reshape(nH, ps, nW, ps).permute(0, 2, 1, 3).reshape(-1, ps*ps)
	t_patches = t[:nHps, :nWps].reshape(nH, ps, nW, ps).permute(0, 2, 1, 3).reshape(-1, ps*ps)

	g_var = g_patches.var(dim=1)
	t_var = t_patches.var(dim=1)

	g_sorted, _ = g_var.sort()
	t_sorted, _ = t_var.sort()
	var_losses.append(F.l1_loss(g_sorted, t_sorted.detach()))

	var_loss = torch.stack(var_losses).mean()

	return grad_loss + var_loss

	def compute_he_edge_loss(self, generated, he_input):
	"""H&E edge structure preservation loss.

	Extracts Sobel edges from H&E input and generated output, then
	computes L1 loss between edge maps at multiple scales.
	"""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	he = he_input.float()

	gen_gray = ((gen + 1) / 2).mean(dim=1, keepdim=True)
	he_gray = ((he + 1) / 2).mean(dim=1, keepdim=True)

	sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]],
	dtype=torch.float32, device=gen.device).view(1, 1, 3, 3)
	sobel_y = sobel_x.transpose(-1, -2)

	loss = 0.0
	full_size = gen_gray.shape[-1]
	scales = [full_size, full_size // 2]
	for size in scales:
	if size < full_size:
	g = F.interpolate(gen_gray, size=size, mode='bilinear', align_corners=False)
	h = F.interpolate(he_gray, size=size, mode='bilinear', align_corners=False)
	else:
	g, h = gen_gray, he_gray

	gx_gen = F.conv2d(g, sobel_x, padding=1)
	gy_gen = F.conv2d(g, sobel_y, padding=1)
	edge_gen = (gx_gen2 + gy_gen2 + 1e-8).sqrt()

	gx_he = F.conv2d(h, sobel_x, padding=1)
	gy_he = F.conv2d(h, sobel_y, padding=1)
	edge_he = (gx_he2 + gy_he2 + 1e-8).sqrt()

	loss = loss + F.l1_loss(edge_gen, edge_he.detach())

	return loss / 2.0

	def compute_background_loss(self, generated, he_input):
	"""Background white loss: push background regions toward white."""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	he = he_input.float()

	he_bright = ((he + 1) / 2).mean(dim=1, keepdim=True)

	threshold = self.hparams.bg_threshold
	mask = torch.sigmoid((he_bright - threshold) * 20.0)

	white_target = torch.ones_like(gen)
	diff = (gen - white_target).abs()

	weighted_diff = diff * mask

	mask_sum = mask.sum() * 3
	if mask_sum > 0:
	return weighted_diff.sum() / mask_sum
	return torch.tensor(0.0, device=gen.device, requires_grad=True)

	def compute_gram_style_loss(self, generated, target):
	"""Gram-matrix style loss: match texture statistics via VGG feature correlations."""
	with torch.amp.autocast('cuda', enabled=False):
	gen = generated.float()
	tgt = target.float()

	gen_256 = F.interpolate(gen, size=256, mode='bilinear', align_corners=False)
	tgt_256 = F.interpolate(tgt, size=256, mode='bilinear', align_corners=False)

	gen_feats = self.vgg_extractor(gen_256)
	tgt_feats = self.vgg_extractor(tgt_256)

	loss = 0.0
	for gf, tf in zip(gen_feats, tgt_feats):
	gram_gen = gram_matrix(gf)
	gram_tgt = gram_matrix(tf)
	loss = loss + F.l1_loss(gram_gen, gram_tgt.detach())

	return loss / len(gen_feats)

	def training_step(self, batch, batch_idx):
	he, her2, uni_or_crops, labels, fnames = batch
	opt_g, opt_d = self.optimizers()

	# On-the-fly UNI extraction: dataset returns [B, 16, 3, 224, 224] sub-crops
	if self._uni_extract_on_the_fly:
	uni = self._extract_uni_from_sub_crops(uni_or_crops)
	else:
	uni = uni_or_crops

	# Apply CFG dropout
	labels_dropped, uni_dropped = self._apply_cfg_dropout(labels, uni)

	# Ablation: zero out UNI features
	if self.hparams.disable_uni:
	uni_dropped = torch.zeros_like(uni_dropped)

	# Ablation: force all labels to null class
	if self.hparams.disable_class:
	labels_dropped = torch.full_like(labels_dropped, self.hparams.null_class)

	# ----------------------------------------------------------------
	# Generator step
	# ----------------------------------------------------------------
	generated = self.generator(he, uni_dropped, labels_dropped)

	# LPIPS main: 4x downsample (128 for 512 input, 256 for 1024)
	lpips_main_size = self.hparams.image_size // 4
	gen_lpips = F.interpolate(generated, size=lpips_main_size, mode='bilinear', align_corners=False)
	her2_lpips = F.interpolate(her2, size=lpips_main_size, mode='bilinear', align_corners=False)
	loss_lpips = self.lpips_fn(gen_lpips, her2_lpips).mean()

	loss_g = self.hparams.lpips_weight * loss_lpips

	# LPIPS fine: 2x downsample (256 for 512 input, 512 for 1024)
	if self.hparams.lpips_256_weight > 0:
	lpips_fine_size = self.hparams.image_size // 2
	gen_fine = F.interpolate(generated, size=lpips_fine_size, mode='bilinear', align_corners=False)
	her2_fine = F.interpolate(her2, size=lpips_fine_size, mode='bilinear', align_corners=False)
	loss_lpips_256 = self.lpips_fn(gen_fine, her2_fine).mean()
	loss_g = loss_g + self.hparams.lpips_256_weight * loss_lpips_256
	self.log('train/lpips_fine', loss_lpips_256, prog_bar=False)

	# LPIPS at full resolution (expensive)
	if self.hparams.lpips_512_weight > 0:
	loss_lpips_512 = self.lpips_fn(generated, her2).mean()
	loss_g = loss_g + self.hparams.lpips_512_weight * loss_lpips_512
	self.log('train/lpips_fullres', loss_lpips_512, prog_bar=False)

	# Low-resolution L1 (color fidelity, misalignment-robust at 64x64)
	if self.hparams.l1_lowres_weight > 0:
	gen_64 = F.interpolate(generated, size=64, mode='bilinear', align_corners=False)
	her2_64 = F.interpolate(her2, size=64, mode='bilinear', align_corners=False)
	loss_l1_lowres = F.l1_loss(gen_64, her2_64)
	loss_g = loss_g + self.hparams.l1_lowres_weight * loss_l1_lowres
	self.log('train/l1_lowres', loss_l1_lowres, prog_bar=False)

	# DAB losses (use original labels, not dropped)
	if self.hparams.dab_intensity_weight > 0:
	loss_dab = self.compute_dab_intensity_loss(generated, her2)
	loss_g = loss_g + self.hparams.dab_intensity_weight * loss_dab
	self.log('train/dab_intensity', loss_dab, prog_bar=False)

	if self.hparams.dab_contrast_weight > 0:
	loss_dab_contrast = self.compute_dab_contrast_loss(generated, labels)
	loss_g = loss_g + self.hparams.dab_contrast_weight * loss_dab_contrast
	self.log('train/dab_contrast', loss_dab_contrast, prog_bar=False)

	# Edge loss (boundary sharpness)
	if self.hparams.edge_weight > 0:
	loss_edge = self.compute_edge_loss(generated, her2)
	loss_g = loss_g + self.hparams.edge_weight * loss_edge
	self.log('train/edge_loss', loss_edge, prog_bar=False)

	# DAB sharpness loss (membrane-localized vs diffuse brown)
	if self.hparams.dab_sharpness_weight > 0:
	loss_dab_sharp = self.compute_dab_sharpness_loss(generated, her2)
	loss_g = loss_g + self.hparams.dab_sharpness_weight * loss_dab_sharp
	self.log('train/dab_sharpness', loss_dab_sharp, prog_bar=False)

	# Gram-matrix style loss
	if self.hparams.gram_style_weight > 0 and self.vgg_extractor is not None:
	loss_gram = self.compute_gram_style_loss(generated, her2)
	loss_g = loss_g + self.hparams.gram_style_weight * loss_gram
	self.log('train/gram_style', loss_gram, prog_bar=False)

	# H&E edge structure preservation (pixel-aligned)
	if self.hparams.he_edge_weight > 0:
	loss_he_edge = self.compute_he_edge_loss(generated, he)
	loss_g = loss_g + self.hparams.he_edge_weight * loss_he_edge
	self.log('train/he_edge', loss_he_edge, prog_bar=False)

	# Background white loss
	if self.hparams.bg_white_weight > 0:
	loss_bg = self.compute_background_loss(generated, he)
	loss_g = loss_g + self.hparams.bg_white_weight * loss_bg
	self.log('train/bg_white', loss_bg, prog_bar=False)

	# PatchNCE loss (contrastive: H&E input vs generated, never sees GT)
	if self.hparams.patchnce_weight > 0 and self.patchnce_loss is not None:
	feats_he = self.generator.encode(he)
	feats_gen = self.generator.encode(generated)
	loss_nce = self.patchnce_loss(feats_he, feats_gen)
	loss_g = loss_g + self.hparams.patchnce_weight * loss_nce
	self.log('train/patchnce', loss_nce, prog_bar=False)

	# Adversarial losses (after warmup)
	loss_adv = torch.tensor(0.0, device=self.device)
	loss_feat_match = torch.tensor(0.0, device=self.device)
	loss_crop_adv = torch.tensor(0.0, device=self.device)
	loss_uncond_adv = torch.tensor(0.0, device=self.device)
	any_adv = (self.hparams.adversarial_weight > 0 or
	self.hparams.uncond_disc_weight > 0 or
	self.hparams.crop_disc_weight > 0 or
	self.hparams.feat_match_weight > 0)
	img_sz = self.hparams.image_size
	# Pre-compute disc-resolution tensors (512 for 1024 input, identity for 512)
	if img_sz == 1024:
	he_for_disc = F.interpolate(he, size=512, mode='bilinear', align_corners=False)
	her2_for_disc = F.interpolate(her2, size=512, mode='bilinear', align_corners=False)
	else:
	he_for_disc = he
	her2_for_disc = her2
	if self.global_step >= self.hparams.adversarial_start_step and any_adv:
	if img_sz == 1024:
	gen_for_disc = F.interpolate(generated, size=512, mode='bilinear', align_corners=False)
	else:
	gen_for_disc = generated

	# Conditional discriminator (paired: generated+HE vs real_HER2+HE)
	if self.hparams.adversarial_weight > 0:
	fake_input = torch.cat([gen_for_disc, he_for_disc], dim=1)
	disc_outputs = self.discriminator(fake_input)
	loss_adv = sum(hinge_loss_g(out) for out in disc_outputs) / len(disc_outputs)
	loss_g = loss_g + self.hparams.adversarial_weight * loss_adv

	# Feature matching from unconditional disc
	if (self.hparams.feat_match_weight > 0 and
	self.uncond_discriminator is not None):
	_, fake_feats = self.uncond_discriminator(gen_for_disc, return_features=True)
	with torch.no_grad():
	_, real_feats = self.uncond_discriminator(her2_for_disc, return_features=True)
	loss_feat_match = feature_matching_loss(fake_feats, real_feats)
	loss_g = loss_g + self.hparams.feat_match_weight * loss_feat_match

	# Crop discriminator: random crops at full resolution
	if self.crop_discriminator is not None and self.hparams.crop_disc_weight > 0:
	fake_input_crop = torch.cat([generated, he], dim=1)
	cs = self.hparams.crop_size
	top = torch.randint(0, img_sz - cs, (1,)).item()
	left = torch.randint(0, img_sz - cs, (1,)).item()
	fake_crop = fake_input_crop[:, :, top:top+cs, left:left+cs]
	loss_crop_adv = hinge_loss_g(self.crop_discriminator(fake_crop))
	loss_g = loss_g + self.hparams.crop_disc_weight * loss_crop_adv

	# Unconditional discriminator: HER2-only adversarial
	if self.uncond_discriminator is not None and self.hparams.uncond_disc_weight > 0:
	loss_uncond_adv = hinge_loss_g(self.uncond_discriminator(gen_for_disc))
	loss_g = loss_g + self.hparams.uncond_disc_weight * loss_uncond_adv

	# Generator backward + step
	lr_scale = self._get_lr_scale()
	for pg in opt_g.param_groups:
	pg['lr'] = self.hparams.gen_lr * lr_scale

	opt_g.zero_grad()
	self.manual_backward(loss_g)
	torch.nn.utils.clip_grad_norm_(self.generator.parameters(), 1.0)
	opt_g.step()

	# Update EMA
	self._update_ema()

	# ----------------------------------------------------------------
	# Discriminator step
	# ----------------------------------------------------------------
	loss_d = torch.tensor(0.0, device=self.device)
	loss_crop_d = torch.tensor(0.0, device=self.device)
	loss_uncond_d = torch.tensor(0.0, device=self.device)
	if self.global_step >= self.hparams.adversarial_start_step and any_adv:
	with torch.no_grad():
	fake_detached = self.generator(he, uni_dropped, labels_dropped)

	# For 1024, downsample for disc
	if img_sz == 1024:
	fake_det_disc = F.interpolate(fake_detached, size=512, mode='bilinear', align_corners=False)
	else:
	fake_det_disc = fake_detached

	# Conditional discriminator
	if self.hparams.adversarial_weight > 0:
	real_input = torch.cat([her2_for_disc, he_for_disc], dim=1)
	fake_input = torch.cat([fake_det_disc, he_for_disc], dim=1)

	disc_real = self.discriminator(real_input)
	disc_fake = self.discriminator(fake_input)

	loss_d = sum(
	hinge_loss_d(dr, df)
	for dr, df in zip(disc_real, disc_fake)
	) / len(disc_real)

	# Crop discriminator
	if self.crop_discriminator is not None and self.hparams.crop_disc_weight > 0:
	real_input_c = torch.cat([her2, he], dim=1)
	fake_input_c = torch.cat([fake_detached, he], dim=1)
	cs = self.hparams.crop_size
	top = torch.randint(0, img_sz - cs, (1,)).item()
	left = torch.randint(0, img_sz - cs, (1,)).item()
	real_crop = real_input_c[:, :, top:top+cs, left:left+cs]
	fake_crop = fake_input_c[:, :, top:top+cs, left:left+cs]
	loss_crop_d = hinge_loss_d(
	self.crop_discriminator(real_crop),
	self.crop_discriminator(fake_crop),
	)
	loss_d = loss_d + self.hparams.crop_disc_weight * loss_crop_d

	# Unconditional discriminator
	if self.uncond_discriminator is not None and (
	self.hparams.uncond_disc_weight > 0 or self.hparams.feat_match_weight > 0):
	uncond_real_out = self.uncond_discriminator(her2_for_disc)
	uncond_fake_out = self.uncond_discriminator(fake_det_disc)
	loss_uncond_d = hinge_loss_d(uncond_real_out, uncond_fake_out)
	loss_d = loss_d + max(self.hparams.uncond_disc_weight, 1.0) * loss_uncond_d

	# R1 gradient penalty
	loss_r1 = torch.tensor(0.0, device=self.device)
	if self.global_step % self.hparams.r1_every == 0:
	with torch.amp.autocast('cuda', enabled=False):
	if self.hparams.adversarial_weight > 0:
	real_input_r1 = torch.cat([her2_for_disc, he_for_disc], dim=1).float().detach().requires_grad_(True)
	for disc in [self.discriminator.disc_512]:
	d_real = disc(real_input_r1)
	grad_real = torch.autograd.grad(
	outputs=d_real.sum(), inputs=real_input_r1,
	create_graph=True,
	)[0]
	loss_r1 = loss_r1 + self.hparams.r1_weight * grad_real.pow(2).mean()
	if self.uncond_discriminator is not None and (
	self.hparams.uncond_disc_weight > 0 or self.hparams.feat_match_weight > 0):
	her2_r1 = her2_for_disc.float().detach().requires_grad_(True)
	d_real_uncond = self.uncond_discriminator(her2_r1)
	grad_uncond = torch.autograd.grad(
	outputs=d_real_uncond.sum(), inputs=her2_r1,
	create_graph=True,
	)[0]
	loss_r1 = loss_r1 + self.hparams.r1_weight * grad_uncond.pow(2).mean()
	loss_d = loss_d + loss_r1
	self.log('train/r1_penalty', loss_r1, prog_bar=False)

	opt_d.zero_grad()
	self.manual_backward(loss_d)
	if self.hparams.adversarial_weight > 0:
	torch.nn.utils.clip_grad_norm_(self.discriminator.parameters(), 1.0)
	if self.crop_discriminator is not None:
	torch.nn.utils.clip_grad_norm_(self.crop_discriminator.parameters(), 1.0)
	if self.uncond_discriminator is not None:
	torch.nn.utils.clip_grad_norm_(self.uncond_discriminator.parameters(), 1.0)
	opt_d.step()

	# Logging
	self.log('train/loss_g', loss_g, prog_bar=True)
	self.log('train/loss_d', loss_d, prog_bar=True)
	self.log('train/lpips', loss_lpips, prog_bar=True)
	self.log('train/adversarial', loss_adv, prog_bar=False)
	self.log('train/lr_scale', lr_scale, prog_bar=False)
	if self.crop_discriminator is not None:
	self.log('train/crop_adv_g', loss_crop_adv, prog_bar=False)
	self.log('train/crop_adv_d', loss_crop_d, prog_bar=False)
	if self.uncond_discriminator is not None:
	self.log('train/uncond_adv_g', loss_uncond_adv, prog_bar=False)
	self.log('train/uncond_adv_d', loss_uncond_d, prog_bar=False)
	if self.hparams.feat_match_weight > 0:
	self.log('train/feat_match', loss_feat_match, prog_bar=False)

	def on_validation_epoch_start(self):
	# Pick a random batch index for the second sample grid
	n_val_batches = max(1, len(self.trainer.val_dataloaders))
	self._random_val_batch_idx = torch.randint(1, max(2, n_val_batches), (1,)).item()
	# Per-label sample collectors (for multi-stain visual grids)
	self._val_per_label_samples = {}

	def _log_sample_grid(self, he, her2_01, gen_01, key):
	"""Log H&E \| Real \| Gen grid to wandb."""
	n = min(4, len(he))
	he_01 = ((he[:n].cpu() + 1) / 2).clamp(0, 1)
	grid_images = []
	for i in range(n):
	grid_images.extend([
	he_01[i],
	her2_01[i].cpu(),
	gen_01[i].cpu(),
	])
	grid = torchvision.utils.make_grid(grid_images, nrow=3, padding=2)
	if self.logger:
	self.logger.experiment.log({
	key: [wandb.Image(grid, caption='H&E \| Real \| Gen')],
	'global_step': self.global_step,
	})

	def validation_step(self, batch, batch_idx):
	he, her2, uni_or_crops, labels, fnames = batch

	# On-the-fly UNI extraction
	if self._uni_extract_on_the_fly:
	uni = self._extract_uni_from_sub_crops(uni_or_crops)
	else:
	uni = uni_or_crops

	if self.hparams.disable_uni:
	uni = torch.zeros_like(uni)

	if self.hparams.disable_class:
	labels = torch.full_like(labels, self.hparams.null_class)

	# Use EMA generator
	with torch.no_grad():
	generated = self.generator_ema(he, uni, labels)

	# LPIPS (4x downsample: 128 for 512, 256 for 1024)
	lpips_size = self.hparams.image_size // 4
	gen_lpips = F.interpolate(generated, size=lpips_size, mode='bilinear', align_corners=False)
	her2_lpips = F.interpolate(her2, size=lpips_size, mode='bilinear', align_corners=False)
	lpips_val = self.lpips_fn(gen_lpips, her2_lpips).mean()

	# SSIM
	gen_01 = ((generated + 1) / 2).clamp(0, 1)
	her2_01 = ((her2 + 1) / 2).clamp(0, 1)
	from torchmetrics.functional.image import structural_similarity_index_measure
	ssim_val = structural_similarity_index_measure(gen_01, her2_01, data_range=1.0)

	# DAB MAE (canonical: mean of top-10%)
	dab_gen = self.dab_extractor.extract_dab_intensity(generated.float().cpu(), normalize="none")
	dab_real = self.dab_extractor.extract_dab_intensity(her2.float().cpu(), normalize="none")

	def p90_score(dab):
	flat = dab.flatten()
	p90 = torch.quantile(flat, 0.9)
	mask = flat >= p90
	return flat[mask].mean().item() if mask.sum() > 0 else flat.mean().item()

	dab_mae = sum(
	abs(p90_score(dab_gen[i]) - p90_score(dab_real[i]))
	for i in range(len(dab_gen))
	) / len(dab_gen)

	self.log('val/lpips', lpips_val, prog_bar=True, sync_dist=True)
	self.log('val/ssim', ssim_val, prog_bar=True, sync_dist=True)
	self.log('val/dab_mae', dab_mae, prog_bar=True, sync_dist=True)

	# Collect per-label samples for visual grids (multi-stain only)
	if hasattr(self, '_val_per_label_samples'):
	for i in range(len(labels)):
	lbl = labels[i].item()
	if lbl == self.hparams.null_class:
	continue
	if lbl not in self._val_per_label_samples:
	self._val_per_label_samples[lbl] = {'he': [], 'real': [], 'gen': []}
	bucket = self._val_per_label_samples[lbl]
	if len(bucket['he']) < 4:
	bucket['he'].append(he[i].cpu())
	bucket['real'].append(her2_01[i].cpu())
	bucket['gen'].append(gen_01[i].cpu())

	# Log sample grids: first batch (fixed) + one random batch
	if batch_idx == 0:
	self._log_sample_grid(he, her2_01, gen_01, 'val/samples_fixed')
	elif batch_idx == self._random_val_batch_idx:
	self._log_sample_grid(he, her2_01, gen_01, 'val/samples_random')

	def on_validation_epoch_end(self):
	"""Log per-label sample grids if multiple labels are present."""
	if not hasattr(self, '_val_per_label_samples') or len(self._val_per_label_samples) <= 1:
	return

	label_names = getattr(self.hparams, 'label_names', None)

	for lbl, bucket in sorted(self._val_per_label_samples.items()):
	if not bucket['he'] or not self.logger:
	continue
	name = label_names[lbl] if label_names and lbl < len(label_names) else str(lbl)
	self._log_sample_grid(
	torch.stack(bucket['he']),
	torch.stack(bucket['real']),
	torch.stack(bucket['gen']),
	f'val/samples_{name}',
	)

	self._val_per_label_samples = {}

	@torch.no_grad()
	def generate(self, he_images, uni_features, labels,
	num_inference_steps=None, guidance_scale=1.0, seed=None):
	"""Generate IHC images from H&E input.

	Args:
	he_images: [B, 3, H, H] where H=512 or H=1024
	uni_features: [B, N, 1024] where N=16 (4x4 CLS) or N=1024 (32x32 patch)
	labels: [B] class/stain labels
	num_inference_steps: ignored (single forward pass)
	guidance_scale: CFG scale (1.0 = no guidance)
	seed: random seed (for reproducibility, though model is deterministic)
	"""
	if seed is not None:
	torch.manual_seed(seed)

	gen = self.generator_ema if hasattr(self, 'generator_ema') else self.generator

	if guidance_scale <= 1.0:
	return gen(he_images, uni_features, labels)

	# Classifier-free guidance
	null_labels = torch.full_like(labels, self.null_class)

	output_cond = gen(he_images, uni_features, labels)
	output_uncond = gen(he_images, uni_features, null_labels)

	output = output_uncond + guidance_scale * (output_cond - output_uncond)
	return output.clamp(-1, 1)