Create model.py

3d7d199 verified 20 days ago

35.6 kB

	# Cell 2
	# === Capacity Head ============================================================

	class CapacityHead(nn.Module):
	def __init__(self, in_dim, feat_dim, init_capacity=1.0):
	super().__init__()
	self._raw_capacity = nn.Parameter(torch.tensor(math.log(math.exp(init_capacity) - 1)))
	# GELU for cascade: smooth gradients needed for overflow propagation
	self.evidence_net = nn.Sequential(
	nn.Linear(in_dim, feat_dim), nn.GELU(), nn.Linear(feat_dim, 1))
	self.feature_net = nn.Sequential(
	nn.Linear(in_dim, feat_dim), nn.GELU(), nn.Linear(feat_dim, feat_dim))
	self.retain_gate = nn.Sequential(
	nn.Linear(feat_dim + 1, feat_dim), nn.Sigmoid())
	self.overflow_gate = nn.Sequential(
	nn.Linear(feat_dim + 1, feat_dim), nn.Sigmoid())

	@property
	def capacity(self):
	return F.softplus(self._raw_capacity)

	def forward(self, x):
	cap = self.capacity
	raw_ev = F.relu(self.evidence_net(x))
	fill = torch.clamp(raw_ev / (cap + 1e-8), max=1.0)
	sat = torch.clamp((raw_ev - cap) / (cap + 1e-8), min=0.0)
	feat = self.feature_net(x)
	retained = self.retain_gate(torch.cat([feat, fill], -1)) * feat * fill
	overflow = self.overflow_gate(torch.cat([feat, sat], -1)) * feat * torch.clamp(sat, max=1.0)
	return fill, overflow, retained, cap, raw_ev


	# === Differentiation Gate =====================================================

	class DifferentiationGate(nn.Module):
	"""
	Curvature direction analysis via occupancy field differentiation.

	Computes gradient and Laplacian of the 3D occupancy field to determine:
	- Curvature direction: convex (normals point outward) vs concave (inward)
	- Curvature alternation: where sign flips (saddle points, torus inner/outer)
	- Perturbation robustness: smoothed gradient features survive noise

	The key insight: a hemisphere and bowl occupy nearly identical voxels,
	but their occupancy gradients point in opposite directions relative
	to the center of mass. The Laplacian's sign distinguishes them.

	Outputs gate signals that modulate curvature features:
	- direction_gate: learned weighting based on gradient analysis
	- alternation_score: how much curvature sign varies spatially
	- directional_features: rich features encoding curvature orientation
	"""

	def __init__(self, embed_dim=64):
	super().__init__()

	# Fixed 3D differentiation kernels — fused into single conv
	# 4 output channels: [grad_x, grad_y, grad_z, laplacian]
	diff_kernels = torch.zeros(4, 1, 3, 3, 3)
	# Sobel X
	diff_kernels[0, 0, 0, 1, 1] = -1; diff_kernels[0, 0, 2, 1, 1] = 1
	# Sobel Y
	diff_kernels[1, 0, 1, 0, 1] = -1; diff_kernels[1, 0, 1, 2, 1] = 1
	# Sobel Z
	diff_kernels[2, 0, 1, 1, 0] = -1; diff_kernels[2, 0, 1, 1, 2] = 1
	# Laplacian
	diff_kernels[3, 0, 1, 1, 1] = -6
	diff_kernels[3, 0, 0, 1, 1] = 1; diff_kernels[3, 0, 2, 1, 1] = 1
	diff_kernels[3, 0, 1, 0, 1] = 1; diff_kernels[3, 0, 1, 2, 1] = 1
	diff_kernels[3, 0, 1, 1, 0] = 1; diff_kernels[3, 0, 1, 1, 2] = 1
	self.register_buffer("diff_kernels", diff_kernels)

	# Precompute coordinate grid
	coords = torch.stack(torch.meshgrid(
	torch.arange(GS, dtype=torch.float32),
	torch.arange(GS, dtype=torch.float32),
	torch.arange(GS, dtype=torch.float32),
	indexing="ij"), dim=-1) # (5,5,5,3)
	self.register_buffer("coords", coords)

	# Process gradient-derived features
	# Per-voxel: gradient direction, Laplacian sign, centroid-relative direction
	# Summarized as histograms and statistics

	# Gradient direction relative to centroid: 3 histogram bins per axis
	# + Laplacian sign distribution: 3 values (frac_pos, frac_neg, frac_zero)
	# + Alternation score: 1 value
	# + Per-axis gradient asymmetry: 3 values
	# + Radial gradient profile: 5 bins
	raw_feat_dim = 3 + 3 + 1 + 3 + 5 # = 15
	# Plus the 3D conv on the Laplacian field preserving spatial structure
	self.lap_conv = nn.Sequential(
	nn.Conv3d(1, 16, 3, padding=1), nn.GELU(),
	nn.Conv3d(16, 16, 3, padding=1), nn.GELU(),
	nn.AdaptiveAvgPool3d(2)) # -> (B, 16, 2, 2, 2) = 128
	lap_conv_dim = 16 * 8 # 128

	# Gradient magnitude 3D conv (encodes where boundaries are + direction)
	self.grad_conv = nn.Sequential(
	nn.Conv3d(3, 16, 3, padding=1), nn.GELU(), # 3-channel: dx, dy, dz
	nn.Conv3d(16, 16, 3, padding=1), nn.GELU(),
	nn.AdaptiveAvgPool3d(2)) # -> (B, 16, 2, 2, 2) = 128
	grad_conv_dim = 16 * 8 # 128

	total_feat_dim = raw_feat_dim + lap_conv_dim + grad_conv_dim # 15 + 128 + 128 = 271

	# Direction gate: SwiGLU for sharp convex/concave gating
	self.direction_net = nn.Sequential(
	SwiGLU(total_feat_dim, embed_dim),
	nn.Linear(embed_dim, embed_dim), nn.Sigmoid())

	# Directional features: SwiGLU for crisp direction encoding
	self.direction_feat_net = nn.Sequential(
	SwiGLU(total_feat_dim, embed_dim),
	nn.Linear(embed_dim, embed_dim))

	def forward(self, grid):
	"""
	grid: (B, 5, 5, 5) binary occupancy

	Returns:
	direction_gate: (B, embed_dim) sigmoid gate for curvature features
	direction_feat: (B, embed_dim) additive directional features
	alternation_score: (B, 1) how much curvature alternates
	"""
	B = grid.shape[0]
	device = grid.device
	vox = grid.unsqueeze(1) # (B, 1, 5, 5, 5)

	# === Smooth occupancy before differentiation ===
	# Binary voxels produce spike gradients. Light blur creates
	# a continuous field whose derivatives are geometrically meaningful.
	vox_smooth = F.avg_pool3d(
	F.pad(vox, (1,1,1,1,1,1), mode='replicate'),
	kernel_size=3, stride=1, padding=0) # (B, 1, 5, 5, 5)

	# === Compute gradients + Laplacian in single fused conv ===
	diff = F.conv3d(vox_smooth, self.diff_kernels, padding=1) # (B, 4, 5, 5, 5)
	grad_field = diff[:, :3] # (B, 3, 5, 5, 5) — gx, gy, gz
	gx, gy, gz = diff[:, 0:1], diff[:, 1:2], diff[:, 2:3]
	lap = diff[:, 3:4] # (B, 1, 5, 5, 5)

	# === Centroid ===
	flat_grid = grid.reshape(B, -1) # (B, 125)
	flat_coords = self.coords.reshape(-1, 3) # (125, 3)
	total_occ = flat_grid.sum(dim=-1, keepdim=True).clamp(min=1) # (B, 1)
	centroids = (flat_grid.unsqueeze(-1) * flat_coords.unsqueeze(0)).sum(dim=1) / total_occ # (B, 3)

	# === Gradient direction relative to centroid ===
	grad_flat = grad_field.reshape(B, 3, -1).permute(0, 2, 1) # (B, 125, 3)
	diff_from_center = flat_coords.unsqueeze(0) - centroids.unsqueeze(1) # (B, 125, 3)
	diff_norm = diff_from_center / (diff_from_center.norm(dim=-1, keepdim=True) + 1e-8)
	dot_products = (grad_flat * diff_norm).sum(dim=-1) # (B, 125)
	grad_mag = grad_flat.norm(dim=-1) # (B, 125)
	active = (flat_grid > 0.5) & (grad_mag > 0.01)

	# Histogram of dot product signs (convex/concave/neutral fractions)
	n_active = active.float().sum(-1).clamp(min=1)
	frac_outward = ((dot_products > 0.1) & active).float().sum(-1) / n_active
	frac_inward = ((dot_products < -0.1) & active).float().sum(-1) / n_active
	frac_neutral = 1.0 - frac_outward - frac_inward
	direction_hist = torch.stack([frac_outward, frac_inward, frac_neutral], dim=-1) # (B, 3)

	# === Laplacian sign distribution (active voxels only) ===
	lap_flat = lap.reshape(B, -1) # (B, 125)
	lap_active = flat_grid > 0.5
	n_lap_active = lap_active.float().sum(-1).clamp(min=1)
	frac_pos_lap = ((lap_flat > 0.1) & lap_active).float().sum(-1) / n_lap_active
	frac_neg_lap = ((lap_flat < -0.1) & lap_active).float().sum(-1) / n_lap_active
	frac_zero_lap = 1.0 - frac_pos_lap - frac_neg_lap
	lap_hist = torch.stack([frac_pos_lap, frac_neg_lap, frac_zero_lap], dim=-1) # (B, 3)

	# === Alternation score (ACTIVE VOXELS ONLY) ===
	# Only count sign flips between neighbor pairs where BOTH voxels are
	# near occupied regions. Otherwise empty space dilutes the signal.
	lap_3d = lap.squeeze(1) # (B, 5, 5, 5)
	# Boundary mask: dilate occupancy by 1 to include immediate neighbors
	boundary_mask = F.max_pool3d(vox, kernel_size=3, stride=1, padding=1).squeeze(1) # (B,5,5,5)

	# X-axis: both neighbors must be in boundary region
	bm_x = boundary_mask[:, 1:, :, :] * boundary_mask[:, :-1, :, :] # (B,4,5,5)
	flip_x = (torch.sign(lap_3d[:, 1:, :, :]) * torch.sign(lap_3d[:, :-1, :, :]) < 0).float()
	active_flips_x = (flip_x * bm_x).sum(dim=(1, 2, 3))
	active_pairs_x = bm_x.sum(dim=(1, 2, 3)).clamp(min=1)

	bm_y = boundary_mask[:, :, 1:, :] * boundary_mask[:, :, :-1, :]
	flip_y = (torch.sign(lap_3d[:, :, 1:, :]) * torch.sign(lap_3d[:, :, :-1, :]) < 0).float()
	active_flips_y = (flip_y * bm_y).sum(dim=(1, 2, 3))
	active_pairs_y = bm_y.sum(dim=(1, 2, 3)).clamp(min=1)

	bm_z = boundary_mask[:, :, :, 1:] * boundary_mask[:, :, :, :-1]
	flip_z = (torch.sign(lap_3d[:, :, :, 1:]) * torch.sign(lap_3d[:, :, :, :-1]) < 0).float()
	active_flips_z = (flip_z * bm_z).sum(dim=(1, 2, 3))
	active_pairs_z = bm_z.sum(dim=(1, 2, 3)).clamp(min=1)

	alternation = ((active_flips_x / active_pairs_x +
	active_flips_y / active_pairs_y +
	active_flips_z / active_pairs_z) / 3.0).unsqueeze(-1) # (B, 1)

	# === Per-axis gradient asymmetry ===
	# Asymmetry: mean gradient along each axis (nonzero = asymmetric curvature)
	gx_mean = (gx.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
	gy_mean = (gy.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
	gz_mean = (gz.squeeze(1) * grid).sum(dim=(1, 2, 3)) / total_occ.squeeze(-1)
	grad_asym = torch.stack([gx_mean, gy_mean, gz_mean], dim=-1) # (B, 3)

	# === Radial gradient profile ===
	# How does gradient magnitude vary with distance from centroid?
	dists = diff_from_center.norm(dim=-1) # (B, 125)
	# Arithmetic binning (Inductor-safe, no bucketize)
	# nan_to_num prevents NaN→long producing garbage indices under BF16
	bin_idx = torch.nan_to_num(dists * (5.0 / 3.5), nan=0.0).long().clamp(0, 4)
	active_mask = (flat_grid > 0.5) # (B, 125)
	radial_grad = torch.zeros(B, 5, device=device)
	# Scatter-add: accumulate grad_mag and counts per bin
	weighted_mag = grad_mag * active_mask.float() # zero out inactive
	one_hot = F.one_hot(bin_idx, 5).float() # (B, 125, 5)
	active_oh = one_hot * active_mask.float().unsqueeze(-1) # mask inactive
	counts = active_oh.sum(dim=1).clamp(min=1) # (B, 5)
	radial_grad = (weighted_mag.unsqueeze(-1) * active_oh).sum(dim=1) / counts
	# (B, 5)

	# === Conv on Laplacian field (spatial curvature map) ===
	lap_feat = self.lap_conv(lap).reshape(B, -1) # (B, 128)

	# === Conv on gradient field (directional boundaries) ===
	grad_feat = self.grad_conv(grad_field).reshape(B, -1) # (B, 128)

	# === Combine all ===
	raw_feat = torch.cat([
	direction_hist, # 3
	lap_hist, # 3
	alternation, # 1
	grad_asym, # 3
	radial_grad, # 5
	], dim=-1) # (B, 15)

	all_feat = torch.cat([raw_feat, lap_feat, grad_feat], dim=-1) # (B, 271)

	direction_gate = self.direction_net(all_feat) # (B, embed_dim) sigmoid
	direction_feat = self.direction_feat_net(all_feat) # (B, embed_dim)

	return direction_gate, direction_feat, alternation


	# === Deformation Augmentation =================================================

	def deform_grid(grid, p_dropout=0.1, p_add=0.1, p_shift=0.15):
	"""Fully vectorized voxel augmentation — zero CPU-GPU sync points."""
	B = grid.shape[0]
	device = grid.device
	r = torch.rand(B, 3, device=device)
	out = grid.clone()

	# --- Voxel dropout (batched, no .any() sync) ---
	drop_sel = (r[:, 0] < p_dropout).view(B, 1, 1, 1)
	keep = torch.rand_like(out) > 0.15
	out = torch.where(drop_sel, out * keep.float(), out)

	# --- Boundary addition (batched, no .any() sync) ---
	add_sel = (r[:, 1] < p_add).view(B, 1, 1, 1).float()
	dilated = F.max_pool3d(out.unsqueeze(1), kernel_size=3, stride=1, padding=1).squeeze(1)
	boundary = ((dilated > 0.5) & (out < 0.5)).float()
	add_noise = (torch.rand_like(out) < 0.3).float()
	out = (out + boundary * add_noise * add_sel).clamp(max=1.0)

	# --- Small translation (fully vectorized, no loops, no boolean indexing) ---
	shift_sel = (r[:, 2] < p_shift) # (B,)
	axes = torch.randint(3, (B,), device=device)
	dirs = torch.randint(0, 2, (B,), device=device) * 2 - 1

	# Precompute all 6 shifted versions of full batch (cheap for 5x5x5)
	# Encode: idx = axis * 2 + (dir==1) → [0..5], 6 = no shift
	versions = []
	for ax in range(3):
	for d in [-1, 1]:
	s = torch.roll(out, shifts=d, dims=ax + 1) # +1 for batch dim
	# Zero wrapped edge
	if d == 1:
	if ax == 0: s[:, 0, :, :] = 0
	elif ax == 1: s[:, :, 0, :] = 0
	else: s[:, :, :, 0] = 0
	else:
	if ax == 0: s[:, -1, :, :] = 0
	elif ax == 1: s[:, :, -1, :] = 0
	else: s[:, :, :, -1] = 0
	versions.append(s)
	versions.append(out) # index 6 = no shift (identity)
	stacked = torch.stack(versions, dim=0) # (7, B, 5, 5, 5)

	# Per-sample assignment: which version to pick
	assign = torch.where(shift_sel, axes * 2 + (dirs == 1).long(), torch.full_like(axes, 6))
	# Gather: stacked[assign[b], b] for each b
	out = stacked[assign, torch.arange(B, device=device)]

	return out


	# === Curvature Head (axis-aware) ==============================================

	class CurvatureHead(nn.Module):
	"""
	Axis-aware curvature detection with differentiation gating.

	1. Per-axis max projections -> 2D conv (keeps 2×2 spatial)
	2. Radial occupancy profile from centroid
	3. Axial symmetry + translation invariance scores
	4. 3D conv with spatial preservation (2×2×2)
	5. DifferentiationGate: gradient/Laplacian analysis for direction detection

	The DifferentiationGate modulates curvature features so that
	convex and concave shapes get distinct representations even when
	their occupancy patterns are nearly identical.
	"""

	def __init__(self, rigid_feat_dim, fill_dim, embed_dim):
	super().__init__()

	self.plane_conv = nn.Sequential(
	nn.Conv2d(1, 16, 3, padding=1), nn.GELU(),
	nn.Conv2d(16, 16, 3, padding=1), nn.GELU(),
	nn.AdaptiveAvgPool2d(2))
	plane_feat_dim = 3 * 16 * 4 # 192

	n_radial = 5
	self.radial_net = nn.Sequential(
	nn.Linear(n_radial, 32), nn.GELU(), nn.Linear(32, 16))
	radial_feat_dim = 16

	symmetry_feat_dim = 6

	self.voxel_conv = nn.Sequential(
	nn.Conv3d(1, 16, 3, padding=1), nn.GELU(),
	nn.Conv3d(16, 32, 3, padding=1), nn.GELU(),
	nn.AdaptiveAvgPool3d(2))
	voxel3d_feat_dim = 32 * 8 # 256

	# DifferentiationGate for curvature direction
	self.diff_gate = DifferentiationGate(embed_dim)

	# Pre-gate combine (without direction features)
	pre_gate_dim = (plane_feat_dim + radial_feat_dim + symmetry_feat_dim +
	voxel3d_feat_dim + rigid_feat_dim + fill_dim)

	# Pre-gate feature projection: SwiGLU for sharp geometric feature gating
	self.pre_gate_proj = nn.Sequential(
	SwiGLU(pre_gate_dim, embed_dim * 2),
	nn.Linear(embed_dim * 2, embed_dim))

	# Post-gate: gated features + direction features + alternation + raw combine
	# = embed_dim (gated) + embed_dim (direction) + 1 (alternation) + pre_gate_dim
	post_gate_dim = embed_dim + embed_dim + 1 + pre_gate_dim

	# SwiGLU for all curvature decision heads: sharp geometric classification
	self.curved_head = nn.Sequential(
	SwiGLU(post_gate_dim, embed_dim),
	nn.Linear(embed_dim, 1), nn.Sigmoid())
	self.curv_type_head = nn.Sequential(
	SwiGLU(post_gate_dim, embed_dim),
	nn.Linear(embed_dim, NUM_CURVATURES))
	self.curv_features = nn.Sequential(
	SwiGLU(post_gate_dim, embed_dim * 2),
	nn.Linear(embed_dim * 2, embed_dim))

	def forward(self, grid, rigid_retained, fill_ratios):
	B = grid.shape[0]

	proj_x = grid.max(dim=1).values
	proj_y = grid.max(dim=2).values
	proj_z = grid.max(dim=3).values

	# Batch all 3 projections through plane_conv in single pass
	projs_batched = torch.cat([
	proj_x.unsqueeze(1), proj_y.unsqueeze(1), proj_z.unsqueeze(1)
	], dim=0) # (3B, 1, 5, 5)
	plane_all = self.plane_conv(projs_batched).reshape(3, B, -1) # (3, B, 64)
	plane_feat = plane_all.permute(1, 0, 2).reshape(B, -1) # (B, 192)

	radial = self._radial_profile(grid)
	radial_feat = self.radial_net(radial)

	sym_feat = self._symmetry_features(proj_x, proj_y, proj_z)

	vox3d_feat = self.voxel_conv(grid.unsqueeze(1)).reshape(B, -1)

	# Raw curvature features (shape-aware but direction-blind)
	raw_combined = torch.cat([
	plane_feat, radial_feat, sym_feat, vox3d_feat,
	rigid_retained, fill_ratios], dim=-1)

	# Project to gatable dimension
	pre_gate = self.pre_gate_proj(raw_combined) # (B, embed_dim)

	# Direction analysis
	dir_gate, dir_feat, alternation = self.diff_gate(grid)

	# Apply gate: direction-modulated curvature features
	gated = pre_gate * dir_gate # (B, embed_dim) — convex/concave differentiation

	# Full post-gate features
	combined = torch.cat([gated, dir_feat, alternation, raw_combined], dim=-1)

	is_curved = self.curved_head(combined)
	curv_logits = self.curv_type_head(combined)
	curv_feat = self.curv_features(combined)
	return is_curved, curv_logits, curv_feat, alternation

	def _radial_profile(self, grid):
	B = grid.shape[0]
	device = grid.device
	coords = torch.stack(torch.meshgrid(
	torch.arange(GS, device=device, dtype=torch.float32),
	torch.arange(GS, device=device, dtype=torch.float32),
	torch.arange(GS, device=device, dtype=torch.float32),
	indexing="ij"), dim=-1)
	flat_grid = grid.reshape(B, -1)
	flat_coords = coords.reshape(-1, 3)
	total_occ = flat_grid.sum(dim=-1, keepdim=True).clamp(min=1)
	centroids = (flat_grid.unsqueeze(-1) * flat_coords.unsqueeze(0)).sum(dim=1) / total_occ
	diffs = flat_coords.unsqueeze(0) - centroids.unsqueeze(1)
	dists = diffs.norm(dim=-1) # (B, 125)
	max_dist = 3.5
	n_bins = 5
	# Arithmetic binning (Inductor-safe, no bucketize)
	bin_idx = torch.nan_to_num(dists * (float(n_bins) / max_dist), nan=0.0).long().clamp(0, n_bins - 1)
	one_hot = F.one_hot(bin_idx, n_bins).float() # (B, 125, 5)
	weighted = flat_grid.unsqueeze(-1) * one_hot # (B, 125, 5)
	profile = weighted.sum(dim=1) / total_occ # (B, 5)
	return profile

	def _symmetry_features(self, proj_x, proj_y, proj_z):
	projs = torch.stack([proj_x, proj_y, proj_z], dim=1) # (B, 3, H, W)
	fh = torch.flip(projs, dims=[2])
	fv = torch.flip(projs, dims=[3])
	sym = 1.0 - ((projs - fh).abs().mean(dim=(2, 3)) +
	(projs - fv).abs().mean(dim=(2, 3))) / 2 # (B, 3)
	shift_diff = (projs[:, :, 1:, :] - projs[:, :, :-1, :]).abs().mean(dim=(2, 3)) # (B, 3)
	trans_inv = 1.0 - shift_diff
	# Interleave: [sym0, trans0, sym1, trans1, sym2, trans2]
	return torch.stack([sym[:, 0], trans_inv[:, 0],
	sym[:, 1], trans_inv[:, 1],
	sym[:, 2], trans_inv[:, 2]], dim=-1) # (B, 6)


	# === Confidence Computation ====================================================

	def compute_confidence(logits):
	"""
	Compute real calibrated confidence metrics from logits.

	Returns dict with:
	max_prob: max(softmax(logits)) — calibrated top-class probability
	margin: top1_prob - top2_prob — disambiguation strength
	entropy: -sum(p * log(p)) — total uncertainty (lower = more confident)
	confidence: margin — primary confidence signal for gating
	"""
	probs = F.softmax(logits, dim=-1)
	max_prob, _ = probs.max(dim=-1)

	top2 = probs.topk(2, dim=-1).values
	margin = top2[:, 0] - top2[:, 1]

	# Entropy normalized to [0, 1] range
	log_probs = F.log_softmax(logits, dim=-1)
	entropy = -(probs * log_probs).sum(dim=-1)
	max_entropy = math.log(logits.shape[-1])
	norm_entropy = entropy / max_entropy

	return {
	"max_prob": max_prob,
	"margin": margin,
	"entropy": norm_entropy,
	"confidence": margin, # primary signal
	}


	# === Rectified Flow Arbiter ===================================================

	class RectifiedFlowArbiter(nn.Module):
	"""
	Rectified flow matching for ambiguous classification refinement.

	Real flow matching requires a target endpoint to define the velocity field.
	We learn class prototypes in latent space as targets: for a sample of class c,
	the target is prototype[c]. The velocity field learns to transport the
	encoded feature z0 toward the correct prototype z1 in straight lines:

	v_target = z1 - z0 (rectified: straight path from source to target)
	loss = \|\|v_predicted - v_target\|\|^2 (flow matching objective)

	At inference, the arbiter integrates the learned velocity field from z0,
	landing near the correct class prototype. Classification reads off the
	nearest prototype.

	Confidence gating: velocity magnitude is scaled by (1 - margin), so
	confident first-pass predictions receive minimal correction.
	"""

	def __init__(self, feat_dim, n_classes, n_steps=4, latent_dim=128, embed_dim=64):
	super().__init__()
	self.n_steps = n_steps
	self.n_classes = n_classes
	self.dt = 1.0 / n_steps
	self.latent_dim = latent_dim

	# Project features to latent space
	self.encode = nn.Sequential(
	nn.Linear(feat_dim, latent_dim * 2), nn.GELU(),
	nn.Linear(latent_dim * 2, latent_dim))

	# Learnable class prototypes — target endpoints for flow
	self.prototypes = nn.Parameter(torch.randn(n_classes, latent_dim) * 0.05)

	# Timestep embedding
	self.time_embed = nn.Sequential(
	nn.Linear(16, embed_dim), nn.GELU(),
	nn.Linear(embed_dim, embed_dim))

	# Confidence embedding
	self.conf_embed = nn.Sequential(
	nn.Linear(3, embed_dim), nn.GELU(),
	nn.Linear(embed_dim, embed_dim))

	# Velocity network: predicts flow direction in latent space
	vel_in = latent_dim + embed_dim + embed_dim
	self.velocity = nn.Sequential(
	SwiGLU(vel_in, latent_dim),
	nn.Linear(latent_dim, latent_dim),
	SwiGLU(latent_dim, latent_dim),
	nn.Linear(latent_dim, latent_dim))

	# Velocity gate: low confidence → full correction, high → minimal
	self.vel_gate = nn.Sequential(
	nn.Linear(embed_dim, latent_dim), nn.Sigmoid())

	# Classification from latent: distance to prototypes + learned head
	self.classifier_head = nn.Sequential(
	SwiGLU(latent_dim + n_classes, 96),
	nn.Linear(96, n_classes))

	# Learned confidence head for blending (differentiable, not topk)
	self.blend_head = nn.Sequential(
	nn.Linear(feat_dim, 64), nn.GELU(),
	nn.Linear(64, 1), nn.Sigmoid())

	# Post-refinement confidence
	self.refined_confidence = nn.Sequential(
	SwiGLU(latent_dim, 32),
	nn.Linear(32, 1), nn.Sigmoid())

	def _time_encoding(self, t, device):
	freqs = torch.exp(torch.linspace(0, -4, 8, device=device))
	args = t.unsqueeze(-1) * freqs.unsqueeze(0)
	return torch.cat([args.sin(), args.cos()], dim=-1)

	def _proto_logits(self, z):
	"""Classify by negative distance to prototypes."""
	# (B, latent) vs (C, latent) → (B, C) distances
	dists = torch.cdist(z.unsqueeze(0), self.prototypes.unsqueeze(0)).squeeze(0)
	# Combine distance signal with learned head
	combined = torch.cat([z, -dists], dim=-1) # (B, latent + n_classes)
	return self.classifier_head(combined)

	def forward(self, features, initial_logits, labels=None):
	"""
	features: (B, feat_dim)
	initial_logits: (B, n_classes)
	labels: (B,) — only during training, for flow matching target

	Returns:
	refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss
	"""
	B = features.shape[0]
	device = features.device

	# Confidence from initial logits
	initial_conf = compute_confidence(initial_logits)
	conf_input = torch.stack([
	initial_conf["max_prob"],
	initial_conf["margin"],
	initial_conf["entropy"]], dim=-1)
	conf_emb = self.conf_embed(conf_input)

	# Confidence-gated velocity magnitude
	gate = self.vel_gate(conf_emb)
	inv_conf = (1.0 - initial_conf["margin"]).unsqueeze(-1)
	adaptive_gate = gate * inv_conf

	# Encode to latent
	z0 = self.encode(features)

	# === Flow matching target ===
	flow_loss = torch.tensor(0.0, device=device)
	if labels is not None:
	# Target: class prototype for each sample
	z1 = self.prototypes[labels] # (B, latent_dim)
	# Target velocity: straight path z0 → z1
	v_target = z1 - z0 # (B, latent_dim)

	# Sample random timestep for flow matching training
	t_rand = torch.rand(B, device=device)
	t_emb = self.time_embed(self._time_encoding(t_rand, device))

	# Interpolated position along straight path
	z_t = z0 + t_rand.unsqueeze(-1) * v_target # (B, latent_dim)

	# Predicted velocity at this point
	vel_input = torch.cat([z_t, t_emb, conf_emb], dim=-1)
	v_pred = self.velocity(vel_input) * adaptive_gate
	v_pred = v_pred.clamp(-20, 20)

	# Flow matching loss: predicted velocity should match target
	flow_loss = F.mse_loss(v_pred, v_target.clamp(-20, 20))

	# === Inference: integrate velocity field ===
	z = z0
	trajectory_logits = []
	for step in range(self.n_steps):
	t_val = torch.full((B,), step * self.dt, device=device)
	t_emb = self.time_embed(self._time_encoding(t_val, device))

	vel_input = torch.cat([z, t_emb, conf_emb], dim=-1)
	v = self.velocity(vel_input) * adaptive_gate
	# Prevent BF16 divergence: clamp velocity magnitude
	v = v.clamp(-20, 20)

	z = z + self.dt * v
	trajectory_logits.append(self._proto_logits(z))

	refined_logits = trajectory_logits[-1]
	refined_conf = self.refined_confidence(z)

	# Learned blend weight (differentiable, from initial features)
	blend_weight = self.blend_head(features) # (B, 1)

	return refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss, blend_weight


	# === Model ====================================================================

	class GeometricShapeClassifier(nn.Module):
	def __init__(self, n_classes=NUM_CLASSES, embed_dim=64, n_tracers=5):
	super().__init__()
	self.n_tracers = n_tracers
	self.embed_dim = embed_dim

	self.voxel_embed = nn.Sequential(
	nn.Linear(4, embed_dim), nn.GELU(), nn.Linear(embed_dim, embed_dim))

	coords = torch.stack(torch.meshgrid(
	torch.arange(GS, dtype=torch.float32),
	torch.arange(GS, dtype=torch.float32),
	torch.arange(GS, dtype=torch.float32),
	indexing="ij"), dim=-1) / (GS - 1) # (5,5,5,3) normalized
	self.register_buffer("pos_grid", coords)

	self.tracer_tokens = nn.Parameter(torch.randn(n_tracers, embed_dim) * 0.02)
	self.tracer_attn = nn.MultiheadAttention(embed_dim, num_heads=4, batch_first=True)
	self.tracer_gate = nn.Sequential(nn.Linear(embed_dim * 2, embed_dim), nn.Sigmoid())
	self.tracer_interact = nn.Sequential(
	nn.Linear(embed_dim * 2, embed_dim), nn.GELU(), nn.Linear(embed_dim, embed_dim))
	# SwiGLU for edge detection: sharp "edge present?" decision
	self.edge_head = nn.Sequential(
	SwiGLU(embed_dim * 2, 32), nn.Linear(32, 1))

	# Precompute all C(n_tracers, 2) pair indices for vectorized interaction
	_pi, _pj = [], []
	for i in range(n_tracers):
	for j in range(i + 1, n_tracers):
	_pi.append(i); _pj.append(j)
	self.register_buffer("_pair_i", torch.tensor(_pi, dtype=torch.long))
	self.register_buffer("_pair_j", torch.tensor(_pj, dtype=torch.long))
	self.n_pairs = len(_pi)

	pool_dim = embed_dim * n_tracers

	self.dim0 = CapacityHead(pool_dim, embed_dim, init_capacity=0.5)
	self.dim1 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=1.0)
	self.dim2 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=1.5)
	self.dim3 = CapacityHead(pool_dim + embed_dim, embed_dim, init_capacity=2.0)

	rigid_feat_dim = embed_dim * 4
	self.curvature = CurvatureHead(rigid_feat_dim, fill_dim=4, embed_dim=embed_dim)

	class_in = pool_dim + 4 + rigid_feat_dim + embed_dim + 1
	self.class_in = class_in # Store for arbiter
	self.classifier = nn.Sequential(
	nn.Linear(class_in, 256), nn.GELU(), nn.Dropout(0.1),
	nn.Linear(256, 128), nn.GELU(), nn.Linear(128, n_classes))

	# SwiGLU for peak dimension: sharp "which dimension?" decision
	self.peak_head = nn.Sequential(
	SwiGLU(class_in, 32), nn.Linear(32, 4))
	# Volume is continuous interpolation — keep GELU
	self.volume_head = nn.Sequential(
	nn.Linear(class_in, 64), nn.GELU(), nn.Linear(64, 1))
	# SwiGLU for CM determinant sign: sharp geometric determinant
	self.cm_head = nn.Sequential(
	SwiGLU(class_in, 64), nn.Linear(64, 1), nn.Tanh())

	# Rectified flow arbiter for ambiguous classification
	self.arbiter = RectifiedFlowArbiter(
	feat_dim=class_in, n_classes=n_classes,
	n_steps=4, latent_dim=128, embed_dim=embed_dim)

	def forward(self, grid, labels=None):
	B = grid.shape[0]
	occ = grid.reshape(B, GS**3, 1)
	pos = self.pos_grid.reshape(1, GS**3, 3).expand(B, -1, -1)
	voxel_emb = self.voxel_embed(torch.cat([occ, pos], dim=-1))

	tracers = self.tracer_tokens.unsqueeze(0).expand(B, -1, -1)
	tracers, _ = self.tracer_attn(tracers, voxel_emb, voxel_emb)

	# Vectorized pair interaction: all C(5,2)=10 pairs at once
	left = tracers[:, self._pair_i] # (B, 10, embed_dim)
	right = tracers[:, self._pair_j] # (B, 10, embed_dim)
	pairs = torch.cat([left, right], dim=-1) # (B, 10, embed_dim*2)

	# Flatten to batch, run networks, reshape back
	flat_pairs = pairs.reshape(B * self.n_pairs, -1)
	gate = self.tracer_gate(flat_pairs).reshape(B, self.n_pairs, -1)
	interaction = self.tracer_interact(flat_pairs).reshape(B, self.n_pairs, -1)
	edge_lengths = self.edge_head(flat_pairs).reshape(B, self.n_pairs)

	# Scatter-add gated interactions back to both tracers in each pair
	gated = gate * interaction # (B, 10, embed_dim)
	tracer_out = tracers.clone()
	pi_exp = self._pair_i.view(1, self.n_pairs, 1).expand(B, -1, self.embed_dim)
	pj_exp = self._pair_j.view(1, self.n_pairs, 1).expand(B, -1, self.embed_dim)
	tracer_out.scatter_add_(1, pi_exp, gated)
	tracer_out.scatter_add_(1, pj_exp, gated)
	pooled = tracer_out.reshape(B, -1)

	fill0, ovf0, ret0, cap0, _ = self.dim0(pooled)
	fill1, ovf1, ret1, cap1, _ = self.dim1(torch.cat([pooled, ovf0], -1))
	fill2, ovf2, ret2, cap2, _ = self.dim2(torch.cat([pooled, ovf1], -1))
	fill3, ovf3, ret3, cap3, _ = self.dim3(torch.cat([pooled, ovf2], -1))

	fill_ratios = torch.cat([fill0, fill1, fill2, fill3], dim=-1)
	rigid_retained = torch.cat([ret0, ret1, ret2, ret3], dim=-1)
	ovf_norms = torch.stack([
	ovf0.norm(dim=-1), ovf1.norm(dim=-1),
	ovf2.norm(dim=-1), ovf3.norm(dim=-1)], dim=-1)

	is_curved, curv_logits, curv_feat, alternation = self.curvature(grid, rigid_retained, fill_ratios)
	full = torch.cat([pooled, fill_ratios, rigid_retained, curv_feat, is_curved], dim=-1)

	# === First pass classification ===
	initial_logits = self.classifier(full)

	# === Rectified flow arbitration ===
	refined_logits, refined_conf, initial_conf, trajectory_logits, flow_loss, blend_weight = \
	self.arbiter(full, initial_logits, labels=labels)

	# === Blend: learned confidence head decides trust ===
	# blend_weight is (B, 1) sigmoid output from learned head
	final_logits = blend_weight * initial_logits + (1.0 - blend_weight) * refined_logits

	return {
	# Classification
	"class_logits": final_logits,
	"initial_logits": initial_logits,
	"refined_logits": refined_logits,
	"trajectory_logits": trajectory_logits,
	# Flow matching
	"flow_loss": flow_loss,
	# Confidence
	"confidence": initial_conf["confidence"],
	"max_prob": initial_conf["max_prob"],
	"entropy": initial_conf["entropy"],
	"refined_confidence": refined_conf,
	"blend_weight": blend_weight.squeeze(-1),
	# Auxiliary heads
	"peak_logits": self.peak_head(full),
	"volume_pred": self.volume_head(full).squeeze(-1),
	"cm_pred": self.cm_head(full).squeeze(-1),
	"edge_lengths": edge_lengths,
	"fill_ratios": fill_ratios,
	"overflows": ovf_norms,
	"capacities": torch.stack([cap0, cap1, cap2, cap3]),
	"is_curved_pred": is_curved,
	"curv_type_logits": curv_logits,
	"alternation": alternation,
	# Pre-classifier features (for cross-contrast)
	"features": full,
	}


	# Quick sanity
	_m = GeometricShapeClassifier()
	print(f'GeometricShapeClassifier: {sum(p.numel() for p in _m.parameters()):,} params')
	del _m