theapemachine
/

cortex

Model card Files Files and versions

xet

Community

theapemachine commited on 22 days ago

Commit

c7bee4d

verified ·

1 Parent(s): 82900ee

Add cortex/adaptive_depth.py

Browse files

Files changed (1) hide show

cortex/adaptive_depth.py +142 -0

cortex/adaptive_depth.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+AdaptiveDepth: Dynamic layer skipping with learned gates.
+Inspired by GateSkip (2025), Mixture of Depths (Raposo et al. 2024), and
+Router-Tuning (2024).
+Architecture:
+    - Each transformer layer gets a lightweight binary gate: g ∈ (0, 1)
+    - The gate decides per-token whether to execute the layer or skip it
+    - Skip = identity (hidden states pass through unchanged)
+    - Execute = normal layer forward + gated residual
+    - Gates are trained to minimize computation while maintaining quality
+    - A budget constraint ensures the model uses a target % of layers per token
+Failure mode addressed:
+    - Fixed compute: All tokens get the same computation depth regardless of difficulty.
+      "The" doesn't need 32 layers of processing, but a complex reasoning step might need all of them.
+    - Wasted compute: Many layers are near-identity for "easy" tokens.
+    - Latency: Dynamic depth enables significant speedup on average.
+    - Overthinking: Too many layers can sometimes HURT performance (representation collapse).
+      Adaptive depth protects against this.
+Injection point: POST_FFN
+    - Rationale: The gate wraps the entire layer's contribution to the residual stream.
+      It decides: "Was this layer's update useful for this token?"
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union, List
+from cortex.core import CortexModule, InjectionPoint
+class AdaptiveDepth(CortexModule):
+    """
+    Token-wise layer gating for dynamic computation depth.
+    Uses a sigmoid-linear gate: the gate output is in (0, 1) and directly
+    scales the layer's residual update. During inference, gates below a
+    threshold can be rounded to 0 for actual compute savings.
+    Training uses a straight-through estimator for the hard gate, plus a
+    budget regularization loss.
+    Args:
+        hidden_dim: Model hidden dimension
+        target_budget: Target fraction of layers to use per token (0-1)
+        gate_type: "sigmoid" (soft), "straight_through" (hard during forward, soft backward)
+        temperature: Temperature for gating (lower = more binary)
+        budget_loss_weight: Weight for the budget regularization loss
+    """
+    def __init__(
+        self,
+        hidden_dim: int,
+        target_budget: float = 0.7,
+        gate_type: str = "sigmoid",
+        temperature: float = 1.0,
+        budget_loss_weight: float = 0.01,
+        target_layers: Union[List[int], str] = "all",
+    ):
+        super().__init__(InjectionPoint.POST_FFN, target_layers)
+        self.hidden_dim = hidden_dim
+        self.target_budget = target_budget
+        self.gate_type = gate_type
+        self.temperature = temperature
+        self.budget_loss_weight = budget_loss_weight
+        # Gate network: maps hidden state to a scalar gate per token
+        self.gate_net = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim // 4),
+            nn.GELU(),
+            nn.Linear(hidden_dim // 4, 1),
+        )
+        # Initialize gate to be "open" (execute layer) by default
+        nn.init.constant_(self.gate_net[-1].bias, 2.0)  # sigmoid(2) ≈ 0.88
+        # Buffers for monitoring
+        self.register_buffer("_pre_layer_hidden", None, persistent=False)
+        self.register_buffer("_gate_values", None, persistent=False)
+        self.register_buffer("_budget_loss", torch.tensor(0.0), persistent=False)
+    def store_input(self, hidden_states: torch.Tensor):
+        """Store the input to the layer (called via pre-hook)."""
+        self._pre_layer_hidden = hidden_states.detach()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        Gate the layer's residual contribution.
+        post_layer = pre_layer + gate * (post_layer - pre_layer)
+        When gate = 1: post_layer (use full layer output)
+        When gate = 0: pre_layer (skip layer entirely)
+        """
+        # Compute gate value per token
+        gate_logit = self.gate_net(hidden_states) / self.temperature  # [B, T, 1]
+        gate = torch.sigmoid(gate_logit)
+        # Straight-through estimator for hard gating
+        if self.gate_type == "straight_through" and self.training:
+            hard_gate = (gate > 0.5).float()
+            gate = hard_gate - gate.detach() + gate  # STE
+        self._gate_values = gate.detach()
+        # Gate the output: scale by gate, preserve gradients
+        gated_output = gate * hidden_states + (1 - gate) * hidden_states.detach()
+        # Budget regularization loss
+        avg_gate = gate.mean()
+        budget_loss = self.budget_loss_weight * (avg_gate - self.target_budget).pow(2)
+        self._budget_loss = budget_loss.detach()
+        return gated_output
+    def get_gate_stats(self) -> dict:
+        """Return statistics about gate usage."""
+        if self._gate_values is None:
+            return {"mean": 0.0, "std": 0.0, "skip_frac": 0.0}
+        g = self._gate_values
+        return {
+            "mean": g.mean().item(),
+            "std": g.std().item(),
+            "skip_frac": (g < 0.5).float().mean().item(),
+        }
+    def get_budget_loss(self) -> torch.Tensor:
+        """Return the budget regularization loss (add to main loss)."""
+        return self._budget_loss
+    def extra_repr(self):
+        return (f"hidden_dim={self.hidden_dim}, target_budget={self.target_budget}, "
+                f"gate_type={self.gate_type}, {super().extra_repr()}")