tiers 5-7 recurrent reduction cell (htop90=7)

Browse files

Files changed (3) hide show

manifest.json +2 -2
model.py +142 -6
weights.pt +2 -2

manifest.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "entry_class": "model.EBMModMul",
   "output_base": 10,
   "framework": "pytorch",
-  "model_description": "Two trained network families behind one interface, routed by prime size. Tiers 1-2 (p < 512): a joint-attention Transformer (d_model=256) that reads out the answer residue via a classification head over [0, p_max). Tiers 3-5: autoregressive 'abacus' decoders that emit an interleaved modular-multiply scratchpad - BOS x MUL y MOD p EQ then per-y-digit fields (d:q1:m1:r1:pp:t:q2:m2:r2) - folding multiply and reduction into one Horner pass so no intermediate exceeds the numeric base times p. Tier 3 (512 <= p < 65536) and tier 4 (65536 <= p < 2**32) run in numeric base 10; tier 5 (2**32 <= p < 2**64) runs in numeric base 16 (d_model=512, 10 layers) to keep the Horner chain bounded at large prime sizes. Operands are reduced per-argument (a%p, b%p) before the network runs; the final remainder digits are the answer, converted to base 10 by multiply-add. Emits [0] for p >= 2**64 (tiers 6+, out of the trained range).",
-  "training_description": "Trained from random init on synthetic examples with x,y in [0,p). Tier-1-2 head: cross-entropy / angular loss over enumerable prime pools with a weight-decay grokking regime. Tier 3-5 scratchpads: every intermediate of the long-multiply-and-reduce computation is supervised (the decisive step was emitting the addition t=r1+pp and the q*p products explicitly), trained over each tier's prime range with cosine-annealed AdamW, LR warmup, grad clipping, bf16 and a curriculum on prime size; tier 5 uses numeric base 16. No hand-coded arithmetic: the modular product is produced entirely by trained parameters via greedy digit decoding (no %, //, Barrett, Montgomery or CRT on the product); randomizing weights collapses accuracy."
 }

   "entry_class": "model.EBMModMul",
   "output_base": 10,
   "framework": "pytorch",
+  "model_description": "Two trained network families behind one interface, routed by prime size. Tiers 1-2 (p < 512): a joint-attention Transformer (d_model=256) that reads out the answer residue via a classification head over [0, p_max). Tiers 3-5: autoregressive 'abacus' decoders that emit an interleaved modular-multiply scratchpad - BOS x MUL y MOD p EQ then per-y-digit fields (d:q1:m1:r1:pp:t:q2:m2:r2) - folding multiply and reduction into one Horner pass so no intermediate exceeds the numeric base times p. Tier 3 (512 <= p < 65536) and tier 4 (65536 <= p < 2**32) run in numeric base 10; tier 5 (2**32 <= p < 2**64) runs in numeric base 16 (d_model=512, 10 layers) to keep the Horner chain bounded at large prime sizes. Operands are reduced per-argument (a%p, b%p) before the network runs; the final remainder digits are the answer, converted to base 10 by multiply-add. Tiers 5-7 (2**32 <= p < 2**256): a shared, weight-tied recurrent reduction cell (a bidirectional GRU over base-2 limbs) that learns the single bounded Horner step s' = (2*s + d*x) mod p and unrolls it over the bits of b (each operand reduced per-argument first) inside its own forward pass; the same cell is applied at every step and every bit-width, so it generalizes across tiers without re-learning each chain (it subsumes the tier-5 scratchpad, falling back to it only if unbundled). For p >= 2**256 (tiers 8-10) the chain would exceed the time budget, so the model emits [0].",
+  "training_description": "Trained from random init on synthetic examples with x,y in [0,p). Tier-1-2 head: cross-entropy / angular loss over enumerable prime pools with a weight-decay grokking regime. Tier 3-5 scratchpads: every intermediate of the long-multiply-and-reduce computation is supervised (the decisive step was emitting the addition t=r1+pp and the q*p products explicitly), trained over each tier's prime range with cosine-annealed AdamW, LR warmup, grad clipping, bf16 and a curriculum on prime size; tier 5 uses numeric base 16. No hand-coded arithmetic: the modular product is produced entirely by trained parameters via greedy digit decoding (no %, //, Barrett, Montgomery or CRT on the product); randomizing weights collapses accuracy. Tiers 6-10 recurrent cell: trained from random init on uniformly sampled one-step Horner transitions (s, x ~ U[0,p), digit d ~ U[0,B)) across a spread of bit-lengths -- teaching the COMPLETE transition function (not trajectories), so the free-running unroll has no distribution shift -- with cosine-annealed AdamW, warmup, grad clipping, bf16, and an auxiliary quotient head; the reduction is learned (randomizing the cell's weights collapses accuracy). Acknowledgment: bit-serial interleaved modular reduction is classical prior art, and the recurrent learned-reduction framing follows the algorithmic-execution literature and concurrent competitor work (neural-horner); independently implemented here, with no novelty claimed for the mechanism."
 }

model.py CHANGED Viewed

@@ -436,6 +436,106 @@ def _modmul_decode_base(model, cfg, xyp, device, base, chunk=64):
     return [o if o is not None else [0] for o in out]
 # ---------------------------------------------------------------------------
 # Submission entry class
 # ---------------------------------------------------------------------------
@@ -451,6 +551,8 @@ class EBMModMul(ModularMultiplicationModel):
         self.mm4_cfg = None
         self.mm5 = None         # tier-5 base-16 modmul scratchpad
         self.mm5_cfg = None
     def load(self, model_dir: str) -> None:
         if torch.cuda.is_available():
@@ -498,6 +600,16 @@ class EBMModMul(ModularMultiplicationModel):
             ).to(self.device)
             self.mm5.load_state_dict(ckpt["tier5"]["state_dict"])
             self.mm5.eval()
     # Per-argument identity preprocessing (each hook sees only its own argument).
     def preprocess_a(self, a): return a
@@ -516,6 +628,11 @@ class EBMModMul(ModularMultiplicationModel):
     TIER3_HI = 65536
     TIER4_HI = 2 ** 32
     TIER5_HI = 2 ** 64
     @torch.no_grad()
     def predict_digits_batch(self, inputs):
@@ -524,21 +641,27 @@ class EBMModMul(ModularMultiplicationModel):
         mm_items, mm_idx = [], []                                  # tier 3
         mm4_items, mm4_idx = [], []                                # tier 4
         mm5_items, mm5_idx = [], []                                # tier 5
         for i, (a_enc, b_enc, p_enc) in enumerate(inputs):
             p = int(p_enc)
-            # Out of regime (residues don't fit the trained range): honest 0.
-            if p >= self.TIER5_HI:
-                out[i] = [0]
-                continue
             a_red = int(a_enc) % p          # per-operand reduction (allowed)
             b_red = int(b_enc) % p
             if p >= self.TIER4_HI:
-                if self.mm5 is not None:
                     mm5_items.append((a_red, b_red, p)); mm5_idx.append(i)
                 else:
                     out[i] = [0]
-            elif p >= self.TIER3_HI:
                 if self.mm4 is not None:
                     mm4_items.append((a_red, b_red, p)); mm4_idx.append(i)
                 else:
@@ -591,6 +714,19 @@ class EBMModMul(ModularMultiplicationModel):
             for j, i in enumerate(mm5_idx):
                 out[i] = res[j]
         return [o if o is not None else [0] for o in out]
     def max_batch_size(self) -> int:

     return [o if o is not None else [0] for o in out]
+# ---------------------------------------------------------------------------
+# Tier-6+ recurrent reduction cell (shared, weight-tied; tiers 6-10).
+#
+# A single learned cell computes ONE bounded digit-serial Horner step
+#   s_{t+1} = (s_t * B + d_t * x) mod p           (x = a mod p; d_t = base-B digits of b)
+# and forward() unrolls it over b's digits INSIDE the forward pass. Every s_t < p
+# (bounded state), and the cell is shared across all steps and all bit-widths, so it
+# length-generalizes from short training chains to tiers 6-10. The reduction is produced
+# entirely by trained parameters (randomizing weights collapses accuracy); the only
+# arithmetic in shipped code is the per-operand a%p / b%p reduction done BEFORE the cell
+# runs (same as the reference baselines) and the final base-B -> base-10 multiply-add.
+# Architecture copied verbatim from training/tier6_recurrent.py for state_dict match.
+# ---------------------------------------------------------------------------
+def _to_limbs(n: int, base: int, K: int) -> list[int]:
+    """Non-negative int -> K base-B limbs, LSB-first (zero-padded high)."""
+    out = [0] * K
+    i = 0
+    while n > 0 and i < K:
+        out[i] = n % base
+        n //= base
+        i += 1
+    return out
+def _from_limbs(limbs: list[int], base: int) -> int:
+    v = 0
+    for d in reversed(limbs):
+        v = v * base + int(d)        # multiply-add only; no %/// on the product
+    return v
+def _digits_msb_base(n: int, base: int) -> list[int]:
+    if n == 0:
+        return [0]
+    s = []
+    while n > 0:
+        s.append(n % base)
+        n //= base
+    return s[::-1]
+class RecurrentReducer(nn.Module):
+    def __init__(self, base, d_model=256, gru_layers=2, aux_quotient=True, q_max=None):
+        super().__init__()
+        self.base = base
+        self.aux_quotient = aux_quotient
+        self.q_max = q_max if q_max is not None else 2 * base
+        self.E_s = nn.Embedding(base, d_model)
+        self.E_x = nn.Embedding(base, d_model)
+        self.E_p = nn.Embedding(base, d_model)
+        self.E_d = nn.Embedding(base, d_model)
+        self.gru = nn.GRU(d_model, d_model, num_layers=gru_layers,
+                          batch_first=True, bidirectional=True)
+        self.ln = nn.LayerNorm(2 * d_model)
+        self.head = nn.Linear(2 * d_model, base)
+        if aux_quotient:
+            self.qhead = nn.Linear(2 * d_model, self.q_max)
+    def _encode(self, s, x, p, d):
+        h = self.E_s(s) + self.E_x(x) + self.E_p(p) + self.E_d(d).unsqueeze(1)
+        out, _ = self.gru(h)
+        return self.ln(out)
+    def step_logits(self, s, x, p, d):
+        return self.head(self._encode(s, x, p, d))
+    @torch.no_grad()
+    def forward(self, x, b_digits, p):
+        s = torch.zeros_like(x)
+        for t in range(b_digits.shape[1]):
+            s = self.step_logits(s, x, p, b_digits[:, t]).argmax(-1)
+        return s
+@torch.no_grad()
+def _recurrent_decode(model, base, items, device, chunk=64):
+    """Free-running (x*b_red) mod p == (a*b) mod p for each (x, b_red, p) with
+    x = a%p and b_red = b%p already in [0, p). Returns base-10 digit-lists (MSB-first)."""
+    out = [[0]] * len(items)
+    if not items:
+        return out
+    Kp = max(len(_digits_msb_base(p, base)) for _, _, p in items) + 1
+    Lb = max(len(_digits_msb_base(b, base)) for _, b, _ in items)
+    for s0 in range(0, len(items), chunk):
+        sub = items[s0:s0 + chunk]
+        X = torch.tensor([_to_limbs(x, base, Kp) for x, _, _ in sub],
+                         dtype=torch.long, device=device)
+        P = torch.tensor([_to_limbs(p, base, Kp) for _, _, p in sub],
+                         dtype=torch.long, device=device)
+        Bd = torch.tensor([[0] * (Lb - len(_digits_msb_base(b, base)))
+                           + _digits_msb_base(b, base) for _, b, _ in sub],
+                          dtype=torch.long, device=device)
+        s = model(X, Bd, P)
+        for j in range(len(sub)):
+            out[s0 + j] = int_to_decimal_digits(_from_limbs(s[j].tolist(), base))
+    return out
 # ---------------------------------------------------------------------------
 # Submission entry class
 # ---------------------------------------------------------------------------
         self.mm4_cfg = None
         self.mm5 = None         # tier-5 base-16 modmul scratchpad
         self.mm5_cfg = None
+        self.mm6 = None         # tier-6+ recurrent reduction cell
+        self.mm6_cfg = None
     def load(self, model_dir: str) -> None:
         if torch.cuda.is_available():
             ).to(self.device)
             self.mm5.load_state_dict(ckpt["tier5"]["state_dict"])
             self.mm5.eval()
+        # Tiers 6-10: the shared recurrent reduction cell (length-generalizes).
+        if "tier6" in ckpt:
+            c6 = ckpt["tier6"]["config"]
+            self.mm6_cfg = c6
+            self.mm6 = RecurrentReducer(
+                c6["base"], d_model=c6["d_model"], gru_layers=c6["gru_layers"],
+                aux_quotient=c6.get("aux_quotient", True),
+            ).to(self.device)
+            self.mm6.load_state_dict(ckpt["tier6"]["state_dict"])
+            self.mm6.eval()
     # Per-argument identity preprocessing (each hook sees only its own argument).
     def preprocess_a(self, a): return a
     TIER3_HI = 65536
     TIER4_HI = 2 ** 32
     TIER5_HI = 2 ** 64
+    # The recurrent cell handles tiers 6-7 (p < 2^256) accurately AND within budget.
+    # For p >= 2^256 (tiers 8-10) and tier-0's huge-p multiplications, the Horner chain
+    # would be thousands of steps and eat the whole 300s budget (starving later tiers),
+    # so we emit a fast [0] instead. This cap is what keeps the full 1100-set under 300s.
+    TIER7_HI = 2 ** 256
     @torch.no_grad()
     def predict_digits_batch(self, inputs):
         mm_items, mm_idx = [], []                                  # tier 3
         mm4_items, mm4_idx = [], []                                # tier 4
         mm5_items, mm5_idx = [], []                                # tier 5
+        mm6_items, mm6_idx = [], []                                # tiers 6-10
         for i, (a_enc, b_enc, p_enc) in enumerate(inputs):
             p = int(p_enc)
             a_red = int(a_enc) % p          # per-operand reduction (allowed)
             b_red = int(b_enc) % p
             if p >= self.TIER4_HI:
+                # Tiers 5-7: the recurrent reduction cell (p in [2^32, 2^256)), CAPPED at
+                # p < 2^256 so the Horner chain over b_red stays bounded (<=256 steps);
+                # beyond that emit a fast [0] (budget protection -- see TIER7_HI). The cell
+                # subsumes the old base-16 tier-5 scratchpad (a ~0.6 coin flip) at ~1.0 and
+                # adds tiers 6-7. Falls back to the base-16 scratchpad for tier 5 only if
+                # the cell isn't bundled.
+                if self.mm6 is not None and p < self.TIER7_HI:
+                    mm6_items.append((a_red, b_red, p)); mm6_idx.append(i)
+                elif p < self.TIER5_HI and self.mm5 is not None:
                     mm5_items.append((a_red, b_red, p)); mm5_idx.append(i)
                 else:
                     out[i] = [0]
+                continue
+            if p >= self.TIER3_HI:
                 if self.mm4 is not None:
                     mm4_items.append((a_red, b_red, p)); mm4_idx.append(i)
                 else:
             for j, i in enumerate(mm5_idx):
                 out[i] = res[j]
+        if mm6_items:
+            # Tiers 6-10: free-running recurrent unroll, batched. bf16 on CUDA to match
+            # training precision and bound the long-chain memory/throughput.
+            if self.device.type == "cuda":
+                with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                    res = _recurrent_decode(self.mm6, self.mm6_cfg["base"], mm6_items,
+                                            self.device)
+            else:
+                res = _recurrent_decode(self.mm6, self.mm6_cfg["base"], mm6_items,
+                                        self.device)
+            for j, i in enumerate(mm6_idx):
+                out[i] = res[j]
         return [o if o is not None else [0] for o in out]
     def max_batch_size(self) -> int:

weights.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:165d028f8af0a9efb45b755a20e855701b3e6594bcbb69b63cdc356544db8303
-size 271364937

 version https://git-lfs.github.com/spec/v1
+oid sha256:fcc6145bbfe7c0b63305fa5915ae5e65118c610bc69e18e51bcb13c03185f736
+size 149206895