// Multi-hidden-layer WebGPU EqProp trainer. // sizes = [D, H1, H2, ..., Hk, O] // Modes: // 0 = adaptive (σ everywhere, u-nudge at output) // 1 = fhn (clip everywhere, ρ-nudge at output) // // Key design: // * ONE generic compute pipeline `pass_layer` updates any layer using uniforms (ni, no, nxt, ...). // * 3 phases (free / plus / minus): each has its own per-layer state buffer + per-layer bind group. // * Uniform buffers are pre-written once per pass (one per (phase, layer) pair), so the encoder // records dispatches that each pick up the right uniforms. // * Gradient kernels are also generic — per layer transition, compute outer product reduction over batch. import { orth as orthCPU } from './eqprop_lib.js'; // ----- WGSL: relax (one kernel handles any layer) ----- const WGSL_RELAX = ` struct P { ni : u32, no : u32, nxt : u32, B : u32, dt : f32, beta : f32, gamma : f32, mode : f32, // mode: 0=adaptive σ, 1=fhn(clip+cubic), 2=prism (soft-clip via softplus) has_topdown : u32, has_target : u32, noise_scale : f32, iter_seed : u32, // sEqProp: noise_scale > 0 injects per-iter per-neuron Gaussian-ish noise into drive. // Bio-faithful (real synapses are stochastic). At test, run M passes & average outputs. clamp_lo : f32, clamp_hi : f32, _p_t1 : f32, _p_t2 : f32, // Tier A — pre-σ drive clamp (algorithmic, uniform-driven). Bounds the pre-activation // value c before σ(c) to prevent saturation runaway. When clamp_hi <= clamp_lo the // kernel treats it as DISABLED (no-op). Default in caller = clamp_lo=clamp_hi=0 → disabled. }; @group(0) @binding(0) var p : P; @group(0) @binding(1) var Win : array; @group(0) @binding(2) var W0 : array; // [no x ni] @group(0) @binding(3) var b0 : array; // [no] @group(0) @binding(4) var W1 : array; // [nxt x no] (top-down) @group(0) @binding(5) var Uh : array; // [B*no] @group(0) @binding(6) var Uo : array; // [B*nxt] @group(0) @binding(7) var Tgt : array; // [B*no] // HPSN: heterogeneous time constants — per-neuron multiplier on drive integration. // Tau[i] replaces the global p.dt. Constant Tau[i]=p.dt → behavior identical to scalar-dt EqProp. // Sampled from Uniform[tau_min, tau_max] → diverse temporal scales like real cortical neurons. @group(0) @binding(8) var Tau : array; // [no] const A1 : f32 = 0.07407407407407407; const PRISM_K : f32 = 10.0; // sharpness; higher k → harder clip // PCG-style cheap hash → uniform [0,1). Per-thread, per-iter, per-neuron stochasticity. fn pcg_hash(seed_in: u32) -> u32 { var state : u32 = seed_in * 747796405u + 2891336453u; let word : u32 = ((state >> ((state >> 28u) + 4u)) ^ state) * 277803737u; return (word >> 22u) ^ word; } fn unif_noise(b: u32, i: u32, t: u32) -> f32 { // Triangular distribution (sum of 2 uniforms - 1) ≈ Gaussian-ish, mean=0, variance=1/6. let h1 = pcg_hash(b * 65537u + i * 257u + t * 31u); let h2 = pcg_hash(b * 31337u + i * 1031u + t * 17u + 12345u); let u1 = f32(h1) / 4294967296.0; let u2 = f32(h2) / 4294967296.0; return (u1 + u2) - 1.0; // range [-1, 1], roughly triangular } fn sg(u: f32) -> f32 { return 1.0 / (1.0 + exp(-4.0 * (u - 0.5))); } fn fhn_rho(u: f32) -> f32 { return clamp(u, 0.0, 1.0); } fn fhn_rho_p(u: f32) -> f32 { return select(0.0, 1.0, u > 0.0 && u < 1.0); } fn fhn_f(u: f32) -> f32 { return A1 * u - u*u*u; } // PRISM activation: ρ(u) = (softplus(k·u) - softplus(k·(u-1))) / k // Smooth approximation of clip(u,0,1). Derivative: σ(k·u) - σ(k·(u-1)). // "Prism" = splits drive into a smooth-yet-saturating activation with gradient flow on both sides. fn softplus(x: f32) -> f32 { return select(x + log(1.0 + exp(-x)), log(1.0 + exp(x)), x <= 0.0); } fn prism_rho(u: f32) -> f32 { return (softplus(PRISM_K * u) - softplus(PRISM_K * (u - 1.0))) / PRISM_K; } fn prism_rho_p(u: f32) -> f32 { let k = PRISM_K; return 1.0/(1.0+exp(-k*u)) - 1.0/(1.0+exp(-k*(u-1.0))); } fn rho(u: f32) -> f32 { if (p.mode > 1.5) { return prism_rho(u); } if (p.mode > 0.5) { return fhn_rho(u); } return sg(u); } @compute @workgroup_size(8, 8) fn pass_layer(@builtin(global_invocation_id) gid: vec3) { let b = gid.y; let i = gid.x; if (b >= p.B || i >= p.no) { return; } // bottom-up: c = b0[i] + sum_j W0[i,j] * rho(Win[b,j]) var c : f32 = b0[i]; let row0 = i * p.ni; let xoff = b * p.ni; for (var j: u32 = 0u; j < p.ni; j = j + 1u) { c = c + W0[row0 + j] * rho(Win[xoff + j]); } // top-down: gamma * sum_k W1[k,i] * rho(Uo[b,k]) (if next layer exists) if (p.has_topdown != 0u) { var td : f32 = 0.0; let uo_off = b * p.nxt; for (var k: u32 = 0u; k < p.nxt; k = k + 1u) { td = td + W1[k * p.no + i] * rho(Uo[uo_off + k]); } if (p.mode > 0.5) { c = c + td; } else { c = c + p.gamma * td; } } // Tier A — pre-σ drive clamp (algorithmic). Active iff clamp_hi > clamp_lo. if (p.clamp_hi > p.clamp_lo) { c = clamp(c, p.clamp_lo, p.clamp_hi); } let idx = b * p.no + i; let u_old = Uh[idx]; // sEqProp noise: per-(b, i, iter_seed) triangular noise added to drive. Zero by default. let noise = select(0.0, p.noise_scale * unif_noise(b, i, p.iter_seed), p.noise_scale > 0.0); var u_new : f32; if (p.mode > 1.5) { // PRISM: u̇ = ρ'(u)·c + (linear pull) ; ρ-nudge for output. Smooth saturating dynamics. var drive : f32 = prism_rho_p(u_old) * c - 0.1 * (u_old - 0.5) + noise; if (p.has_target != 0u && p.beta != 0.0) { drive = drive + p.beta * (Tgt[idx] - prism_rho(u_old)); } u_new = u_old + Tau[i] * drive; u_new = clamp(u_new, -0.3, 1.3); } else if (p.mode > 0.5) { // FHN var drive : f32 = fhn_rho_p(u_old) * c + fhn_f(u_old) + noise; if (p.has_target != 0u && p.beta != 0.0) { drive = drive + p.beta * (Tgt[idx] - fhn_rho(u_old)); } u_new = u_old + Tau[i] * drive; u_new = clamp(u_new, -0.2, 1.2); } else { // Adaptive var drive : f32 = -u_old + sg(c) + noise; if (p.has_target != 0u && p.beta != 0.0) { drive = drive + p.beta * (Tgt[idx] - u_old); } u_new = u_old + Tau[i] * drive; } Uh[idx] = u_new; } // 2D dispatch to handle big buffers (B*no can exceed the 65535 per-dim workgroup limit). @compute @workgroup_size(64) fn init_state(@builtin(global_invocation_id) gid: vec3) { let stride = 65535u * 64u; // workgroups_per_X * threads_per_workgroup let g = gid.y * stride + gid.x; let n = p.B * p.no; if (g < n) { Uh[g] = 0.1; } } `; // ----- WGSL: gradient (one kernel handles any layer transition) ----- const WGSL_GRAD = ` struct GP { ni : u32, no : u32, _pad : u32, B : u32, c : f32, two_beta : f32, mode_pre : f32, mode_post : f32, // mode_pre/post: 0=σ, 1=clip, 2=identity }; @group(0) @binding(0) var p : GP; @group(0) @binding(1) var UpreP : array; // [B*ni] - "input" layer state, plus phase @group(0) @binding(2) var UpreM : array; // [B*ni] - minus @group(0) @binding(3) var UpostP: array; // [B*no] @group(0) @binding(4) var UpostM: array; // [B*no] @group(0) @binding(5) var R : array; // [B] @group(0) @binding(6) var gW : array; // [no*ni] @group(0) @binding(7) var gB : array; // [no] fn sg(u: f32) -> f32 { return 1.0 / (1.0 + exp(-4.0 * (u - 0.5))); } const PRISM_K2 : f32 = 10.0; fn softplus2(x: f32) -> f32 { return select(x + log(1.0 + exp(-x)), log(1.0 + exp(x)), x <= 0.0); } fn prism_rho_g(u: f32) -> f32 { return (softplus2(PRISM_K2*u) - softplus2(PRISM_K2*(u-1.0))) / PRISM_K2; } fn rho_mode(u: f32, m: f32) -> f32 { if (m > 2.5) { return u; } // identity (linear) if (m > 1.5) { return prism_rho_g(u); } // prism soft-clip if (m > 0.5) { return clamp(u, 0.0, 1.0); } // hard-clip (FHN) return sg(u); // σ (adaptive) } @compute @workgroup_size(8, 8) fn grad_W(@builtin(global_invocation_id) gid: vec3) { let i = gid.y; let j = gid.x; if (i >= p.no || j >= p.ni) { return; } var acc : f32 = 0.0; for (var b: u32 = 0u; b < p.B; b = b + 1u) { let rh = R[b]; let ip = rho_mode(UpostP[b * p.no + i], p.mode_post); let im = rho_mode(UpostM[b * p.no + i], p.mode_post); let jp = rho_mode(UpreP[b * p.ni + j], p.mode_pre); let jm = rho_mode(UpreM[b * p.ni + j], p.mode_pre); acc = acc + rh * (ip * jp - im * jm); } gW[i * p.ni + j] = acc / p.two_beta; } @compute @workgroup_size(64) fn grad_B(@builtin(global_invocation_id) gid: vec3) { let i = gid.x; if (i >= p.no) { return; } var acc : f32 = 0.0; for (var b: u32 = 0u; b < p.B; b = b + 1u) { let rh = R[b]; let ip = rho_mode(UpostP[b * p.no + i], p.mode_post); let im = rho_mode(UpostM[b * p.no + i], p.mode_post); acc = acc + rh * (ip - im); } gB[i] = acc / p.two_beta; } `; // ----- WGSL: reward + adaptation (depends on output layer state) ----- const WGSL_AUX = ` struct AP { B : u32, O : u32, H_max : u32, n_hidden : u32, c : f32, mode : f32, _p0 : f32, _p1 : f32, }; @group(0) @binding(0) var p : AP; @group(0) @binding(1) var UoF : array; @group(0) @binding(2) var Tgt : array; @group(0) @binding(3) var R : array; // adaptation buffers (variable size; we pass single layer at a time via separate bind groups) @group(0) @binding(4) var Uf : array; @group(0) @binding(5) var Up : array; @group(0) @binding(6) var Um : array; fn sg(u: f32) -> f32 { return 1.0 / (1.0 + exp(-4.0 * (u - 0.5))); } fn rho_out(u: f32) -> f32 { if (p.mode > 0.5) { return clamp(u, 0.0, 1.0); } return sg(u); } @compute @workgroup_size(64) fn compute_reward(@builtin(global_invocation_id) gid: vec3) { let b = gid.x; if (b >= p.B) { return; } var loss : f32 = 0.0; let off = b * p.O; for (var i: u32 = 0u; i < p.O; i = i + 1u) { let d = rho_out(UoF[off + i]) - Tgt[off + i]; loss = loss + d * d; } let escale : f32 = 0.4; let rmin : f32 = 0.1; var r : f32 = loss / escale; if (r > 1.0) { r = 1.0; } R[b] = rmin + (1.0 - rmin) * r; } // Adjusted Adaptation per layer: Up,Um ← (1-c)*Up + c*Uf. 2D dispatch safe for large buffers. @compute @workgroup_size(64) fn adapt_layer(@builtin(global_invocation_id) gid: vec3) { let stride = 65535u * 64u; let g = gid.y * stride + gid.x; if (g >= arrayLength(&Uf)) { return; } let f = Uf[g]; Up[g] = (1.0 - p.c) * Up[g] + p.c * f; Um[g] = (1.0 - p.c) * Um[g] + p.c * f; } `; // ----- JS: trainer class ----- export async function makeGPUDeep({powerPreference='high-performance'}={}){ if(!navigator.gpu) throw new Error('no webgpu'); const adapter = await navigator.gpu.requestAdapter({powerPreference}); if(!adapter) throw new Error('no adapter'); const want = {}; const tryKeys = ['maxStorageBuffersPerShaderStage','maxBufferSize','maxStorageBufferBindingSize', 'maxComputeInvocationsPerWorkgroup','maxComputeWorkgroupSizeX','maxComputeWorkgroupStorageSize','maxBindGroups']; for(const k of tryKeys){ const v=adapter.limits[k]; if(typeof v==='number') want[k]=v; } const dev = await adapter.requestDevice({requiredLimits: want}); return {adapter, dev, info: adapter.info||{}}; } const PHASE_F = 0, PHASE_P = 1, PHASE_M = 2; export class GPUTrainerDeep { // sizes: [D, H1, H2, ..., Hk, O] — len L+1; L = number of weight matrices = sizes.length-1 // mode: 'adaptive' | 'fhn' | 'prism' // driveClampLo, driveClampHi: Tier A — pre-σ drive clamp; ACTIVE iff hi > lo. Default 0,0 = disabled. constructor({dev, sizes, B, mode='adaptive', gamma=0.6, hpsnTauMin=0, hpsnTauMax=0, hpsnSeed=42, driveClampLo=0, driveClampHi=0}={}){ this.dev = dev; this.sizes = sizes; this.L = sizes.length - 1; // number of weight matrices (transitions) this.B = B; this.O = sizes[sizes.length-1]; this.mode = mode; this.modeFlag = (mode==='prism') ? 2.0 : (mode==='fhn' ? 1.0 : 0.0); this.gamma = gamma; this.hpsnTauMin = hpsnTauMin; this.hpsnTauMax = hpsnTauMax; this.hpsnSeed = hpsnSeed; this.useHPSN = (hpsnTauMax > hpsnTauMin && hpsnTauMin > 0); this.driveClampLo = driveClampLo; this.driveClampHi = driveClampHi; this._build(); // Initialize Tau buffers — either constant=0.7 (backward compat) or per-neuron Uniform[hpsnTauMin, hpsnTauMax]. if(this.useHPSN){ this.setAllTau(0.7, hpsnTauMin, hpsnTauMax, hpsnSeed); } else { this.setAllTau(0.7); } } _F32buf(n, usage){ if(!Number.isFinite(n) || n <= 0){ console.error('BAD _F32buf size', {n, sizes:JSON.stringify(this.sizes), sizesArr:this.sizes, B:this.B, L:this.L, S0:this.sizes&&this.sizes[0], S0type:typeof (this.sizes&&this.sizes[0])}); throw new Error('_F32buf called with non-finite n=' + n + ' sizes=' + JSON.stringify(this.sizes)); } return this.dev.createBuffer({size:Math.max(4,n*4), usage}); } _build(){ const dev = this.dev, S = this.sizes, B = this.B, L = this.L; const RW = GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST; const R = GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST|GPUBufferUsage.COPY_SRC; const UNI = GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST; const RDS = GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ; // input + target (shared across phases) this.bufWin = this._F32buf(B * S[0], R); this.bufTgt = this._F32buf(B * S[L], R); // weights & biases (one per transition) this.bufW = []; this.bufB = []; for(let l=0; l({binding:i, visibility:GPUShaderStage.COMPUTE, buffer:{type:'read-only-storage'}}); const sRW= (i)=>({binding:i, visibility:GPUShaderStage.COMPUTE, buffer:{type:'storage'}}); const uN = (i)=>({binding:i, visibility:GPUShaderStage.COMPUTE, buffer:{type:'uniform'}}); this.bglR = dev.createBindGroupLayout({entries:[uN(0), sR(1), sR(2), sR(3), sR(4), sRW(5), sR(6), sR(7), sR(8)]}); this.plR = dev.createPipelineLayout({bindGroupLayouts:[this.bglR]}); this.pipeRelax = dev.createComputePipeline({layout:this.plR, compute:{module:modR, entryPoint:'pass_layer'}}); this.pipeInit = dev.createComputePipeline({layout:this.plR, compute:{module:modR, entryPoint:'init_state'}}); // Grad pipeline (generic) const modG = dev.createShaderModule({code: WGSL_GRAD}); this.bglG = dev.createBindGroupLayout({entries:[uN(0), sR(1), sR(2), sR(3), sR(4), sR(5), sRW(6), sRW(7)]}); this.plG = dev.createPipelineLayout({bindGroupLayouts:[this.bglG]}); this.pipeGW = dev.createComputePipeline({layout:this.plG, compute:{module:modG, entryPoint:'grad_W'}}); this.pipeGB = dev.createComputePipeline({layout:this.plG, compute:{module:modG, entryPoint:'grad_B'}}); // Aux pipeline (reward + adaptation) const modA = dev.createShaderModule({code: WGSL_AUX}); this.bglA = dev.createBindGroupLayout({entries:[uN(0), sR(1), sR(2), sRW(3), sR(4), sRW(5), sRW(6)]}); this.plA = dev.createPipelineLayout({bindGroupLayouts:[this.bglA]}); this.pipeReward = dev.createComputePipeline({layout:this.plA, compute:{module:modA, entryPoint:'compute_reward'}}); this.pipeAdapt = dev.createComputePipeline({layout:this.plA, compute:{module:modA, entryPoint:'adapt_layer'}}); // ---- uniform buffers (one per (phase, layer)) ---- // Each entry: 48 bytes (12 u32/f32 slots) this.bufP_relax = []; for(let phase=0; phase<3; phase++){ this.bufP_relax.push([]); for(let l=1; l<=L; l++){ this.bufP_relax[phase].push(dev.createBuffer({size: 64, usage: UNI})); } } // init uniform (one per layer, beta=0) this.bufP_init = []; for(let l=1; l<=L; l++) this.bufP_init.push(dev.createBuffer({size: 64, usage: UNI})); // Grad uniform (one per layer transition) this.bufP_grad = []; for(let l=0; l1 else bufWin // W0 = bufW[l-1], b0 = bufB[l-1] // W1 = bufW[l] (top-down weights to layer l+1), used if l0: Pre = state[l-1] (sizes[l]), Post = state[l] (sizes[l+1]) this.bgG = []; for(let l=0; l tauMin > 0, samples Uniform[tauMin, tauMax]. // Otherwise fills with constant scalarTau (backward-compat with old fixed-dt EqProp). setTau(layerIdx, scalarTau, tauMin=0, tauMax=0, seed=42){ const no = this.sizes[layerIdx+1]; const arr = new Float32Array(no); if(tauMax > tauMin && tauMin > 0){ // Deterministic LCG for reproducible per-neuron tau distribution. let s = (seed>>>0) || 1; const rng = ()=>{ s = (Math.imul(s, 1664525) + 1013904223) >>> 0; return s/4294967296; }; for(let i=0;i 0 enables sEqProp; seedBase is added to iteration counter for per-call variation. _writeAllUniformsForPass(dt, beta, noiseScale=0, seedBase=0){ const S=this.sizes, L=this.L, B=this.B, gam=this.gamma, mf=this.modeFlag; const phaseBetas = [0, +beta, -beta]; // free, plus, minus const ns = (typeof this.noiseScale === 'number') ? this.noiseScale : noiseScale; const sb = (typeof this.noiseSeedBase === 'number') ? this.noiseSeedBase : seedBase; const cLo = this.driveClampLo || 0; const cHi = this.driveClampHi || 0; // Relax uniforms (per phase, per layer). iter_seed is incremented per call below. for(let phase=0; phase<3; phase++){ for(let l=1; l<=L; l++){ const isOut = (l === L); const isHid = !isOut; const ni = S[l-1], no = S[l], nxt = isHid ? S[l+1] : 0; const phaseBeta = (isOut) ? phaseBetas[phase] : 0; this._writeRelaxParams(this.bufP_relax[phase][l-1], { ni, no, nxt, B, dt, beta: phaseBeta, gamma: gam, mode: mf, has_topdown: isHid ? 1 : 0, has_target: isOut ? 1 : 0, noise_scale: ns, iter_seed: (sb + phase * 7919 + (l-1) * 1009) >>> 0, clamp_lo: cLo, clamp_hi: cHi, }); } } for(let l=1; l<=L; l++){ this._writeRelaxParams(this.bufP_init[l-1], { ni: S[l-1], no: S[l], nxt: 0, B, dt, beta: 0, gamma: gam, mode: mf, has_topdown: 0, has_target: 0, noise_scale: 0, iter_seed: 0, // init kernel doesn't use noise clamp_lo: 0, clamp_hi: 0, // init kernel doesn't run drive — clamp irrelevant }); } } // Tier A — runtime setter for drive clamp. Pass (0,0) to disable. setDriveClamp(lo, hi){ this.driveClampLo = lo; this.driveClampHi = hi; } // sEqProp: set per-pass noise scale and seed base. Call before runFreeAndReadOutputs / runOnePass. setSEqPropNoise(noiseScale, seedBase){ this.noiseScale = noiseScale; this.noiseSeedBase = (seedBase >>> 0) || 0; } _runReward(enc){ this._writeAuxParams(this.bufP_rew, {B: this.B, O: this.O, c: 0, mode: this.modeFlag}); const pass = enc.beginComputePass(); pass.setPipeline(this.pipeReward); pass.setBindGroup(0, this.bgRew); pass.dispatchWorkgroups(Math.ceil(this.B/64)); pass.end(); } _runAdaptation(enc, adpC, adpSteps){ if(adpSteps <= 0 || this.mode === 'fhn') return; // skip adaptation in FHN mode const L = this.L; for(let l=1; l<=L; l++){ this._writeAuxParams(this.bufP_adapt[l-1], {B: this.B, O: this.O, c: adpC, mode: this.modeFlag}); } const MAX_WG_X = 65535; for(let a=0; a this.L) throw new Error('layerIdx out of range'); this._writeAllUniformsForPass(dt, 0); const enc = this.dev.createCommandEncoder(); this._initAllPhases(enc); this._runPhaseRelax(enc, PHASE_F, iters); const size = this.B * this.sizes[layerIdx] * 4; const rb = this.dev.createBuffer({size, usage: GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ}); enc.copyBufferToBuffer(this.bufU[PHASE_F][layerIdx-1], 0, rb, 0, size); this.dev.queue.submit([enc.finish()]); await rb.mapAsync(GPUMapMode.READ); const r = new Float32Array(rb.getMappedRange().slice(0)); rb.unmap(); rb.destroy?.(); return r; } async runOnePassGetGradients({itF=8, itN=5, dt=0.7, beta=0.5, adpC=0.15, adpSteps=3}={}){ if(this.mode === 'fhn') adpSteps = 0; // HPSN backward-compat: when not using heterogeneous-τ, refresh Tau to match runtime dt. // When useHPSN=true, the user-set heterogeneous distribution is preserved (Tau not overwritten). if(!this.useHPSN){ if(this._lastTauDt !== dt){ this.setAllTau(dt); this._lastTauDt = dt; } } this._writeAllUniformsForPass(dt, beta); const enc = this.dev.createCommandEncoder(); this._initAllPhases(enc); this._runPhaseRelax(enc, PHASE_F, itF); this._runPhaseRelax(enc, PHASE_P, itN); this._runPhaseRelax(enc, PHASE_M, itN); this._runReward(enc); this._runAdaptation(enc, adpC, adpSteps); this._runGrad(enc, beta); // Readback all gradients (separate buffers per layer) + Uo_free for(let l=0; l0?this.MB[l][k]/bn:0); } endBatch(){ this.bc++; } } // Adam (with optional weight decay → AdamW). EqProp gives an ascent direction, so we += step. export class Adam { constructor(sizes, {beta1=0.9, beta2=0.999, eps=1e-8, weightDecay=0}={}){ this.sizes=sizes; this.L=sizes.length-1; this.beta1=beta1; this.beta2=beta2; this.eps=eps; this.wd = weightDecay; // AdamW-style decoupled weight decay (applied to W only, not bias) this.mW=[]; this.vW=[]; this.mB=[]; this.vB=[]; this.t=0; for(let l=0; l + m̂/√v̂ if(wd > 0) W[k] *= (1 - lr * wd); W[k] += lr * m_hat / (Math.sqrt(v_hat) + eps); } for(let k=0;km, transpose. const transp = (n > m); let R = m, C = n; if(transp){ // swap to make R ≥ C const T = new Float64Array(n*m); for(let i=0;i0) W[k] *= (1 - lr*wd); W[k] += scale * O[k]; } // Bias: plain momentum (Muon spec says biases get separate Adam-like; here just SGD-momentum for simplicity) for(let k=0;k= 0 ? 1 : -1; if(wd>0) W[k] *= (1 - lr*wd); W[k] += lr * u; // momentum update with b2 this.mW[l][k] = b2*this.mW[l][k] + (1-b2)*g; } for(let k=0;k= 0 ? 1 : -1; B[k] += lr * u; this.mB[l][k] = b2*this.mB[l][k] + (1-b2)*g; } } endBatch(){} }