Spartacus-1B-Instruct / MonoidForCausalLM.py

Upload 14 files

a6cf12a verified 4 days ago

45 kB

	"""
	MonoidForCausalLM — Causal Monoid Language Model (HuggingFace Compatible)
	MonoidForCausalLM — 幺半群因果语言模型 (兼容 HuggingFace)

	Architecture / 架构概要:
	Replace softmax attention with a monoid parallel-scan recurrence.
	用幺半群并行扫描递推替代 softmax 注意力。

	Core idea / 核心思想:
	Softmax attention computes o_t = Σ_{i≤t} softmax(q_t·k_i) v_i
	— requires O(T) KV-cache per layer at inference.
	Softmax 注意力计算 o_t = Σ_{i≤t} softmax(q_t·k_i) v_i
	— 推理时每层需要 O(T) 的 KV 缓存。

	Monoid attention compresses the entire causal history into a
	fixed-size state matrix S_t ∈ ℝ^{d×d} per head:
	S_t = diag(α_t) · S_{t-1} + k_t ⊗ v_t (vector decay recurrence)
	o_t = q_t · S_t (state readout)
	where α_t ∈ ℝ^d is a per-dimension vector decay gate.
	幺半群注意力将完整因果历史压缩到每个头一个固定大小的状态矩阵 S_t:
	S_t = diag(α_t) · S_{t-1} + k_t ⊗ v_t (向量衰减递推)
	o_t = q_t · S_t (状态读出)
	其中 α_t ∈ ℝ^d 是逐维度的向量衰减门。

	This is a monoid because the binary operator:
	(α, S) ⊕ (β, X) = (α·β, diag(β)·S + X)
	is associative → enables parallel prefix scan for training,
	and O(1) sequential update for inference.
	这是一个幺半群，因为二元算子:
	(α, S) ⊕ (β, X) = (α·β, diag(β)·S + X)
	满足结合律 → 训练时可用并行前缀扫描，推理时 O(1) 逐步递推。

	Key properties / 关键特性:
	✓ Explicit causal modeling — α_t gate explicitly controls how fast
	past information decays, making causality a first-class citizen.
	显式因果建模 — α_t 衰减门显式控制历史信息的遗忘速率，
	因果性是一等公民而非靠 mask 施加的约束。

	✓ Monoid state compression — the full causal prefix x_{1:t} is
	lossily compressed into a fixed-size (d×d) state matrix per head.
	No O(T) KV-cache needed; inference is O(1) per token per layer.
	幺半群状态压缩 — 完整因果前缀 x_{1:t} 被有损压缩到每个头
	固定大小的 (d×d) 状态矩阵中。无需 O(T) KV 缓存；
	推理时每层每 token O(1)。

	✓ Parallel training — associativity of ⊕ enables O(T) parallel
	prefix scan (vs O(T²) for softmax attention).
	并行训练 — ⊕ 的结合律使 O(T) 并行前缀扫描成为可能
	(对比 softmax 注意力的 O(T²))。

	Reuses LlamaMLP + LlamaRMSNorm from HuggingFace Transformers.
	复用 HuggingFace Transformers 的 LlamaMLP + LlamaRMSNorm。
	"""

	from __future__ import annotations

	from typing import Optional, Union

	import torch
	import torch.nn as nn
	from torch import Tensor

	from transformers import PretrainedConfig, PreTrainedModel, GenerationMixin, AutoConfig, AutoModelForCausalLM
	from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
	from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm

	try:
	from monoid_scan_cuda import parallel_scan, parallel_scan_with_state
	except ImportError:
	# Pure-PyTorch fallback (sequential scan) — works on CPU / MPS / any device.
	# Slower than the fused CUDA kernel but numerically identical.

	def parallel_scan(alpha: Tensor, kv: Tensor) -> Tensor:
	"""Sequential prefix scan fallback: S_t[i,:] = α_t[i]·S_{t-1}[i,:] + kv_t[i,:]."""
	B, H, T, d1, d2 = kv.shape
	states = torch.zeros(B, H, T, d1, d2, device=kv.device, dtype=kv.dtype)
	S = torch.zeros(B, H, d1, d2, device=kv.device, dtype=kv.dtype)
	for t in range(T):
	decay = alpha[:, :, t] # [B, H, d]
	while decay.dim() < S.dim():
	decay = decay.unsqueeze(-1)
	S = S * decay + kv[:, :, t]
	states[:, :, t] = S
	return states

	def parallel_scan_with_state(alpha: Tensor, kv: Tensor):
	"""Sequential prefix scan that also returns the final (decay_acc, S) state."""
	B, H, T, d1, d2 = kv.shape
	states = torch.zeros(B, H, T, d1, d2, device=kv.device, dtype=kv.dtype)
	S = torch.zeros(B, H, d1, d2, device=kv.device, dtype=kv.dtype)
	decay_acc = torch.ones(B, H, d1, device=alpha.device, dtype=alpha.dtype)
	for t in range(T):
	decay = alpha[:, :, t]
	while decay.dim() < S.dim():
	decay = decay.unsqueeze(-1)
	S = S * decay + kv[:, :, t]
	states[:, :, t] = S
	decay_acc = decay_acc * alpha[:, :, t]
	return states, (decay_acc, S)



	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Config / 配置
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidConfig(PretrainedConfig):
	"""
	Configuration for the Monoid causal language model.
	幺半群因果语言模型的配置。

	Mirrors LlamaConfig for the shared components (MLP, RMSNorm, embedding)
	so that weights can be directly transferred from Llama checkpoints.
	与 LlamaConfig 的共享组件 (MLP, RMSNorm, embedding) 保持一致,
	以便从 Llama 检查点直接迁移权重。
	"""
	model_type = "monoid"

	def __init__(
	self,
	vocab_size: int = 32000,
	hidden_size: int = 576,
	intermediate_size: int = 1536,
	num_hidden_layers: int = 30,
	num_attention_heads: int = 9,
	head_dim: int = 64,
	max_position_embeddings: int = 2048,
	rms_norm_eps: float = 1e-5,
	hidden_act: str = "silu",
	mlp_bias: bool = False,
	attention_bias: bool = False,
	tie_word_embeddings: bool = True,
	initializer_range: float = 0.041666666666666664,
	pad_token_id: int = None,
	bos_token_id: int = 1,
	eos_token_id: int = 2,
	**kwargs,
	):
	super().__init__(
	pad_token_id=pad_token_id,
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	tie_word_embeddings=tie_word_embeddings,
	**kwargs,
	)
	self.vocab_size = vocab_size
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.head_dim = head_dim
	self.max_position_embeddings = max_position_embeddings
	self.rms_norm_eps = rms_norm_eps
	self.hidden_act = hidden_act
	self.mlp_bias = mlp_bias
	self.attention_bias = attention_bias
	self.initializer_range = initializer_range


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Monoid Cache — O(1) state replaces O(T) KV-Cache
	# 幺半群缓存 — O(1) 状态替代 O(T) KV 缓存
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidCache:
	"""
	Per-layer monoid state cache for autoregressive inference.
	自回归推理的逐层幺半群状态缓存。

	Unlike Transformer KV-Cache that stores all past keys & values (O(T) memory),
	each layer here stores exactly ONE state tuple:
	(decay_acc, S) where S ∈ ℝ^{B, H, d, d}
	This is the monoid "sum" of all past (α_i, k_i⊗v_i) via ⊕.
	Memory is O(1) per layer regardless of sequence length.

	不同于 Transformer 的 KV-Cache (存储所有过去的 key 和 value, O(T) 内存),
	这里每层仅存储一个状态元组:
	(decay_acc, S) 其中 S ∈ ℝ^{B, H, d, d}
	这是所有过去的 (α_i, k_i⊗v_i) 通过 ⊕ 累积的幺半群 "和"。
	无论序列多长，每层内存 O(1)。
	"""

	def __init__(self):
	self.states: list[tuple[Tensor, Tensor] \| None] = []
	self.seen_tokens: int = 0

	def get_seq_length(self, layer_idx: int = 0) -> int:
	return self.seen_tokens

	def update(self, layer_idx: int, state: tuple[Tensor, Tensor]):
	"""Store the accumulated monoid state for a given layer.
	存储指定层的累积幺半群状态。"""
	while len(self.states) <= layer_idx:
	self.states.append(None)
	self.states[layer_idx] = state

	def get_state(self, layer_idx: int) -> tuple[Tensor, Tensor] \| None:
	"""Retrieve the accumulated monoid state for a given layer.
	获取指定层的累积幺半群状态。"""
	if layer_idx < len(self.states):
	return self.states[layer_idx]
	return None

	def reorder_cache(self, beam_idx: torch.LongTensor):
	"""Reorder cache for beam search. 为 beam search 重排缓存。"""
	for i, state in enumerate(self.states):
	if state is not None:
	log_d, kv = state
	self.states[i] = (log_d[beam_idx], kv[beam_idx])


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Monoid Operator — the algebraic heart
	# 幺半群算子 — 代数核心
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	def monoid_op(
	a: tuple[Tensor, Tensor],
	b: tuple[Tensor, Tensor],
	) -> tuple[Tensor, Tensor]:
	"""
	The monoid binary operator ⊕ on (vector decay, state matrix) pairs.
	幺半群二元算子 ⊕，作用于 (向量衰减, 状态矩阵) 对。

	Definition / 定义:
	(α, S) ⊕ (β, X) = (α·β, diag(β)·S + X)
	where α, β ∈ (0,1)^d are per-dimension vector decay gates (sigmoid output).

	Why this is a monoid / 为什么这是幺半群:
	• Associativity / 结合律:
	(a ⊕ b) ⊕ c = a ⊕ (b ⊕ c) ✓
	This enables parallel prefix scan for training (reduce tree)
	and O(1) left-fold for inference (sequential append).
	结合律使训练时可以用并行前缀扫描 (归约树),
	推理时可以 O(1) 左折叠 (逐步追加)。

	• Identity / 单位元:
	e = (1, 0) → e ⊕ a = a ⊕ e = a ✓

	Causal semantics / 因果语义:
	S_t = α_t · S_{t-1} + k_t ⊗ v_t
	The decay α_t ∈ (0,1) explicitly controls how much of the past
	the model retains. This is explicit causal modeling — the model
	must learn to balance retention vs novelty at every timestep.
	衰减 α_t ∈ (0,1) 显式控制模型保留多少过去信息。
	这就是显式因果建模 — 模型必须在每个时间步学习如何
	平衡保留旧信息与吸收新信息。
	"""
	decay_a, kv_a = a
	decay_b, kv_b = b

	new_decay = decay_a * decay_b # α·β (element-wise product)
	while decay_b.dim() < kv_a.dim():
	decay_b = decay_b.unsqueeze(-1) # broadcast to [B,H,...,1,1]

	return new_decay, kv_a * decay_b + kv_b # β·S + X


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Monoid Attention — the core innovation
	# 幺半群注意力 — 核心创新层
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidAttention(nn.Module):
	"""
	Monoid Causal Attention — replaces softmax attention entirely.
	幺半群因果注意力 — 完全替代 softmax 注意力。

	Key differences from standard attention / 与标准注意力的关键区别:
	✗ No RoPE / positional encoding — position is implicitly encoded
	by the causal decay gate α_t. The model learns when to forget
	rather than encoding where tokens are.
	不使用 RoPE / 位置编码 — 位置信息由因果衰减门 α_t 隐式编码。
	模型学习何时遗忘而非编码 token 在哪里。

	✗ No KV-Cache — replaced by MonoidCache with O(1) state per layer.
	Each state S ∈ ℝ^{H×d×d} is a compressed summary of ALL past tokens.
	不使用 KV 缓存 — 由 O(1) 的 MonoidCache 状态替代。
	每个状态 S ∈ ℝ^{H×d×d} 是所有过去 token 的压缩摘要。

	✗ No attention mask — causality is built into the recurrence itself.
	S_t only depends on S_{t-1} and the current token by construction.
	不使用注意力掩码 — 因果性内建于递推结构本身。
	S_t 仅依赖 S_{t-1} 和当前 token，结构上保证因果性。

	Computation / 计算:
	Training (parallel scan, O(T)):
	k_t = SiLU(k_proj(x_t)) # non-negative keys for PSD state
	S_t = α_t · S_{t-1} + k_t ⊗ v_t # monoid recurrence via prefix scan
	o_t = q_t · S_t # linear readout from state

	Inference (RNN mode, O(1) per token):
	Same recurrence, but applied one token at a time.

	训练 (并行扫描, O(T)):
	k_t = SiLU(k_proj(x_t)) # 非负 key 保证状态矩阵半正定
	S_t = α_t · S_{t-1} + k_t ⊗ v_t # 通过前缀扫描实现幺半群递推
	o_t = q_t · S_t # 从状态中线性读出

	推理 (RNN 模式, 每 token O(1)):
	同一递推公式, 但逐 token 顺序应用。
	"""

	def __init__(self, config: MonoidConfig, layer_idx: int):
	super().__init__()
	self.layer_idx = layer_idx
	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = config.head_dim
	self.scaling = self.head_dim ** -0.5 # 1/√d, scale factor for q·S readout
	# q·S 读出的缩放因子

	# --- Projections (transferred from Llama) ---
	# --- 投影层 (从 Llama 迁移) ---
	# q_proj, o_proj: identical dims to Llama, direct copy
	# k_proj, v_proj: Llama GQA has fewer KV heads; we tile to full heads
	# q_proj, o_proj: 维度与 Llama 一致, 直接复制
	# k_proj, v_proj: Llama GQA 的 KV 头更少; 我们重复到全头数
	self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
	self.k_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
	self.v_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
	self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=config.attention_bias)

	# --- Output gate (novel component, randomly initialized) ---
	# --- 输出门控 (全新组件, 随机初始化) ---
	# Modulates the multi-head readout before o_proj, similar to GLA/RetNet.
	# gate = SiLU(gate_proj(x)), output = gate ⊙ concat_heads(o)
	# This lets the model suppress or amplify specific head outputs
	# conditioned on the current input, increasing expressiveness.
	# 在 o_proj 之前调制多头读出, 类似 GLA/RetNet。
	# gate = SiLU(gate_proj(x)), output = gate ⊙ concat_heads(o)
	# 使模型能根据当前输入抑制或放大特定头的输出, 增加表达力。
	self.gate_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=False)

	# --- Decay gate (novel component, randomly initialized) ---
	# --- 衰减门 (全新组件, 随机初始化) ---
	# Projects hidden_size → num_heads * head_dim, yielding a VECTOR per head.
	# Activation: log_α = -softplus(Wx + b), giving α ∈ (0, 1].
	# Vector decay: S_t = diag(α_t) · S_{t-1} + k_t ⊗ v_t
	# Different feature dimensions can have independent lifetimes:
	# - fast-decaying dims for local syntax
	# - slow-decaying dims for global entity/fact memory
	# 将 hidden_size 投影到 num_heads * head_dim, 每个头产生一个向量。
	# 激活: log_α = -softplus(Wx + b), 使 α ∈ (0, 1]。
	# 向量衰减: S_t = diag(α_t) · S_{t-1} + k_t ⊗ v_t
	# 不同特征维度拥有独立的生命周期:
	# - 快速衰减的维度负责局部语法结构
	# - 慢速衰减的维度负责全局实体和事实记忆
	self.decay_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=True)

	# --- QK-Norm (novel component, randomly initialized) ---
	# --- QK 归一化 (全新组件, 随机初始化) ---
	# Stabilizes the scale of q·S readout. Without this, the state
	# matrix S (sum of outer products) can grow unboundedly.
	# 稳定 q·S 读出的尺度。没有这个, 状态矩阵 S (外积之和)
	# 可能无界增长。
	self.q_norm = LlamaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
	self.k_norm = LlamaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
	self.o_norm = LlamaRMSNorm(self.head_dim, eps=config.rms_norm_eps)

	# --- Learnable initial state h0 (novel component, zero-initialized) ---
	# --- 可学习初始状态 h0 (全新组件, 零初始化) ---
	# S_0 = h0 ∈ ℝ^{1, H, d, d}, shared across batch.
	# Zero-init means the model starts with "no memory" — a clean slate.
	# The model can learn a non-zero h0 as a kind of "system prompt" state.
	# S_0 = h0 ∈ ℝ^{1, H, d, d}, 跨 batch 共享。
	# 零初始化意味着模型从"无记忆"开始 — 一张白纸。
	# 模型可以学习非零的 h0 作为一种"系统提示"状态。
	self.h0 = nn.Parameter(torch.zeros(1, self.num_heads, self.head_dim, self.head_dim))

	def forward(
	self,
	hidden_states: Tensor,
	attention_mask: Tensor \| None = None,
	monoid_cache: MonoidCache \| None = None,
	use_cache: bool = False,
	) -> tuple[Tensor, tuple[Tensor, Tensor] \| None]:
	"""
	Args:
	hidden_states: [B, T, hidden_size]
	attention_mask: [B, T] with 1=real token, 0=pad.
	For PAD positions: α=1 (preserve state), kv=0 (no contribution).
	掩码: 1=真实token, 0=填充。
	填充位置: α=1 (保持状态不变), kv=0 (无贡献)。
	monoid_cache: O(1) state cache for inference
	推理用 O(1) 状态缓存
	use_cache: whether to use/update the cache
	是否使用/更新缓存

	Returns:
	output: [B, T, hidden_size]
	final_state: (log_decay_acc, S) or None
	"""
	B, T, _ = hidden_states.shape
	H, d = self.num_heads, self.head_dim

	# --- Project to multi-head Q, K, V ---
	# --- 投影到多头 Q, K, V ---
	q = self.q_proj(hidden_states).view(B, T, H, d).transpose(1, 2) # [B,H,T,d]
	k = self.k_proj(hidden_states).view(B, T, H, d).transpose(1, 2)
	v = self.v_proj(hidden_states).view(B, T, H, d).transpose(1, 2)

	# --- Output gate: computed from input, applied before o_proj ---
	# --- 输出门控: 从输入计算, 在 o_proj 之前应用 ---
	gate = torch.nn.functional.silu(self.gate_proj(hidden_states)) # [B,T,H*d]

	# --- QK-Norm: stabilize q·S readout scale ---
	# --- QK 归一化: 稳定 q·S 读出尺度 ---
	q = self.q_norm(q) * self.scaling
	k = self.k_norm(k)

	# --- Non-negative keys via SiLU ---
	# --- 通过 SiLU 保证 key 非负 ---
	# Why: the state S = Σ α^{t-i} k_i⊗v_i is a sum of outer products.
	# Non-negative k ensures S is positive semi-definite (PSD),
	# preventing "feature erasure" where one token's contribution
	# cancels another's. PSD guarantees monotonic information accumulation.
	# 原因: 状态 S = Σ α^{t-i} k_i⊗v_i 是外积之和。
	# 非负的 k 保证 S 半正定 (PSD), 防止一个 token 的贡献
	# 抵消另一个 token 的"特征擦除"现象。
	# PSD 保证信息单调积累。
	k = torch.nn.functional.silu(k)

	# --- Compute per-dimension vector decay gate α_t ---
	# --- 计算每维度向量衰减门 α_t ---
	# Sigmoid: α = σ(Wx + b)
	# Value range: α ∈ (0, 1).
	# When Wx → -∞: σ → 0 (complete forgetting)
	# When Wx → +∞: σ → 1 (perfect memory, no forgetting)
	# Each dimension of the d-vector decays independently:
	# S_t[i,j] = α_t[i] · S_{t-1}[i,j] + k_t[i] · v_t[j]
	#
	# Sigmoid: α = σ(Wx + b)
	# 值域: α ∈ (0, 1)。
	# 当 Wx → -∞: σ → 0 (完全遗忘)
	# 当 Wx → +∞: σ → 1 (完美记忆, 不遗忘)
	# d-向量的每个维度独立衰减:
	# S_t[i,j] = α_t[i] · S_{t-1}[i,j] + k_t[i] · v_t[j]
	raw = self.decay_proj(hidden_states) # [B,T,H*d]
	alpha = torch.sigmoid(raw) # [B,T,H*d]
	alpha = alpha.view(B, T, H, d).transpose(1, 2) # [B,H,T,d]

	# --- Apply attention_mask: PAD tokens must be invisible to the recurrence ---
	# --- 应用注意力掩码: PAD token 必须对递推不可见 ---
	# For PAD positions (mask=0): set log_α=0 (α=1, preserve state) and kv=0 (no contribution).
	# This makes S_t = 1·S_{t-1} + 0 = S_{t-1}, i.e. PAD is a no-op on the state.
	# 对于 PAD 位置 (mask=0): 设 log_α=0 (α=1, 保持状态) 且 kv=0 (无贡献)。
	# 这使得 S_t = 1·S_{t-1} + 0 = S_{t-1}, 即 PAD 对状态是空操作。
	if attention_mask is not None:
	# attention_mask: [B, T] → [B, 1, T, 1] for broadcasting with [B, H, T, d]
	mask = attention_mask[:, None, :, None].to(alpha.dtype) # [B,1,T,1]
	alpha = alpha * mask + (1 - mask) # PAD → α=1 (preserve state)
	k = k * mask # PAD → k=0
	v = v * mask # PAD → v=0 → kv=0

	# ══════════════════════════════════════════════════════════
	# Inference path (RNN mode): O(1) per token per layer
	# 推理路径 (RNN 模式): 每层每 token O(1)
	# ══════════════════════════════════════════════════════════
	# When generating, T=1. We apply the monoid operator once
	# to fold the new token into the accumulated state.
	# This is where "O(1) inference" materializes:
	# S_t = α_t · S_{t-1} + k_t ⊗ v_t (one monoid_op call)
	# o_t = q_t · S_t (one matmul)
	# Total: O(H·d²) per layer — independent of sequence length.
	#
	# 生成时 T=1。我们调用一次幺半群算子将新 token 折叠进累积状态。
	# 这就是 "O(1) 推理" 的具体体现:
	# S_t = α_t · S_{t-1} + k_t ⊗ v_t (一次 monoid_op)
	# o_t = q_t · S_t (一次矩阵乘法)
	# 总计: 每层 O(H·d²) — 与序列长度无关。
	if use_cache and T == 1:
	# Outer product: k_t ⊗ v_t ∈ ℝ^{H×d×d}
	# 外积: k_t ⊗ v_t ∈ ℝ^{H×d×d}
	kv_t = torch.einsum('bhd, bhe -> bhde', k[:, :, 0], v[:, :, 0])
	alpha_t = alpha[:, :, 0] # [B,H,d]

	prev = monoid_cache.get_state(self.layer_idx) if monoid_cache else None
	if prev is None:
	# First token: initialize from learnable h0
	# 第一个 token: 从可学习的 h0 初始化
	decay_t = alpha_t
	while decay_t.dim() < self.h0.dim():
	decay_t = decay_t.unsqueeze(-1)
	new_state = (alpha_t, self.h0.expand(B, -1, -1, -1) * decay_t + kv_t)
	else:
	# Subsequent tokens: fold via monoid_op — O(1)!
	# 后续 token: 通过 monoid_op 折叠 — O(1)!
	new_state = monoid_op(prev, (alpha_t, kv_t))

	if monoid_cache is not None:
	monoid_cache.update(self.layer_idx, new_state)

	# Readout: o_t = q_t · S_t
	# 读出: o_t = q_t · S_t
	o = torch.einsum('bhd, bhde -> bhe', q[:, :, 0], new_state[1])
	o = self.o_norm(o)
	# Reshape [B,H,d] → [B,1,H*d] (heads contiguous, matching scan path)
	# 重塑 [B,H,d] → [B,1,H*d] (头连续排列, 与扫描路径一致)
	o = o.contiguous().view(B, 1, -1)
	return self.o_proj(gate * o), new_state

	# ══════════════════════════════════════════════════════════
	# Inference prefill (use_cache=True, T>1): parallel scan + readout
	# 推理预填充 (use_cache=True, T>1): 并行扫描 + 读出
	# ══════════════════════════════════════════════════════════
	# Uses the same parallel_scan_with_state as training to leverage
	# Triton CUDA kernel acceleration instead of O(T) Python loop.
	# Memory: O(B·H·T·d²) — same as training path.
	# 使用与训练相同的 parallel_scan_with_state 来利用
	# Triton CUDA 核函数加速, 而非 O(T) 的 Python 循环。
	# 内存: O(B·H·T·d²) — 与训练路径相同。
	if use_cache:
	kv = torch.einsum('bhtd, bhte -> bhtde', k, v) # [B,H,T,d,d]
	states, (decay_acc, S_T) = parallel_scan_with_state(alpha, kv)

	# Add h0 contribution: S_t += diag(∏_{i=0}^{t} α_i) · h0
	# 叠加 h0 贡献: S_t += diag(∏_{i=0}^{t} α_i) · h0
	cum_alpha = torch.exp(torch.cumsum(torch.log(alpha + 1e-8), dim=2)) # [B,H,T,d]
	h0_decay = cum_alpha.unsqueeze(-1) # [B,H,T,d,1]
	states = states + h0_decay * self.h0.unsqueeze(2) # broadcast h0 [1,H,1,d,d]

	# Final state includes h0 contribution
	# 最终状态包含 h0 贡献
	total_h0_decay = decay_acc.unsqueeze(-1) # [B,H,d,1]
	S_final = S_T + total_h0_decay * self.h0.squeeze(0) # [B,H,d,d]
	# h0 is [1,H,d,d], squeeze(0) removed for clarity but expand also works
	final_state = (decay_acc, S_final)

	if monoid_cache is not None:
	monoid_cache.update(self.layer_idx, final_state)

	# Vectorized readout: o_t = q_t · S_t for all t
	# 向量化读出: 一次性计算所有 t 的 o_t = q_t · S_t
	o = torch.einsum('bhtd, bhtde -> bhte', q, states) # [B,H,T,d]
	o = self.o_norm(o)
	o = o.transpose(1, 2).contiguous().view(B, T, -1)
	return self.o_proj(gate * o), final_state

	# ══════════════════════════════════════════════════════════
	# Training path: parallel scan + vectorized readout
	# 训练路径: 并行扫描 + 向量化读出
	# ══════════════════════════════════════════════════════════
	# Materialize full kv tensor [B,H,T,d,d] and scan in one pass.
	# Memory: O(B·H·T·d²) — trades memory for speed.
	# Eliminates T×30 Python-loop kernel launches for outer product
	# and readout; scan itself is parallel when CUDA kernel available.
	#
	# 物化完整 kv 张量 [B,H,T,d,d] 并一次性扫描。
	# 内存: O(B·H·T·d²) — 以内存换速度。
	# 消除外积和读出的 T×30 次 Python 循环 kernel launch;
	# 当 CUDA kernel 可用时扫描本身也是并行的。

	# Vectorized outer product: kv_t = k_t ⊗ v_t for all t at once
	# 向量化外积: 一次性计算所有 t 的 k_t ⊗ v_t
	kv = torch.einsum('bhtd, bhte -> bhtde', k, v) # [B,H,T,d,d]

	# Parallel prefix scan: S_t = diag(α_t)·S_{t-1} + kv_t (from S=0)
	# 并行前缀扫描: S_t = diag(α_t)·S_{t-1} + kv_t (从 S=0 开始)
	# alpha is [B,H,T,d] — vector decay per dimension.
	# alpha 为 [B,H,T,d] — 每维度向量衰减。
	states = parallel_scan(alpha, kv) # [B,H,T,d,d]

	# Add h0 contribution: S_t += diag(∏_{i=0}^{t} α_i) · h0
	# 叠加 h0 贡献: S_t += diag(∏_{i=0}^{t} α_i) · h0
	cum_alpha = torch.exp(torch.cumsum(torch.log(alpha + 1e-8), dim=2)) # [B,H,T,d]
	h0_decay = cum_alpha.unsqueeze(-1) # [B,H,T,d,1]
	states = states + h0_decay * self.h0.unsqueeze(2) # broadcast h0 [1,H,1,d,d]

	# Vectorized readout: o_t = q_t · S_t for all t at once
	# 向量化读出: 一次性计算所有 t 的 q_t · S_t
	o = torch.einsum('bhtd, bhtde -> bhte', q, states) # [B,H,T,d]
	o = self.o_norm(o)

	o = o.transpose(1, 2).contiguous().view(B, T, -1)
	return self.o_proj(gate * o), None


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Decoder Layer: MonoidAttn + LlamaMLP + LlamaRMSNorm
	# 解码层: 幺半群注意力 + LlamaMLP + LlamaRMSNorm
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidDecoderLayer(nn.Module):
	"""
	Pre-Norm Transformer block with Monoid attention.
	使用幺半群注意力的 Pre-Norm Transformer 块。

	Data flow / 数据流:
	x → RMSNorm → MonoidAttn → +residual → RMSNorm → LlamaMLP → +residual → out

	The MLP and RMSNorm are identical to Llama (weights transferred directly).
	Only MonoidAttention is the novel component.
	MLP 和 RMSNorm 与 Llama 完全相同 (权重直接迁移)。
	仅 MonoidAttention 是全新组件。
	"""
	gradient_checkpointing = False

	def __init__(self, config: MonoidConfig, layer_idx: int):
	super().__init__()
	self.self_attn = MonoidAttention(config, layer_idx)
	self.mlp = LlamaMLP(config)
	self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	def forward(
	self,
	hidden_states: Tensor,
	attention_mask: Tensor \| None = None,
	monoid_cache: MonoidCache \| None = None,
	use_cache: bool = False,
	) -> Tensor:
	# --- Attention block with residual ---
	# --- 注意力块 + 残差连接 ---
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	hidden_states, _ = self.self_attn(hidden_states, attention_mask=attention_mask, monoid_cache=monoid_cache, use_cache=use_cache)
	hidden_states = residual + hidden_states

	# --- FFN block with residual ---
	# --- 前馈网络块 + 残差连接 ---
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# MonoidModel (backbone)
	# MonoidModel (骨干网络)
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidPreTrainedModel(PreTrainedModel):
	config_class = MonoidConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["MonoidDecoderLayer"]

	def _init_weights(self, module: nn.Module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()

	if isinstance(module, MonoidAttention):
	# decay_proj: bias init so sigmoid(bias) ≈ 0.95 → mostly remembering at start
	# decay_proj: 偏置初始化使 sigmoid(bias) ≈ 0.95 → 初始时以记忆为主
	nn.init.constant_(module.decay_proj.bias, 3.0)
	# gate_proj: small init so gate starts near identity (SiLU(0)=0,
	# but normal weights give moderate gate values)
	# gate_proj: 小初始化, 使门控从接近恒等开始
	nn.init.normal_(module.gate_proj.weight, mean=0.0, std=0.01)
	# o_norm: RMSNorm weight defaults to 1.0 (identity), explicit for clarity
	# o_norm: RMSNorm 权重默认为 1.0 (恒等), 显式设置确保正确
	nn.init.ones_(module.o_norm.weight)

	class MonoidModel(MonoidPreTrainedModel):
	"""
	Stack of MonoidDecoderLayers with token embedding and final norm.
	幺半群解码层堆叠, 带 token 嵌入和最终归一化。

	Forward: embed_tokens → N × MonoidDecoderLayer → final_norm
	前向: embed_tokens → N × MonoidDecoderLayer → final_norm
	"""

	def __init__(self, config: MonoidConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
	self.layers = nn.ModuleList(
	[MonoidDecoderLayer(config, i) for i in range(config.num_hidden_layers)]
	)
	self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.gradient_checkpointing = False
	self.post_init()

	def forward(
	self,
	input_ids: Tensor \| None = None,
	attention_mask: Tensor \| None = None,
	inputs_embeds: Tensor \| None = None,
	monoid_cache: MonoidCache \| None = None,
	use_cache: bool = False,
	) -> BaseModelOutputWithPast:
	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)

	hidden_states = inputs_embeds
	for layer in self.layers:
	if self.gradient_checkpointing and self.training and not use_cache:
	hidden_states = self._gradient_checkpointing_func(
	layer.__call__,
	hidden_states,
	attention_mask,
	monoid_cache,
	use_cache,
	)
	else:
	hidden_states = layer(hidden_states, attention_mask=attention_mask, monoid_cache=monoid_cache, use_cache=use_cache)

	hidden_states = self.norm(hidden_states)

	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=monoid_cache,
	)


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# MonoidForCausalLM — the full causal LM
	# MonoidForCausalLM — 完整因果语言模型
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	class MonoidForCausalLM(MonoidPreTrainedModel, GenerationMixin):
	"""
	Monoid-based causal language model with LM head.
	基于幺半群的因果语言模型, 带语言模型头。

	The architecture in one sentence:
	"Llama body + Monoid mind" — reuse Llama's proven MLP/embeddings,
	replace attention with monoid state compression for O(1) inference.

	一句话概括架构:
	"Llama 的身体 + 幺半群的思维" — 复用 Llama 成熟的 MLP/嵌入层,
	用幺半群状态压缩替换注意力, 实现 O(1) 推理。
	"""
	_tied_weights_keys = ["lm_head.weight"]

	# Tell HuggingFace GenerationMixin NOT to create DynamicCache.
	# Monoid uses its own O(1) MonoidCache, not KV-Cache.
	# 告诉 HuggingFace 不要创建 DynamicCache。
	# Monoid 使用自己的 O(1) MonoidCache, 不是 KV 缓存。
	_is_stateful = True

	def __init__(self, config: MonoidConfig):
	super().__init__(config)
	self.model = MonoidModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def prepare_inputs_for_generation(
	self,
	input_ids: Tensor,
	past_key_values=None,
	attention_mask: Tensor \| None = None,
	inputs_embeds: Tensor \| None = None,
	**kwargs,
	) -> dict:
	"""
	Called by GenerationMixin at each decoding step.
	GenerationMixin 在每个解码步调用此方法。

	HuggingFace may pass a DynamicCache; we intercept and replace
	it with MonoidCache since we don't use standard KV-cache.
	HuggingFace 可能传入 DynamicCache; 我们拦截并替换为
	MonoidCache, 因为我们不使用标准 KV 缓存。
	"""
	# Intercept non-MonoidCache objects (e.g. DynamicCache from GenerationMixin)
	# 拦截非 MonoidCache 对象 (如 GenerationMixin 创建的 DynamicCache)
	if past_key_values is not None and not isinstance(past_key_values, MonoidCache):
	past_key_values = None

	if past_key_values is not None and past_key_values.seen_tokens > 0:
	# Cache exists → only feed the latest token (O(1) inference)
	# 缓存已存在 → 只需输入最新的 token (O(1) 推理)
	input_ids = input_ids[:, -1:]
	# Decode step: single real token, no PAD → mask not needed
	# 解码步: 单个真实token, 无PAD → 不需要掩码
	attention_mask = None

	model_inputs = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"monoid_cache": past_key_values,
	"use_cache": True,
	}
	return model_inputs

	def forward(
	self,
	input_ids: Tensor \| None = None,
	attention_mask: Tensor \| None = None, # [B,T] 1=real, 0=pad — used to mask PAD from recurrence
	# [B,T] 1=真实token, 0=填充 — 用于屏蔽PAD对递推的影响
	position_ids: Tensor \| None = None, # kept for API compat; monoid ignores this
	# 保留 API 兼容性; 幺半群不使用
	past_key_values: MonoidCache \| None = None,
	inputs_embeds: Tensor \| None = None,
	labels: Tensor \| None = None,
	use_cache: bool \| None = None,
	monoid_cache: MonoidCache \| None = None,
	output_attentions: bool \| None = None, # kept for API compat
	output_hidden_states: bool \| None = None, # kept for API compat
	logits_to_keep: int \| Tensor = 0,
	**kwargs,
	) -> CausalLMOutputWithPast:
	# monoid_cache takes priority; fall back to past_key_values for GenerationMixin compat
	# monoid_cache 优先; 兼容 GenerationMixin 传入的 past_key_values
	cache = monoid_cache or past_key_values

	# Discard any non-MonoidCache (e.g. DynamicCache injected by GenerationMixin)
	# 丢弃任何非 MonoidCache 对象 (如 GenerationMixin 注入的 DynamicCache)
	if cache is not None and not isinstance(cache, MonoidCache):
	cache = None

	if use_cache and cache is None:
	cache = MonoidCache()

	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	monoid_cache=cache,
	use_cache=bool(use_cache),
	)

	hidden_states = outputs.last_hidden_state

	# Optionally only compute logits for the last K tokens (memory saving)
	# 可选仅计算最后 K 个 token 的 logits (节省内存)
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) and logits_to_keep > 0 else slice(None)
	logits = self.lm_head(hidden_states[:, slice_indices, :])

	# Standard causal LM loss: cross-entropy with shift
	# 标准因果语言模型损失: 带偏移的交叉熵
	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss = nn.functional.cross_entropy(
	shift_logits.view(-1, self.vocab_size),
	shift_labels.view(-1),
	ignore_index=-100,
	)

	if cache is not None:
	cache.seen_tokens += (input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1])

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=cache,
	)


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# AutoModel Registration / 自动注册
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	AutoConfig.register("monoid", MonoidConfig)
	AutoModelForCausalLM.register(MonoidConfig, MonoidForCausalLM)


	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
	# Smoke Tests / 验证
	# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

	if __name__ == '__main__':
	device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
	print(f'Device: {device}')

	config = MonoidConfig(
	vocab_size=49152,
	hidden_size=576,
	intermediate_size=1536,
	num_hidden_layers=30,
	num_attention_heads=9,
	head_dim=64,
	rms_norm_eps=1e-5,
	hidden_act="silu",
	tie_word_embeddings=True,
	)
	model = MonoidForCausalLM(config).to(device)
	n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f'Parameters: {n_params:,}')

	# -- Training smoke test / 训练冒烟测试 --
	B, T = 2, 64
	ids = torch.randint(0, config.vocab_size, (B, T), device=device)
	out = model(ids, labels=ids)
	print(f'Train — logits: {out.logits.shape}, loss: {out.loss:.4f}')

	# -- Inference smoke test (manual RNN loop) / 推理冒烟测试 (手动 RNN 循环) --
	prompt = torch.randint(0, config.vocab_size, (1, 8), device=device)
	cache = MonoidCache()
	# Prefill / 预填充
	prefill_out = model(prompt, use_cache=True, monoid_cache=cache)
	print(f'Prefill — logits: {prefill_out.logits.shape}, cache seen: {cache.seen_tokens}')
	# Decode 1 token / 解码 1 个 token
	next_tok = prefill_out.logits[:, -1:].argmax(dim=-1)
	step_out = model(next_tok, use_cache=True, monoid_cache=cache)
	print(f'Decode — logits: {step_out.logits.shape}, cache seen: {cache.seen_tokens}')

	# -- Monoid associativity check / 幺半群结合律验证 --
	print('\nMonoid associativity check / 幺半群结合律验证:')
	a = (torch.randn(1, 1, 1), torch.randn(1, 1, 4, 4))
	b = (torch.randn(1, 1, 1), torch.randn(1, 1, 4, 4))
	c = (torch.randn(1, 1, 1), torch.randn(1, 1, 4, 4))
	ab_c = monoid_op(monoid_op(a, b), c)
	a_bc = monoid_op(a, monoid_op(b, c))
	err = (ab_c[1] - a_bc[1]).abs().max().item()
	print(f' \|(a⊕b)⊕c - a⊕(b⊕c)\| = {err:.2e}')

	print('\nDone.')