File size: 16,115 Bytes

import math
from typing import Optional

import torch
import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F
from transformers import PreTrainedModel
from transformers.cache_utils import DynamicCache
from transformers.generation.utils import GenerationMixin
from transformers.modeling_outputs import CausalLMOutputWithPast

from .config import GPTConfig


CONTROL_TENSOR_NAME_PATTERNS = (
    "scale",
    "gate",
    "gain",
    "norm",
    "ln_",
    "rms",
)
RESULT_ROLE_ID = 10
SPACE_ROLE_ID = 11


class CastedLinear(nn.Linear):
    """Store linear params in FP32, cast to activation dtype for matmul."""

    def forward(self, x: Tensor) -> Tensor:
        weight = self.weight.to(dtype=x.dtype)
        bias = self.bias.to(dtype=x.dtype) if self.bias is not None else None
        return F.linear(x, weight, bias)


def restore_fp32_params(model: nn.Module) -> None:
    """Keep linear weights and control params in FP32 after dtype conversion."""
    for module in model.modules():
        if isinstance(module, CastedLinear):
            module.float()
    for name, param in model.named_parameters():
        if (
            param.ndim < 2
            or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)
        ) and param.dtype != torch.float32:
            param.data = param.data.float()


class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        rms = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
        return (x.float() * rms).to(dtype=x.dtype) * self.weight.to(dtype=x.dtype)


def build_rope_inv_freq(head_dim, theta=2500.0):
    return 1.0 / (theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim))


def precompute_rope_cos_sin(head_dim, seq_len, theta=2500.0):
    freqs = build_rope_inv_freq(head_dim, theta)
    t = torch.arange(seq_len, dtype=torch.float32)
    freqs = torch.outer(t, freqs)
    return freqs.cos(), freqs.sin()


def _apply_rope(x, cos, sin):
    x_float = x.float()
    x_pair = x_float.reshape(*x_float.shape[:-1], -1, 2)
    even = x_pair[..., 0]
    odd = x_pair[..., 1]
    cos = cos.unsqueeze(0).unsqueeze(0)
    sin = sin.unsqueeze(0).unsqueeze(0)
    x_rot = torch.stack((even * cos - odd * sin, even * sin + odd * cos), dim=-1)
    return x_rot.flatten(-2).type_as(x)


def apply_rotary_emb(q, k, freqs_cis):
    cos, sin = freqs_cis
    return _apply_rope(q, cos, sin), _apply_rope(k, cos, sin)

class GPTAttention(nn.Module):
    def __init__(self, config, layer_idx):
        super().__init__()
        self.layer_idx = layer_idx
        self.n_head = config.num_attention_heads
        self.n_kv_heads = config.num_key_value_heads
        self.head_dim = config.head_dim
        self.n_rep = self.n_head // self.n_kv_heads
        self.xsa_projection = config.xsa_projection

        self.q_proj = CastedLinear(config.hidden_size, self.n_head * self.head_dim, bias=False)
        self.k_proj = CastedLinear(config.hidden_size, self.n_kv_heads * self.head_dim, bias=False)
        self.v_proj = CastedLinear(config.hidden_size, self.n_kv_heads * self.head_dim, bias=False)
        self.o_proj = CastedLinear(self.n_head * self.head_dim, config.hidden_size, bias=False)

    def _xsa_efficient(self, y: Tensor, v_current: Tensor) -> Tensor:
        # y:         [B, H,   T, D]
        # v_current: [B, Hkv, T, D]
        B, H, T, D = y.shape
        Hkv = v_current.size(1)
        group = H // Hkv

        y_g = y.reshape(B, Hkv, group, T, D)
        v_n = F.normalize(v_current, dim=-1).unsqueeze(2)

        proj = (y_g * v_n).sum(dim=-1, keepdim=True) * v_n
        return (y_g - proj).reshape(B, H, T, D)

    def forward(self, x, freqs_cis, past_key_value=None, use_cache=False, attention_mask=None):
        B, T, _ = x.size()

        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        k_current = self.k_proj(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)
        v_current = self.v_proj(x).view(B, T, self.n_kv_heads, self.head_dim).transpose(1, 2)

        q, k_current = apply_rotary_emb(q, k_current, freqs_cis)

        if past_key_value is not None:
            k, v = past_key_value.update(k_current, v_current, self.layer_idx)
        else:
            k, v = k_current, v_current

        S = k.size(2)

        is_causal = past_key_value is None or past_key_value.get_seq_length(self.layer_idx) == T

        attn_mask = None
        if attention_mask is not None:
            key_pad = attention_mask.to(torch.bool)[:, None, None, :]

            if is_causal and T > 1:
                causal = torch.ones(T, S, dtype=torch.bool, device=x.device).tril(diagonal=S - T)
                attn_mask = key_pad & causal[None, None, :, :]
            else:
                attn_mask = key_pad.expand(B, 1, T, S)

            is_causal = False

        y = F.scaled_dot_product_attention(
            q,
            k,
            v,
            attn_mask=attn_mask,
            is_causal=is_causal,
            enable_gqa=(self.n_kv_heads != self.n_head),
        )

        if self.xsa_projection:
            y = self._xsa_efficient(y, v_current)

        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.head_dim)
        return self.o_proj(y)


class GPTMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.w_gate = CastedLinear(config.hidden_size, config.intermediate_size, bias=False)
        self.w_up = CastedLinear(config.hidden_size, config.intermediate_size, bias=False)
        self.w_down = CastedLinear(config.intermediate_size, config.hidden_size, bias=False)

    def forward(self, x):
        return self.w_down(F.silu(self.w_gate(x)) * self.w_up(x))


class GPTBlock(nn.Module):
    def __init__(self, config, layer_idx):
        super().__init__()
        self.ln_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.attn = GPTAttention(config, layer_idx)
        self.ln_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.mlp = GPTMLP(config)

    def forward(self, x, freqs_cis, past_key_value=None, use_cache=False, attention_mask=None):
        x = x + self.attn(self.ln_1(x), freqs_cis, past_key_value, use_cache, attention_mask=attention_mask)
        x = x + self.mlp(self.ln_2(x))
        return x


class GPTPreTrainedModel(PreTrainedModel):
    config_class = GPTConfig
    base_model_prefix = "transformer"
    supports_gradient_checkpointing = False

    def _init_weights(self, module):
        std = self.config.hidden_size ** -0.5
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)


class GPTForCausalLM(GPTPreTrainedModel, GenerationMixin):
    _tied_weights_keys = {"lm_head.weight": "transformer.wte.weight"}

    def __init__(self, config):
        super().__init__(config)
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.hidden_size),
            h=nn.ModuleList([GPTBlock(config, i) for i in range(config.num_hidden_layers)]),
            ln_f=RMSNorm(config.hidden_size, eps=config.rms_norm_eps),
        ))
        self.lm_head = CastedLinear(config.hidden_size, config.vocab_size, bias=False)
        if config.tie_word_embeddings:
            self.lm_head.weight = self.transformer["wte"].weight
        if getattr(config, "use_place_embeddings", True):
            self.place_embeddings = nn.Embedding(
                config.place_vocab_size,
                config.hidden_size,
                padding_idx=0,
            )
        else:
            self.place_embeddings = None
        if getattr(config, "use_role_embeddings", True):
            self.role_embeddings = nn.Embedding(
                config.role_vocab_size,
                config.hidden_size,
                padding_idx=0,
            )
        else:
            self.role_embeddings = None
        self._freqs_cis_cache = None
        self.post_init()
        with torch.no_grad():
            if self.place_embeddings is not None:
                self.place_embeddings.weight[0].zero_()
            if self.role_embeddings is not None:
                self.role_embeddings.weight[0].zero_()
        restore_fp32_params(self)

    def _apply(self, fn):
        module = super()._apply(fn)
        restore_fp32_params(self)
        return module

    def get_input_embeddings(self):
        return self.transformer["wte"]

    def set_input_embeddings(self, value):
        self.transformer["wte"] = value

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def derive_features_from_input_ids(self, input_ids: Tensor) -> tuple[Tensor, Tensor]:
        """Derive arithmetic auxiliary streams from token IDs.

        This is the default-compatible path for HF/leaderboard callers that only
        provide ``input_ids``. Training and specialized benchmarks may still pass
        precomputed streams, which remain authoritative.
        """
        digit_ids = set(int(token_id) for token_id in getattr(self.config, "feature_digit_token_ids", []))
        equals_id = getattr(self.config, "feature_equals_token_id", None)
        space_ids = set(int(token_id) for token_id in getattr(self.config, "feature_space_token_ids", []))
        place_overflow_id = int(getattr(self.config, "place_vocab_size", 1)) - 1
        role_vocab_size = int(getattr(self.config, "role_vocab_size", 0))

        place_ids = torch.zeros_like(input_ids)
        role_ids = torch.zeros_like(input_ids)
        if not digit_ids:
            return place_ids, role_ids

        input_cpu = input_ids.detach().to("cpu")
        place_cpu = torch.zeros_like(input_cpu)
        role_cpu = torch.zeros_like(input_cpu)

        for row in range(input_cpu.size(0)):
            ids = [int(token_id) for token_id in input_cpu[row].tolist()]

            index = 0
            digit_runs: list[tuple[int, int]] = []
            while index < len(ids):
                if ids[index] not in digit_ids:
                    index += 1
                    continue
                run_start = index
                offset = 1
                while index < len(ids) and ids[index] in digit_ids:
                    place_cpu[row, index] = min(offset, place_overflow_id)
                    index += 1
                    offset += 1
                digit_runs.append((run_start, index))

            if equals_id is None or not digit_runs:
                continue
            equals_positions = [pos for pos, token_id in enumerate(ids) if token_id == int(equals_id)]
            if len(equals_positions) != 1:
                continue
            equals_position = equals_positions[0]
            operand_runs = [(start, end) for start, end in digit_runs if end <= equals_position]
            result_runs = [(start, end) for start, end in digit_runs if start > equals_position]
            if not operand_runs or len(operand_runs) > 9:
                continue

            if role_vocab_size > SPACE_ROLE_ID:
                for pos, token_id in enumerate(ids):
                    if token_id in space_ids:
                        role_cpu[row, pos] = SPACE_ROLE_ID
            for role, (start, end) in enumerate(operand_runs, start=1):
                if role >= role_vocab_size:
                    break
                role_cpu[row, start:end] = role
            if role_vocab_size > RESULT_ROLE_ID:
                for start, end in result_runs:
                    role_cpu[row, start:end] = RESULT_ROLE_ID

        return (
            place_cpu.to(device=input_ids.device, non_blocking=True),
            role_cpu.to(device=input_ids.device, non_blocking=True),
        )

    def embed_tokens(self, input_ids, *, place_ids=None, role_ids=None, **kwargs):
        if (place_ids is None and self.place_embeddings is not None) or (
            role_ids is None and self.role_embeddings is not None
        ):
            derived_place_ids, derived_role_ids = self.derive_features_from_input_ids(input_ids)
            if place_ids is None:
                place_ids = derived_place_ids
            if role_ids is None:
                role_ids = derived_role_ids

        embeddings = self.transformer["wte"](input_ids)
        if self.place_embeddings is not None:
            if place_ids.shape != input_ids.shape:
                raise ValueError("place_ids must match input_ids shape")
            embeddings = embeddings + self.place_embeddings(place_ids)
        if self.role_embeddings is not None:
            if role_ids.shape != input_ids.shape:
                raise ValueError("role_ids must match input_ids shape")
            embeddings = embeddings + self.role_embeddings(role_ids)
        return embeddings

    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **kwargs):
        if past_key_values is not None and past_key_values.get_seq_length() > 0:
            input_ids = input_ids[:, -1:]
            if kwargs.get("place_ids") is not None:
                kwargs["place_ids"] = kwargs["place_ids"][:, -1:]
            if kwargs.get("role_ids") is not None:
                kwargs["role_ids"] = kwargs["role_ids"][:, -1:]
        return {
            "input_ids": input_ids,
            "place_ids": kwargs.get("place_ids"),
            "role_ids": kwargs.get("role_ids"),
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": True,
        }

    def _get_freqs_cis(self, seq_len, device):
        cache = self._freqs_cis_cache
        if cache is None or cache[0].device != device or cache[0].size(0) < seq_len:
            cache = tuple(
                tensor.to(device)
                for tensor in precompute_rope_cos_sin(self.config.head_dim, seq_len, self.config.rope_theta)
            )
            if torch.is_inference_mode_enabled():
                return cache[0][:seq_len], cache[1][:seq_len]
            self._freqs_cis_cache = cache
        return cache[0][:seq_len], cache[1][:seq_len]

    def forward(
        self,
        input_ids,
        attention_mask=None,
        labels=None,
        place_ids=None,
        role_ids=None,
        past_key_values: Optional[DynamicCache] = None,
        use_cache=False,
        **kwargs,
    ):
        B, T = input_ids.size()
        if use_cache and past_key_values is None:
            past_key_values = DynamicCache()

        past_len = past_key_values.get_seq_length() if past_key_values is not None else 0
        x = self.embed_tokens(
            input_ids,
            place_ids=place_ids,
            role_ids=role_ids,
            **kwargs,
        )
        cos, sin = self._get_freqs_cis(past_len + T, input_ids.device)
        freqs_cis = (
            cos[past_len:past_len + T],
            sin[past_len:past_len + T],
        )

        for block in self.transformer["h"]:
            x = block(x, freqs_cis, past_key_values if use_cache else None, use_cache, attention_mask=attention_mask)

        x = self.transformer["ln_f"](x)
        logits = self.lm_head(x)

        loss = None
        if labels is not None:
            if getattr(self.config, "labels_are_shifted", False):
                loss = F.cross_entropy(logits.float().reshape(-1, logits.size(-1)), labels.reshape(-1))
            else:
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = labels[..., 1:].contiguous()
                loss = F.cross_entropy(shift_logits.float().view(-1, shift_logits.size(-1)), shift_labels.reshape(-1))

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=past_key_values if use_cache else None,
        )