FlowVortex
/

SymTime

@@ -1,24 +1,25 @@
-{
-  "architectures": [
-      "SymTimeModel"
-    ],
-  "_name_or_path": "FlowVortex/SymTime",
-  "auto_map": {
-    "AutoConfig": "configuration_symtime.SymTimeConfig",
-    "AutoModel": "model.SymTimeModel"
-  },
-  "patch_size": 16,
-  "num_layers": 6,
-  "d_model": 512,
-  "d_ff": 2048,
-  "num_heads": 8,
-  "norm": "BatchNorm",
-  "dropout": 0.1,
-  "act": "gelu",
-  "pre_norm": false,
-  "initializer_factor": 0.05,
-  "model_type": "symtime",
-  "torch_dtype": "float32",
-  "transformers_version": "5.5.4"
 }

+{
+  "architectures": [
+      "SymTimeModel"
+    ],
+  "_name_or_path": "FlowVortex/SymTime",
+  "auto_map": {
+    "AutoConfig": "configuration_symtime.SymTimeConfig",
+    "AutoModel": "model.SymTimeModel"
+  },
+  "patch_size": 16,
+  "stride": 16,
+  "num_layers": 6,
+  "d_model": 512,
+  "d_ff": 2048,
+  "num_heads": 8,
+  "norm": "BatchNorm",
+  "dropout": 0.1,
+  "act": "gelu",
+  "pre_norm": false,
+  "initializer_factor": 0.05,
+  "model_type": "symtime",
+  "torch_dtype": "float32",
+  "transformers_version": "5.5.4"
 }

configuration_symtime.py CHANGED Viewed

@@ -1,64 +1,66 @@
-from dataclasses import dataclass
-from typing import List, Literal, Optional, Dict
-from enum import Enum
-from transformers.configuration_utils import PretrainedConfig
-@dataclass
-class SymTimeConfig(PretrainedConfig):
-    """
-    Time series encoder configuration for SymTime Model.
-    Parameters
-    -----------
-    patch_size
-        The size of the patch to be used for the input data.
-    num_layers
-        The number of layers to be used for the encoder.
-    d_model
-        The dimension of the model.
-    d_ff
-        The dimension of the feedforward network.
-    num_heads
-        The number of heads to be used for the attention mechanism.
-    norm
-        The normalization to be used for the encoder.
-    attn_dropout
-        The dropout rate to be used for the attention mechanism.
-    dropout
-        The dropout rate to be used for the encoder.
-    act
-        The activation function to be used for the encoder.
-    pre_norm
-        Whether to use pre-norm for the encoder.
-    """
-    model_type = "symtime"
-    def __init__(
-        self,
-        patch_size: int = 16,
-        num_layers: int = 6,
-        d_model: int = 512,
-        d_ff: int = 2048,
-        num_heads: int = 8,
-        norm: str = "BatchNorm",
-        dropout: float = 0.1,
-        act: str = "gelu",
-        pre_norm: bool = False,
-        initializer_factor: float = 0.05,
-        **kwargs,
-    ) -> None:
-        self.patch_size = patch_size
-        self.num_layers = num_layers
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.d_ff = d_ff
-        self.norm = norm
-        self.dropout = dropout
-        self.act = act
-        self.pre_norm = pre_norm
-        self.initializer_factor = initializer_factor
-        super().__init__(**kwargs)

+from dataclasses import dataclass
+from transformers.configuration_utils import PretrainedConfig
+@dataclass
+class SymTimeConfig(PretrainedConfig):
+    """
+    Time series encoder configuration for SymTime Model.
+    Parameters
+    -----------
+    num_layers
+        The number of layers to be used for the encoder.
+    d_model
+        The dimension of the model.
+    d_ff
+        The dimension of the feedforward network.
+    num_heads
+        The number of heads to be used for the attention mechanism.
+    norm
+        The normalization to be used for the encoder.
+    attn_dropout
+        The dropout rate to be used for the attention mechanism.
+    dropout
+        The dropout rate to be used for the encoder.
+    act
+        The activation function to be used for the encoder.
+    pre_norm
+        Whether to use pre-norm for the encoder.
+    patch_size
+        The size of the patch to be used for the input data.
+    stride
+        The stride of the patch to be used for the input data.
+    """
+    model_type = "symtime"
+    def __init__(
+        self,
+        num_layers: int = 6,
+        d_model: int = 512,
+        d_ff: int = 2048,
+        num_heads: int = 8,
+        norm: str = "BatchNorm",
+        dropout: float = 0.1,
+        act: str = "gelu",
+        pre_norm: bool = False,
+        patch_size: int = 16,
+        stride: int = 16,
+        initializer_factor: float = 0.05,
+        **kwargs,
+    ) -> None:
+        self.patch_size = patch_size
+        self.stride = stride
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_ff = d_ff
+        self.norm = norm
+        self.dropout = dropout
+        self.act = act
+        self.pre_norm = pre_norm
+        self.initializer_factor = initializer_factor
+        super().__init__(**kwargs)

layers.py CHANGED Viewed

@@ -1,401 +1,427 @@
-from typing import Optional, Union, Tuple, Callable
-import math
-import numpy as np
-import torch
-from torch import nn
-from torch import Tensor
-import torch.nn.functional as F
-from einops import rearrange
-def get_activation_fn(activation: Union[str, Callable]) -> nn.Module:
-    """Select the activation function to use."""
-    if callable(activation):
-        return activation()
-    elif activation.lower() == "relu":
-        return nn.ReLU()
-    elif activation.lower() == "gelu":
-        return nn.GELU()
-    raise ValueError(
-        f'{activation} is not available. You can use "relu", "gelu", or a callable'
-    )
-class Transpose(nn.Module):
-    """Transpose the dimensions of the input tensor"""
-    def __init__(self, *dims, contiguous=False) -> None:
-        super().__init__()
-        self.dims, self.contiguous = dims, contiguous
-    def forward(self, x: Tensor) -> Tensor:
-        if self.contiguous:
-            return x.transpose(*self.dims).contiguous()
-        else:
-            return x.transpose(*self.dims)
-class PositionalEmbedding(nn.Module):
-    """Adding the positional encoding to the input for Transformer"""
-    def __init__(self, hidden_size: int, max_len: int = 5000) -> None:
-        super(PositionalEmbedding, self).__init__()
-        # Calculate the positional encoding once in the logarithmic space.
-        pe = torch.zeros(
-            max_len, hidden_size
-        ).float()  # Initialize a tensor of zeros with shape (max_len, hidden_size) to store positional encodings
-        pe.requires_grad = (
-            False  # Positional encodings do not require gradients as they are fixed
-        )
-        position = (
-            torch.arange(0, max_len).float().unsqueeze(1)
-        )  # Generate a sequence from 0 to max_len-1 and add a dimension at the 1st axis
-        div_term = (
-            torch.arange(0, hidden_size, 2).float() * -(math.log(10000.0) / hidden_size)
-        ).exp()  # Calculate the divisor term in the positional encoding formula
-        pe[:, 0::2] = torch.sin(
-            position * div_term
-        )  # Apply the sine function to the even columns of the positional encoding matrix
-        pe[:, 1::2] = torch.cos(
-            position * div_term
-        )  # Apply the cosine function to the odd columns of the positional encoding matrix
-        pe = pe.unsqueeze(
-            0
-        )  # Add a batch dimension, changing the shape to (1, max_len, hidden_size)
-        self.register_buffer(
-            "pe", pe
-        )  # Register the positional encodings as a buffer, which will not be updated as model parameters
-    def forward(self, x: Tensor) -> Tensor:
-        # Return the first max_len positional encodings that match the length of input x
-        return x + self.pe[:, : x.size(1)]
-class TSTEncoder(nn.Module):
-    """Time series encoder backbone of SymTime"""
-    def __init__(
-        self,
-        patch_size: int = 16,
-        num_layers: int = 3,
-        hidden_size: int = 128,
-        num_heads: int = 16,
-        d_k: int = None,
-        d_v: int = None,
-        d_ff: int = 256,
-        norm: str = "BatchNorm",
-        attn_dropout: float = 0.0,
-        dropout: float = 0.0,
-        act: str = "gelu",
-        store_attn: bool = False,
-        pre_norm: bool = False,
-    ) -> None:
-        super().__init__()
-        # The Linear layer to project the input patches to the model dimension
-        self.W_P = nn.Linear(patch_size, hidden_size)
-        # Positional encoding
-        self.pe = PositionalEmbedding(hidden_size=hidden_size)
-        # Residual dropout
-        self.dropout = nn.Dropout(dropout)
-        # Create the [CLS] token
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
-        self.cls_mask = nn.Parameter(torch.ones(1, 1).bool(), requires_grad=False)
-        # Create the encoder layer of the model backbone
-        self.layers = nn.ModuleList(
-            [
-                TSTEncoderLayer(
-                    hidden_size=hidden_size,
-                    num_heads=num_heads,
-                    d_k=d_k,
-                    d_v=d_v,
-                    d_ff=d_ff,
-                    norm=norm,
-                    attn_dropout=attn_dropout,
-                    dropout=dropout,
-                    activation=act,
-                    pre_norm=pre_norm,
-                    store_attn=store_attn,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-        # model params init
-        self.apply(self._init_weights)
-    def _init_weights(self, m: nn.Module) -> None:
-        """model params init through apply methods"""
-        if isinstance(m, nn.Linear):
-            nn.init.xavier_uniform_(m.weight)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-    def forward(
-        self,
-        x: Tensor,  # x: [batch_size, patch_num, patch_size]
-        attn_mask: Optional[Tensor] = None,  # attn_mask: [batch, num_patch]
-        return_cls_token: bool = True,  # whether to return the CLS token
-    ) -> Tensor:
-        """ """
-        batch_size = x.size(0)
-        # Input patching embedding
-        x = self.W_P(x)  # x: [batch_size, patch_num, model_dim]
-        # Add the [CLS] token
-        cls_token = self.cls_token.expand(batch_size, -1, -1)
-        x = torch.cat([cls_token, x], dim=1)
-        # adjust the attn mask
-        if attn_mask is not None:
-            attn_mask = torch.cat(
-                [self.cls_mask.expand(batch_size, -1), attn_mask], dim=1
-            )
-        # Add the positional embedding
-        x = self.pe(x)
-        x = self.dropout(x)  # x: [batch_size, patch_num, hidden_size]
-        for mod in self.layers:
-            x = mod(x, attn_mask=attn_mask)
-        if not return_cls_token:
-            # If not returning the CLS token, remove it from the output
-            return x[:, 1:, :]
-        return x
-class TSTEncoderLayer(nn.Module):
-    """Patch-based Transformer module sublayer"""
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        d_k: int = None,
-        d_v: int = None,
-        d_ff: int = 256,
-        store_attn: int = False,
-        norm: str = "BatchNorm",
-        attn_dropout: float = 0.0,
-        dropout: float = 0.0,
-        bias: bool = True,
-        activation: str = "gelu",
-        pre_norm: bool = False,
-    ) -> None:
-        super(TSTEncoderLayer, self).__init__()
-        assert (
-            not hidden_size % num_heads
-        ), f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})"
-        # If not specified, the number of heads is divided
-        d_k = hidden_size // num_heads if d_k is None else d_k
-        d_v = hidden_size // num_heads if d_v is None else d_v
-        # Create the multi-head attention
-        self.self_attn = MultiHeadAttention(
-            hidden_size,
-            num_heads,
-            d_k,
-            d_v,
-            attn_dropout=attn_dropout,
-            proj_dropout=dropout,
-        )
-        # Add & Norm
-        self.dropout_attn = nn.Dropout(dropout)
-        if "batch" in norm.lower():
-            self.norm_attn = nn.Sequential(
-                Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
-            )
-        else:
-            self.norm_attn = nn.LayerNorm(hidden_size)
-        # Position-wise Feed-Forward
-        self.ff = nn.Sequential(
-            nn.Linear(hidden_size, d_ff, bias=bias),
-            get_activation_fn(activation),
-            nn.Dropout(dropout),
-            nn.Linear(d_ff, hidden_size, bias=bias),
-        )
-        # Add & Norm
-        self.dropout_ffn = nn.Dropout(dropout)
-        if "batch" in norm.lower():
-            self.norm_ffn = nn.Sequential(
-                Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
-            )
-        else:
-            self.norm_ffn = nn.LayerNorm(hidden_size)
-        # use pre-norm or not
-        self.pre_norm = pre_norm
-        self.store_attn = store_attn
-        self.attn = None
-    def forward(
-        self, src: Tensor, attn_mask: Optional[Tensor] = None
-    ) -> Union[Tuple[Tensor, Tensor], Tensor]:
-        """Multi-Head attention sublayer"""
-        # Whether to use pre-norm for attention layer
-        if self.pre_norm:
-            src = self.norm_attn(src)
-        # Multi-Head attention
-        src2, attn = self.self_attn(src, src, src, attn_mask=attn_mask)
-        if self.store_attn:
-            self.attn = attn
-        # Add: residual connection with residual dropout
-        src = src + self.dropout_attn(src2)
-        if not self.pre_norm:
-            src = self.norm_attn(src)
-        # Whether to use pre-norm for ffn layer
-        if self.pre_norm:
-            src = self.norm_ffn(src)
-        # Position-wise Feed-Forward
-        src2 = self.ff(src)
-        # Add: residual connection with residual dropout
-        src = src + self.dropout_ffn(src2)
-        if not self.pre_norm:
-            src = self.norm_ffn(src)
-        return src
-class MultiHeadAttention(nn.Module):
-    """Multi-head attention mechanism layer"""
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        d_k: int = None,
-        d_v: int = None,
-        attn_dropout: float = 0.0,
-        proj_dropout: float = 0.0,
-        qkv_bias: bool = True,
-    ) -> None:
-        """Multi Head Attention Layer
-        Input shape:
-            Q:       [batch_size (bs) x max_q_len x hidden_size]
-            K, V:    [batch_size (bs) x q_len x hidden_size]
-            mask:    [q_len x q_len]
-        """
-        super().__init__()
-        d_k = hidden_size // num_heads if d_k is None else d_k
-        d_v = hidden_size // num_heads if d_v is None else d_v
-        self.num_heads, self.d_k, self.d_v = num_heads, d_k, d_v
-        self.W_Q = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
-        self.W_K = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
-        self.W_V = nn.Linear(hidden_size, d_v * num_heads, bias=qkv_bias)
-        # Scaled Dot-Product Attention (multiple heads)
-        self.sdp_attn = _ScaledDotProductAttention(
-            hidden_size, num_heads, attn_dropout=attn_dropout
-        )
-        # Project output
-        self.to_out = nn.Sequential(
-            nn.Linear(num_heads * d_v, hidden_size), nn.Dropout(proj_dropout)
-        )
-    def forward(
-        self,
-        q: Tensor,
-        k: Optional[Tensor] = None,
-        v: Optional[Tensor] = None,
-        attn_mask: Optional[Tensor] = None,
-    ):
-        bs = q.size(0)
-        if k is None:
-            k = q
-        if v is None:
-            v = q
-        # Linear (+ split in multiple heads)
-        q_s = self.W_Q(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
-        k_s = self.W_K(k).view(bs, -1, self.num_heads, self.d_k).permute(0, 2, 3, 1)
-        v_s = self.W_V(v).view(bs, -1, self.num_heads, self.d_v).transpose(1, 2)
-        # Apply Scaled Dot-Product Attention (multiple heads)
-        output, attn_weights = self.sdp_attn(q_s, k_s, v_s, attn_mask=attn_mask)
-        # back to the original inputs dimensions
-        output = (
-            output.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_v)
-        )
-        output = self.to_out(output)
-        return output, attn_weights
-class _ScaledDotProductAttention(nn.Module):
-    r"""Scaled Dot-Product Attention module (Attention is all you need by Vaswani et al., 2017) with optional residual attention from previous layer
-    (Realformer: Transformer likes residual attention by He et al, 2020) and locality self sttention (Vision Transformer for Small-Size Datasets
-    by Lee et al, 2021)"""
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        attn_dropout: float = 0.0,
-        res_attention: bool = False,
-    ):
-        super().__init__()
-        self.attn_dropout = nn.Dropout(attn_dropout)
-        self.res_attention = res_attention
-        head_dim = hidden_size // num_heads
-        self.scale = nn.Parameter(torch.tensor(head_dim**-0.5), requires_grad=False)
-    def forward(
-        self, q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None
-    ) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
-        """
-        :param q: [batch_size, num_heads, num_token, d_k]
-        :param k: [batch_size, num_heads, d_k, num_token]
-        :param v: [batch_size, num_heads, num_token, d_k]
-        :param attn_mask: [batch_size, num_heads, num_token]
-        """
-        # Scaled MatMul (q, k) - similarity scores for all pairs of positions in an input sequence
-        attn_scores = torch.matmul(q, k) * self.scale
-        # Attention mask (optional)
-        if (
-            attn_mask is not None
-        ):  # attn_mask with shape [q_len x seq_len] - only used when q_len == seq_len
-            attn_mask = rearrange(attn_mask, "b i -> b 1 i 1") * rearrange(
-                attn_mask, "b i -> b 1 1 i"
-            )
-            if attn_mask.dtype == torch.bool:
-                attn_scores.masked_fill_(attn_mask, -np.inf)
-            else:
-                attn_scores += attn_mask
-        # normalize the attention weights
-        attn_weights = F.softmax(attn_scores, dim=-1)
-        attn_weights = self.attn_dropout(attn_weights)
-        # compute the new values given the attention weights
-        output = torch.matmul(attn_weights, v)
-        return output, attn_weights

+from typing import Optional, Union, Tuple, Callable
+import math
+import numpy as np
+import torch
+from torch import nn
+from torch import Tensor
+import torch.nn.functional as F
+from einops import rearrange
+def get_activation_fn(activation: Union[str, Callable]) -> nn.Module:
+    """
+    Select the activation function to use.
+    Parameters
+    ----------
+    activation : Union[str, Callable]
+        The activation specification to resolve. It can be a string such as
+        "relu" or "gelu", or a callable that returns an activation module.
+    Return
+    ------
+    nn.Module
+        The corresponding activation module instance.
+    """
+    if callable(activation):
+        return activation()
+    elif activation.lower() == "relu":
+        return nn.ReLU()
+    elif activation.lower() == "gelu":
+        return nn.GELU()
+    raise ValueError(
+        f'{activation} is not available. You can use "relu", "gelu", or a callable'
+    )
+class Transpose(nn.Module):
+    """Transpose the dimensions of the input tensor.
+    Parameters
+    ----------
+    *dims : int
+        The dimensions passed to `torch.Tensor.transpose`.
+    contiguous : bool, optional
+        Whether to return a contiguous tensor after transposing, by default False.
+    Return
+    ------
+    Tensor
+        The transposed tensor.
+    """
+    def __init__(self, *dims, contiguous=False) -> None:
+        super().__init__()
+        self.dims, self.contiguous = dims, contiguous
+    def forward(self, x: Tensor) -> Tensor:
+        if self.contiguous:
+            return x.transpose(*self.dims).contiguous()
+        else:
+            return x.transpose(*self.dims)
+class PositionalEmbedding(nn.Module):
+    """Adding the positional encoding to the input for Transformer"""
+    def __init__(self, hidden_size: int, max_len: int = 5000) -> None:
+        super(PositionalEmbedding, self).__init__()
+        # Calculate the positional encoding once in the logarithmic space.
+        pe = torch.zeros(
+            max_len, hidden_size
+        ).float()  # Initialize a tensor of zeros with shape (max_len, hidden_size) to store positional encodings
+        pe.requires_grad = (
+            False  # Positional encodings do not require gradients as they are fixed
+        )
+        position = (
+            torch.arange(0, max_len).float().unsqueeze(1)
+        )  # Generate a sequence from 0 to max_len-1 and add a dimension at the 1st axis
+        div_term = (
+            torch.arange(0, hidden_size, 2).float() * -(math.log(10000.0) / hidden_size)
+        ).exp()  # Calculate the divisor term in the positional encoding formula
+        pe[:, 0::2] = torch.sin(
+            position * div_term
+        )  # Apply the sine function to the even columns of the positional encoding matrix
+        pe[:, 1::2] = torch.cos(
+            position * div_term
+        )  # Apply the cosine function to the odd columns of the positional encoding matrix
+        pe = pe.unsqueeze(
+            0
+        )  # Add a batch dimension, changing the shape to (1, max_len, hidden_size)
+        self.register_buffer(
+            "pe", pe
+        )  # Register the positional encodings as a buffer, which will not be updated as model parameters
+    def forward(self, x: Tensor) -> Tensor:
+        # Return the first max_len positional encodings that match the length of input x
+        return x + self.pe[:, : x.size(1)]
+class TSTEncoder(nn.Module):
+    """Time series encoder backbone of SymTime"""
+    def __init__(
+        self,
+        patch_size: int = 16,
+        num_layers: int = 3,
+        hidden_size: int = 128,
+        num_heads: int = 16,
+        d_k: int = None,
+        d_v: int = None,
+        d_ff: int = 256,
+        norm: str = "BatchNorm",
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        act: str = "gelu",
+        store_attn: bool = False,
+        pre_norm: bool = False,
+    ) -> None:
+        super().__init__()
+        # The Linear layer to project the input patches to the model dimension
+        self.W_P = nn.Linear(patch_size, hidden_size)
+        # Positional encoding
+        self.pe = PositionalEmbedding(hidden_size=hidden_size)
+        # Residual dropout
+        self.dropout = nn.Dropout(dropout)
+        # Create the [CLS] token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
+        self.cls_mask = nn.Parameter(torch.ones(1, 1).bool(), requires_grad=False)
+        # Create the encoder layer of the model backbone
+        self.layers = nn.ModuleList(
+            [
+                TSTEncoderLayer(
+                    hidden_size=hidden_size,
+                    num_heads=num_heads,
+                    d_k=d_k,
+                    d_v=d_v,
+                    d_ff=d_ff,
+                    norm=norm,
+                    attn_dropout=attn_dropout,
+                    dropout=dropout,
+                    activation=act,
+                    pre_norm=pre_norm,
+                    store_attn=store_attn,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # model params init
+        self.apply(self._init_weights)
+    def _init_weights(self, m: nn.Module) -> None:
+        """model params init through apply methods"""
+        if isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(
+        self,
+        x: Tensor,  # x: [batch_size, patch_num, patch_size]
+        attn_mask: Optional[Tensor] = None,  # attn_mask: [batch, num_patch]
+        return_cls_token: bool = True,  # whether to return the CLS token
+    ) -> Tensor:
+        """ """
+        batch_size = x.size(0)
+        # Input patching embedding
+        x = self.W_P(x)  # x: [batch_size, patch_num, model_dim]
+        # Add the [CLS] token
+        cls_token = self.cls_token.expand(batch_size, -1, -1)
+        x = torch.cat([cls_token, x], dim=1)
+        # adjust the attn mask
+        if attn_mask is not None:
+            attn_mask = torch.cat(
+                [self.cls_mask.expand(batch_size, -1), attn_mask], dim=1
+            )
+        # Add the positional embedding
+        x = self.pe(x)
+        x = self.dropout(x)  # x: [batch_size, patch_num, hidden_size]
+        for mod in self.layers:
+            x = mod(x, attn_mask=attn_mask)
+        if not return_cls_token:
+            # If not returning the CLS token, remove it from the output
+            return x[:, 1:, :]
+        return x
+class TSTEncoderLayer(nn.Module):
+    """Patch-based Transformer module sublayer"""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        d_k: int = None,
+        d_v: int = None,
+        d_ff: int = 256,
+        store_attn: int = False,
+        norm: str = "BatchNorm",
+        attn_dropout: float = 0.0,
+        dropout: float = 0.0,
+        bias: bool = True,
+        activation: str = "gelu",
+        pre_norm: bool = False,
+    ) -> None:
+        super(TSTEncoderLayer, self).__init__()
+        assert (
+            not hidden_size % num_heads
+        ), f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})"
+        # If not specified, the number of heads is divided
+        d_k = hidden_size // num_heads if d_k is None else d_k
+        d_v = hidden_size // num_heads if d_v is None else d_v
+        # Create the multi-head attention
+        self.self_attn = MultiHeadAttention(
+            hidden_size,
+            num_heads,
+            d_k,
+            d_v,
+            attn_dropout=attn_dropout,
+            proj_dropout=dropout,
+        )
+        # Add & Norm
+        self.dropout_attn = nn.Dropout(dropout)
+        if "batch" in norm.lower():
+            self.norm_attn = nn.Sequential(
+                Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
+            )
+        else:
+            self.norm_attn = nn.LayerNorm(hidden_size)
+        # Position-wise Feed-Forward
+        self.ff = nn.Sequential(
+            nn.Linear(hidden_size, d_ff, bias=bias),
+            get_activation_fn(activation),
+            nn.Dropout(dropout),
+            nn.Linear(d_ff, hidden_size, bias=bias),
+        )
+        # Add & Norm
+        self.dropout_ffn = nn.Dropout(dropout)
+        if "batch" in norm.lower():
+            self.norm_ffn = nn.Sequential(
+                Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
+            )
+        else:
+            self.norm_ffn = nn.LayerNorm(hidden_size)
+        # use pre-norm or not
+        self.pre_norm = pre_norm
+        self.store_attn = store_attn
+        self.attn = None
+    def forward(
+        self, src: Tensor, attn_mask: Optional[Tensor] = None
+    ) -> Union[Tuple[Tensor, Tensor], Tensor]:
+        """Multi-Head attention sublayer"""
+        # Whether to use pre-norm for attention layer
+        if self.pre_norm:
+            src = self.norm_attn(src)
+        # Multi-Head attention
+        src2, attn = self.self_attn(src, src, src, attn_mask=attn_mask)
+        if self.store_attn:
+            self.attn = attn
+        # Add: residual connection with residual dropout
+        src = src + self.dropout_attn(src2)
+        if not self.pre_norm:
+            src = self.norm_attn(src)
+        # Whether to use pre-norm for ffn layer
+        if self.pre_norm:
+            src = self.norm_ffn(src)
+        # Position-wise Feed-Forward
+        src2 = self.ff(src)
+        # Add: residual connection with residual dropout
+        src = src + self.dropout_ffn(src2)
+        if not self.pre_norm:
+            src = self.norm_ffn(src)
+        return src
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention mechanism layer"""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        d_k: int = None,
+        d_v: int = None,
+        attn_dropout: float = 0.0,
+        proj_dropout: float = 0.0,
+        qkv_bias: bool = True,
+    ) -> None:
+        """Multi Head Attention Layer
+        Input shape:
+            Q:       [batch_size (bs) x max_q_len x hidden_size]
+            K, V:    [batch_size (bs) x q_len x hidden_size]
+            mask:    [q_len x q_len]
+        """
+        super().__init__()
+        d_k = hidden_size // num_heads if d_k is None else d_k
+        d_v = hidden_size // num_heads if d_v is None else d_v
+        self.num_heads, self.d_k, self.d_v = num_heads, d_k, d_v
+        self.W_Q = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
+        self.W_K = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
+        self.W_V = nn.Linear(hidden_size, d_v * num_heads, bias=qkv_bias)
+        # Scaled Dot-Product Attention (multiple heads)
+        self.sdp_attn = _ScaledDotProductAttention(
+            hidden_size, num_heads, attn_dropout=attn_dropout
+        )
+        # Project output
+        self.to_out = nn.Sequential(
+            nn.Linear(num_heads * d_v, hidden_size), nn.Dropout(proj_dropout)
+        )
+    def forward(
+        self,
+        q: Tensor,
+        k: Optional[Tensor] = None,
+        v: Optional[Tensor] = None,
+        attn_mask: Optional[Tensor] = None,
+    ):
+        bs = q.size(0)
+        if k is None:
+            k = q
+        if v is None:
+            v = q
+        # Linear (+ split in multiple heads)
+        q_s = self.W_Q(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
+        k_s = self.W_K(k).view(bs, -1, self.num_heads, self.d_k).permute(0, 2, 3, 1)
+        v_s = self.W_V(v).view(bs, -1, self.num_heads, self.d_v).transpose(1, 2)
+        # Apply Scaled Dot-Product Attention (multiple heads)
+        output, attn_weights = self.sdp_attn(q_s, k_s, v_s, attn_mask=attn_mask)
+        # back to the original inputs dimensions
+        output = (
+            output.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_v)
+        )
+        output = self.to_out(output)
+        return output, attn_weights
+class _ScaledDotProductAttention(nn.Module):
+    r"""Scaled Dot-Product Attention module (Attention is all you need by Vaswani et al., 2017) with optional residual attention from previous layer
+    (Realformer: Transformer likes residual attention by He et al, 2020) and locality self sttention (Vision Transformer for Small-Size Datasets
+    by Lee et al, 2021)"""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        attn_dropout: float = 0.0,
+        res_attention: bool = False,
+    ):
+        super().__init__()
+        self.attn_dropout = nn.Dropout(attn_dropout)
+        self.res_attention = res_attention
+        head_dim = hidden_size // num_heads
+        self.scale = nn.Parameter(torch.tensor(head_dim**-0.5), requires_grad=False)
+    def forward(
+        self, q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None
+    ) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+        """
+        :param q: [batch_size, num_heads, num_token, d_k]
+        :param k: [batch_size, num_heads, d_k, num_token]
+        :param v: [batch_size, num_heads, num_token, d_k]
+        :param attn_mask: [batch_size, num_heads, num_token]
+        """
+        # Scaled MatMul (q, k) - similarity scores for all pairs of positions in an input sequence
+        attn_scores = torch.matmul(q, k) * self.scale
+        # Attention mask (optional)
+        if (
+            attn_mask is not None
+        ):  # attn_mask with shape [q_len x seq_len] - only used when q_len == seq_len
+            attn_mask = rearrange(attn_mask, "b i -> b 1 i 1") * rearrange(
+                attn_mask, "b i -> b 1 1 i"
+            )
+            if attn_mask.dtype == torch.bool:
+                attn_scores.masked_fill_(attn_mask, -np.inf)
+            else:
+                attn_scores += attn_mask
+        # normalize the attention weights
+        attn_weights = F.softmax(attn_scores, dim=-1)
+        attn_weights = self.attn_dropout(attn_weights)
+        # compute the new values given the attention weights
+        output = torch.matmul(attn_weights, v)
+        return output, attn_weights

model.py CHANGED Viewed

@@ -1,142 +1,212 @@
-from typing import Tuple
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.nn import functional as F
-from einops import rearrange, repeat
-from transformers.modeling_utils import PreTrainedModel
-from configuration_symtime import SymTimeConfig
-from layers import MultiHeadAttention, TSTEncoder, TSTEncoderLayer
-class SymTimeModel(PreTrainedModel):
-    """
-    SymTime Model for Huggingface.
-    Parameters
-    ----------
-    config: SymTimeConfig
-        The configuration of the SymTime model.
-    Attributes
-    ----------
-    config: SymTimeConfig
-        The configuration of the SymTime model.
-    encoder: TSTEncoder
-        The encoder of the SymTime model.
-    Methods
-    -------
-    forward(x: Tensor) -> Tuple[Tensor, Tensor]:
-        Forward pass of the SymTime model.
-    _init_weights(module: nn.Module) -> None:
-        Initialize weights for the SymTime encoder stack.
-    """
-    config_class = SymTimeConfig
-    def __init__(self, config: SymTimeConfig):
-        super().__init__(config)
-        self.config = config
-        self.encoder = TSTEncoder(
-            patch_size=config.patch_size,
-            num_layers=config.num_layers,
-            hidden_size=config.d_model,
-            num_heads=config.num_heads,
-            d_ff=config.d_ff,
-            norm=config.norm,
-            attn_dropout=config.dropout,
-            dropout=config.dropout,
-            act=config.act,
-            pre_norm=config.pre_norm,
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-    def _init_weights(self, module) -> None:
-        """Initialize weights for the SymTime encoder stack.
-        The model is built on top of Hugging Face `PreTrainedModel`, so this method
-        is called recursively via `post_init()`. We keep the initialization aligned
-        with the current backbone structure in `layers.py`:
-        - `TSTEncoder.W_P`: patch projection linear layer
-        - `TSTEncoder.cls_token`: learnable CLS token
-        - `TSTEncoderLayer.self_attn`: Q/K/V and output projections
-        - `TSTEncoderLayer.ff`: feed-forward linear layers
-        - `LayerNorm` / `BatchNorm1d`: normalization layers
-        """
-        super()._init_weights(module)
-        factor = self.config.initializer_factor
-        d_model = self.config.d_model
-        num_heads = self.config.num_heads
-        d_k = d_model // num_heads
-        d_v = d_k
-        if isinstance(module, nn.Linear):
-            nn.init.normal_(
-                module.weight, mean=0.0, std=factor * (module.in_features**-0.5)
-            )
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            nn.init.ones_(module.weight)
-            nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.BatchNorm1d):
-            if module.weight is not None:
-                nn.init.ones_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, TSTEncoder):
-            if hasattr(module, "cls_token") and module.cls_token is not None:
-                nn.init.normal_(module.cls_token, mean=0.0, std=factor)
-            if hasattr(module, "W_P") and isinstance(module.W_P, nn.Linear):
-                nn.init.normal_(
-                    module.W_P.weight,
-                    mean=0.0,
-                    std=factor * (module.W_P.in_features**-0.5),
-                )
-                if module.W_P.bias is not None:
-                    nn.init.zeros_(module.W_P.bias)
-        elif isinstance(module, MultiHeadAttention):
-            nn.init.normal_(module.W_Q.weight, mean=0.0, std=factor * (d_model**-0.5))
-            nn.init.normal_(module.W_K.weight, mean=0.0, std=factor * (d_model**-0.5))
-            nn.init.normal_(module.W_V.weight, mean=0.0, std=factor * (d_model**-0.5))
-            if module.W_Q.bias is not None:
-                nn.init.zeros_(module.W_Q.bias)
-            if module.W_K.bias is not None:
-                nn.init.zeros_(module.W_K.bias)
-            if module.W_V.bias is not None:
-                nn.init.zeros_(module.W_V.bias)
-            out_proj = module.to_out[0]
-            nn.init.normal_(
-                out_proj.weight, mean=0.0, std=factor * ((num_heads * d_v) ** -0.5)
-            )
-            if out_proj.bias is not None:
-                nn.init.zeros_(out_proj.bias)
-        elif isinstance(module, TSTEncoderLayer):
-            for submodule in module.ff:
-                if isinstance(submodule, nn.Linear):
-                    nn.init.normal_(
-                        submodule.weight,
-                        mean=0.0,
-                        std=factor * (submodule.in_features**-0.5),
-                    )
-                    if submodule.bias is not None:
-                        nn.init.zeros_(submodule.bias)
-    def forward(
-        self, x: Tensor, return_cls_token: bool = True
-    ) -> Tuple[Tensor, Tensor]:
-        return self.encoder(x, return_cls_token=return_cls_token)

+from typing import Tuple
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import functional as F
+from einops import rearrange, repeat
+from transformers.modeling_utils import PreTrainedModel
+from configuration_symtime import SymTimeConfig
+from layers import MultiHeadAttention, TSTEncoder, TSTEncoderLayer
+class SymTimeModel(PreTrainedModel):
+    """
+    SymTime Model for Huggingface.
+    Parameters
+    ----------
+    config: SymTimeConfig
+        The configuration of the SymTime model.
+    Attributes
+    ----------
+    config: SymTimeConfig
+        The configuration of the SymTime model.
+    encoder: TSTEncoder
+        The encoder of the SymTime model.
+    Methods
+    -------
+    forward(x: Tensor) -> Tuple[Tensor, Tensor]:
+        Forward pass of the SymTime model.
+    _init_weights(module: nn.Module) -> None:
+        Initialize weights for the SymTime encoder stack.
+    """
+    config_class = SymTimeConfig
+    def __init__(self, config: SymTimeConfig):
+        super().__init__(config)
+        self.config = config
+        self.patch_size = config.patch_size
+        self.stride = config.stride
+        self.padding_patch_layer = nn.ReplicationPad1d((0, self.stride))
+        self.encoder = TSTEncoder(
+            patch_size=config.patch_size,
+            num_layers=config.num_layers,
+            hidden_size=config.d_model,
+            num_heads=config.num_heads,
+            d_ff=config.d_ff,
+            norm=config.norm,
+            attn_dropout=config.dropout,
+            dropout=config.dropout,
+            act=config.act,
+            pre_norm=config.pre_norm,
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def _init_weights(self, module) -> None:
+        """Initialize weights for the SymTime encoder stack.
+        The model is built on top of Hugging Face `PreTrainedModel`, so this method
+        is called recursively via `post_init()`. We keep the initialization aligned
+        with the current backbone structure in `layers.py`:
+        - `TSTEncoder.W_P`: patch projection linear layer
+        - `TSTEncoder.cls_token`: learnable CLS token
+        - `TSTEncoderLayer.self_attn`: Q/K/V and output projections
+        - `TSTEncoderLayer.ff`: feed-forward linear layers
+        - `LayerNorm` / `BatchNorm1d`: normalization layers
+        """
+        super()._init_weights(module)
+        factor = self.config.initializer_factor
+        d_model = self.config.d_model
+        num_heads = self.config.num_heads
+        d_k = d_model // num_heads
+        d_v = d_k
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(
+                module.weight, mean=0.0, std=factor * (module.in_features**-0.5)
+            )
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.ones_(module.weight)
+            nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.BatchNorm1d):
+            if module.weight is not None:
+                nn.init.ones_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, TSTEncoder):
+            if hasattr(module, "cls_token") and module.cls_token is not None:
+                nn.init.normal_(module.cls_token, mean=0.0, std=factor)
+            if hasattr(module, "W_P") and isinstance(module.W_P, nn.Linear):
+                nn.init.normal_(
+                    module.W_P.weight,
+                    mean=0.0,
+                    std=factor * (module.W_P.in_features**-0.5),
+                )
+                if module.W_P.bias is not None:
+                    nn.init.zeros_(module.W_P.bias)
+        elif isinstance(module, MultiHeadAttention):
+            nn.init.normal_(module.W_Q.weight, mean=0.0, std=factor * (d_model**-0.5))
+            nn.init.normal_(module.W_K.weight, mean=0.0, std=factor * (d_model**-0.5))
+            nn.init.normal_(module.W_V.weight, mean=0.0, std=factor * (d_model**-0.5))
+            if module.W_Q.bias is not None:
+                nn.init.zeros_(module.W_Q.bias)
+            if module.W_K.bias is not None:
+                nn.init.zeros_(module.W_K.bias)
+            if module.W_V.bias is not None:
+                nn.init.zeros_(module.W_V.bias)
+            out_proj = module.to_out[0]
+            nn.init.normal_(
+                out_proj.weight, mean=0.0, std=factor * ((num_heads * d_v) ** -0.5)
+            )
+            if out_proj.bias is not None:
+                nn.init.zeros_(out_proj.bias)
+        elif isinstance(module, TSTEncoderLayer):
+            for submodule in module.ff:
+                if isinstance(submodule, nn.Linear):
+                    nn.init.normal_(
+                        submodule.weight,
+                        mean=0.0,
+                        std=factor * (submodule.in_features**-0.5),
+                    )
+                    if submodule.bias is not None:
+                        nn.init.zeros_(submodule.bias)
+    def patching(self, time_series: torch.Tensor) -> torch.Tensor:
+        """Split a raw 1D time series into overlapping or non-overlapping patches.
+        The encoder does not operate directly on the full sequence. Instead, it
+        first converts the input into a sequence of local windows, where each
+        window has length ``self.patch_size`` and consecutive windows are shifted
+        by ``self.stride``. This patch-based representation reduces the temporal
+        resolution while preserving local patterns that are useful for attention
+        layers.
+        If the sequence length is not compatible with the patch size, we pad the
+        sequence on the right using replication padding so that the final patch
+        extraction remains well-defined.
+        """
+        # Unpack the input shape for clarity: each sample is a 1D signal.
+        batch_size, seq_length = time_series.shape
+        # When the sequence length cannot be evenly covered by the patch size,
+        # extend the sequence with replicated boundary values. This avoids
+        # discarding the tail of the signal and keeps the patching procedure
+        # consistent for every batch element.
+        if seq_length % self.patch_size != 0:
+            time_series = self.padding_patch_layer(time_series)
+        # Convert the padded sequence into a patch tensor using a sliding window.
+        # The resulting tensor contains local segments sampled along the last
+        # dimension, which will be consumed by the transformer encoder.
+        time_series = time_series.unfold(
+            dimension=-1, size=self.patch_size, step=self.stride
+        )
+        return time_series
+    def forward(
+        self, x: Tensor, return_cls_token: bool = True
+    ) -> Tuple[Tensor, Tensor]:
+        """Run the full SymTime inference pipeline.
+        The forward pass expects a 2D tensor of shape ``[batch_size, seq_length]``
+        containing a batch of univariate time series. The input is first converted
+        into patch embeddings through :meth:`patching`, and the resulting patch
+        sequence is then passed into the transformer encoder.
+        Parameters
+        ----------
+        x : Tensor
+            Batched input time series with shape ``[batch_size, seq_length]``.
+        return_cls_token : bool, optional
+            If ``True``, the encoder also returns the learned CLS token output
+            alongside the patch-level representations. This is useful when the
+            downstream task needs a global sequence summary.
+        Returns
+        -------
+        Tuple[Tensor, Tensor]
+            The encoded patch sequence and, optionally, the CLS token output.
+        """
+        # Validate that the input follows the expected batch-by-time layout.
+        assert (
+            x.dim() == 2
+        ), "Input time series must be a 2D tensor with shape of [batch_size, seq_length]."
+        # Convert the raw signal into a patch-based representation before encoding.
+        time_series = self.patching(x)
+        # Feed the patch sequence into the transformer encoder and return its output.
+        return self.encoder(time_series, return_cls_token=return_cls_token)