whenxuan commited on
Commit
62d1028
·
verified ·
1 Parent(s): 7c4d438

whenxuan: add the patching for time series

Browse files
Files changed (4) hide show
  1. config.json +24 -23
  2. configuration_symtime.py +66 -64
  3. layers.py +427 -401
  4. model.py +212 -142
config.json CHANGED
@@ -1,24 +1,25 @@
1
-
2
- {
3
- "architectures": [
4
- "SymTimeModel"
5
- ],
6
- "_name_or_path": "FlowVortex/SymTime",
7
- "auto_map": {
8
- "AutoConfig": "configuration_symtime.SymTimeConfig",
9
- "AutoModel": "model.SymTimeModel"
10
- },
11
- "patch_size": 16,
12
- "num_layers": 6,
13
- "d_model": 512,
14
- "d_ff": 2048,
15
- "num_heads": 8,
16
- "norm": "BatchNorm",
17
- "dropout": 0.1,
18
- "act": "gelu",
19
- "pre_norm": false,
20
- "initializer_factor": 0.05,
21
- "model_type": "symtime",
22
- "torch_dtype": "float32",
23
- "transformers_version": "5.5.4"
 
24
  }
 
1
+
2
+ {
3
+ "architectures": [
4
+ "SymTimeModel"
5
+ ],
6
+ "_name_or_path": "FlowVortex/SymTime",
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_symtime.SymTimeConfig",
9
+ "AutoModel": "model.SymTimeModel"
10
+ },
11
+ "patch_size": 16,
12
+ "stride": 16,
13
+ "num_layers": 6,
14
+ "d_model": 512,
15
+ "d_ff": 2048,
16
+ "num_heads": 8,
17
+ "norm": "BatchNorm",
18
+ "dropout": 0.1,
19
+ "act": "gelu",
20
+ "pre_norm": false,
21
+ "initializer_factor": 0.05,
22
+ "model_type": "symtime",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "5.5.4"
25
  }
configuration_symtime.py CHANGED
@@ -1,64 +1,66 @@
1
- from dataclasses import dataclass
2
- from typing import List, Literal, Optional, Dict
3
- from enum import Enum
4
-
5
- from transformers.configuration_utils import PretrainedConfig
6
-
7
-
8
- @dataclass
9
- class SymTimeConfig(PretrainedConfig):
10
- """
11
- Time series encoder configuration for SymTime Model.
12
-
13
- Parameters
14
- -----------
15
- patch_size
16
- The size of the patch to be used for the input data.
17
- num_layers
18
- The number of layers to be used for the encoder.
19
- d_model
20
- The dimension of the model.
21
- d_ff
22
- The dimension of the feedforward network.
23
- num_heads
24
- The number of heads to be used for the attention mechanism.
25
- norm
26
- The normalization to be used for the encoder.
27
- attn_dropout
28
- The dropout rate to be used for the attention mechanism.
29
- dropout
30
- The dropout rate to be used for the encoder.
31
- act
32
- The activation function to be used for the encoder.
33
- pre_norm
34
- Whether to use pre-norm for the encoder.
35
- """
36
-
37
- model_type = "symtime"
38
-
39
- def __init__(
40
- self,
41
- patch_size: int = 16,
42
- num_layers: int = 6,
43
- d_model: int = 512,
44
- d_ff: int = 2048,
45
- num_heads: int = 8,
46
- norm: str = "BatchNorm",
47
- dropout: float = 0.1,
48
- act: str = "gelu",
49
- pre_norm: bool = False,
50
- initializer_factor: float = 0.05,
51
- **kwargs,
52
- ) -> None:
53
- self.patch_size = patch_size
54
- self.num_layers = num_layers
55
- self.d_model = d_model
56
- self.num_heads = num_heads
57
- self.d_ff = d_ff
58
- self.norm = norm
59
- self.dropout = dropout
60
- self.act = act
61
- self.pre_norm = pre_norm
62
- self.initializer_factor = initializer_factor
63
-
64
- super().__init__(**kwargs)
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ @dataclass
7
+ class SymTimeConfig(PretrainedConfig):
8
+ """
9
+ Time series encoder configuration for SymTime Model.
10
+
11
+ Parameters
12
+ -----------
13
+ num_layers
14
+ The number of layers to be used for the encoder.
15
+ d_model
16
+ The dimension of the model.
17
+ d_ff
18
+ The dimension of the feedforward network.
19
+ num_heads
20
+ The number of heads to be used for the attention mechanism.
21
+ norm
22
+ The normalization to be used for the encoder.
23
+ attn_dropout
24
+ The dropout rate to be used for the attention mechanism.
25
+ dropout
26
+ The dropout rate to be used for the encoder.
27
+ act
28
+ The activation function to be used for the encoder.
29
+ pre_norm
30
+ Whether to use pre-norm for the encoder.
31
+ patch_size
32
+ The size of the patch to be used for the input data.
33
+ stride
34
+ The stride of the patch to be used for the input data.
35
+ """
36
+
37
+ model_type = "symtime"
38
+
39
+ def __init__(
40
+ self,
41
+ num_layers: int = 6,
42
+ d_model: int = 512,
43
+ d_ff: int = 2048,
44
+ num_heads: int = 8,
45
+ norm: str = "BatchNorm",
46
+ dropout: float = 0.1,
47
+ act: str = "gelu",
48
+ pre_norm: bool = False,
49
+ patch_size: int = 16,
50
+ stride: int = 16,
51
+ initializer_factor: float = 0.05,
52
+ **kwargs,
53
+ ) -> None:
54
+ self.patch_size = patch_size
55
+ self.stride = stride
56
+ self.num_layers = num_layers
57
+ self.d_model = d_model
58
+ self.num_heads = num_heads
59
+ self.d_ff = d_ff
60
+ self.norm = norm
61
+ self.dropout = dropout
62
+ self.act = act
63
+ self.pre_norm = pre_norm
64
+ self.initializer_factor = initializer_factor
65
+
66
+ super().__init__(**kwargs)
layers.py CHANGED
@@ -1,401 +1,427 @@
1
- from typing import Optional, Union, Tuple, Callable
2
- import math
3
-
4
- import numpy as np
5
- import torch
6
- from torch import nn
7
- from torch import Tensor
8
- import torch.nn.functional as F
9
- from einops import rearrange
10
-
11
-
12
- def get_activation_fn(activation: Union[str, Callable]) -> nn.Module:
13
- """Select the activation function to use."""
14
- if callable(activation):
15
- return activation()
16
- elif activation.lower() == "relu":
17
- return nn.ReLU()
18
- elif activation.lower() == "gelu":
19
- return nn.GELU()
20
- raise ValueError(
21
- f'{activation} is not available. You can use "relu", "gelu", or a callable'
22
- )
23
-
24
-
25
- class Transpose(nn.Module):
26
- """Transpose the dimensions of the input tensor"""
27
-
28
- def __init__(self, *dims, contiguous=False) -> None:
29
- super().__init__()
30
- self.dims, self.contiguous = dims, contiguous
31
-
32
- def forward(self, x: Tensor) -> Tensor:
33
- if self.contiguous:
34
- return x.transpose(*self.dims).contiguous()
35
- else:
36
- return x.transpose(*self.dims)
37
-
38
-
39
- class PositionalEmbedding(nn.Module):
40
- """Adding the positional encoding to the input for Transformer"""
41
-
42
- def __init__(self, hidden_size: int, max_len: int = 5000) -> None:
43
- super(PositionalEmbedding, self).__init__()
44
-
45
- # Calculate the positional encoding once in the logarithmic space.
46
- pe = torch.zeros(
47
- max_len, hidden_size
48
- ).float() # Initialize a tensor of zeros with shape (max_len, hidden_size) to store positional encodings
49
- pe.requires_grad = (
50
- False # Positional encodings do not require gradients as they are fixed
51
- )
52
-
53
- position = (
54
- torch.arange(0, max_len).float().unsqueeze(1)
55
- ) # Generate a sequence from 0 to max_len-1 and add a dimension at the 1st axis
56
- div_term = (
57
- torch.arange(0, hidden_size, 2).float() * -(math.log(10000.0) / hidden_size)
58
- ).exp() # Calculate the divisor term in the positional encoding formula
59
-
60
- pe[:, 0::2] = torch.sin(
61
- position * div_term
62
- ) # Apply the sine function to the even columns of the positional encoding matrix
63
- pe[:, 1::2] = torch.cos(
64
- position * div_term
65
- ) # Apply the cosine function to the odd columns of the positional encoding matrix
66
-
67
- pe = pe.unsqueeze(
68
- 0
69
- ) # Add a batch dimension, changing the shape to (1, max_len, hidden_size)
70
- self.register_buffer(
71
- "pe", pe
72
- ) # Register the positional encodings as a buffer, which will not be updated as model parameters
73
-
74
- def forward(self, x: Tensor) -> Tensor:
75
- # Return the first max_len positional encodings that match the length of input x
76
- return x + self.pe[:, : x.size(1)]
77
-
78
-
79
- class TSTEncoder(nn.Module):
80
- """Time series encoder backbone of SymTime"""
81
-
82
- def __init__(
83
- self,
84
- patch_size: int = 16,
85
- num_layers: int = 3,
86
- hidden_size: int = 128,
87
- num_heads: int = 16,
88
- d_k: int = None,
89
- d_v: int = None,
90
- d_ff: int = 256,
91
- norm: str = "BatchNorm",
92
- attn_dropout: float = 0.0,
93
- dropout: float = 0.0,
94
- act: str = "gelu",
95
- store_attn: bool = False,
96
- pre_norm: bool = False,
97
- ) -> None:
98
- super().__init__()
99
- # The Linear layer to project the input patches to the model dimension
100
- self.W_P = nn.Linear(patch_size, hidden_size)
101
-
102
- # Positional encoding
103
- self.pe = PositionalEmbedding(hidden_size=hidden_size)
104
-
105
- # Residual dropout
106
- self.dropout = nn.Dropout(dropout)
107
-
108
- # Create the [CLS] token
109
- self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
110
- self.cls_mask = nn.Parameter(torch.ones(1, 1).bool(), requires_grad=False)
111
-
112
- # Create the encoder layer of the model backbone
113
- self.layers = nn.ModuleList(
114
- [
115
- TSTEncoderLayer(
116
- hidden_size=hidden_size,
117
- num_heads=num_heads,
118
- d_k=d_k,
119
- d_v=d_v,
120
- d_ff=d_ff,
121
- norm=norm,
122
- attn_dropout=attn_dropout,
123
- dropout=dropout,
124
- activation=act,
125
- pre_norm=pre_norm,
126
- store_attn=store_attn,
127
- )
128
- for _ in range(num_layers)
129
- ]
130
- )
131
-
132
- # model params init
133
- self.apply(self._init_weights)
134
-
135
- def _init_weights(self, m: nn.Module) -> None:
136
- """model params init through apply methods"""
137
- if isinstance(m, nn.Linear):
138
- nn.init.xavier_uniform_(m.weight)
139
- if isinstance(m, nn.Linear) and m.bias is not None:
140
- nn.init.constant_(m.bias, 0)
141
- elif isinstance(m, nn.LayerNorm):
142
- nn.init.constant_(m.bias, 0)
143
- nn.init.constant_(m.weight, 1.0)
144
-
145
- def forward(
146
- self,
147
- x: Tensor, # x: [batch_size, patch_num, patch_size]
148
- attn_mask: Optional[Tensor] = None, # attn_mask: [batch, num_patch]
149
- return_cls_token: bool = True, # whether to return the CLS token
150
- ) -> Tensor:
151
- """ """
152
- batch_size = x.size(0)
153
-
154
- # Input patching embedding
155
- x = self.W_P(x) # x: [batch_size, patch_num, model_dim]
156
-
157
- # Add the [CLS] token
158
- cls_token = self.cls_token.expand(batch_size, -1, -1)
159
- x = torch.cat([cls_token, x], dim=1)
160
- # adjust the attn mask
161
- if attn_mask is not None:
162
- attn_mask = torch.cat(
163
- [self.cls_mask.expand(batch_size, -1), attn_mask], dim=1
164
- )
165
-
166
- # Add the positional embedding
167
- x = self.pe(x)
168
- x = self.dropout(x) # x: [batch_size, patch_num, hidden_size]
169
-
170
- for mod in self.layers:
171
- x = mod(x, attn_mask=attn_mask)
172
-
173
- if not return_cls_token:
174
- # If not returning the CLS token, remove it from the output
175
- return x[:, 1:, :]
176
-
177
- return x
178
-
179
-
180
- class TSTEncoderLayer(nn.Module):
181
- """Patch-based Transformer module sublayer"""
182
-
183
- def __init__(
184
- self,
185
- hidden_size: int,
186
- num_heads: int,
187
- d_k: int = None,
188
- d_v: int = None,
189
- d_ff: int = 256,
190
- store_attn: int = False,
191
- norm: str = "BatchNorm",
192
- attn_dropout: float = 0.0,
193
- dropout: float = 0.0,
194
- bias: bool = True,
195
- activation: str = "gelu",
196
- pre_norm: bool = False,
197
- ) -> None:
198
- super(TSTEncoderLayer, self).__init__()
199
-
200
- assert (
201
- not hidden_size % num_heads
202
- ), f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})"
203
- # If not specified, the number of heads is divided
204
- d_k = hidden_size // num_heads if d_k is None else d_k
205
- d_v = hidden_size // num_heads if d_v is None else d_v
206
-
207
- # Create the multi-head attention
208
- self.self_attn = MultiHeadAttention(
209
- hidden_size,
210
- num_heads,
211
- d_k,
212
- d_v,
213
- attn_dropout=attn_dropout,
214
- proj_dropout=dropout,
215
- )
216
-
217
- # Add & Norm
218
- self.dropout_attn = nn.Dropout(dropout)
219
- if "batch" in norm.lower():
220
- self.norm_attn = nn.Sequential(
221
- Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
222
- )
223
- else:
224
- self.norm_attn = nn.LayerNorm(hidden_size)
225
-
226
- # Position-wise Feed-Forward
227
- self.ff = nn.Sequential(
228
- nn.Linear(hidden_size, d_ff, bias=bias),
229
- get_activation_fn(activation),
230
- nn.Dropout(dropout),
231
- nn.Linear(d_ff, hidden_size, bias=bias),
232
- )
233
-
234
- # Add & Norm
235
- self.dropout_ffn = nn.Dropout(dropout)
236
- if "batch" in norm.lower():
237
- self.norm_ffn = nn.Sequential(
238
- Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
239
- )
240
- else:
241
- self.norm_ffn = nn.LayerNorm(hidden_size)
242
-
243
- # use pre-norm or not
244
- self.pre_norm = pre_norm
245
- self.store_attn = store_attn
246
- self.attn = None
247
-
248
- def forward(
249
- self, src: Tensor, attn_mask: Optional[Tensor] = None
250
- ) -> Union[Tuple[Tensor, Tensor], Tensor]:
251
- """Multi-Head attention sublayer"""
252
-
253
- # Whether to use pre-norm for attention layer
254
- if self.pre_norm:
255
- src = self.norm_attn(src)
256
-
257
- # Multi-Head attention
258
- src2, attn = self.self_attn(src, src, src, attn_mask=attn_mask)
259
- if self.store_attn:
260
- self.attn = attn
261
-
262
- # Add: residual connection with residual dropout
263
- src = src + self.dropout_attn(src2)
264
- if not self.pre_norm:
265
- src = self.norm_attn(src)
266
-
267
- # Whether to use pre-norm for ffn layer
268
- if self.pre_norm:
269
- src = self.norm_ffn(src)
270
-
271
- # Position-wise Feed-Forward
272
- src2 = self.ff(src)
273
-
274
- # Add: residual connection with residual dropout
275
- src = src + self.dropout_ffn(src2)
276
- if not self.pre_norm:
277
- src = self.norm_ffn(src)
278
-
279
- return src
280
-
281
-
282
- class MultiHeadAttention(nn.Module):
283
- """Multi-head attention mechanism layer"""
284
-
285
- def __init__(
286
- self,
287
- hidden_size: int,
288
- num_heads: int,
289
- d_k: int = None,
290
- d_v: int = None,
291
- attn_dropout: float = 0.0,
292
- proj_dropout: float = 0.0,
293
- qkv_bias: bool = True,
294
- ) -> None:
295
- """Multi Head Attention Layer
296
- Input shape:
297
- Q: [batch_size (bs) x max_q_len x hidden_size]
298
- K, V: [batch_size (bs) x q_len x hidden_size]
299
- mask: [q_len x q_len]
300
- """
301
- super().__init__()
302
- d_k = hidden_size // num_heads if d_k is None else d_k
303
- d_v = hidden_size // num_heads if d_v is None else d_v
304
-
305
- self.num_heads, self.d_k, self.d_v = num_heads, d_k, d_v
306
-
307
- self.W_Q = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
308
- self.W_K = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
309
- self.W_V = nn.Linear(hidden_size, d_v * num_heads, bias=qkv_bias)
310
-
311
- # Scaled Dot-Product Attention (multiple heads)
312
- self.sdp_attn = _ScaledDotProductAttention(
313
- hidden_size, num_heads, attn_dropout=attn_dropout
314
- )
315
-
316
- # Project output
317
- self.to_out = nn.Sequential(
318
- nn.Linear(num_heads * d_v, hidden_size), nn.Dropout(proj_dropout)
319
- )
320
-
321
- def forward(
322
- self,
323
- q: Tensor,
324
- k: Optional[Tensor] = None,
325
- v: Optional[Tensor] = None,
326
- attn_mask: Optional[Tensor] = None,
327
- ):
328
- bs = q.size(0)
329
- if k is None:
330
- k = q
331
- if v is None:
332
- v = q
333
-
334
- # Linear (+ split in multiple heads)
335
- q_s = self.W_Q(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
336
- k_s = self.W_K(k).view(bs, -1, self.num_heads, self.d_k).permute(0, 2, 3, 1)
337
- v_s = self.W_V(v).view(bs, -1, self.num_heads, self.d_v).transpose(1, 2)
338
-
339
- # Apply Scaled Dot-Product Attention (multiple heads)
340
- output, attn_weights = self.sdp_attn(q_s, k_s, v_s, attn_mask=attn_mask)
341
-
342
- # back to the original inputs dimensions
343
- output = (
344
- output.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_v)
345
- )
346
- output = self.to_out(output)
347
-
348
- return output, attn_weights
349
-
350
-
351
- class _ScaledDotProductAttention(nn.Module):
352
- r"""Scaled Dot-Product Attention module (Attention is all you need by Vaswani et al., 2017) with optional residual attention from previous layer
353
- (Realformer: Transformer likes residual attention by He et al, 2020) and locality self sttention (Vision Transformer for Small-Size Datasets
354
- by Lee et al, 2021)"""
355
-
356
- def __init__(
357
- self,
358
- hidden_size: int,
359
- num_heads: int,
360
- attn_dropout: float = 0.0,
361
- res_attention: bool = False,
362
- ):
363
- super().__init__()
364
- self.attn_dropout = nn.Dropout(attn_dropout)
365
- self.res_attention = res_attention
366
- head_dim = hidden_size // num_heads
367
- self.scale = nn.Parameter(torch.tensor(head_dim**-0.5), requires_grad=False)
368
-
369
- def forward(
370
- self, q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None
371
- ) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
372
- """
373
- :param q: [batch_size, num_heads, num_token, d_k]
374
- :param k: [batch_size, num_heads, d_k, num_token]
375
- :param v: [batch_size, num_heads, num_token, d_k]
376
- :param attn_mask: [batch_size, num_heads, num_token]
377
- """
378
-
379
- # Scaled MatMul (q, k) - similarity scores for all pairs of positions in an input sequence
380
- attn_scores = torch.matmul(q, k) * self.scale
381
-
382
- # Attention mask (optional)
383
- if (
384
- attn_mask is not None
385
- ): # attn_mask with shape [q_len x seq_len] - only used when q_len == seq_len
386
- attn_mask = rearrange(attn_mask, "b i -> b 1 i 1") * rearrange(
387
- attn_mask, "b i -> b 1 1 i"
388
- )
389
- if attn_mask.dtype == torch.bool:
390
- attn_scores.masked_fill_(attn_mask, -np.inf)
391
- else:
392
- attn_scores += attn_mask
393
-
394
- # normalize the attention weights
395
- attn_weights = F.softmax(attn_scores, dim=-1)
396
- attn_weights = self.attn_dropout(attn_weights)
397
-
398
- # compute the new values given the attention weights
399
- output = torch.matmul(attn_weights, v)
400
-
401
- return output, attn_weights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union, Tuple, Callable
2
+ import math
3
+
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch import Tensor
8
+ import torch.nn.functional as F
9
+ from einops import rearrange
10
+
11
+
12
+ def get_activation_fn(activation: Union[str, Callable]) -> nn.Module:
13
+ """
14
+ Select the activation function to use.
15
+
16
+ Parameters
17
+ ----------
18
+ activation : Union[str, Callable]
19
+ The activation specification to resolve. It can be a string such as
20
+ "relu" or "gelu", or a callable that returns an activation module.
21
+
22
+ Return
23
+ ------
24
+ nn.Module
25
+ The corresponding activation module instance.
26
+ """
27
+ if callable(activation):
28
+ return activation()
29
+ elif activation.lower() == "relu":
30
+ return nn.ReLU()
31
+ elif activation.lower() == "gelu":
32
+ return nn.GELU()
33
+ raise ValueError(
34
+ f'{activation} is not available. You can use "relu", "gelu", or a callable'
35
+ )
36
+
37
+
38
+ class Transpose(nn.Module):
39
+ """Transpose the dimensions of the input tensor.
40
+
41
+ Parameters
42
+ ----------
43
+ *dims : int
44
+ The dimensions passed to `torch.Tensor.transpose`.
45
+ contiguous : bool, optional
46
+ Whether to return a contiguous tensor after transposing, by default False.
47
+
48
+ Return
49
+ ------
50
+ Tensor
51
+ The transposed tensor.
52
+ """
53
+
54
+ def __init__(self, *dims, contiguous=False) -> None:
55
+ super().__init__()
56
+ self.dims, self.contiguous = dims, contiguous
57
+
58
+ def forward(self, x: Tensor) -> Tensor:
59
+ if self.contiguous:
60
+ return x.transpose(*self.dims).contiguous()
61
+ else:
62
+ return x.transpose(*self.dims)
63
+
64
+
65
+ class PositionalEmbedding(nn.Module):
66
+ """Adding the positional encoding to the input for Transformer"""
67
+
68
+ def __init__(self, hidden_size: int, max_len: int = 5000) -> None:
69
+ super(PositionalEmbedding, self).__init__()
70
+
71
+ # Calculate the positional encoding once in the logarithmic space.
72
+ pe = torch.zeros(
73
+ max_len, hidden_size
74
+ ).float() # Initialize a tensor of zeros with shape (max_len, hidden_size) to store positional encodings
75
+ pe.requires_grad = (
76
+ False # Positional encodings do not require gradients as they are fixed
77
+ )
78
+
79
+ position = (
80
+ torch.arange(0, max_len).float().unsqueeze(1)
81
+ ) # Generate a sequence from 0 to max_len-1 and add a dimension at the 1st axis
82
+ div_term = (
83
+ torch.arange(0, hidden_size, 2).float() * -(math.log(10000.0) / hidden_size)
84
+ ).exp() # Calculate the divisor term in the positional encoding formula
85
+
86
+ pe[:, 0::2] = torch.sin(
87
+ position * div_term
88
+ ) # Apply the sine function to the even columns of the positional encoding matrix
89
+ pe[:, 1::2] = torch.cos(
90
+ position * div_term
91
+ ) # Apply the cosine function to the odd columns of the positional encoding matrix
92
+
93
+ pe = pe.unsqueeze(
94
+ 0
95
+ ) # Add a batch dimension, changing the shape to (1, max_len, hidden_size)
96
+ self.register_buffer(
97
+ "pe", pe
98
+ ) # Register the positional encodings as a buffer, which will not be updated as model parameters
99
+
100
+ def forward(self, x: Tensor) -> Tensor:
101
+ # Return the first max_len positional encodings that match the length of input x
102
+ return x + self.pe[:, : x.size(1)]
103
+
104
+
105
+ class TSTEncoder(nn.Module):
106
+ """Time series encoder backbone of SymTime"""
107
+
108
+ def __init__(
109
+ self,
110
+ patch_size: int = 16,
111
+ num_layers: int = 3,
112
+ hidden_size: int = 128,
113
+ num_heads: int = 16,
114
+ d_k: int = None,
115
+ d_v: int = None,
116
+ d_ff: int = 256,
117
+ norm: str = "BatchNorm",
118
+ attn_dropout: float = 0.0,
119
+ dropout: float = 0.0,
120
+ act: str = "gelu",
121
+ store_attn: bool = False,
122
+ pre_norm: bool = False,
123
+ ) -> None:
124
+ super().__init__()
125
+ # The Linear layer to project the input patches to the model dimension
126
+ self.W_P = nn.Linear(patch_size, hidden_size)
127
+
128
+ # Positional encoding
129
+ self.pe = PositionalEmbedding(hidden_size=hidden_size)
130
+
131
+ # Residual dropout
132
+ self.dropout = nn.Dropout(dropout)
133
+
134
+ # Create the [CLS] token
135
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
136
+ self.cls_mask = nn.Parameter(torch.ones(1, 1).bool(), requires_grad=False)
137
+
138
+ # Create the encoder layer of the model backbone
139
+ self.layers = nn.ModuleList(
140
+ [
141
+ TSTEncoderLayer(
142
+ hidden_size=hidden_size,
143
+ num_heads=num_heads,
144
+ d_k=d_k,
145
+ d_v=d_v,
146
+ d_ff=d_ff,
147
+ norm=norm,
148
+ attn_dropout=attn_dropout,
149
+ dropout=dropout,
150
+ activation=act,
151
+ pre_norm=pre_norm,
152
+ store_attn=store_attn,
153
+ )
154
+ for _ in range(num_layers)
155
+ ]
156
+ )
157
+
158
+ # model params init
159
+ self.apply(self._init_weights)
160
+
161
+ def _init_weights(self, m: nn.Module) -> None:
162
+ """model params init through apply methods"""
163
+ if isinstance(m, nn.Linear):
164
+ nn.init.xavier_uniform_(m.weight)
165
+ if isinstance(m, nn.Linear) and m.bias is not None:
166
+ nn.init.constant_(m.bias, 0)
167
+ elif isinstance(m, nn.LayerNorm):
168
+ nn.init.constant_(m.bias, 0)
169
+ nn.init.constant_(m.weight, 1.0)
170
+
171
+ def forward(
172
+ self,
173
+ x: Tensor, # x: [batch_size, patch_num, patch_size]
174
+ attn_mask: Optional[Tensor] = None, # attn_mask: [batch, num_patch]
175
+ return_cls_token: bool = True, # whether to return the CLS token
176
+ ) -> Tensor:
177
+ """ """
178
+ batch_size = x.size(0)
179
+
180
+ # Input patching embedding
181
+ x = self.W_P(x) # x: [batch_size, patch_num, model_dim]
182
+
183
+ # Add the [CLS] token
184
+ cls_token = self.cls_token.expand(batch_size, -1, -1)
185
+ x = torch.cat([cls_token, x], dim=1)
186
+ # adjust the attn mask
187
+ if attn_mask is not None:
188
+ attn_mask = torch.cat(
189
+ [self.cls_mask.expand(batch_size, -1), attn_mask], dim=1
190
+ )
191
+
192
+ # Add the positional embedding
193
+ x = self.pe(x)
194
+ x = self.dropout(x) # x: [batch_size, patch_num, hidden_size]
195
+
196
+ for mod in self.layers:
197
+ x = mod(x, attn_mask=attn_mask)
198
+
199
+ if not return_cls_token:
200
+ # If not returning the CLS token, remove it from the output
201
+ return x[:, 1:, :]
202
+
203
+ return x
204
+
205
+
206
+ class TSTEncoderLayer(nn.Module):
207
+ """Patch-based Transformer module sublayer"""
208
+
209
+ def __init__(
210
+ self,
211
+ hidden_size: int,
212
+ num_heads: int,
213
+ d_k: int = None,
214
+ d_v: int = None,
215
+ d_ff: int = 256,
216
+ store_attn: int = False,
217
+ norm: str = "BatchNorm",
218
+ attn_dropout: float = 0.0,
219
+ dropout: float = 0.0,
220
+ bias: bool = True,
221
+ activation: str = "gelu",
222
+ pre_norm: bool = False,
223
+ ) -> None:
224
+ super(TSTEncoderLayer, self).__init__()
225
+
226
+ assert (
227
+ not hidden_size % num_heads
228
+ ), f"hidden_size ({hidden_size}) must be divisible by num_heads ({num_heads})"
229
+ # If not specified, the number of heads is divided
230
+ d_k = hidden_size // num_heads if d_k is None else d_k
231
+ d_v = hidden_size // num_heads if d_v is None else d_v
232
+
233
+ # Create the multi-head attention
234
+ self.self_attn = MultiHeadAttention(
235
+ hidden_size,
236
+ num_heads,
237
+ d_k,
238
+ d_v,
239
+ attn_dropout=attn_dropout,
240
+ proj_dropout=dropout,
241
+ )
242
+
243
+ # Add & Norm
244
+ self.dropout_attn = nn.Dropout(dropout)
245
+ if "batch" in norm.lower():
246
+ self.norm_attn = nn.Sequential(
247
+ Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
248
+ )
249
+ else:
250
+ self.norm_attn = nn.LayerNorm(hidden_size)
251
+
252
+ # Position-wise Feed-Forward
253
+ self.ff = nn.Sequential(
254
+ nn.Linear(hidden_size, d_ff, bias=bias),
255
+ get_activation_fn(activation),
256
+ nn.Dropout(dropout),
257
+ nn.Linear(d_ff, hidden_size, bias=bias),
258
+ )
259
+
260
+ # Add & Norm
261
+ self.dropout_ffn = nn.Dropout(dropout)
262
+ if "batch" in norm.lower():
263
+ self.norm_ffn = nn.Sequential(
264
+ Transpose(1, 2), nn.BatchNorm1d(hidden_size), Transpose(1, 2)
265
+ )
266
+ else:
267
+ self.norm_ffn = nn.LayerNorm(hidden_size)
268
+
269
+ # use pre-norm or not
270
+ self.pre_norm = pre_norm
271
+ self.store_attn = store_attn
272
+ self.attn = None
273
+
274
+ def forward(
275
+ self, src: Tensor, attn_mask: Optional[Tensor] = None
276
+ ) -> Union[Tuple[Tensor, Tensor], Tensor]:
277
+ """Multi-Head attention sublayer"""
278
+
279
+ # Whether to use pre-norm for attention layer
280
+ if self.pre_norm:
281
+ src = self.norm_attn(src)
282
+
283
+ # Multi-Head attention
284
+ src2, attn = self.self_attn(src, src, src, attn_mask=attn_mask)
285
+ if self.store_attn:
286
+ self.attn = attn
287
+
288
+ # Add: residual connection with residual dropout
289
+ src = src + self.dropout_attn(src2)
290
+ if not self.pre_norm:
291
+ src = self.norm_attn(src)
292
+
293
+ # Whether to use pre-norm for ffn layer
294
+ if self.pre_norm:
295
+ src = self.norm_ffn(src)
296
+
297
+ # Position-wise Feed-Forward
298
+ src2 = self.ff(src)
299
+
300
+ # Add: residual connection with residual dropout
301
+ src = src + self.dropout_ffn(src2)
302
+ if not self.pre_norm:
303
+ src = self.norm_ffn(src)
304
+
305
+ return src
306
+
307
+
308
+ class MultiHeadAttention(nn.Module):
309
+ """Multi-head attention mechanism layer"""
310
+
311
+ def __init__(
312
+ self,
313
+ hidden_size: int,
314
+ num_heads: int,
315
+ d_k: int = None,
316
+ d_v: int = None,
317
+ attn_dropout: float = 0.0,
318
+ proj_dropout: float = 0.0,
319
+ qkv_bias: bool = True,
320
+ ) -> None:
321
+ """Multi Head Attention Layer
322
+ Input shape:
323
+ Q: [batch_size (bs) x max_q_len x hidden_size]
324
+ K, V: [batch_size (bs) x q_len x hidden_size]
325
+ mask: [q_len x q_len]
326
+ """
327
+ super().__init__()
328
+ d_k = hidden_size // num_heads if d_k is None else d_k
329
+ d_v = hidden_size // num_heads if d_v is None else d_v
330
+
331
+ self.num_heads, self.d_k, self.d_v = num_heads, d_k, d_v
332
+
333
+ self.W_Q = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
334
+ self.W_K = nn.Linear(hidden_size, d_k * num_heads, bias=qkv_bias)
335
+ self.W_V = nn.Linear(hidden_size, d_v * num_heads, bias=qkv_bias)
336
+
337
+ # Scaled Dot-Product Attention (multiple heads)
338
+ self.sdp_attn = _ScaledDotProductAttention(
339
+ hidden_size, num_heads, attn_dropout=attn_dropout
340
+ )
341
+
342
+ # Project output
343
+ self.to_out = nn.Sequential(
344
+ nn.Linear(num_heads * d_v, hidden_size), nn.Dropout(proj_dropout)
345
+ )
346
+
347
+ def forward(
348
+ self,
349
+ q: Tensor,
350
+ k: Optional[Tensor] = None,
351
+ v: Optional[Tensor] = None,
352
+ attn_mask: Optional[Tensor] = None,
353
+ ):
354
+ bs = q.size(0)
355
+ if k is None:
356
+ k = q
357
+ if v is None:
358
+ v = q
359
+
360
+ # Linear (+ split in multiple heads)
361
+ q_s = self.W_Q(q).view(bs, -1, self.num_heads, self.d_k).transpose(1, 2)
362
+ k_s = self.W_K(k).view(bs, -1, self.num_heads, self.d_k).permute(0, 2, 3, 1)
363
+ v_s = self.W_V(v).view(bs, -1, self.num_heads, self.d_v).transpose(1, 2)
364
+
365
+ # Apply Scaled Dot-Product Attention (multiple heads)
366
+ output, attn_weights = self.sdp_attn(q_s, k_s, v_s, attn_mask=attn_mask)
367
+
368
+ # back to the original inputs dimensions
369
+ output = (
370
+ output.transpose(1, 2).contiguous().view(bs, -1, self.num_heads * self.d_v)
371
+ )
372
+ output = self.to_out(output)
373
+
374
+ return output, attn_weights
375
+
376
+
377
+ class _ScaledDotProductAttention(nn.Module):
378
+ r"""Scaled Dot-Product Attention module (Attention is all you need by Vaswani et al., 2017) with optional residual attention from previous layer
379
+ (Realformer: Transformer likes residual attention by He et al, 2020) and locality self sttention (Vision Transformer for Small-Size Datasets
380
+ by Lee et al, 2021)"""
381
+
382
+ def __init__(
383
+ self,
384
+ hidden_size: int,
385
+ num_heads: int,
386
+ attn_dropout: float = 0.0,
387
+ res_attention: bool = False,
388
+ ):
389
+ super().__init__()
390
+ self.attn_dropout = nn.Dropout(attn_dropout)
391
+ self.res_attention = res_attention
392
+ head_dim = hidden_size // num_heads
393
+ self.scale = nn.Parameter(torch.tensor(head_dim**-0.5), requires_grad=False)
394
+
395
+ def forward(
396
+ self, q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None
397
+ ) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
398
+ """
399
+ :param q: [batch_size, num_heads, num_token, d_k]
400
+ :param k: [batch_size, num_heads, d_k, num_token]
401
+ :param v: [batch_size, num_heads, num_token, d_k]
402
+ :param attn_mask: [batch_size, num_heads, num_token]
403
+ """
404
+
405
+ # Scaled MatMul (q, k) - similarity scores for all pairs of positions in an input sequence
406
+ attn_scores = torch.matmul(q, k) * self.scale
407
+
408
+ # Attention mask (optional)
409
+ if (
410
+ attn_mask is not None
411
+ ): # attn_mask with shape [q_len x seq_len] - only used when q_len == seq_len
412
+ attn_mask = rearrange(attn_mask, "b i -> b 1 i 1") * rearrange(
413
+ attn_mask, "b i -> b 1 1 i"
414
+ )
415
+ if attn_mask.dtype == torch.bool:
416
+ attn_scores.masked_fill_(attn_mask, -np.inf)
417
+ else:
418
+ attn_scores += attn_mask
419
+
420
+ # normalize the attention weights
421
+ attn_weights = F.softmax(attn_scores, dim=-1)
422
+ attn_weights = self.attn_dropout(attn_weights)
423
+
424
+ # compute the new values given the attention weights
425
+ output = torch.matmul(attn_weights, v)
426
+
427
+ return output, attn_weights
model.py CHANGED
@@ -1,142 +1,212 @@
1
- from typing import Tuple
2
-
3
- import torch
4
- import torch.nn as nn
5
- from torch import Tensor
6
- from torch.nn import functional as F
7
- from einops import rearrange, repeat
8
- from transformers.modeling_utils import PreTrainedModel
9
-
10
- from configuration_symtime import SymTimeConfig
11
- from layers import MultiHeadAttention, TSTEncoder, TSTEncoderLayer
12
-
13
-
14
- class SymTimeModel(PreTrainedModel):
15
- """
16
- SymTime Model for Huggingface.
17
-
18
- Parameters
19
- ----------
20
- config: SymTimeConfig
21
- The configuration of the SymTime model.
22
-
23
- Attributes
24
- ----------
25
- config: SymTimeConfig
26
- The configuration of the SymTime model.
27
- encoder: TSTEncoder
28
- The encoder of the SymTime model.
29
-
30
- Methods
31
- -------
32
- forward(x: Tensor) -> Tuple[Tensor, Tensor]:
33
- Forward pass of the SymTime model.
34
-
35
- _init_weights(module: nn.Module) -> None:
36
- Initialize weights for the SymTime encoder stack.
37
- """
38
-
39
- config_class = SymTimeConfig
40
-
41
- def __init__(self, config: SymTimeConfig):
42
- super().__init__(config)
43
- self.config = config
44
- self.encoder = TSTEncoder(
45
- patch_size=config.patch_size,
46
- num_layers=config.num_layers,
47
- hidden_size=config.d_model,
48
- num_heads=config.num_heads,
49
- d_ff=config.d_ff,
50
- norm=config.norm,
51
- attn_dropout=config.dropout,
52
- dropout=config.dropout,
53
- act=config.act,
54
- pre_norm=config.pre_norm,
55
- )
56
-
57
- # Initialize weights and apply final processing
58
- self.post_init()
59
-
60
- def _init_weights(self, module) -> None:
61
- """Initialize weights for the SymTime encoder stack.
62
-
63
- The model is built on top of Hugging Face `PreTrainedModel`, so this method
64
- is called recursively via `post_init()`. We keep the initialization aligned
65
- with the current backbone structure in `layers.py`:
66
-
67
- - `TSTEncoder.W_P`: patch projection linear layer
68
- - `TSTEncoder.cls_token`: learnable CLS token
69
- - `TSTEncoderLayer.self_attn`: Q/K/V and output projections
70
- - `TSTEncoderLayer.ff`: feed-forward linear layers
71
- - `LayerNorm` / `BatchNorm1d`: normalization layers
72
- """
73
- super()._init_weights(module)
74
-
75
- factor = self.config.initializer_factor
76
- d_model = self.config.d_model
77
- num_heads = self.config.num_heads
78
- d_k = d_model // num_heads
79
- d_v = d_k
80
-
81
- if isinstance(module, nn.Linear):
82
- nn.init.normal_(
83
- module.weight, mean=0.0, std=factor * (module.in_features**-0.5)
84
- )
85
- if module.bias is not None:
86
- nn.init.zeros_(module.bias)
87
-
88
- elif isinstance(module, nn.LayerNorm):
89
- nn.init.ones_(module.weight)
90
- nn.init.zeros_(module.bias)
91
-
92
- elif isinstance(module, nn.BatchNorm1d):
93
- if module.weight is not None:
94
- nn.init.ones_(module.weight)
95
- if module.bias is not None:
96
- nn.init.zeros_(module.bias)
97
-
98
- elif isinstance(module, TSTEncoder):
99
- if hasattr(module, "cls_token") and module.cls_token is not None:
100
- nn.init.normal_(module.cls_token, mean=0.0, std=factor)
101
- if hasattr(module, "W_P") and isinstance(module.W_P, nn.Linear):
102
- nn.init.normal_(
103
- module.W_P.weight,
104
- mean=0.0,
105
- std=factor * (module.W_P.in_features**-0.5),
106
- )
107
- if module.W_P.bias is not None:
108
- nn.init.zeros_(module.W_P.bias)
109
-
110
- elif isinstance(module, MultiHeadAttention):
111
- nn.init.normal_(module.W_Q.weight, mean=0.0, std=factor * (d_model**-0.5))
112
- nn.init.normal_(module.W_K.weight, mean=0.0, std=factor * (d_model**-0.5))
113
- nn.init.normal_(module.W_V.weight, mean=0.0, std=factor * (d_model**-0.5))
114
- if module.W_Q.bias is not None:
115
- nn.init.zeros_(module.W_Q.bias)
116
- if module.W_K.bias is not None:
117
- nn.init.zeros_(module.W_K.bias)
118
- if module.W_V.bias is not None:
119
- nn.init.zeros_(module.W_V.bias)
120
-
121
- out_proj = module.to_out[0]
122
- nn.init.normal_(
123
- out_proj.weight, mean=0.0, std=factor * ((num_heads * d_v) ** -0.5)
124
- )
125
- if out_proj.bias is not None:
126
- nn.init.zeros_(out_proj.bias)
127
-
128
- elif isinstance(module, TSTEncoderLayer):
129
- for submodule in module.ff:
130
- if isinstance(submodule, nn.Linear):
131
- nn.init.normal_(
132
- submodule.weight,
133
- mean=0.0,
134
- std=factor * (submodule.in_features**-0.5),
135
- )
136
- if submodule.bias is not None:
137
- nn.init.zeros_(submodule.bias)
138
-
139
- def forward(
140
- self, x: Tensor, return_cls_token: bool = True
141
- ) -> Tuple[Tensor, Tensor]:
142
- return self.encoder(x, return_cls_token=return_cls_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+ from torch.nn import functional as F
7
+ from einops import rearrange, repeat
8
+ from transformers.modeling_utils import PreTrainedModel
9
+
10
+ from configuration_symtime import SymTimeConfig
11
+ from layers import MultiHeadAttention, TSTEncoder, TSTEncoderLayer
12
+
13
+
14
+ class SymTimeModel(PreTrainedModel):
15
+ """
16
+ SymTime Model for Huggingface.
17
+
18
+ Parameters
19
+ ----------
20
+ config: SymTimeConfig
21
+ The configuration of the SymTime model.
22
+
23
+ Attributes
24
+ ----------
25
+ config: SymTimeConfig
26
+ The configuration of the SymTime model.
27
+ encoder: TSTEncoder
28
+ The encoder of the SymTime model.
29
+
30
+ Methods
31
+ -------
32
+ forward(x: Tensor) -> Tuple[Tensor, Tensor]:
33
+ Forward pass of the SymTime model.
34
+
35
+ _init_weights(module: nn.Module) -> None:
36
+ Initialize weights for the SymTime encoder stack.
37
+ """
38
+
39
+ config_class = SymTimeConfig
40
+
41
+ def __init__(self, config: SymTimeConfig):
42
+ super().__init__(config)
43
+ self.config = config
44
+
45
+ self.patch_size = config.patch_size
46
+ self.stride = config.stride
47
+
48
+ self.padding_patch_layer = nn.ReplicationPad1d((0, self.stride))
49
+ self.encoder = TSTEncoder(
50
+ patch_size=config.patch_size,
51
+ num_layers=config.num_layers,
52
+ hidden_size=config.d_model,
53
+ num_heads=config.num_heads,
54
+ d_ff=config.d_ff,
55
+ norm=config.norm,
56
+ attn_dropout=config.dropout,
57
+ dropout=config.dropout,
58
+ act=config.act,
59
+ pre_norm=config.pre_norm,
60
+ )
61
+
62
+ # Initialize weights and apply final processing
63
+ self.post_init()
64
+
65
+ def _init_weights(self, module) -> None:
66
+ """Initialize weights for the SymTime encoder stack.
67
+
68
+ The model is built on top of Hugging Face `PreTrainedModel`, so this method
69
+ is called recursively via `post_init()`. We keep the initialization aligned
70
+ with the current backbone structure in `layers.py`:
71
+
72
+ - `TSTEncoder.W_P`: patch projection linear layer
73
+ - `TSTEncoder.cls_token`: learnable CLS token
74
+ - `TSTEncoderLayer.self_attn`: Q/K/V and output projections
75
+ - `TSTEncoderLayer.ff`: feed-forward linear layers
76
+ - `LayerNorm` / `BatchNorm1d`: normalization layers
77
+ """
78
+ super()._init_weights(module)
79
+
80
+ factor = self.config.initializer_factor
81
+ d_model = self.config.d_model
82
+ num_heads = self.config.num_heads
83
+ d_k = d_model // num_heads
84
+ d_v = d_k
85
+
86
+ if isinstance(module, nn.Linear):
87
+ nn.init.normal_(
88
+ module.weight, mean=0.0, std=factor * (module.in_features**-0.5)
89
+ )
90
+ if module.bias is not None:
91
+ nn.init.zeros_(module.bias)
92
+
93
+ elif isinstance(module, nn.LayerNorm):
94
+ nn.init.ones_(module.weight)
95
+ nn.init.zeros_(module.bias)
96
+
97
+ elif isinstance(module, nn.BatchNorm1d):
98
+ if module.weight is not None:
99
+ nn.init.ones_(module.weight)
100
+ if module.bias is not None:
101
+ nn.init.zeros_(module.bias)
102
+
103
+ elif isinstance(module, TSTEncoder):
104
+ if hasattr(module, "cls_token") and module.cls_token is not None:
105
+ nn.init.normal_(module.cls_token, mean=0.0, std=factor)
106
+ if hasattr(module, "W_P") and isinstance(module.W_P, nn.Linear):
107
+ nn.init.normal_(
108
+ module.W_P.weight,
109
+ mean=0.0,
110
+ std=factor * (module.W_P.in_features**-0.5),
111
+ )
112
+ if module.W_P.bias is not None:
113
+ nn.init.zeros_(module.W_P.bias)
114
+
115
+ elif isinstance(module, MultiHeadAttention):
116
+ nn.init.normal_(module.W_Q.weight, mean=0.0, std=factor * (d_model**-0.5))
117
+ nn.init.normal_(module.W_K.weight, mean=0.0, std=factor * (d_model**-0.5))
118
+ nn.init.normal_(module.W_V.weight, mean=0.0, std=factor * (d_model**-0.5))
119
+ if module.W_Q.bias is not None:
120
+ nn.init.zeros_(module.W_Q.bias)
121
+ if module.W_K.bias is not None:
122
+ nn.init.zeros_(module.W_K.bias)
123
+ if module.W_V.bias is not None:
124
+ nn.init.zeros_(module.W_V.bias)
125
+
126
+ out_proj = module.to_out[0]
127
+ nn.init.normal_(
128
+ out_proj.weight, mean=0.0, std=factor * ((num_heads * d_v) ** -0.5)
129
+ )
130
+ if out_proj.bias is not None:
131
+ nn.init.zeros_(out_proj.bias)
132
+
133
+ elif isinstance(module, TSTEncoderLayer):
134
+ for submodule in module.ff:
135
+ if isinstance(submodule, nn.Linear):
136
+ nn.init.normal_(
137
+ submodule.weight,
138
+ mean=0.0,
139
+ std=factor * (submodule.in_features**-0.5),
140
+ )
141
+ if submodule.bias is not None:
142
+ nn.init.zeros_(submodule.bias)
143
+
144
+ def patching(self, time_series: torch.Tensor) -> torch.Tensor:
145
+ """Split a raw 1D time series into overlapping or non-overlapping patches.
146
+
147
+ The encoder does not operate directly on the full sequence. Instead, it
148
+ first converts the input into a sequence of local windows, where each
149
+ window has length ``self.patch_size`` and consecutive windows are shifted
150
+ by ``self.stride``. This patch-based representation reduces the temporal
151
+ resolution while preserving local patterns that are useful for attention
152
+ layers.
153
+
154
+ If the sequence length is not compatible with the patch size, we pad the
155
+ sequence on the right using replication padding so that the final patch
156
+ extraction remains well-defined.
157
+ """
158
+
159
+ # Unpack the input shape for clarity: each sample is a 1D signal.
160
+ batch_size, seq_length = time_series.shape
161
+
162
+ # When the sequence length cannot be evenly covered by the patch size,
163
+ # extend the sequence with replicated boundary values. This avoids
164
+ # discarding the tail of the signal and keeps the patching procedure
165
+ # consistent for every batch element.
166
+ if seq_length % self.patch_size != 0:
167
+ time_series = self.padding_patch_layer(time_series)
168
+
169
+ # Convert the padded sequence into a patch tensor using a sliding window.
170
+ # The resulting tensor contains local segments sampled along the last
171
+ # dimension, which will be consumed by the transformer encoder.
172
+ time_series = time_series.unfold(
173
+ dimension=-1, size=self.patch_size, step=self.stride
174
+ )
175
+
176
+ return time_series
177
+
178
+ def forward(
179
+ self, x: Tensor, return_cls_token: bool = True
180
+ ) -> Tuple[Tensor, Tensor]:
181
+ """Run the full SymTime inference pipeline.
182
+
183
+ The forward pass expects a 2D tensor of shape ``[batch_size, seq_length]``
184
+ containing a batch of univariate time series. The input is first converted
185
+ into patch embeddings through :meth:`patching`, and the resulting patch
186
+ sequence is then passed into the transformer encoder.
187
+
188
+ Parameters
189
+ ----------
190
+ x : Tensor
191
+ Batched input time series with shape ``[batch_size, seq_length]``.
192
+ return_cls_token : bool, optional
193
+ If ``True``, the encoder also returns the learned CLS token output
194
+ alongside the patch-level representations. This is useful when the
195
+ downstream task needs a global sequence summary.
196
+
197
+ Returns
198
+ -------
199
+ Tuple[Tensor, Tensor]
200
+ The encoded patch sequence and, optionally, the CLS token output.
201
+ """
202
+
203
+ # Validate that the input follows the expected batch-by-time layout.
204
+ assert (
205
+ x.dim() == 2
206
+ ), "Input time series must be a 2D tensor with shape of [batch_size, seq_length]."
207
+
208
+ # Convert the raw signal into a patch-based representation before encoding.
209
+ time_series = self.patching(x)
210
+
211
+ # Feed the patch sequence into the transformer encoder and return its output.
212
+ return self.encoder(time_series, return_cls_token=return_cls_token)