from typing import Optional

import torch
from torch import nn, Tensor
import torch.nn.functional as F

from huggingface_hub import PyTorchModelHubMixin


class Transpose(nn.Module):
    def __init__(self, *dims, contiguous=False):
        super(Transpose, self).__init__()
        self.dims, self.contiguous = dims, contiguous

    def forward(self, x):
        if self.contiguous:
            return x.transpose(*self.dims).contiguous()
        else:
            return x.transpose(*self.dims)

    def __repr__(self):
        if self.contiguous:
            return f"{self.__class__.__name__}(dims={', '.join([str(d) for d in self.dims])}).contiguous()"
        else:
            return (
                f"{self.__class__.__name__}({', '.join([str(d) for d in self.dims])})"
            )


pytorch_acts = [
    nn.ELU,
    nn.LeakyReLU,
    nn.PReLU,
    nn.ReLU,
    nn.ReLU6,
    nn.SELU,
    nn.CELU,
    nn.GELU,
    nn.Sigmoid,
    nn.Softplus,
    nn.Tanh,
    nn.Softmax,
]
pytorch_act_names = [a.__name__.lower() for a in pytorch_acts]


def get_act_fn(act, **act_kwargs):
    if act is None:
        return
    elif isinstance(act, nn.Module):
        return act
    elif callable(act):
        return act(**act_kwargs)
    idx = pytorch_act_names.index(act.lower())
    return pytorch_acts[idx](**act_kwargs)


class RevIN(nn.Module):
    def __init__(
        self,
        c_in: int,
        affine: bool = True,
        subtract_last: bool = False,
        dim: int = 2,
        eps: float = 1e-5,
    ):
        super().__init__()
        self.c_in, self.affine, self.subtract_last, self.dim, self.eps = (
            c_in,
            affine,
            subtract_last,
            dim,
            eps,
        )
        if self.affine:
            self.weight = nn.Parameter(torch.ones(1, c_in, 1))
            self.bias = nn.Parameter(torch.zeros(1, c_in, 1))

    def forward(self, x: Tensor, mode: Tensor):
        if mode:
            return self.normalize(x)
        else:
            return self.denormalize(x)

    def normalize(self, x):
        if self.subtract_last:
            self.sub = x[..., -1].unsqueeze(-1).detach()
        else:
            self.sub = torch.mean(x, dim=-1, keepdim=True).detach()
        self.std = (
            torch.std(x, dim=-1, keepdim=True, unbiased=False).detach() + self.eps
        )
        if self.affine:
            x = x.sub(self.sub)
            x = x.div(self.std)
            x = x.mul(self.weight)
            x = x.add(self.bias)
            return x
        else:
            x = x.sub(self.sub)
            x = x.div(self.std)
            return x

    def denormalize(self, x):
        if self.affine:
            x = x.sub(self.bias)
            x = x.div(self.weight)
            x = x.mul(self.std)
            x = x.add(self.sub)
            return x
        else:
            x = x.mul(self.std)
            x = x.add(self.sub)
            return x


class MovingAverage(nn.Module):
    def __init__(
        self,
        kernel_size: int,
    ):
        super().__init__()
        padding_left = (kernel_size - 1) // 2
        padding_right = kernel_size - padding_left - 1
        self.padding = torch.nn.ReplicationPad1d((padding_left, padding_right))
        self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=1)

    def forward(self, x: Tensor):
        return self.avg(self.padding(x))


class SeriesDecomposition(nn.Module):
    def __init__(
        self,
        kernel_size: int,  # the size of the window
    ):
        super().__init__()
        self.moving_avg = MovingAverage(kernel_size)

    def forward(self, x: Tensor):
        moving_mean = self.moving_avg(x)
        residual = x - moving_mean
        return residual, moving_mean


class _ScaledDotProductAttention(nn.Module):
    def __init__(self, d_model, n_heads, attn_dropout=0.0, res_attention=False):
        super().__init__()
        self.attn_dropout = nn.Dropout(attn_dropout)
        self.res_attention = res_attention
        head_dim = d_model // n_heads
        self.scale = nn.Parameter(torch.tensor(head_dim**-0.5), requires_grad=False)

    def forward(self, q: Tensor, k: Tensor, v: Tensor, prev: Optional[Tensor] = None):
        attn_scores = torch.matmul(q, k) * self.scale

        if prev is not None:
            attn_scores = attn_scores + prev

        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = self.attn_dropout(attn_weights)

        output = torch.matmul(attn_weights, v)

        if self.res_attention:
            return output, attn_weights, attn_scores
        else:
            return output, attn_weights


class _MultiheadAttention(nn.Module):
    def __init__(
        self,
        d_model,
        n_heads,
        d_k=None,
        d_v=None,
        res_attention=False,
        attn_dropout=0.0,
        proj_dropout=0.0,
        qkv_bias=True,
    ):
        "Multi Head Attention Layer"

        super().__init__()
        d_k = d_v = d_model // n_heads

        self.n_heads, self.d_k, self.d_v = n_heads, d_k, d_v

        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=qkv_bias)
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=qkv_bias)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=qkv_bias)

        # Scaled Dot-Product Attention (multiple heads)
        self.res_attention = res_attention
        self.sdp_attn = _ScaledDotProductAttention(
            d_model,
            n_heads,
            attn_dropout=attn_dropout,
            res_attention=self.res_attention,
        )

        # Poject output
        self.to_out = nn.Sequential(
            nn.Linear(n_heads * d_v, d_model), nn.Dropout(proj_dropout)
        )

    def forward(
        self,
        Q: Tensor,
        K: Optional[Tensor] = None,
        V: Optional[Tensor] = None,
        prev: Optional[Tensor] = None,
    ):
        bs = Q.size(0)
        if K is None:
            K = Q
        if V is None:
            V = Q

        # Linear (+ split in multiple heads)
        q_s = (
            self.W_Q(Q).view(bs, -1, self.n_heads, self.d_k).transpose(1, 2)
        )  # q_s: [bs x n_heads x max_q_len x d_k]
        k_s = (
            self.W_K(K).view(bs, -1, self.n_heads, self.d_k).permute(0, 2, 3, 1)
        )  # k_s: [bs x n_heads x d_k x q_len] - transpose(1,2) + transpose(2,3)
        v_s = (
            self.W_V(V).view(bs, -1, self.n_heads, self.d_v).transpose(1, 2)
        )  # v_s: [bs x n_heads x q_len x d_v]

        # Apply Scaled Dot-Product Attention (multiple heads)
        if self.res_attention:
            output, attn_weights, attn_scores = self.sdp_attn(q_s, k_s, v_s, prev=prev)
        else:
            output, attn_weights = self.sdp_attn(q_s, k_s, v_s)
        # output: [bs x n_heads x q_len x d_v], attn: [bs x n_heads x q_len x q_len], scores: [bs x n_heads x max_q_len x q_len]

        # back to the original inputs dimensions
        output = (
            output.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * self.d_v)
        )  # output: [bs x q_len x n_heads * d_v]
        output = self.to_out(output)

        if self.res_attention:
            return output, attn_weights, attn_scores
        else:
            return output, attn_weights


class Flatten_Head(nn.Module):
    def __init__(self, individual, n_vars, nf, pred_dim):
        super().__init__()

        if isinstance(pred_dim, (tuple, list)):
            pred_dim = pred_dim[-1]
        self.individual = individual
        self.n = n_vars if individual else 1
        self.nf, self.pred_dim = nf, pred_dim

        if individual:
            self.layers = nn.ModuleList()
            for i in range(self.n):
                self.layers.append(
                    nn.Sequential(nn.Flatten(start_dim=-2), nn.Linear(nf, pred_dim))
                )
        else:
            self.layer = nn.Sequential(
                nn.Flatten(start_dim=-2), nn.Linear(nf, pred_dim)
            )

    def forward(self, x: Tensor):
        """
        Args:
            x: [bs x nvars x d_model x n_patch]
            output: [bs x nvars x pred_dim]
        """
        if self.individual:
            x_out = []
            for i, layer in enumerate(self.layers):
                x_out.append(layer(x[:, i]))
            x = torch.stack(x_out, dim=1)
            return x
        else:
            return self.layer(x)


class _TSTiEncoderLayer(nn.Module):
    def __init__(
        self,
        q_len,
        d_model,
        n_heads,
        d_k=None,
        d_v=None,
        d_ff=256,
        store_attn=False,
        norm="BatchNorm",
        attn_dropout=0,
        dropout=0.0,
        bias=True,
        activation="gelu",
        res_attention=False,
        pre_norm=False,
    ):
        super().__init__()
        assert (
            not d_model % n_heads
        ), f"d_model ({d_model}) must be divisible by n_heads ({n_heads})"
        d_k = d_model // n_heads if d_k is None else d_k
        d_v = d_model // n_heads if d_v is None else d_v

        # Multi-Head attention
        self.res_attention = res_attention
        self.self_attn = _MultiheadAttention(
            d_model,
            n_heads,
            d_k,
            d_v,
            attn_dropout=attn_dropout,
            proj_dropout=dropout,
            res_attention=res_attention,
        )

        # Add & Norm
        self.dropout_attn = nn.Dropout(dropout)
        if "batch" in norm.lower():
            self.norm_attn = nn.Sequential(
                Transpose(1, 2), nn.BatchNorm1d(d_model), Transpose(1, 2)
            )
        else:
            self.norm_attn = nn.LayerNorm(d_model)

        # Position-wise Feed-Forward
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff, bias=bias),
            get_act_fn(activation),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model, bias=bias),
        )

        # Add & Norm
        self.dropout_ffn = nn.Dropout(dropout)
        if "batch" in norm.lower():
            self.norm_ffn = nn.Sequential(
                Transpose(1, 2), nn.BatchNorm1d(d_model), Transpose(1, 2)
            )
        else:
            self.norm_ffn = nn.LayerNorm(d_model)

        self.pre_norm = pre_norm
        self.store_attn = store_attn

    def forward(self, src: Tensor, prev: Optional[Tensor] = None):
        """
        Args:
            src: [bs x q_len x d_model]
        """

        # Multi-Head attention sublayer
        if self.pre_norm:
            src = self.norm_attn(src)
        ## Multi-Head attention
        if self.res_attention:
            src2, attn, scores = self.self_attn(src, src, src, prev)
        else:
            src2, attn = self.self_attn(src, src, src)
        if self.store_attn:
            self.attn = attn
        ## Add & Norm
        src = src + self.dropout_attn(
            src2
        )  # Add: residual connection with residual dropout
        if not self.pre_norm:
            src = self.norm_attn(src)

        # Feed-forward sublayer
        if self.pre_norm:
            src = self.norm_ffn(src)
        ## Position-wise Feed-Forward
        src2 = self.ff(src)
        ## Add & Norm
        src = src + self.dropout_ffn(
            src2
        )  # Add: residual connection with residual dropout
        if not self.pre_norm:
            src = self.norm_ffn(src)

        if self.res_attention:
            return src, scores
        else:
            return src


class _TSTiEncoder(nn.Module):  # i means channel-independent
    def __init__(
        self,
        c_in,
        patch_num,
        patch_len,
        n_layers=3,
        d_model=128,
        n_heads=16,
        d_k=None,
        d_v=None,
        d_ff=256,
        norm="BatchNorm",
        attn_dropout=0.0,
        dropout=0.0,
        act="gelu",
        store_attn=False,
        res_attention=True,
        pre_norm=False,
    ):

        super().__init__()

        self.patch_num = patch_num
        self.patch_len = patch_len

        # Input encoding
        q_len = patch_num
        self.W_P = nn.Linear(
            patch_len, d_model
        )  # Eq 1: projection of feature vectors onto a d-dim vector space
        self.seq_len = q_len

        # Positional encoding
        W_pos = torch.empty((q_len, d_model))
        nn.init.uniform_(W_pos, -0.02, 0.02)
        self.W_pos = nn.Parameter(W_pos)

        # Residual dropout
        self.dropout = nn.Dropout(dropout)

        # Encoder
        self.layers = nn.ModuleList(
            [
                _TSTiEncoderLayer(
                    q_len,
                    d_model,
                    n_heads=n_heads,
                    d_k=d_k,
                    d_v=d_v,
                    d_ff=d_ff,
                    norm=norm,
                    attn_dropout=attn_dropout,
                    dropout=dropout,
                    activation=act,
                    res_attention=res_attention,
                    pre_norm=pre_norm,
                    store_attn=store_attn,
                )
                for i in range(n_layers)
            ]
        )
        self.res_attention = res_attention

    def forward(self, x: Tensor):
        """
        Args:
            x: [bs x nvars x patch_len x patch_num]
        """

        n_vars = x.shape[1]
        # Input encoding
        x = x.permute(0, 1, 3, 2)  # x: [bs x nvars x patch_num x patch_len]
        x = self.W_P(x)  # x: [bs x nvars x patch_num x d_model]

        x = torch.reshape(
            x, (x.shape[0] * x.shape[1], x.shape[2], x.shape[3])
        )  # x: [bs * nvars x patch_num x d_model]
        x = self.dropout(x + self.W_pos)  # x: [bs * nvars x patch_num x d_model]

        # Encoder
        if self.res_attention:
            scores = None
            for mod in self.layers:
                x, scores = mod(x, prev=scores)
        else:
            for mod in self.layers:
                x = mod(x)
        x = torch.reshape(
            x, (-1, n_vars, x.shape[-2], x.shape[-1])
        )  # x: [bs x nvars x patch_num x d_model]
        x = x.permute(0, 1, 3, 2)  # x: [bs x nvars x d_model x patch_num]

        return x


class _PatchTST_backbone(nn.Module):
    def __init__(
        self,
        c_in,
        seq_len,
        pred_dim,
        patch_len,
        stride,
        n_layers=3,
        d_model=128,
        n_heads=16,
        d_k=None,
        d_v=None,
        d_ff=256,
        norm="BatchNorm",
        attn_dropout=0.0,
        dropout=0.0,
        act="gelu",
        res_attention=True,
        pre_norm=False,
        store_attn=False,
        padding_patch=True,
        individual=False,
        revin=True,
        affine=True,
        subtract_last=False,
    ):

        super().__init__()

        self.revin = revin
        self.revin_layer = RevIN(c_in, affine=affine, subtract_last=subtract_last)

        self.patch_len = patch_len
        self.stride = stride
        self.padding_patch = padding_patch
        patch_num = int((seq_len - patch_len) / stride + 1) + 1
        self.patch_num = patch_num
        self.padding_patch_layer = nn.ReplicationPad1d((stride, 0))

        self.unfold = nn.Unfold(kernel_size=(1, patch_len), stride=stride)
        self.patch_len = patch_len

        self.backbone = _TSTiEncoder(
            c_in,
            patch_num=patch_num,
            patch_len=patch_len,
            n_layers=n_layers,
            d_model=d_model,
            n_heads=n_heads,
            d_k=d_k,
            d_v=d_v,
            d_ff=d_ff,
            attn_dropout=attn_dropout,
            dropout=dropout,
            act=act,
            res_attention=res_attention,
            pre_norm=pre_norm,
            store_attn=store_attn,
        )

        # Head
        self.head_nf = d_model * patch_num
        self.n_vars = c_in
        self.individual = individual
        self.head = Flatten_Head(self.individual, self.n_vars, self.head_nf, pred_dim)

    def forward(self, z: Tensor):
        """
        Args:
            z: [bs x c_in x seq_len]
        """

        if self.revin:
            z = self.revin_layer(z, torch.tensor(True, dtype=torch.bool))

        z = self.padding_patch_layer(z)
        b, c, s = z.size()
        z = z.reshape(-1, 1, 1, s)
        z = self.unfold(z)
        z = z.permute(0, 2, 1).reshape(b, c, -1, self.patch_len).permute(0, 1, 3, 2)

        z = self.backbone(z)
        z = self.head(z)

        if self.revin:
            z = self.revin_layer(z, torch.tensor(False, dtype=torch.bool))
        return z


class PatchTST(nn.Module, PyTorchModelHubMixin):
    def __init__(
        self,
        c_in,
        c_out,
        seq_len,
        pred_dim=None,
        n_layers=2,
        n_heads=8,
        d_model=512,
        d_ff=2048,
        dropout=0.05,
        attn_dropout=0.0,
        patch_len=16,
        stride=8,
        padding_patch=True,
        revin=True,
        affine=False,
        individual=False,
        subtract_last=False,
        decomposition=False,
        kernel_size=25,
        activation="gelu",
        norm="BatchNorm",
        pre_norm=False,
        res_attention=True,
        store_attn=False,
        classification=False,
    ):

        super().__init__()

        if pred_dim is None:
            pred_dim = seq_len

        self.decomposition = decomposition
        if self.decomposition:
            self.decomp_module = SeriesDecomposition(kernel_size)
            self.model_trend = _PatchTST_backbone(
                c_in=c_in,
                seq_len=seq_len,
                pred_dim=pred_dim,
                patch_len=patch_len,
                stride=stride,
                n_layers=n_layers,
                d_model=d_model,
                n_heads=n_heads,
                d_ff=d_ff,
                norm=norm,
                attn_dropout=attn_dropout,
                dropout=dropout,
                act=activation,
                res_attention=res_attention,
                pre_norm=pre_norm,
                store_attn=store_attn,
                padding_patch=padding_patch,
                individual=individual,
                revin=revin,
                affine=affine,
                subtract_last=subtract_last,
            )
            self.model_res = _PatchTST_backbone(
                c_in=c_in,
                seq_len=seq_len,
                pred_dim=pred_dim,
                patch_len=patch_len,
                stride=stride,
                n_layers=n_layers,
                d_model=d_model,
                n_heads=n_heads,
                d_ff=d_ff,
                norm=norm,
                attn_dropout=attn_dropout,
                dropout=dropout,
                act=activation,
                res_attention=res_attention,
                pre_norm=pre_norm,
                store_attn=store_attn,
                padding_patch=padding_patch,
                individual=individual,
                revin=revin,
                affine=affine,
                subtract_last=subtract_last,
            )
            self.patch_num = self.model_trend.patch_num
        else:
            self.model = _PatchTST_backbone(
                c_in=c_in,
                seq_len=seq_len,
                pred_dim=pred_dim,
                patch_len=patch_len,
                stride=stride,
                n_layers=n_layers,
                d_model=d_model,
                n_heads=n_heads,
                d_ff=d_ff,
                norm=norm,
                attn_dropout=attn_dropout,
                dropout=dropout,
                act=activation,
                res_attention=res_attention,
                pre_norm=pre_norm,
                store_attn=store_attn,
                padding_patch=padding_patch,
                individual=individual,
                revin=revin,
                affine=affine,
                subtract_last=subtract_last,
            )
            self.patch_num = self.model.patch_num
        self.classification = classification

    def forward(self, x):
        if self.decomposition:
            res_init, trend_init = self.decomp_module(x)
            res = self.model_res(res_init)
            trend = self.model_trend(trend_init)
            x = res + trend
        else:
            x = self.model(x)

        if self.classification:
            x = x.squeeze(-2)
        return x