Spaces:

sqfoo
/

STLDM

Running

App Files Files Community

sqfoo commited on 3 days ago

Commit

be9a67e

1 Parent(s): 805021f

Add Application file

Browse files

Files changed (28) hide show

LICENSE +21 -0
app.py +56 -0
requirements.txt +147 -0
stldm/__init__.py +5 -0
stldm/__pycache__/__init__.cpython-38.pyc +0 -0
stldm/__pycache__/__init__.cpython-39.pyc +0 -0
stldm/__pycache__/config.cpython-38.pyc +0 -0
stldm/__pycache__/inference.cpython-38.pyc +0 -0
stldm/__pycache__/modules.cpython-38.pyc +0 -0
stldm/__pycache__/modules.cpython-39.pyc +0 -0
stldm/__pycache__/simvpv2.cpython-38.pyc +0 -0
stldm/__pycache__/simvpv2.cpython-39.pyc +0 -0
stldm/__pycache__/stldm.cpython-38.pyc +0 -0
stldm/__pycache__/stldm.cpython-39.pyc +0 -0
stldm/__pycache__/stldm_hf.cpython-38.pyc +0 -0
stldm/__pycache__/stldm_spatial.cpython-38.pyc +0 -0
stldm/__pycache__/stldm_spatial.cpython-39.pyc +0 -0
stldm/__pycache__/submodules.cpython-38.pyc +0 -0
stldm/__pycache__/submodules.cpython-39.pyc +0 -0
stldm/config.py +115 -0
stldm/inference.py +99 -0
stldm/modules.py +126 -0
stldm/simvpv2.py +431 -0
stldm/stldm.py +612 -0
stldm/stldm_hf.py +620 -0
stldm/stldm_spatial.py +593 -0
stldm/submodules.py +395 -0
utilspp.py +566 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 sqfoo
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+import numpy as np
+import gradio as gr
+from stldm import InferenceHub
+from stldm.config import STLDM_HKO
+from data.dutils import resize
+from utilspp import gradio_visualize, gradio_gif
+def nowcasting(file, cfg_str, ensemble_no):
+    # Model Setup
+    Forecastor = InferenceHub(
+      model_config=STLDM_HKO,
+      cfg_str=cfg_str,
+      model_type='HF'
+    )
+    # Data Preparation
+    x = torch.tensor(np.load(file.name))
+    if x.ndim not in (5, 4):
+        raise ValueError("Please specify the input has the format of (T C H W)")
+    if x.max() > 1:
+        x = x / 255.0
+    x = x.clamp(0, 1)
+    if x.ndim == 4:
+        x = x.unsqueeze(0)
+    x = resize(x, 128) # resize the data to 128 x 128
+    if x.shape[1] < 5:
+        raise ValueError("The input should have at least 5 frames for STLDM to predict")
+    x = x[0, -5:]
+    out = {}
+    for i in range(ensemble_no):
+      y_pred = Forecastor(input_x=x, include_mu=False)
+      out[f'Ensemble {i+1}'] = torch.cat((x, y_pred), dim=0)
+    figure = gradio_gif(out, len(out['Ensemble 1']))
+    return figure
+with gr.Blocks() as demo:
+    gr.Markdown("# STLDM official demo for nowcasting")
+    gr.Markdown("Please upload the radar sequences with **at least 5 frames** in the format of .npy file, and **STLDM** will predict the future 20 frames based on the past 5 frames.")
+    gr.Markdown('Please refer to [paper](https://arxiv.org/abs/2512.21118) and [code](https://github.com/sqfoo/stldm_official) for more details about STLDM.')
+    file_input = gr.File(label="Upload the input radar squences", file_types=[".npy"])
+    cfg_str = gr.Slider(0.0, 2.0, value=1.0, step=0.1, label="Classifier Free Guidance Scale")
+    ensemble_no = gr.Slider(1, 10, value=2, step=1, label="How many ensemble predictions?")
+    output = gr.Image(label="Nowcasting Results")
+    btn = gr.Button("Forecast Now!")
+    btn.click(fn=nowcasting, inputs=[file_input, cfg_str, ensemble_no], outputs=output)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,147 @@

+absl-py==2.0.0
+antlr4-python3-runtime==4.9.3
+anyio==4.12.0
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.4.0
+asttokens==3.0.1
+async-lru==2.0.5
+attrs==25.4.0
+babel==2.17.0
+beautifulsoup4==4.14.3
+bleach==6.2.0
+cachetools==5.3.2
+certifi==2023.11.17
+cffi==2.0.0
+charset-normalizer==3.3.2
+comm==0.2.3
+contourpy==1.3.0
+cycler==0.12.1
+debugpy==1.8.17
+decorator==5.2.1
+defusedxml==0.7.1
+einops==0.8.1
+exceptiongroup==1.3.1
+executing==2.2.1
+fastjsonschema==2.21.2
+fonttools==4.45.0
+fqdn==1.5.1
+google-auth==2.23.4
+google-auth-oauthlib==0.4.6
+grpcio==1.59.3
+h11==0.16.0
+h5py==3.7.0
+httpcore==1.0.9
+httpx==0.28.1
+idna==3.4
+imageio==2.33.0
+importlib-metadata==6.8.0
+importlib_resources==6.5.2
+ipykernel==6.31.0
+ipython==8.18.1
+ipywidgets==8.1.8
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.3.2
+json5==0.12.1
+jsonpointer==3.0.0
+jsonschema==4.25.1
+jsonschema-specifications==2025.9.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.3.0
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.17.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.5.0
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.28.0
+jupyterlab_widgets==3.0.16
+kiwisolver==1.4.5
+lark==1.3.1
+lpips==0.1.4
+Markdown==3.5.1
+MarkupSafe==2.1.3
+matplotlib==3.9.4
+matplotlib-inline==0.2.1
+mistune==3.1.4
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.2.1
+notebook==7.5.0
+notebook_shim==0.2.4
+numpy==1.24.4
+oauthlib==3.2.2
+omegaconf==2.3.0
+opencv-python==4.8.0.74
+overrides==7.7.0
+packaging==23.2
+pandas==1.4.3
+pandocfilters==1.5.1
+parso==0.8.5
+pexpect==4.9.0
+Pillow==10.1.0
+platformdirs==4.4.0
+prometheus_client==0.23.1
+prompt_toolkit==3.0.52
+protobuf==3.19.6
+psutil==7.1.3
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyasn1==0.5.1
+pyasn1-modules==0.3.0
+pycparser==2.23
+Pygments==2.19.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-json-logger==4.0.0
+pytz==2023.3.post1
+PyWavelets==1.5.0
+PyYAML==6.0
+pyzmq==27.1.0
+referencing==0.36.2
+requests==2.31.0
+requests-oauthlib==1.3.1
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc3987-syntax==1.1.0
+rpds-py==0.27.1
+rsa==4.9
+SciencePlots==2.2.0
+scikit-image==0.19.3
+scikit-learn==1.1.2
+scipy==1.9.1
+Send2Trash==1.8.3
+six==1.16.0
+soupsieve==2.8
+stack-data==0.6.3
+tensorboard==2.9.0
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+terminado==0.18.1
+threadpoolctl==3.2.0
+tifffile==2023.9.26
+tinycss2==1.4.0
+tomli==2.3.0
+torch==1.12.1+cu116
+torchmetrics==0.11.0
+torchvision==0.13.1+cu116
+tornado==6.5.2
+tqdm==4.66.1
+traitlets==5.14.3
+typing_extensions==4.8.0
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.1.0
+wcwidth==0.2.14
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.9.0
+Werkzeug==3.0.1
+widgetsnbextension==4.0.15
+zipp==3.17.0

stldm/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from stldm.stldm import model_setup
+from stldm.stldm_spatial import model_setup as spatial_setup
+from stldm.inference import InferenceHub
+n2n_setup = {'2D': spatial_setup, '3D': model_setup}

stldm/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (343 Bytes). View file

stldm/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (1.15 kB). View file

stldm/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (1.08 kB). View file

stldm/__pycache__/inference.cpython-38.pyc ADDED Viewed

Binary file (3.26 kB). View file

stldm/__pycache__/modules.cpython-38.pyc ADDED Viewed

Binary file (5.45 kB). View file

stldm/__pycache__/modules.cpython-39.pyc ADDED Viewed

Binary file (5.4 kB). View file

stldm/__pycache__/simvpv2.cpython-38.pyc ADDED Viewed

Binary file (15.4 kB). View file

stldm/__pycache__/simvpv2.cpython-39.pyc ADDED Viewed

Binary file (15.2 kB). View file

stldm/__pycache__/stldm.cpython-38.pyc ADDED Viewed

Binary file (18.4 kB). View file

stldm/__pycache__/stldm.cpython-39.pyc ADDED Viewed

Binary file (18.4 kB). View file

stldm/__pycache__/stldm_hf.cpython-38.pyc ADDED Viewed

Binary file (18.6 kB). View file

stldm/__pycache__/stldm_spatial.cpython-38.pyc ADDED Viewed

Binary file (18.2 kB). View file

stldm/__pycache__/stldm_spatial.cpython-39.pyc ADDED Viewed

Binary file (18.2 kB). View file

stldm/__pycache__/submodules.cpython-38.pyc ADDED Viewed

Binary file (15.4 kB). View file

stldm/__pycache__/submodules.cpython-39.pyc ADDED Viewed

Binary file (15.4 kB). View file

stldm/config.py ADDED Viewed

	@@ -0,0 +1,115 @@

+STLDM_SEVIR = {
+    'model': "stldm",
+    'pre': None,
+    'post': None,
+    'vp_param': {
+        'shape_in': (13, 1, 128, 128),
+        'shape_out': (12, 1, 128, 128),
+        'hid_S': 32,
+        'hid_T': 512,
+        'N_S': 4,
+        'N_T': 8,
+        'groups': 8,
+        'last_activation': 'sigmoid',
+    },
+    'stldm_param': {
+        'in_ch': 32,
+        'chs_mult': [1,2,4,8],
+        'num_groups': 8,
+        'heads': 4,
+        'dim_head': 32,
+        'base_ch': 64,
+        'patch_size': 16
+    },
+    'param': {
+        'timesteps': 50,
+        'sampling_timesteps': 20,
+        'objective': 'pred_v'
+    }
+}
+STLDM_HKO = {
+    'model': "stldm",
+    'pre': None,
+    'post': None,
+    'vp_param': {
+        'shape_in': (5, 1, 128, 128),
+        'shape_out': (20, 1, 128, 128),
+        'hid_S': 32,
+        'hid_T': 512,
+        'N_S': 4,
+        'N_T': 8,
+        'groups': 8,
+        'last_activation': 'sigmoid',
+    },
+    'stldm_param': {
+        'in_ch': 32,
+        'chs_mult': [1,2,4,8],
+        'num_groups': 8,
+        'heads': 4,
+        'dim_head': 32,
+        'base_ch': 64,
+        'patch_size': 16
+    },
+    'param': {
+        'timesteps': 50,
+        'sampling_timesteps': 20,
+        'objective': 'pred_v'
+    }
+}
+STLDM_METEO = {
+    'model': "stldm",
+    'pre': None,
+    'post': None,
+    'vp_param': {
+        'shape_in': (5, 1, 128, 128),
+        'shape_out': (20, 1, 128, 128),
+        'hid_S': 32,
+        'hid_T': 512,
+        'N_S': 4,
+        'N_T': 8,
+        'groups': 8,
+        'last_activation': 'sigmoid',
+    },
+    'stldm_param': {
+        'in_ch': 32,
+        'chs_mult': [1,2,4,8],
+        'num_groups': 8,
+        'heads': 4,
+        'dim_head': 32,
+        'base_ch': 64,
+        'patch_size': 16
+    },
+    'param': {
+        'timesteps': 50,
+        'sampling_timesteps': 20,
+        'objective': 'pred_v'
+    }
+}
+STLDM_HKO_HF = {
+    'vp_param': {
+        'shape_in': (5, 1, 128, 128),
+        'shape_out': (20, 1, 128, 128),
+        'hid_S': 32,
+        'hid_T': 512,
+        'N_S': 4,
+        'N_T': 8,
+        'groups': 8,
+        'last_activation': 'sigmoid',
+    },
+    'stldm_param': {
+        'in_ch': 32,
+        'chs_mult': [1,2,4,8],
+        'num_groups': 8,
+        'heads': 4,
+        'dim_head': 32,
+        'base_ch': 64,
+        'patch_size': 16
+    },
+    'timesteps': 50,
+    'sampling_timesteps': 20,
+    'objective': 'pred_v'
+}

stldm/inference.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+from typing import Tuple
+from stldm.stldm import model_setup, guidance_scheduler
+from stldm.stldm_spatial import model_setup as spatial_setup
+from stldm.stldm_hf import GaussianDiffusion as hf_setup
+n2n_setup = {'2D': spatial_setup, '3D': model_setup, 'HF': hf_setup}
+class InferenceHub:
+    """
+    Unified inference interface for STLDM models
+    Support local checkpoints and the checkpoint uploaded to Hugging Face.
+    Params:
+    - model_config: dict, the model configuration found in "stldm/model_config.py"
+    - model_ckpt: str, the path to the model checkpoint. For 'HF' model_type, this can be None.
+    - cfg_str: float, the classifier-free guidance strength. If None, no CFG is applied.
+    - model_type: str, the type of the model. Options are '2D', '3D', and 'HF'.
+    """
+    def __init__(self, model_config, model_ckpt:str=None, cfg_str:float=None, model_type:str='3D', gpu='auto'):
+        self.input_size = model_config['vp_param']['shape_in']
+        self.sampling_steps = model_config['param']['timesteps']
+        self.model_config = self.setup_config(model_config, model_type)
+        self.model = self.setup_model(model_type, self.model_config, model_ckpt)
+        self.setup_cfg(cfg_str)
+        if gpu is not None:
+            if gpu == 'auto':
+                if torch.cuda.device_count() > 0:
+                    self.model.to(device="cuda")
+            else:
+                self.model.to(device=f"cuda:{gpu}")
+    def setup_config(self, model_config, model_type):
+        if model_type == 'HF':
+            HF_config = {
+                'vp_param': model_config['vp_param'],
+                'stldm_param': model_config['stldm_param'],
+                **model_config['param'],
+            }
+            return HF_config
+        else:
+            return model_config
+    def setup_model(self, model_type, model_config, model_ckpt):
+        if model_type not in n2n_setup:
+            raise ValueError(f"model_type should be one of {str(list(n2n_setup.keys()))}")
+        if model_type == 'HF':
+            model = n2n_setup[model_type](**model_config).from_pretrained("sqfoo/STLDM_official")
+        else:
+            model = n2n_setup[model_type](model_config)
+            model.load_state_dict(torch.load(model_ckpt))
+        model.eval()
+        return model
+    def setup_cfg(self, cfg_str):
+        guidance = guidance_scheduler(sampling_step=self.sampling_steps, const=cfg_str) if cfg_str is not None else None
+        self.model.setup_guidance(guidance)
+    """
+    This method performs inference on the input tensor.
+    Params:
+    - input_x: torch.tensor, the input tensor with shape (B T C H W) or (T C H W)
+    - include_mu: bool, whether to return the intermediate representation 'mu' along with the final prediction
+    """
+    @torch.no_grad()
+    def __call__(self, input_x: torch.tensor, include_mu: bool = False):
+        ndim = input_x.ndim
+        if ndim not in (5, 4):
+            raise ValueError("Please specify the input has the either format of (B T C H W) or (T C H W)")
+        input_device = input_x.device
+        if ndim == 4:
+            input_x = input_x.unsqueeze(0)
+        if input_x.shape[1:] != self.input_size:
+            raise ValueError(f"Ensure that the input has the shape of {str(self.input_size)}")
+        input_x = input_x.to(self.model.device)
+        if include_mu:
+            y_pred, mu = self.model(input_x, includ_mu=include_mu)
+        else:
+            y_pred = self.model(input_x, includ_mu=include_mu)
+            mu = None
+        if mu is not None:
+            mu = mu.to(input_device)
+        y_pred = y_pred.to(input_device)
+        if ndim == 4:
+            y_pred = y_pred[0]
+            mu = mu if mu is None else mu[0]
+        return (y_pred, mu) if include_mu else y_pred

stldm/modules.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from torch import nn
+from stldm.submodules import ChannelConversion
+from stldm.simvpv2 import stride_generator, ConvSC, MidMetaNet
+class Encoder(nn.Module):
+    def __init__(self, C_in, C_hid, N_S):
+        super(Encoder, self).__init__()
+        strides = stride_generator(N_S)
+        self.enc = nn.Sequential(
+            ConvSC(C_in, C_hid, stride=strides[0]),
+            *[ConvSC(C_hid, C_hid, stride=s) for s in strides[1:]],
+            ChannelConversion(C_hid, 2*C_hid)
+        )
+    def forward(self, x):
+        for encoder in self.enc:
+            x = encoder(x)
+        (mean, log_var) = torch.chunk(x, 2, dim=1)
+        return mean, log_var
+class Decoder(nn.Module):
+    def __init__(self, C_hid, C_out, N_S, last_activation='sigmoid'):
+        super(Decoder,self).__init__()
+        strides = stride_generator(N_S, reverse=True)
+        self.dec = nn.Sequential(
+            ChannelConversion(C_hid, C_hid),
+            *[ConvSC(C_hid, C_hid, stride=s, transpose=True) for s in strides[:-1]],
+            ConvSC(C_hid, C_hid, stride=strides[-1], transpose=True)# Modify HERE
+        )
+        self.readout = nn.Conv2d(C_hid, C_out, 1)
+        if last_activation=='sigmoid':
+            self.last = nn.Sigmoid()
+        else:
+            self.last = nn.Identity()
+    def forward(self, x):
+        for decoder in self.dec:
+            x = decoder(x)
+        Y = self.readout(x)
+        return self.last(Y)
+class VAE(nn.Module):
+    def __init__(self, C_in, hid_S, N_S, last_activation='none'):
+        super(VAE, self).__init__()
+        self.encoder = Encoder(C_in, hid_S, N_S)
+        self.decoder = Decoder(hid_S, C_in, N_S, last_activation)
+    def sample_from_standard_normal(self, mean, log_var):
+        std = (0.5 * log_var).exp()
+        return mean + std * torch.randn_like(mean)
+    def encode(self, x):
+        assert x.ndim==4
+        mean, log_var = self.encoder(x)
+        return mean, log_var
+    def decode(self, z):
+        assert z.ndim==4
+        dec = self.decoder(z)
+        return dec
+    def kl_from_standard_normal(self, mean, log_var):
+        kl = 0.5 * (log_var.exp() + mean.square() - 1.0 - log_var)
+        return kl.mean()
+    def _losses_(self, x, y):
+        mean, log_var = self.encode(x)
+        kl_loss = self.kl_from_standard_normal(mean, log_var)
+        y_pred = self.forward(x)
+        recon_loss = nn.MSELoss()(y_pred, y)
+        return recon_loss, kl_loss
+    def forward(self, x):
+        mu_z, log_var = self.encode(x)
+        z = self.sample_from_standard_normal(mu_z, log_var)
+        recon = self.decode(z)
+        return recon
+class SimVPV2_Model(nn.Module):
+    def __init__(self, shape_in, shape_out, hid_S=16, hid_T=256, N_S=4, N_T=4,
+                 mlp_ratio=8., drop=0.0, drop_path=0.0, spatio_kernel_enc=3,
+                 spatio_kernel_dec=3, last_activation='none', act_inplace=True, **kwargs):
+        super(SimVPV2_Model, self).__init__()
+        T, C, H, W = shape_in  # T is pre_seq_length
+        T2, C2, H2, W2 = shape_out # T2 is output length
+        assert C==C2 and H==H2 and W==W2, 'Need to be the same image shape for input and output'
+        self.T2 = T2
+        self.T = T
+        H, W = int(H / 2**(N_S/2)), int(W / 2**(N_S/2))  # downsample 1 / 2**(N_S/2)
+        self.vae = VAE(C_in=C, hid_S=hid_S, N_S=N_S, last_activation=last_activation)
+        self.hid = MidMetaNet(T*hid_S, T2*hid_S*2, hid_T, N_T,
+                    input_resolution=(H, W), model_type='gsta',
+                    mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path)
+    def forward(self, x_raw):
+        B, T, C, H, W = x_raw.shape
+        x = x_raw.reshape(B*T, C, H, W)
+        embed, log_var = self.vae.encode(x)
+        embed = self.vae.sample_from_standard_normal(embed, log_var)
+        *_, C_, H_, W_ = embed.shape
+        z = embed.view(B, T, C_, H_, W_)
+        hid, *_ = self.hid(z)
+        hid_mu, log_var_hid = torch.chunk(hid, 2, dim=1)
+        hid = self.vae.sample_from_standard_normal(hid_mu, log_var_hid)
+        hid = hid.reshape(B*self.T2, C_, H_, W_)
+        # conds_ = hid
+        conds_ = hid_mu.reshape(B*self.T2, C_, H_, W_)
+        Y = self.vae.decode(hid)
+        Y = Y.reshape(B, self.T2, C, H, W)
+        return Y, conds_
+    def _losses_(self, x, y):
+        y_pred, *_ = self.forward(x)
+        recon_loss = nn.MSELoss()(y_pred, y)
+        return recon_loss

stldm/simvpv2.py ADDED Viewed

	@@ -0,0 +1,431 @@

+from torch import nn
+import torch, math
+# from torchmodels.simvp import ConvSC, stride_generator
+class SimVPV2_Model(nn.Module):
+    r"""SimVP Model
+    Implementation of `SimVP: Simpler yet Better Video Prediction
+    Just Remove The Skip Connection
+    <https://arxiv.org/abs/2206.05099>`_.
+    """
+    def __init__(self, shape_in, shape_out, hid_S=16, hid_T=256, N_S=4, N_T=4,
+                 mlp_ratio=8., drop=0.0, drop_path=0.0, spatio_kernel_enc=3,
+                 spatio_kernel_dec=3, last_activation='none', act_inplace=True, recursive=False, **kwargs):
+        super(SimVPV2_Model, self).__init__()
+        T, C, H, W = shape_in  # T is pre_seq_length
+        T2, C2, H2, W2 = shape_out # T2 is output length
+        assert C==C2 and H==H2 and W==W2, 'Need to be the same image shape for input and output'
+        self.T2 = T2
+        self.T = T
+        H, W = int(H / 2**(N_S/2)), int(W / 2**(N_S/2))  # downsample 1 / 2**(N_S/2)
+        act_inplace = False
+        self.enc = Encoder(C, hid_S, N_S)#, spatio_kernel_enc, act_inplace=act_inplace)
+        self.dec = Decoder(hid_S, C, N_S, last_activation)#, spatio_kernel_dec, act_inplace=act_inplace)
+        # Modify HERE
+        self.recursive = recursive
+        if not self.recursive:
+            self.hid = MidMetaNet(T*hid_S, T2*hid_S, hid_T, N_T,
+                input_resolution=(H, W), model_type='gsta',
+                mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path)
+        else:
+            self.hid = MidMetaNet(T*hid_S, T*hid_S, hid_T, N_T,
+                input_resolution=(H, W), model_type='gsta',
+                mlp_ratio=mlp_ratio, drop=drop, drop_path=drop_path)
+        self.last_activation = last_activation
+    def forward(self, x_raw, **kwargs):
+        B, T, C, H, W = x_raw.shape
+        # x = x_raw.view(B*T, C, H, W)
+        x = x_raw.reshape(B*T, C, H, W)
+        embed = self.enc(x)
+        _, C_, H_, W_ = embed.shape
+        z = embed.view(B, T, C_, H_, W_)
+        if not self.recursive:
+            hid, conds_ = self.hid(z)
+        else:
+            no = self.T2//self.T
+            if self.T2%self.T != 0:
+                no += 1
+            hid = []
+            for i in range(no):
+                z, _ = self.hid(z)
+                hid.append(z)
+            hid = torch.cat(hid, dim=1)
+            hid = hid[:, :self.T2]
+            conds_ = hid.reshape(-1, C_, H_, W_)
+            # print(hid.shape, conds_.shape)
+        hid = hid.reshape(B*self.T2, C_, H_, W_)
+        Y = self.dec(hid)
+        Y = Y.reshape(B, self.T2, C, H, W)
+        return Y, conds_, hid.reshape(B, -1, C_, H_, W_)
+    def recon_loss(self, x, y):
+        X = torch.cat((x, y), dim=1)
+        B, T, C, H, W = X.shape
+        X = X.reshape(-1, C, H, W)
+        recon = self.dec(self.enc(X))
+        return nn.MSELoss()(recon, X)
+class MidMetaNet(nn.Module):
+    """The hidden Translator of MetaFormer for SimVP"""
+    # Modify HERE with an additional param: channel_out
+    def __init__(self, channel_in, channel_out, channel_hid, N2,
+                 input_resolution=None, model_type=None,
+                 mlp_ratio=4., drop=0.0, drop_path=0.1):
+        super(MidMetaNet, self).__init__()
+        assert N2 >= 2 and mlp_ratio > 1
+        self.N2 = N2
+        dpr = [  # stochastic depth decay rule
+            x.item() for x in torch.linspace(1e-2, drop_path, self.N2)]
+        # downsample
+        enc_layers = [MetaBlock(
+            channel_in, channel_hid, input_resolution, model_type,
+            mlp_ratio, drop, drop_path=dpr[0], layer_i=0)]
+        # middle layers
+        for i in range(1, N2-1):
+            enc_layers.append(MetaBlock(
+                channel_hid, channel_hid, input_resolution, model_type,
+                mlp_ratio, drop, drop_path=dpr[i], layer_i=i))
+        # upsample
+        # Modify HERE
+        enc_layers.append(MetaBlock(
+            channel_hid, channel_out, input_resolution, model_type,
+            mlp_ratio, drop, drop_path=drop_path, layer_i=N2-1))
+        self.enc = nn.Sequential(*enc_layers)
+    def forward(self, x):
+        B, T, C, H, W = x.shape
+        x = x.reshape(B, T*C, H, W)
+        z = x
+        conds = [z]
+        for i in range(self.N2):
+            z = self.enc[i](z)
+            conds.append(z)
+        y = z.reshape(B, -1, C, H, W)
+        return y, y.reshape(-1, C, H, W) #conds #conds[:-1]
+class MetaBlock(nn.Module):
+    """The hidden Translator of MetaFormer for SimVP"""
+    def __init__(self, in_channels, out_channels, input_resolution=None, model_type=None,
+                 mlp_ratio=8., drop=0.0, drop_path=0.0, layer_i=0):
+        super(MetaBlock, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        model_type = model_type.lower() if model_type is not None else 'gsta'
+        if model_type == 'gsta':
+            self.block = GASubBlock(
+                in_channels, kernel_size=21, mlp_ratio=mlp_ratio,
+                drop=drop, drop_path=drop_path, act_layer=nn.GELU)
+        else:
+            assert False and "Invalid model_type in SimVP"
+        if in_channels != out_channels:
+            self.reduction = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        z = self.block(x)
+        return z if self.in_channels == self.out_channels else self.reduction(z)
+class GASubBlock(nn.Module):
+    """A GABlock (gSTA) for SimVP"""
+    def __init__(self, dim, kernel_size=21, mlp_ratio=4.,
+                 drop=0., drop_path=0.1, init_value=1e-2, act_layer=nn.GELU):
+        super().__init__()
+        self.norm1 = nn.BatchNorm2d(dim)
+        self.attn = SpatialAttention(dim, kernel_size)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2d(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MixMlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.layer_scale_1 = nn.Parameter(init_value * torch.ones((dim)), requires_grad=True)
+        self.layer_scale_2 = nn.Parameter(init_value * torch.ones((dim)), requires_grad=True)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'layer_scale_1', 'layer_scale_2'}
+    def forward(self, x):
+        x = x + self.drop_path(
+            self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.attn(self.norm1(x)))
+        x = x + self.drop_path(
+            self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x)))
+        return x
+class SpatialAttention(nn.Module):
+    """A Spatial Attention block for SimVP"""
+    def __init__(self, d_model, kernel_size=21, attn_shortcut=True):
+        super().__init__()
+        self.proj_1 = nn.Conv2d(d_model, d_model, 1)         # 1x1 conv
+        self.activation = nn.GELU()                          # GELU
+        self.spatial_gating_unit = AttentionModule(d_model, kernel_size)
+        self.proj_2 = nn.Conv2d(d_model, d_model, 1)         # 1x1 conv
+        self.attn_shortcut = attn_shortcut
+    def forward(self, x):
+        if self.attn_shortcut:
+            shortcut = x.clone()
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        if self.attn_shortcut:
+            x = x + shortcut
+        return x
+class AttentionModule(nn.Module):
+    """Large Kernel Attention for SimVP"""
+    def __init__(self, dim, kernel_size, dilation=3):
+        super().__init__()
+        d_k = 2 * dilation - 1
+        d_p = (d_k - 1) // 2
+        dd_k = kernel_size // dilation + ((kernel_size // dilation) % 2 - 1)
+        dd_p = (dilation * (dd_k - 1) // 2)
+        self.conv0 = nn.Conv2d(dim, dim, d_k, padding=d_p, groups=dim)
+        self.conv_spatial = nn.Conv2d(
+            dim, dim, dd_k, stride=1, padding=dd_p, groups=dim, dilation=dilation)
+        self.conv1 = nn.Conv2d(dim, 2*dim, 1)
+    def forward(self, x):
+        u = x.clone()
+        attn = self.conv0(x)           # depth-wise conv
+        attn = self.conv_spatial(attn) # depth-wise dilation convolution
+        f_g = self.conv1(attn)
+        split_dim = f_g.shape[1] // 2
+        f_x, g_x = torch.split(f_g, split_dim, dim=1)
+        return torch.sigmoid(g_x) * f_x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x):
+        x = self.dwconv(x)
+        return x
+class MixMlp(nn.Module):
+    def __init__(self,
+                 in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)  # 1x1
+        self.dwconv = DWConv(hidden_features)                  # CFF: Convlutional feed-forward network
+        self.act = act_layer()                                 # GELU
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1) # 1x1
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+"""
+From TIMM repo: https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py
+"""
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+class Encoder(nn.Module):
+    def __init__(self,C_in, C_hid, N_S):
+        super(Encoder,self).__init__()
+        strides = stride_generator(N_S)
+        self.enc = nn.Sequential(
+            ConvSC(C_in, C_hid, stride=strides[0]),
+            *[ConvSC(C_hid, C_hid, stride=s) for s in strides[1:]]
+        )
+    def forward(self,x):# B*4, 3, 128, 128
+        enc1 = self.enc[0](x)
+        latent = enc1
+        for i in range(1,len(self.enc)):
+            latent = self.enc[i](latent)
+        return latent
+class Decoder(nn.Module):
+    def __init__(self,C_hid, C_out, N_S, last_activation='sigmoid'):
+        super(Decoder,self).__init__()
+        strides = stride_generator(N_S, reverse=True)
+        self.dec = nn.Sequential(
+            *[ConvSC(C_hid, C_hid, stride=s, transpose=True) for s in strides[:-1]],
+            ConvSC(C_hid, C_hid, stride=strides[-1], transpose=True)# Modify HERE
+        )
+        self.readout = nn.Conv2d(C_hid, C_out, 1)
+        if last_activation=='sigmoid':
+            self.last = nn.Sigmoid()
+        else:
+            self.last = nn.Identity()
+    def forward(self, hid):
+        for i in range(0,len(self.dec)-1):
+            hid = self.dec[i](hid)
+        Y = self.dec[-1](hid) # Modify HERE
+        Y = self.readout(Y)
+        return self.last(Y)
+class BasicConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, transpose=False, act_norm=False):
+        super(BasicConv2d, self).__init__()
+        self.act_norm=act_norm
+        if not transpose:
+            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        else:
+            self.conv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,output_padding=stride //2 )
+        self.norm = nn.GroupNorm(2, out_channels)
+        self.act = nn.LeakyReLU(0.2, inplace=True)
+    def forward(self, x):
+        y = self.conv(x)
+        if self.act_norm:
+            y = self.act(self.norm(y))
+        return y
+class ConvSC(nn.Module):
+    def __init__(self, C_in, C_out, stride, transpose=False, act_norm=True):
+        super(ConvSC, self).__init__()
+        if stride == 1:
+            transpose = False
+        self.conv = BasicConv2d(C_in, C_out, kernel_size=3, stride=stride,
+                                padding=1, transpose=transpose, act_norm=act_norm)
+    def forward(self, x):
+        y = self.conv(x)
+        return y
+def stride_generator(N, reverse=False):
+    strides = [1, 2]*10
+    if reverse: return list(reversed(strides[:N]))
+    else: return strides[:N]

stldm/stldm.py ADDED Viewed

	@@ -0,0 +1,612 @@

+import torch, random
+from torch import nn
+from einops import rearrange
+from stldm.submodules import *
+class Down_Block(nn.Module):
+    def __init__(self, in_ch, hid_ch, out_ch, time_dim, is_last, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Down_Block, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn_spatial = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head))) if patch_size is None else Residual(PreNorm(hid_ch, Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch, dim_out=hid_ch, groups=num_groups)
+        # self.attn_temporal = Residual(PreNorm(hid_ch, TemporalAttention_Pos(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.attn_temporal = Residual(PreNorm(hid_ch, TemporalAttention(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.last = Downsample2D(dim_in=hid_ch, dim_out=out_ch) if not is_last else ChannelConversion(hid_ch, out_ch)
+    def forward(self, x, time_emb, cond=None, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        if cond is None:
+            cond = torch.zeros_like(x) # -> Unconditioning
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = torch.cat((x, cond), dim=1) # BT, 2C, H, W
+        out = self.block1(out, time_emb)
+        spatial_attn = self.attn_spatial(out)
+        out = self.block2(spatial_attn, time_emb)
+        *_, c, h, w = out.shape
+        out = out.reshape(B,T,c,h,w)
+        # temporal_attn = self.attn_temporal(out, relative_pos)
+        temporal_attn = self.attn_temporal(out)
+        temporal_attn = temporal_attn.reshape(B*T,c,h,w)
+        out = self.last(temporal_attn)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w), spatial_attn, temporal_attn
+class MidBlock(nn.Module):
+    def __init__(self, in_ch, time_dim, num_groups=8, heads=4, dim_head=32):
+        super(MidBlock, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.qattn_spatial = Residual(PreNorm(in_ch, Quadratic_SpatialAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        # self.qattn_time = Residual(PreNorm(in_ch, TemporalAttention_Pos(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.qattn_time = Residual(PreNorm(in_ch, TemporalAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block3 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.block1(x, time_emb)
+        out = self.qattn_spatial(out)
+        out = self.block2(out, time_emb) # a little bit difference here
+        out = out.reshape((B, T, C, H, W))
+        # out = self.qattn_time(out, relative_pos).reshape(B*T, C, H, W)
+        out = self.qattn_time(out).reshape(B*T, C, H, W)
+        out = self.block3(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class Up_Block(nn.Module):
+    def __init__(self, in_chs, hid_ch, out_ch, is_last, time_dim, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Up_Block, self).__init__()
+        in_ch, skip_ch = in_chs
+        self.up = Upsample2D(dim_in=in_ch, dim_out=hid_ch) if not is_last else ChannelConversion(in_ch, hid_ch)
+        self.attn_spatial = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head) if patch_size is None else Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block1 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        # self.attn_temporal =  Residual(PreNorm(hid_ch, TemporalAttention_Pos(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.attn_temporal =  Residual(PreNorm(hid_ch, TemporalAttention(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=out_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, spatialattn_skip, tempattn_skip, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.up(x)
+        *_, c, h, w = out.shape
+        out = out.reshape(-1, T, c, h, w)
+        # out = self.attn_temporal(out, relative_pos).reshape(B*T, c, h, w)
+        out = self.attn_temporal(out).reshape(B*T, c, h, w)
+        out = torch.cat((out, tempattn_skip), dim=1)
+        out = self.block1(out, time_emb)
+        out = self.attn_spatial(out)
+        out = torch.cat((out, spatialattn_skip), dim=1)
+        out = self.block2(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class LDM(nn.Module):
+    def __init__(self, in_ch, chs_mult:tuple, patch_size=None, num_groups=8, heads=4, dim_head=32, base_ch=64):
+        super(LDM, self).__init__()
+        # Time Embedding MLP
+        time_dim = 4*base_ch
+        fourier_dim = base_ch
+        self.time_mlp = Time_MLP(dim=base_ch, time_dim=time_dim, fourier_dim=fourier_dim)
+        ups, downs = [], []
+        conditions = []
+        layer_no = len(chs_mult)
+        chs = [in_ch, *map(lambda m: base_ch*m, chs_mult)]
+        ch_in, ch_out = chs[:-1], chs[1:]
+        up_in, up_out = list(reversed(ch_out)), list(reversed(ch_in))
+        patches = None if patch_size is None else [patch_size//(2**n) for n in range(layer_no)] # Patch Size should be 2^N
+        for n in range(layer_no):
+            downs.append(
+                Down_Block(in_ch=2*ch_in[n], hid_ch=ch_in[n], out_ch=ch_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[n], is_last=(n==layer_no-1), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            ups.append(
+                Up_Block(in_chs=(up_in[n], ch_in[-n-1]), hid_ch=up_in[n], out_ch=up_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[layer_no-n-1], is_last=(n==0), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            if n != -1:
+                conditions.append(
+                    Downsample2D(dim_in=ch_in[n], dim_out=ch_out[n])
+                )
+        self.downs = nn.ModuleList(downs)
+        self.ups = nn.ModuleList(ups)
+        self.conditions = nn.ModuleList(conditions)
+        self.mid = MidBlock(in_ch=ch_out[-1], time_dim=time_dim, num_groups=num_groups, heads=heads, dim_head=dim_head)
+        # self.relative_pos = RelativePositionBias(heads=heads)
+    def forward(self, x, time, conds=None):
+        t = self.time_mlp(time)
+        hid_spatial = []
+        hid_temporal = []
+        # relative_position = self.relative_pos(x.shape[1], x.device) # Calculate The Relative Position
+        for n, down_block in enumerate(self.downs):
+            # print(x.shape)
+            # x, spatial_attn, time_attn = down_block(x, t, conds, relative_position)
+            x, spatial_attn, time_attn = down_block(x, t, conds)
+            hid_spatial.append(spatial_attn)
+            hid_temporal.append(time_attn)
+            if conds is not None:
+                conds = self.conditions[n](conds)
+        # out = self.mid(x, t, relative_position)
+        out = self.mid(x, t)
+        for up_block in self.ups:
+            # out = up_block(out, t, hid_spatial.pop(), hid_temporal.pop(), relative_position)
+            out = up_block(out, t, hid_spatial.pop(), hid_temporal.pop())
+        return out
+# constants
+from collections import namedtuple
+from torch.cuda.amp import autocast
+import torch.nn.functional as F
+from einops import reduce
+from tqdm.auto import tqdm
+ModelPrediction =  namedtuple('ModelPrediction', ['pred_noise', 'pred_x_start'])
+def identity(t, *args, **kwargs):
+    return t
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def exists(x):
+    return x is not None
+def guidance_scheduler(sampling_step: int, const: float):
+    return const*torch.ones(sampling_step)
+class GaussianDiffusion(nn.Module):
+    def __init__(
+        self,
+        vp_model,
+        diffusion,
+        timesteps = 1000,
+        sampling_timesteps = None,
+        objective = 'pred_v',
+        beta_schedule = 'sigmoid',
+        schedule_fn_kwargs = dict(),
+        ddim_sampling_eta = 0.,
+        offset_noise_strength = 0.,  # https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        min_snr_loss_weight = False, # https://arxiv.org/abs/2303.09556
+        min_snr_gamma = 5
+    ):
+        super(GaussianDiffusion, self).__init__()
+        self.backbone = vp_model
+        self.diff_unet = diffusion
+        self.objective = objective
+        assert objective in {'pred_noise', 'pred_x0', 'pred_v'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
+        if beta_schedule == 'linear':
+            beta_schedule_fn = linear_beta_schedule
+        elif beta_schedule == 'cosine':
+            beta_schedule_fn = cosine_beta_schedule
+        elif beta_schedule == 'sigmoid':
+            beta_schedule_fn = sigmoid_beta_schedule
+        else:
+            raise ValueError(f'unknown beta schedule {beta_schedule}')
+        betas = beta_schedule_fn(timesteps, **schedule_fn_kwargs)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        # sampling related parameters
+        self.sampling_timesteps = default(sampling_timesteps, timesteps) # default num sampling timesteps to number of timesteps at training
+        assert self.sampling_timesteps <= timesteps
+        self.is_ddim_sampling = self.sampling_timesteps < timesteps
+        self.ddim_sampling_eta = ddim_sampling_eta
+        # helper function to register buffer from float64 to float32
+        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+        register_buffer('betas', betas)
+        register_buffer('alphas_cumprod', alphas_cumprod)
+        register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+        register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
+        register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
+        register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
+        register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        register_buffer('posterior_variance', posterior_variance)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        register_buffer('posterior_log_variance_clipped', torch.log(posterior_variance.clamp(min =1e-20)))
+        register_buffer('posterior_mean_coef1', betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))
+        register_buffer('posterior_mean_coef2', (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / (1. - alphas_cumprod))
+        # offset noise strength - in blogpost, they claimed 0.1 was ideal
+        self.offset_noise_strength = offset_noise_strength
+        # derive loss weight
+        # snr - signal noise ratio
+        snr = alphas_cumprod / (1 - alphas_cumprod)
+        # https://arxiv.org/abs/2303.09556
+        maybe_clipped_snr = snr.clone()
+        if min_snr_loss_weight:
+            maybe_clipped_snr.clamp_(max = min_snr_gamma)
+        if objective == 'pred_noise':
+            register_buffer('loss_weight', maybe_clipped_snr / snr)
+        elif objective == 'pred_x0':
+            register_buffer('loss_weight', maybe_clipped_snr)
+        elif objective == 'pred_v':
+            register_buffer('loss_weight', maybe_clipped_snr / (snr + 1))
+    @property
+    def device(self):
+        return self.betas.device
+    # CFG schdeuler => by taking pre-setting scheduler
+    def setup_guidance(self, scheduler):
+        if exists(scheduler):
+            self.CFG_sch = scheduler.to(self.device)
+        else:
+            self.CFG_sch = scheduler
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (
+            (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+        )
+    def predict_v(self, x_start, t, noise):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
+        )
+    def predict_start_from_v(self, x_t, t, v):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+            extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def model_predictions(self, x, t, cond, clip_x_start = False, rederive_pred_noise = False):
+        # print(t.device)
+        if exists(self.CFG_sch):
+            uncond = self.diff_unet(x, t, conds=None) #conds=torch.zeros_like(cond))
+            model_output = self.diff_unet(x, t, conds=cond)
+            time = int(t[0])
+            model_output = model_output - self.CFG_sch[time] * (uncond - model_output)
+        else:
+            model_output = self.diff_unet(x, t, conds=cond)
+        maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
+        if self.objective == 'pred_noise':
+            pred_noise = model_output
+            x_start = self.predict_start_from_noise(x, t, pred_noise)
+            x_start = maybe_clip(x_start)
+            if clip_x_start and rederive_pred_noise:
+                pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_x0':
+            x_start = model_output
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_v':
+            v = model_output
+            x_start = self.predict_start_from_v(x, t, v)
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return ModelPrediction(pred_noise, x_start)
+    def p_mean_variance(self, x, t, cond=None, clip_denoised = True):
+        preds = self.model_predictions(x, t, cond=cond, clip_x_start=False,)
+        x_start = preds.pred_x_start
+        if clip_denoised:
+            x_start.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start = x_start, x_t = x, t = t)
+        return model_mean, posterior_variance, posterior_log_variance, x_start
+    @torch.no_grad()
+    def p_sample(self, x, t: int, cond=None):
+        b, *_, device = *x.shape, self.device
+        batched_times = torch.full((b,), t, device = device, dtype = torch.long)
+        model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, t = batched_times, cond=cond, clip_denoised = False)
+        noise = torch.randn_like(x) if t > 0 else 0. # no noise if t == 0
+        pred_img = model_mean + (0.5 * model_log_variance).exp() * noise
+        return pred_img, x_start
+    @torch.no_grad()
+    def p_sample_loop(self, shape, cond=None, return_all_timesteps = False):
+        batch, device = shape[0], self.device
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for t in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps, disable=True):
+            frames_pred, _ = self.p_sample(frames_pred, t, cond=cond)
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def ddim_sample(self, shape, cond=None, return_all_timesteps = False):
+        batch, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+        device = self.device
+        times = torch.linspace(-1, total_timesteps - 1, steps = sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for time, time_next in tqdm(time_pairs, desc = 'sampling loop time step', disable=True):
+            time_cond = torch.full((batch,), time, device = device, dtype = torch.long)
+            pred_noise, x_start, *_ = self.model_predictions(
+                            frames_pred,
+                            time_cond,
+                            cond = cond, #cond.copy(),
+                            clip_x_start = False,
+                            rederive_pred_noise = True
+                        )
+            if time_next < 0:
+                frames_pred = x_start
+                imgs.append(frames_pred)
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(frames_pred)
+            frames_pred = x_start * alpha_next.sqrt() + \
+                  c * pred_noise + \
+                  sigma * noise
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def sample(self, frames_in, return_all_timesteps = False):
+        assert frames_in.ndim == 5
+        B, T_in, C, H, W = frames_in.shape
+        device = self.device
+        backbone_output, conds, *_ = self.backbone(frames_in)
+        sample_fn = self.p_sample_loop if not self.is_ddim_sampling else self.ddim_sample
+        *_, c, h, w = conds.shape
+        tgt_shape = conds.reshape(B, -1, c, h, w).shape
+        ldm_pred = sample_fn(
+            tgt_shape,
+            cond=conds,
+            return_all_timesteps = return_all_timesteps
+        )
+        ldm_pred = rearrange(ldm_pred, 'b t c h w -> (b t) c h w')
+        frames_pred = self.backbone.vae.decode(ldm_pred)
+        frames_pred = rearrange(frames_pred, '(b t) c h w -> b t c h w', b=B)
+        return frames_pred, backbone_output
+    def predict(self, frames_in, compute_loss=False, **kwargs):
+        pred, mu = self.sample(frames_in=frames_in)
+        return pred, mu
+    def compute_loss(self, frames_in, frames_gt, validate=False):
+        compute_loss = True and (not validate)
+        B, T_in, C, H, W = frames_in.shape
+        T_out = frames_gt.shape[1]
+        device = frames_in.device
+        """
+        Diffusion Loss
+        """
+        backbone_output, conds = self.backbone(frames_in)
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        t = torch.randint(0, self.num_timesteps, (B,), device=self.device).long()
+        if random.random() > 0.85: # Unconditional
+            conds = None
+        diff_loss = self.p_losses(hid_gt.detach(), t, cond=conds)
+        """
+        Backbone Loss
+        """
+        mu_loss = self.backbone._losses_(frames_in, frames_gt)
+        """
+        VAE Loss
+        """
+        ae_loss, kl_loss = self.backbone.vae._losses_(
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w'),
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w')
+        )
+        kl_weight = 1E-6
+        recon_loss = ae_loss + kl_weight*kl_loss
+        """
+        Prior Loss at t=T [Noisy]
+        """
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        T = torch.ones((B,), device=self.device).long() * (self.num_timesteps - 1)
+        mu_noisy = extract(self.sqrt_alphas_cumprod, T, hid_gt.shape) * hid_gt
+        sigma_noisy = extract(self.sqrt_one_minus_alphas_cumprod, T, hid_gt.shape)
+        log_var_noisy = 2*torch.log(sigma_noisy)
+        prior_loss = self.kl_from_standard_normal(mu_noisy, log_var_noisy)
+        return recon_loss, mu_loss, diff_loss, prior_loss
+    def kl_from_standard_normal(self, mean, log_var):
+        kl = 0.5 * (log_var.exp() + mean.square() - 1.0 - log_var)
+        return kl.mean()
+    @autocast(enabled = False)
+    def q_sample(self, x_start, t, noise = None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, noise=None, offset_noise_strength=None, cond=None):
+        b, T, c, h, w = x_start.shape
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # offset noise - https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        offset_noise_strength = default(offset_noise_strength, self.offset_noise_strength)
+        if offset_noise_strength > 0.:
+            offset_noise = torch.randn(x_start.shape[:2], device = self.device)
+            noise += offset_noise_strength * rearrange(offset_noise, 'b c -> b c 1 1')
+        # noise sample
+        x = self.q_sample(x_start=x_start, t=t, noise=noise) # Use q_sample here for updating: https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L763
+        model_out = self.diff_unet(x, t, conds=cond)
+        if self.objective == 'pred_noise':
+            target = noise
+        elif self.objective == 'pred_x0':
+            target = x_start
+        elif self.objective == 'pred_v':
+            v = self.predict_v(x_start, t, noise)
+            target = v
+        else:
+            raise ValueError(f'unknown objective {self.objective}')
+        loss = F.mse_loss(model_out, target, reduction = 'none') # (B, T, C, H, W)
+        loss = reduce(loss, 'b ... -> b', 'mean')
+        loss = loss * extract(self.loss_weight, t, loss.shape)
+        return loss.mean()
+    @torch.no_grad()
+    def forward(self, input_x, include_mu=False, **kwargs):
+        pred, mu = self.predict(input_x, compute_loss=False)
+        if include_mu:
+            return pred, mu
+        else:
+            return pred
+from stldm.modules import SimVPV2_Model, VAE
+def model_setup(model_config, print_info=False, cfg_str=None):
+    if print_info:
+        print('Setup the model with considering temporal attention be (BHW, T, C) ... ...')
+        print('Train it from end to end')
+    vp_config = model_config['vp_param']
+    ldm_config = model_config['stldm_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ldm = LDM(**ldm_config)
+    model = GaussianDiffusion(vp_model=vpm, diffusion=ldm, **model_config['param'])
+    scheduler = guidance_scheduler(sampling_step=model_config['param']['timesteps'], const=cfg_str) if cfg_str is not None else None
+    model.setup_guidance(scheduler)
+    return model
+def ae_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ae = vpm.vae
+    return ae
+def backbone_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    return vpm

stldm/stldm_hf.py ADDED Viewed

	@@ -0,0 +1,620 @@

+import torch, random
+from torch import nn
+from einops import rearrange
+from stldm.submodules import *
+class Down_Block(nn.Module):
+    def __init__(self, in_ch, hid_ch, out_ch, time_dim, is_last, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Down_Block, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn_spatial = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head))) if patch_size is None else Residual(PreNorm(hid_ch, Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch, dim_out=hid_ch, groups=num_groups)
+        # self.attn_temporal = Residual(PreNorm(hid_ch, TemporalAttention_Pos(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.attn_temporal = Residual(PreNorm(hid_ch, TemporalAttention(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.last = Downsample2D(dim_in=hid_ch, dim_out=out_ch) if not is_last else ChannelConversion(hid_ch, out_ch)
+    def forward(self, x, time_emb, cond=None, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        if cond is None:
+            cond = torch.zeros_like(x) # -> Unconditioning
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = torch.cat((x, cond), dim=1) # BT, 2C, H, W
+        out = self.block1(out, time_emb)
+        spatial_attn = self.attn_spatial(out)
+        out = self.block2(spatial_attn, time_emb)
+        *_, c, h, w = out.shape
+        out = out.reshape(B,T,c,h,w)
+        # temporal_attn = self.attn_temporal(out, relative_pos)
+        temporal_attn = self.attn_temporal(out)
+        temporal_attn = temporal_attn.reshape(B*T,c,h,w)
+        out = self.last(temporal_attn)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w), spatial_attn, temporal_attn
+class MidBlock(nn.Module):
+    def __init__(self, in_ch, time_dim, num_groups=8, heads=4, dim_head=32):
+        super(MidBlock, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.qattn_spatial = Residual(PreNorm(in_ch, Quadratic_SpatialAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        # self.qattn_time = Residual(PreNorm(in_ch, TemporalAttention_Pos(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.qattn_time = Residual(PreNorm(in_ch, TemporalAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block3 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.block1(x, time_emb)
+        out = self.qattn_spatial(out)
+        out = self.block2(out, time_emb) # a little bit difference here
+        out = out.reshape((B, T, C, H, W))
+        # out = self.qattn_time(out, relative_pos).reshape(B*T, C, H, W)
+        out = self.qattn_time(out).reshape(B*T, C, H, W)
+        out = self.block3(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class Up_Block(nn.Module):
+    def __init__(self, in_chs, hid_ch, out_ch, is_last, time_dim, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Up_Block, self).__init__()
+        in_ch, skip_ch = in_chs
+        self.up = Upsample2D(dim_in=in_ch, dim_out=hid_ch) if not is_last else ChannelConversion(in_ch, hid_ch)
+        self.attn_spatial = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head) if patch_size is None else Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block1 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        # self.attn_temporal =  Residual(PreNorm(hid_ch, TemporalAttention_Pos(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.attn_temporal =  Residual(PreNorm(hid_ch, TemporalAttention(dim=hid_ch, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=out_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, spatialattn_skip, tempattn_skip, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.up(x)
+        *_, c, h, w = out.shape
+        out = out.reshape(-1, T, c, h, w)
+        # out = self.attn_temporal(out, relative_pos).reshape(B*T, c, h, w)
+        out = self.attn_temporal(out).reshape(B*T, c, h, w)
+        out = torch.cat((out, tempattn_skip), dim=1)
+        out = self.block1(out, time_emb)
+        out = self.attn_spatial(out)
+        out = torch.cat((out, spatialattn_skip), dim=1)
+        out = self.block2(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class LDM(nn.Module):
+    def __init__(self, in_ch, chs_mult:tuple, patch_size=None, num_groups=8, heads=4, dim_head=32, base_ch=64):
+        super(LDM, self).__init__()
+        # Time Embedding MLP
+        time_dim = 4*base_ch
+        fourier_dim = base_ch
+        self.time_mlp = Time_MLP(dim=base_ch, time_dim=time_dim, fourier_dim=fourier_dim)
+        ups, downs = [], []
+        conditions = []
+        layer_no = len(chs_mult)
+        chs = [in_ch, *map(lambda m: base_ch*m, chs_mult)]
+        ch_in, ch_out = chs[:-1], chs[1:]
+        up_in, up_out = list(reversed(ch_out)), list(reversed(ch_in))
+        patches = None if patch_size is None else [patch_size//(2**n) for n in range(layer_no)] # Patch Size should be 2^N
+        for n in range(layer_no):
+            downs.append(
+                Down_Block(in_ch=2*ch_in[n], hid_ch=ch_in[n], out_ch=ch_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[n], is_last=(n==layer_no-1), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            ups.append(
+                Up_Block(in_chs=(up_in[n], ch_in[-n-1]), hid_ch=up_in[n], out_ch=up_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[layer_no-n-1], is_last=(n==0), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            if n != -1:
+                conditions.append(
+                    Downsample2D(dim_in=ch_in[n], dim_out=ch_out[n])
+                )
+        self.downs = nn.ModuleList(downs)
+        self.ups = nn.ModuleList(ups)
+        self.conditions = nn.ModuleList(conditions)
+        self.mid = MidBlock(in_ch=ch_out[-1], time_dim=time_dim, num_groups=num_groups, heads=heads, dim_head=dim_head)
+        # self.relative_pos = RelativePositionBias(heads=heads)
+    def forward(self, x, time, conds=None):
+        t = self.time_mlp(time)
+        hid_spatial = []
+        hid_temporal = []
+        # relative_position = self.relative_pos(x.shape[1], x.device) # Calculate The Relative Position
+        for n, down_block in enumerate(self.downs):
+            # print(x.shape)
+            # x, spatial_attn, time_attn = down_block(x, t, conds, relative_position)
+            x, spatial_attn, time_attn = down_block(x, t, conds)
+            hid_spatial.append(spatial_attn)
+            hid_temporal.append(time_attn)
+            if conds is not None:
+                conds = self.conditions[n](conds)
+        # out = self.mid(x, t, relative_position)
+        out = self.mid(x, t)
+        for up_block in self.ups:
+            # out = up_block(out, t, hid_spatial.pop(), hid_temporal.pop(), relative_position)
+            out = up_block(out, t, hid_spatial.pop(), hid_temporal.pop())
+        return out
+# constants
+from collections import namedtuple
+from torch.cuda.amp import autocast
+import torch.nn.functional as F
+from einops import reduce
+from tqdm.auto import tqdm
+ModelPrediction =  namedtuple('ModelPrediction', ['pred_noise', 'pred_x_start'])
+def identity(t, *args, **kwargs):
+    return t
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def exists(x):
+    return x is not None
+def guidance_scheduler(sampling_step: int, const: float):
+    return const*torch.ones(sampling_step)
+from huggingface_hub import PyTorchModelHubMixin
+class GaussianDiffusion(
+    nn.Module,
+    PyTorchModelHubMixin,
+    # optionally, you can add metadata which gets pushed to the model card
+    repo_url="https://github.com/sqfoo/stldm_official",
+    pipeline_tag="Precipitation_Nowcasting",
+    license="mit"):
+    def __init__(
+        self,
+        vp_param: dict,
+        stldm_param: dict,
+        timesteps = 1000,
+        sampling_timesteps = None,
+        objective = 'pred_v',
+        beta_schedule = 'sigmoid',
+        schedule_fn_kwargs = dict(),
+        ddim_sampling_eta = 0.,
+        offset_noise_strength = 0.,  # https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        min_snr_loss_weight = False, # https://arxiv.org/abs/2303.09556
+        min_snr_gamma = 5
+    ):
+        super(GaussianDiffusion, self).__init__()
+        self.backbone = SimVPV2_Model(**vp_param)
+        self.diff_unet = LDM(**stldm_param)
+        self.objective = objective
+        assert objective in {'pred_noise', 'pred_x0', 'pred_v'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
+        if beta_schedule == 'linear':
+            beta_schedule_fn = linear_beta_schedule
+        elif beta_schedule == 'cosine':
+            beta_schedule_fn = cosine_beta_schedule
+        elif beta_schedule == 'sigmoid':
+            beta_schedule_fn = sigmoid_beta_schedule
+        else:
+            raise ValueError(f'unknown beta schedule {beta_schedule}')
+        betas = beta_schedule_fn(timesteps, **schedule_fn_kwargs)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        # sampling related parameters
+        self.sampling_timesteps = default(sampling_timesteps, timesteps) # default num sampling timesteps to number of timesteps at training
+        assert self.sampling_timesteps <= timesteps
+        self.is_ddim_sampling = self.sampling_timesteps < timesteps
+        self.ddim_sampling_eta = ddim_sampling_eta
+        # helper function to register buffer from float64 to float32
+        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+        register_buffer('betas', betas)
+        register_buffer('alphas_cumprod', alphas_cumprod)
+        register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+        register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
+        register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
+        register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
+        register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        register_buffer('posterior_variance', posterior_variance)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        register_buffer('posterior_log_variance_clipped', torch.log(posterior_variance.clamp(min =1e-20)))
+        register_buffer('posterior_mean_coef1', betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))
+        register_buffer('posterior_mean_coef2', (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / (1. - alphas_cumprod))
+        # offset noise strength - in blogpost, they claimed 0.1 was ideal
+        self.offset_noise_strength = offset_noise_strength
+        # derive loss weight
+        # snr - signal noise ratio
+        snr = alphas_cumprod / (1 - alphas_cumprod)
+        # https://arxiv.org/abs/2303.09556
+        maybe_clipped_snr = snr.clone()
+        if min_snr_loss_weight:
+            maybe_clipped_snr.clamp_(max = min_snr_gamma)
+        if objective == 'pred_noise':
+            register_buffer('loss_weight', maybe_clipped_snr / snr)
+        elif objective == 'pred_x0':
+            register_buffer('loss_weight', maybe_clipped_snr)
+        elif objective == 'pred_v':
+            register_buffer('loss_weight', maybe_clipped_snr / (snr + 1))
+    @property
+    def device(self):
+        return self.betas.device
+    # CFG schdeuler => by taking pre-setting scheduler
+    def setup_guidance(self, scheduler):
+        if exists(scheduler):
+            self.CFG_sch = scheduler.to(self.device)
+        else:
+            self.CFG_sch = scheduler
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (
+            (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+        )
+    def predict_v(self, x_start, t, noise):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
+        )
+    def predict_start_from_v(self, x_t, t, v):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+            extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def model_predictions(self, x, t, cond, clip_x_start = False, rederive_pred_noise = False):
+        # print(t.device)
+        if exists(self.CFG_sch):
+            uncond = self.diff_unet(x, t, conds=None) #conds=torch.zeros_like(cond))
+            model_output = self.diff_unet(x, t, conds=cond)
+            time = int(t[0])
+            model_output = model_output - self.CFG_sch[time] * (uncond - model_output)
+        else:
+            model_output = self.diff_unet(x, t, conds=cond)
+        maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
+        if self.objective == 'pred_noise':
+            pred_noise = model_output
+            x_start = self.predict_start_from_noise(x, t, pred_noise)
+            x_start = maybe_clip(x_start)
+            if clip_x_start and rederive_pred_noise:
+                pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_x0':
+            x_start = model_output
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_v':
+            v = model_output
+            x_start = self.predict_start_from_v(x, t, v)
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return ModelPrediction(pred_noise, x_start)
+    def p_mean_variance(self, x, t, cond=None, clip_denoised = True):
+        preds = self.model_predictions(x, t, cond=cond, clip_x_start=False,)
+        x_start = preds.pred_x_start
+        if clip_denoised:
+            x_start.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start = x_start, x_t = x, t = t)
+        return model_mean, posterior_variance, posterior_log_variance, x_start
+    @torch.no_grad()
+    def p_sample(self, x, t: int, cond=None):
+        b, *_, device = *x.shape, self.device
+        batched_times = torch.full((b,), t, device = device, dtype = torch.long)
+        model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, t = batched_times, cond=cond, clip_denoised = False)
+        noise = torch.randn_like(x) if t > 0 else 0. # no noise if t == 0
+        pred_img = model_mean + (0.5 * model_log_variance).exp() * noise
+        return pred_img, x_start
+    @torch.no_grad()
+    def p_sample_loop(self, shape, cond=None, return_all_timesteps = False):
+        batch, device = shape[0], self.device
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for t in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps, disable=True):
+            frames_pred, _ = self.p_sample(frames_pred, t, cond=cond)
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def ddim_sample(self, shape, cond=None, return_all_timesteps = False):
+        batch, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+        device = self.device
+        times = torch.linspace(-1, total_timesteps - 1, steps = sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for time, time_next in tqdm(time_pairs, desc = 'sampling loop time step', disable=True):
+            time_cond = torch.full((batch,), time, device = device, dtype = torch.long)
+            pred_noise, x_start, *_ = self.model_predictions(
+                            frames_pred,
+                            time_cond,
+                            cond = cond, #cond.copy(),
+                            clip_x_start = False,
+                            rederive_pred_noise = True
+                        )
+            if time_next < 0:
+                frames_pred = x_start
+                imgs.append(frames_pred)
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(frames_pred)
+            frames_pred = x_start * alpha_next.sqrt() + \
+                  c * pred_noise + \
+                  sigma * noise
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def sample(self, frames_in, return_all_timesteps = False):
+        assert frames_in.ndim == 5
+        B, T_in, C, H, W = frames_in.shape
+        device = self.device
+        backbone_output, conds, *_ = self.backbone(frames_in)
+        sample_fn = self.p_sample_loop if not self.is_ddim_sampling else self.ddim_sample
+        *_, c, h, w = conds.shape
+        tgt_shape = conds.reshape(B, -1, c, h, w).shape
+        ldm_pred = sample_fn(
+            tgt_shape,
+            cond=conds,
+            return_all_timesteps = return_all_timesteps
+        )
+        ldm_pred = rearrange(ldm_pred, 'b t c h w -> (b t) c h w')
+        frames_pred = self.backbone.vae.decode(ldm_pred)
+        frames_pred = rearrange(frames_pred, '(b t) c h w -> b t c h w', b=B)
+        return frames_pred, backbone_output
+    def predict(self, frames_in, compute_loss=False, **kwargs):
+        pred, mu = self.sample(frames_in=frames_in)
+        return pred, mu
+    def compute_loss(self, frames_in, frames_gt, validate=False):
+        compute_loss = True and (not validate)
+        B, T_in, C, H, W = frames_in.shape
+        T_out = frames_gt.shape[1]
+        device = frames_in.device
+        """
+        Diffusion Loss
+        """
+        backbone_output, conds = self.backbone(frames_in)
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        t = torch.randint(0, self.num_timesteps, (B,), device=self.device).long()
+        if random.random() > 0.85: # Unconditional
+            conds = None
+        diff_loss = self.p_losses(hid_gt.detach(), t, cond=conds)
+        """
+        Backbone Loss
+        """
+        mu_loss = self.backbone._losses_(frames_in, frames_gt)
+        """
+        VAE Loss
+        """
+        ae_loss, kl_loss = self.backbone.vae._losses_(
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w'),
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w')
+        )
+        kl_weight = 1E-6
+        recon_loss = ae_loss + kl_weight*kl_loss
+        """
+        Prior Loss at t=T [Noisy]
+        """
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        T = torch.ones((B,), device=self.device).long() * (self.num_timesteps - 1)
+        mu_noisy = extract(self.sqrt_alphas_cumprod, T, hid_gt.shape) * hid_gt
+        sigma_noisy = extract(self.sqrt_one_minus_alphas_cumprod, T, hid_gt.shape)
+        log_var_noisy = 2*torch.log(sigma_noisy)
+        prior_loss = self.kl_from_standard_normal(mu_noisy, log_var_noisy)
+        return recon_loss, mu_loss, diff_loss, prior_loss
+    def kl_from_standard_normal(self, mean, log_var):
+        kl = 0.5 * (log_var.exp() + mean.square() - 1.0 - log_var)
+        return kl.mean()
+    @autocast(enabled = False)
+    def q_sample(self, x_start, t, noise = None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, noise=None, offset_noise_strength=None, cond=None):
+        b, T, c, h, w = x_start.shape
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # offset noise - https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        offset_noise_strength = default(offset_noise_strength, self.offset_noise_strength)
+        if offset_noise_strength > 0.:
+            offset_noise = torch.randn(x_start.shape[:2], device = self.device)
+            noise += offset_noise_strength * rearrange(offset_noise, 'b c -> b c 1 1')
+        # noise sample
+        x = self.q_sample(x_start=x_start, t=t, noise=noise) # Use q_sample here for updating: https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L763
+        model_out = self.diff_unet(x, t, conds=cond)
+        if self.objective == 'pred_noise':
+            target = noise
+        elif self.objective == 'pred_x0':
+            target = x_start
+        elif self.objective == 'pred_v':
+            v = self.predict_v(x_start, t, noise)
+            target = v
+        else:
+            raise ValueError(f'unknown objective {self.objective}')
+        loss = F.mse_loss(model_out, target, reduction = 'none') # (B, T, C, H, W)
+        loss = reduce(loss, 'b ... -> b', 'mean')
+        loss = loss * extract(self.loss_weight, t, loss.shape)
+        return loss.mean()
+    @torch.no_grad()
+    def forward(self, input_x, include_mu=False, **kwargs):
+        pred, mu = self.predict(input_x, compute_loss=False)
+        if include_mu:
+            return pred, mu
+        else:
+            return pred
+from stldm.modules import SimVPV2_Model, VAE
+def model_setup(model_config, print_info=False, cfg_str=None):
+    if print_info:
+        print('Setup the model with considering temporal attention be (BHW, T, C) ... ...')
+        print('Train it from end to end')
+    vp_config = model_config['vp_param']
+    ldm_config = model_config['stldm_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ldm = LDM(**ldm_config)
+    model = GaussianDiffusion(vp_model=vpm, diffusion=ldm, **model_config['param'])
+    scheduler = guidance_scheduler(sampling_step=model_config['param']['timesteps'], const=cfg_str) if cfg_str is not None else None
+    model.setup_guidance(scheduler)
+    return model
+def ae_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ae = vpm.vae
+    return ae
+def backbone_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    return vpm

stldm/stldm_spatial.py ADDED Viewed

	@@ -0,0 +1,593 @@

+import torch, random
+from torch import nn
+from einops import rearrange
+from stldm.submodules import *
+class Down_Block(nn.Module):
+    def __init__(self, in_ch, hid_ch, out_ch, time_dim, is_last, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Down_Block, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn1 = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head))) if patch_size is None else Residual(PreNorm(hid_ch, Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch, dim_out=hid_ch, groups=num_groups)
+        self.attn2 = nn.Identity()
+        # self.attn2 = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head))) if patch_size is None else Residual(PreNorm(hid_ch, Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.last = Downsample2D(dim_in=hid_ch, dim_out=out_ch) if not is_last else ChannelConversion(hid_ch, out_ch)
+    def forward(self, x, time_emb, cond=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        if cond is None:
+            cond = torch.zeros_like(x) # -> Unconditioning
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = torch.cat((x, cond), dim=1) # BT, 2C, H, W
+        out = self.block1(out, time_emb)
+        skip1  = self.attn1(out)
+        out = self.block2(skip1, time_emb)
+        skip2 = self.attn2(out)
+        out = self.last(skip2)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w), skip1, skip2
+class MidBlock(nn.Module):
+    def __init__(self, in_ch, time_dim, num_groups=8, heads=4, dim_head=32):
+        super(MidBlock, self).__init__()
+        self.block1 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn1 = Residual(PreNorm(in_ch, Quadratic_SpatialAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn2 = nn.Identity()
+        # self.attn2 = Residual(PreNorm(in_ch, Quadratic_SpatialAttention(dim=in_ch, heads=heads, dim_head=dim_head)))
+        self.block3 = ResnetBlock(dim=in_ch, dim_out=in_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, relative_pos=None):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.block1(x, time_emb)
+        out = self.attn1(out)
+        out = self.block2(out, time_emb) # a little bit difference here
+        out = self.attn2(out)
+        out = self.block3(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class Up_Block(nn.Module):
+    def __init__(self, in_chs, hid_ch, out_ch, is_last, time_dim, patch_size=None, num_groups=8, heads=4, dim_head=32):
+        super(Up_Block, self).__init__()
+        in_ch, skip_ch = in_chs
+        self.up = Upsample2D(dim_in=in_ch, dim_out=hid_ch) if not is_last else ChannelConversion(in_ch, hid_ch)
+        self.attn1 = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head) if patch_size is None else Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block1 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=hid_ch, time_emb_dim=time_dim, groups=num_groups)
+        self.attn2 = nn.Identity()
+        # self.attn2 = Residual(PreNorm(hid_ch, Quadratic_SpatialAttention(dim=hid_ch, heads=heads, dim_head=dim_head) if patch_size is None else Linear_SpatialAttention(dim=hid_ch, patch_size=patch_size, heads=heads, dim_head=dim_head)))
+        self.block2 = ResnetBlock(dim=hid_ch+skip_ch, dim_out=out_ch, time_emb_dim=time_dim, groups=num_groups)
+    def forward(self, x, time_emb, skip1, skip2):
+        assert x.ndim==5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        time_emb = time_emb.unsqueeze(1) # From (B C) to (B 1 C)
+        time_emb = time_emb.repeat(1, T, 1)
+        time_emb = time_emb.reshape(B*T, -1)
+        out = self.up(x)
+        *_, c, h, w = out.shape
+        out = self.attn1(out)
+        out = torch.cat((out, skip2), dim=1)
+        out = self.block1(out, time_emb)
+        out = self.attn2(out)
+        out = torch.cat((out, skip2), dim=1)
+        out = self.block2(out, time_emb)
+        *_, c, h, w = out.shape
+        return out.reshape(B, T, c, h, w)
+class LDM(nn.Module):
+    def __init__(self, in_ch, chs_mult:tuple, patch_size=None, num_groups=8, heads=4, dim_head=32, base_ch=64):
+        super(LDM, self).__init__()
+        # Time Embedding MLP
+        time_dim = 4*base_ch
+        fourier_dim = base_ch
+        self.time_mlp = Time_MLP(dim=base_ch, time_dim=time_dim, fourier_dim=fourier_dim)
+        ups, downs = [], []
+        conditions = []
+        layer_no = len(chs_mult)
+        chs = [in_ch, *map(lambda m: base_ch*m, chs_mult)]
+        ch_in, ch_out = chs[:-1], chs[1:]
+        up_in, up_out = list(reversed(ch_out)), list(reversed(ch_in))
+        patches = None if patch_size is None else [patch_size//(2**n) for n in range(layer_no)] # Patch Size should be 2^N
+        for n in range(layer_no):
+            downs.append(
+                Down_Block(in_ch=2*ch_in[n], hid_ch=ch_in[n], out_ch=ch_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[n], is_last=(n==layer_no-1), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            ups.append(
+                Up_Block(in_chs=(up_in[n], ch_in[-n-1]), hid_ch=up_in[n], out_ch=up_out[n], time_dim=time_dim, patch_size=None if patch_size is None else patches[layer_no-n-1], is_last=(n==0), num_groups=num_groups, heads=heads, dim_head=dim_head)
+            )
+            if n != -1:
+                conditions.append(
+                    Downsample2D(dim_in=ch_in[n], dim_out=ch_out[n])
+                )
+        self.downs = nn.ModuleList(downs)
+        self.ups = nn.ModuleList(ups)
+        self.conditions = nn.ModuleList(conditions)
+        self.mid = MidBlock(in_ch=ch_out[-1], time_dim=time_dim, num_groups=num_groups, heads=heads, dim_head=dim_head)
+    def forward(self, x, time, conds=None):
+        t = self.time_mlp(time)
+        hids1, hids2 = [], []
+        for n, down_block in enumerate(self.downs):
+            x, skip1, skip2 = down_block(x, t, conds)
+            hids1.append(skip1)
+            hids2.append(skip2)
+            if conds is not None:
+                conds = self.conditions[n](conds)
+        out = self.mid(x, t)
+        for up_block in self.ups:
+            out = up_block(out, t, hids1.pop(), hids2.pop())
+        return out
+# constants
+from collections import namedtuple
+from torch.cuda.amp import autocast
+import torch.nn.functional as F
+from einops import reduce
+from tqdm.auto import tqdm
+ModelPrediction =  namedtuple('ModelPrediction', ['pred_noise', 'pred_x_start'])
+def identity(t, *args, **kwargs):
+    return t
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def exists(x):
+    return x is not None
+def guidance_scheduler(sampling_step: int, const: float):
+    return const*torch.ones(sampling_step)
+class GaussianDiffusion(nn.Module):
+    def __init__(
+        self,
+        vp_model,
+        diffusion,
+        timesteps = 1000,
+        sampling_timesteps = None,
+        objective = 'pred_v',
+        beta_schedule = 'sigmoid',
+        schedule_fn_kwargs = dict(),
+        ddim_sampling_eta = 0.,
+        offset_noise_strength = 0.,  # https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        min_snr_loss_weight = False, # https://arxiv.org/abs/2303.09556
+        min_snr_gamma = 5
+    ):
+        super(GaussianDiffusion, self).__init__()
+        self.backbone = vp_model
+        self.diff_unet = diffusion
+        self.objective = objective
+        assert objective in {'pred_noise', 'pred_x0', 'pred_v'}, 'objective must be either pred_noise (predict noise) or pred_x0 (predict image start) or pred_v (predict v [v-parameterization as defined in appendix D of progressive distillation paper, used in imagen-video successfully])'
+        if beta_schedule == 'linear':
+            beta_schedule_fn = linear_beta_schedule
+        elif beta_schedule == 'cosine':
+            beta_schedule_fn = cosine_beta_schedule
+        elif beta_schedule == 'sigmoid':
+            beta_schedule_fn = sigmoid_beta_schedule
+        else:
+            raise ValueError(f'unknown beta schedule {beta_schedule}')
+        betas = beta_schedule_fn(timesteps, **schedule_fn_kwargs)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod_prev = F.pad(alphas_cumprod[:-1], (1, 0), value = 1.)
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        # sampling related parameters
+        self.sampling_timesteps = default(sampling_timesteps, timesteps) # default num sampling timesteps to number of timesteps at training
+        assert self.sampling_timesteps <= timesteps
+        self.is_ddim_sampling = self.sampling_timesteps < timesteps
+        self.ddim_sampling_eta = ddim_sampling_eta
+        # helper function to register buffer from float64 to float32
+        register_buffer = lambda name, val: self.register_buffer(name, val.to(torch.float32))
+        register_buffer('betas', betas)
+        register_buffer('alphas_cumprod', alphas_cumprod)
+        register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        register_buffer('sqrt_alphas_cumprod', torch.sqrt(alphas_cumprod))
+        register_buffer('sqrt_one_minus_alphas_cumprod', torch.sqrt(1. - alphas_cumprod))
+        register_buffer('log_one_minus_alphas_cumprod', torch.log(1. - alphas_cumprod))
+        register_buffer('sqrt_recip_alphas_cumprod', torch.sqrt(1. / alphas_cumprod))
+        register_buffer('sqrt_recipm1_alphas_cumprod', torch.sqrt(1. / alphas_cumprod - 1))
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        register_buffer('posterior_variance', posterior_variance)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        register_buffer('posterior_log_variance_clipped', torch.log(posterior_variance.clamp(min =1e-20)))
+        register_buffer('posterior_mean_coef1', betas * torch.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod))
+        register_buffer('posterior_mean_coef2', (1. - alphas_cumprod_prev) * torch.sqrt(alphas) / (1. - alphas_cumprod))
+        # offset noise strength - in blogpost, they claimed 0.1 was ideal
+        self.offset_noise_strength = offset_noise_strength
+        # derive loss weight
+        # snr - signal noise ratio
+        snr = alphas_cumprod / (1 - alphas_cumprod)
+        # https://arxiv.org/abs/2303.09556
+        maybe_clipped_snr = snr.clone()
+        if min_snr_loss_weight:
+            maybe_clipped_snr.clamp_(max = min_snr_gamma)
+        if objective == 'pred_noise':
+            register_buffer('loss_weight', maybe_clipped_snr / snr)
+        elif objective == 'pred_x0':
+            register_buffer('loss_weight', maybe_clipped_snr)
+        elif objective == 'pred_v':
+            register_buffer('loss_weight', maybe_clipped_snr / (snr + 1))
+    @property
+    def device(self):
+        return self.betas.device
+    # CFG schdeuler => by taking pre-setting scheduler
+    def setup_guidance(self, scheduler):
+        if exists(scheduler):
+            self.CFG_sch = scheduler.to(self.device)
+        else:
+            self.CFG_sch = scheduler
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+    def predict_noise_from_start(self, x_t, t, x0):
+        return (
+            (extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / \
+            extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+        )
+    def predict_v(self, x_start, t, noise):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * noise -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * x_start
+        )
+    def predict_start_from_v(self, x_t, t, v):
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
+        )
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+            extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def model_predictions(self, x, t, cond, clip_x_start = False, rederive_pred_noise = False):
+        # print(t.device)
+        if exists(self.CFG_sch):
+            uncond = self.diff_unet(x, t, conds=None) #conds=torch.zeros_like(cond))
+            model_output = self.diff_unet(x, t, conds=cond)
+            time = int(t[0])
+            model_output = model_output - self.CFG_sch[time] * (uncond - model_output)
+        else:
+            model_output = self.diff_unet(x, t, conds=cond)
+        maybe_clip = partial(torch.clamp, min = -1., max = 1.) if clip_x_start else identity
+        if self.objective == 'pred_noise':
+            pred_noise = model_output
+            x_start = self.predict_start_from_noise(x, t, pred_noise)
+            x_start = maybe_clip(x_start)
+            if clip_x_start and rederive_pred_noise:
+                pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_x0':
+            x_start = model_output
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        elif self.objective == 'pred_v':
+            v = model_output
+            x_start = self.predict_start_from_v(x, t, v)
+            x_start = maybe_clip(x_start)
+            pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return ModelPrediction(pred_noise, x_start)
+    def p_mean_variance(self, x, t, cond=None, clip_denoised = True):
+        preds = self.model_predictions(x, t, cond=cond, clip_x_start=False,)
+        x_start = preds.pred_x_start
+        if clip_denoised:
+            x_start.clamp_(-1., 1.)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start = x_start, x_t = x, t = t)
+        return model_mean, posterior_variance, posterior_log_variance, x_start
+    @torch.no_grad()
+    def p_sample(self, x, t: int, cond=None):
+        b, *_, device = *x.shape, self.device
+        batched_times = torch.full((b,), t, device = device, dtype = torch.long)
+        model_mean, _, model_log_variance, x_start = self.p_mean_variance(x = x, t = batched_times, cond=cond, clip_denoised = False)
+        noise = torch.randn_like(x) if t > 0 else 0. # no noise if t == 0
+        pred_img = model_mean + (0.5 * model_log_variance).exp() * noise
+        return pred_img, x_start
+    @torch.no_grad()
+    def p_sample_loop(self, shape, cond=None, return_all_timesteps = False):
+        batch, device = shape[0], self.device
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for t in tqdm(reversed(range(0, self.num_timesteps)), desc = 'sampling loop time step', total = self.num_timesteps, disable=True):
+            frames_pred, _ = self.p_sample(frames_pred, t, cond=cond)
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def ddim_sample(self, shape, cond=None, return_all_timesteps = False):
+        batch, total_timesteps, sampling_timesteps, eta, objective = shape[0], self.num_timesteps, self.sampling_timesteps, self.ddim_sampling_eta, self.objective
+        device = self.device
+        times = torch.linspace(-1, total_timesteps - 1, steps = sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        frames_pred = torch.randn(shape, device = device)
+        imgs = [frames_pred]
+        for time, time_next in tqdm(time_pairs, desc = 'sampling loop time step', disable=True):
+            time_cond = torch.full((batch,), time, device = device, dtype = torch.long)
+            pred_noise, x_start, *_ = self.model_predictions(
+                            frames_pred,
+                            time_cond,
+                            cond = cond, #cond.copy(),
+                            clip_x_start = False,
+                            rederive_pred_noise = True
+                        )
+            if time_next < 0:
+                frames_pred = x_start
+                imgs.append(frames_pred)
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn_like(frames_pred)
+            frames_pred = x_start * alpha_next.sqrt() + \
+                  c * pred_noise + \
+                  sigma * noise
+            imgs.append(frames_pred)
+        ret = frames_pred if not return_all_timesteps else torch.stack(imgs, dim = 1)
+        return ret
+    @torch.no_grad()
+    def sample(self, frames_in, return_all_timesteps = False):
+        assert frames_in.ndim == 5
+        B, T_in, C, H, W = frames_in.shape
+        device = self.device
+        backbone_output, conds, *_ = self.backbone(frames_in) # updated for Updated loss function on 03/07
+        sample_fn = self.p_sample_loop if not self.is_ddim_sampling else self.ddim_sample
+        *_, c, h, w = conds.shape
+        tgt_shape = conds.reshape(B, -1, c, h, w).shape
+        ldm_pred = sample_fn(
+            tgt_shape,
+            cond=conds,
+            return_all_timesteps = return_all_timesteps
+        )
+        ldm_pred = rearrange(ldm_pred, 'b t c h w -> (b t) c h w')
+        frames_pred = self.backbone.vae.decode(ldm_pred)
+        frames_pred = rearrange(frames_pred, '(b t) c h w -> b t c h w', b=B)
+        return frames_pred, backbone_output
+    def predict(self, frames_in, compute_loss=False, **kwargs):
+        pred, mu = self.sample(frames_in=frames_in)
+        return pred, mu
+    def compute_loss(self, frames_in, frames_gt, validate=False):
+        compute_loss = True and (not validate)
+        B, T_in, C, H, W = frames_in.shape
+        T_out = frames_gt.shape[1]
+        device = frames_in.device
+        """
+        Diffusion Loss
+        """
+        backbone_output, conds = self.backbone(frames_in)
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        t = torch.randint(0, self.num_timesteps, (B,), device=self.device).long()
+        if random.random() > 0.85: # Unconditional
+            conds = None
+        diff_loss = self.p_losses(hid_gt.detach(), t, cond=conds)
+        """
+        Backbone Loss
+        """
+        mu_loss = self.backbone._losses_(frames_in, frames_gt)
+        """
+        VAE Loss
+        """
+        ae_loss, kl_loss = self.backbone.vae._losses_(
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w'),
+            rearrange(torch.cat((frames_in, frames_gt), dim=1), 'b t c h w -> (b t) c h w')
+        )
+        kl_weight = 1E-6
+        recon_loss = ae_loss + kl_weight*kl_loss
+        """
+        Prior Loss at t=T [Noisy]
+        """
+        hid_gt, _ = self.backbone.vae.encode(
+            rearrange(frames_gt, 'b t c h w -> (b t) c h w')
+        )
+        hid_gt = rearrange(hid_gt, '(b t) c h w -> b t c h w', b=B)
+        T = torch.ones((B,), device=self.device).long() * (self.num_timesteps - 1)
+        mu_noisy = extract(self.sqrt_alphas_cumprod, T, hid_gt.shape) * hid_gt
+        sigma_noisy = extract(self.sqrt_one_minus_alphas_cumprod, T, hid_gt.shape)
+        log_var_noisy = 2*torch.log(sigma_noisy)
+        prior_loss = self.kl_from_standard_normal(mu_noisy, log_var_noisy)
+        return recon_loss, mu_loss, diff_loss, prior_loss
+    def kl_from_standard_normal(self, mean, log_var):
+        kl = 0.5 * (log_var.exp() + mean.square() - 1.0 - log_var)
+        return kl.mean()
+    @autocast(enabled = False)
+    def q_sample(self, x_start, t, noise = None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+            extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def p_losses(self, x_start, t, noise=None, offset_noise_strength=None, cond=None):
+        b, T, c, h, w = x_start.shape
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        # offset noise - https://www.crosslabs.org/blog/diffusion-with-offset-noise
+        offset_noise_strength = default(offset_noise_strength, self.offset_noise_strength)
+        if offset_noise_strength > 0.:
+            offset_noise = torch.randn(x_start.shape[:2], device = self.device)
+            noise += offset_noise_strength * rearrange(offset_noise, 'b c -> b c 1 1')
+        # noise sample
+        x = self.q_sample(x_start=x_start, t=t, noise=noise) # Use q_sample here for updating: https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L763
+        model_out = self.diff_unet(x, t, conds=cond)
+        if self.objective == 'pred_noise':
+            target = noise
+        elif self.objective == 'pred_x0':
+            target = x_start
+        elif self.objective == 'pred_v':
+            v = self.predict_v(x_start, t, noise)
+            target = v
+        else:
+            raise ValueError(f'unknown objective {self.objective}')
+        loss = F.mse_loss(model_out, target, reduction = 'none') # (B, T, C, H, W)
+        loss = reduce(loss, 'b ... -> b', 'mean')
+        loss = loss * extract(self.loss_weight, t, loss.shape)
+        return loss.mean()
+    @torch.no_grad()
+    def forward(self, input_x, include_mu=False, **kwargs):
+        pred, mu = self.predict(input_x, compute_loss=False)
+        if include_mu:
+            return pred, mu
+        else:
+            return pred
+from stldm.modules import SimVPV2_Model, VAE
+def model_setup(model_config, print_info=False, cfg_str=None):
+    if print_info:
+        print('Setup a Spatial diffusion with replacing a Temporal attention with Spatial attention')
+        print('This is a diffusion with the consideration of (BT, C, H, W)')
+        print('Train it from end to end')
+    vp_config = model_config['vp_param']
+    ldm_config = model_config['stldm_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ldm = LDM(**ldm_config)
+    model = GaussianDiffusion(vp_model=vpm, diffusion=ldm, **model_config['param'])
+    scheduler = guidance_scheduler(sampling_step=model_config['param']['timesteps'], const=cfg_str) if cfg_str is not None else None
+    model.setup_guidance(scheduler)
+    return model
+def ae_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    ae = vpm.vae
+    return ae
+def backbone_setup(model_config):
+    vp_config = model_config['vp_param']
+    vpm = SimVPV2_Model(**vp_config)
+    return vpm

stldm/submodules.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import torch, math
+from torch import nn
+from einops import rearrange
+# building block modules
+def exists(x):
+    return x is not None
+class LayerNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+    def forward(self, x):
+        eps = 1e-5 if x.dtype == torch.float32 else 1e-3
+        var = torch.var(x, dim = 1, unbiased = False, keepdim = True)
+        mean = torch.mean(x, dim = 1, keepdim = True)
+        return (x - mean) * (var + eps).rsqrt() * self.g
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = LayerNorm(dim)
+    def forward(self, x, *args, **kwargs):
+        x = self.norm(x)
+        return self.fn(x, *args, **kwargs)
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups = 8):
+        super().__init__()
+        self.proj = nn.Conv2d(dim, dim_out, 3, padding = 1)
+        if dim_out%groups != 0:
+            groups = 1
+        self.norm = nn.GroupNorm(groups, dim_out)
+        self.act = nn.SiLU()
+    def forward(self, x, scale_shift = None):
+        x = self.proj(x)
+        x = self.norm(x)
+        if exists(scale_shift):
+            scale, shift = scale_shift
+            x = x * (scale + 1) + shift
+        x = self.act(x)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, dim_out, *, time_emb_dim = None, groups = 8):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(time_emb_dim, dim_out * 2)
+        ) if exists(time_emb_dim) else None
+        self.block1 = Block(dim, dim_out, groups = groups)
+        self.block2 = Block(dim_out, dim_out, groups = groups)
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+    def forward(self, x, time_emb = None):
+        scale_shift = None
+        if exists(self.mlp) and exists(time_emb):
+            time_emb = self.mlp(time_emb)
+            time_emb = rearrange(time_emb, 'b c -> b c 1 1')
+            scale_shift = time_emb.chunk(2, dim = 1)
+        h = self.block1(x, scale_shift = scale_shift)
+        h = self.block2(h)
+        return h + self.res_conv(x)
+"""
+Input Tensor and Output Tensor should be in the format of (BT, C, H, W) with # dims = 4
+"""
+class Linear_SpatialAttention(nn.Module):
+    def __init__(self, dim, patch_size, heads=4, dim_head=32):
+        super(Linear_SpatialAttention, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.patch_size = patch_size
+        self.heads = heads
+        hidden_dim = dim_head*heads # No of Channel for (Q, K, V)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim*3, kernel_size=1, padding=0, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, kernel_size=1),
+            LayerNorm(dim)
+        )
+    def forward(self, x):
+        assert x.ndim == 4
+        BT, C, H, W = x.shape
+        nh, nw = H//self.patch_size, W//self.patch_size
+        qkv = self.to_qkv(x).chunk(3, dim=1) # qkv tuple in (q, k , v)
+        # [B, Head × C, X × P, Y × P] -> [B, Head × X × Y, C, P × P]
+        q, k, v = map(lambda t: rearrange(t, 'b (h c) (nh ph) (nw pw) -> b (h nh nw) c (ph pw)', h=self.heads, ph=self.patch_size, pw=self.patch_size, nh=nh, nw=nw), qkv)
+        q = q.softmax(dim=-2)
+        k = k.softmax(dim=-1)
+        q = q*self.scale
+        context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+        out = torch.einsum('b h d e, b h d n -> b h e n', context, q)
+        out = rearrange(out, 'b (h nh nw) c (ph pw) -> b (h c) (nh ph) (nw pw)', h=self.heads, ph=self.patch_size, pw=self.patch_size, nh=nh, nw=nw)
+        out = self.to_out(out)
+        return out
+"""
+Input Tensor and Output Tensor should be in the format of (B, T, C, H, W) with # dims = 5
+"""
+class Linear_TemporalAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(Linear_TemporalAttention, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head*heads # No of Channel for (Q, K, V)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim*3, kernel_size=1, padding=0, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, kernel_size=1),
+            LayerNorm(dim)
+        )
+    def forward(self, x):
+        assert x.ndim == 5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        qkv = self.to_qkv(x).chunk(3, dim=1) # qkv tuple in (q, k , v)
+        # [B, Head × C, X × P, Y × P] -> [B, Head × X × Y, C, P × P]
+        q, k, v = map(lambda t: rearrange(t, '(b t) (h c) x y -> b (h x y) c t', h=self.heads, x=H, y=W, t=T), qkv)
+        q = q.softmax(dim=-2)
+        k = k.softmax(dim=-1)
+        q = q*self.scale
+        v /= (H*W)
+        context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+        out = torch.einsum('b h d e, b h d n -> b h e n', context, q)
+        out = rearrange(out, 'b (h x y) c t -> (b t) (h c) x y', h=self.heads, x=H, y=W, t=T)
+        out = self.to_out(out)
+        return out.reshape(B, T, C, H, W)
+# Does not Follow what suggested by the paper as could not ensure the spatial factor of 2
+def Downsample2D(dim_in, dim_out):
+    return nn.Conv2d(dim_in, dim_out, kernel_size=(4, 4), stride=(2, 2), padding=(1,1))
+def Upsample2D(dim_in, dim_out):
+    return nn.ConvTranspose2d(dim_in, dim_out, kernel_size=(4, 4), stride=(2, 2), padding=(1,1))
+def ChannelConversion(dim_in, dim_out):
+    return nn.Conv2d(dim_in, dim_out, kernel_size=(3,3), padding=(1,1))
+"""
+Input Tensor and Output Tensor should be in the format of (BT, C, H, W) with # dims = 4
+"""
+class Quadratic_SpatialAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(Quadratic_SpatialAttention, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head*heads # No of Channel for (Q, K, V)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim*3, kernel_size=1, padding=0, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, kernel_size=1)
+        )
+    def forward(self, x):
+        assert x.ndim == 4
+        BT, C, H, W = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim=1) # qkv tuple in (q, k , v)
+        # [B, Head × C, X × P, Y × P] -> [B, Head × X × Y, C, P × P]
+        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h=self.heads), qkv)
+        q = q*self.scale
+        sim = torch.einsum('b h d i, b h d j -> b h i j', q, k)
+        attn = sim.softmax(dim = -1)
+        out = torch.einsum('b h i j, b h d j -> b h i d', attn, v)
+        out = rearrange(out, 'b h (x y) d -> b (h d) x y', x = H, y = W)
+        out = self.to_out(out)
+        return out
+"""
+Input Tensor and Output Tensor should be in the format of (B, T, C, H, W) with # dims = 5
+"""
+class Quadratic_TemporalAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(Quadratic_TemporalAttention, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head*heads # No of Channel for (Q, K, V)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim*3, kernel_size=1, padding=0, bias=False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        assert x.ndim == 5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        qkv = self.to_qkv(x).chunk(3, dim=1) # qkv tuple in (q, k , v)
+        # [B, Head × C, X × P, Y × P] -> [B, Head × X × Y, C, P × P]
+        q, k, v = map(lambda t: rearrange(t, '(b t) (h c) x y -> b h (c x y) t', h=self.heads, x=H, y=W, t=T), qkv)
+        q = q*self.scale
+        sim = torch.einsum('b h d i, b h d j -> b h i j', q, k)
+        attn = sim.softmax(dim = -1)
+        out = torch.einsum('b h i j, b h d j -> b h i d', attn, v)
+        out = rearrange(out, 'b h t (c x y) -> (b t) (h c) x y', h=self.heads, x=H, y=W, t=T)
+        out = self.to_out(out)
+        return out.reshape(B, T, C, H, W)
+"""
+A series of functions required for Diffusion Model copied from DiffCast code
+"""
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def linear_beta_schedule(timesteps):
+    """
+    linear schedule, proposed in original ddpm paper
+    """
+    scale = 1000 / timesteps
+    beta_start = scale * 0.0001
+    beta_end = scale * 0.02
+    return torch.linspace(beta_start, beta_end, timesteps, dtype = torch.float64)
+def cosine_beta_schedule(timesteps, s = 0.008):
+    """
+    cosine schedule
+    as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+    """
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype = torch.float64) / timesteps
+    alphas_cumprod = torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+def sigmoid_beta_schedule(timesteps, start = -3, end = 3, tau = 1, clamp_min = 1e-5):
+    """
+    sigmoid schedule
+    proposed in https://arxiv.org/abs/2212.11972 - Figure 8
+    better for images > 64x64, when used during training
+    """
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype = torch.float64) / timesteps
+    v_start = torch.tensor(start / tau).sigmoid()
+    v_end = torch.tensor(end / tau).sigmoid()
+    alphas_cumprod = (-((t * (end - start) + start) / tau).sigmoid() + v_end) / (v_end - v_start)
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+# sinusoidal positional embeds
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super(SinusoidalPosEmb, self).__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class Time_MLP(nn.Module):
+    def __init__(self, dim, time_dim, fourier_dim=32):
+        super(Time_MLP, self).__init__()
+        self.mlp = nn.Sequential(
+            SinusoidalPosEmb(fourier_dim),
+            nn.Linear(fourier_dim, time_dim),
+            nn.GELU(),
+            nn.Linear(time_dim, time_dim)
+        )
+    def forward(self, x):
+        return self.mlp(x)
+class RelativePositionBias(nn.Module):
+    def __init__(
+        self,
+        heads = 8,
+        num_buckets = 32,
+        max_distance = 128
+    ):
+        super().__init__()
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.relative_attention_bias = nn.Embedding(num_buckets, heads)
+    @staticmethod
+    def _relative_position_bucket(relative_position, num_buckets = 32, max_distance = 128):
+        ret = 0
+        n = -relative_position
+        num_buckets //= 2
+        ret += (n < 0).long() * num_buckets
+        n = torch.abs(n)
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).long()
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+    def forward(self, n, device):
+        q_pos = torch.arange(n, dtype = torch.long, device = device)
+        k_pos = torch.arange(n, dtype = torch.long, device = device)
+        rel_pos = rearrange(k_pos, 'j -> 1 j') - rearrange(q_pos, 'i -> i 1')
+        rp_bucket = self._relative_position_bucket(rel_pos, num_buckets = self.num_buckets, max_distance = self.max_distance)
+        values = self.relative_attention_bias(rp_bucket)
+        return rearrange(values, 'i j h -> h i j')
+"""
+Input Tensor and Output Tensor should be in the format of (B, T, C, H, W) with # dims = 5
+"""
+class TemporalAttention_Pos(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(TemporalAttention_Pos, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head*heads # No of Channel for (Q, K, V)
+        self.to_qkv = nn.Conv2d(dim, hidden_dim*3, kernel_size=1, padding=0)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, kernel_size=1),
+        )
+    def forward(self, x, rel_pos=None):
+        assert x.ndim == 5
+        B, T, C, H, W = x.shape
+        x = x.reshape(B*T, C, H, W)
+        qkv = self.to_qkv(x).chunk(3, dim=1) # qkv tuple in (q, k , v)
+        # [B, Head × C, X × P, Y × P] -> [B, Head × X × Y, C, P × P]
+        q, k, v = map(lambda t: rearrange(t, '(b t) (h c) x y -> (b x y) h c t', h=self.heads, x=H, y=W, t=T), qkv)
+        q = q*self.scale
+        sim = torch.einsum('b h d i, b h d j -> b h i j', q, k)
+        if rel_pos is not None:
+            sim += rel_pos
+        attn = sim.softmax(dim = -1)
+        out = torch.einsum('b h i j, b h d j -> b h i d', attn, v)
+        out = rearrange(out, '(b x y) h t c -> (b t) (h c) x y', h=self.heads, x=H, y=W, t=T)
+        out = self.to_out(out)
+        return out.reshape(B, T, C, H, W)
+class TemporalAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super(TemporalAttention, self).__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head*heads
+        self.to_k = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_q = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_v = nn.Linear(dim, hidden_dim, bias=False)
+        self.to_out = nn.Linear(hidden_dim, dim)
+    def forward(self, x):
+        assert x.ndim == 5
+        B, T, C, H, W = x.shape
+        x = rearrange(x, 'b t c h w -> b (h w) t c')
+        q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
+        q = rearrange(q, '... n (h d) -> ... h n d', h=self.heads) # B (H W) Head T Dim
+        k = rearrange(k, '... n (h d) -> ... h n d', h=self.heads)
+        v = rearrange(v, '... n (h d) -> ... h n d', h=self.heads)
+        q = q*self.scale
+        sim = torch.einsum('... h i d, ... h j d -> ... h i j', q, k)
+        attn = sim.softmax(dim=-1)
+        out = torch.einsum('... h i j, ... h j d -> ... h i d', attn, v)
+        out = rearrange(out, '... h i d -> ... i (h d)', h=self.heads)
+        out = self.to_out(out)
+        out = rearrange(out, 'b (h w) t c -> b t c h w', h=H, w=W)
+        return out

utilspp.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import os
+import torch
+import numpy as np
+import lpips as lp
+import pandas as pd
+import torchmetrics
+import matplotlib.pyplot as plt
+from bisect import bisect_right
+import torchvision.transforms as T
+from torch import nn
+from matplotlib.colors import ListedColormap, BoundaryNorm
+from matplotlib.lines import Line2D
+from data import dutils
+# =======================================================================
+# Scheduler Helper Function
+# =======================================================================
+class SequentialLR(torch.optim.lr_scheduler._LRScheduler):
+    """Receives the list of schedulers that is expected to be called sequentially during
+    optimization process and milestone points that provides exact intervals to reflect
+    which scheduler is supposed to be called at a given epoch.
+    Args:
+        schedulers (list): List of chained schedulers.
+        milestones (list): List of integers that reflects milestone points.
+    Example:
+        >>> # Assuming optimizer uses lr = 1. for all groups
+        >>> # lr = 0.1     if epoch == 0
+        >>> # lr = 0.1     if epoch == 1
+        >>> # lr = 0.9     if epoch == 2
+        >>> # lr = 0.81    if epoch == 3
+        >>> # lr = 0.729   if epoch == 4
+        >>> scheduler1 = ConstantLR(self.opt, factor=0.1, total_iters=2)
+        >>> scheduler2 = ExponentialLR(self.opt, gamma=0.9)
+        >>> scheduler = SequentialLR(self.opt, schedulers=[scheduler1, scheduler2], milestones=[2])
+        >>> for epoch in range(100):
+        >>>     train(...)
+        >>>     validate(...)
+        >>>     scheduler.step()
+    """
+    def __init__(self, optimizer, schedulers, milestones, last_epoch=-1, verbose=False):
+        for scheduler_idx in range(1, len(schedulers)):
+            if (schedulers[scheduler_idx].optimizer != schedulers[0].optimizer):
+                raise ValueError(
+                    "Sequential Schedulers expects all schedulers to belong to the same optimizer, but "
+                    "got schedulers at index {} and {} to be different".format(0, scheduler_idx)
+                )
+        if (len(milestones) != len(schedulers) - 1):
+            raise ValueError(
+                "Sequential Schedulers expects number of schedulers provided to be one more "
+                "than the number of milestone points, but got number of schedulers {} and the "
+                "number of milestones to be equal to {}".format(len(schedulers), len(milestones))
+            )
+        self.optimizer = optimizer
+        self._schedulers = schedulers
+        self._milestones = milestones
+        self.last_epoch = last_epoch + 1
+    def step(self, ref=None):
+        self.last_epoch += 1
+        idx = bisect_right(self._milestones, self.last_epoch)
+        if idx > 0 and self._milestones[idx - 1] == self.last_epoch:
+            self._schedulers[idx].step(0)
+        else:
+            # Check HERE
+            if isinstance(self._schedulers[idx], torch.optim.lr_scheduler.ReduceLROnPlateau):
+                self._schedulers[idx].step(ref)
+            else:
+                self._schedulers[idx].step()
+    def state_dict(self):
+        """Returns the state of the scheduler as a :class:`dict`.
+        It contains an entry for every variable in self.__dict__ which
+        is not the optimizer.
+        The wrapped scheduler states will also be saved.
+        """
+        state_dict = {key: value for key, value in self.__dict__.items() if key not in ('optimizer', '_schedulers')}
+        state_dict['_schedulers'] = [None] * len(self._schedulers)
+        for idx, s in enumerate(self._schedulers):
+            state_dict['_schedulers'][idx] = s.state_dict()
+        return state_dict
+    def load_state_dict(self, state_dict):
+        """Loads the schedulers state.
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        _schedulers = state_dict.pop('_schedulers')
+        self.__dict__.update(state_dict)
+        # Restore state_dict keys in order to prevent side effects
+        # https://github.com/pytorch/pytorch/issues/32756
+        state_dict['_schedulers'] = _schedulers
+        for idx, s in enumerate(_schedulers):
+            self._schedulers[idx].load_state_dict(s)
+def warmup_lambda(warmup_steps, min_lr_ratio=0.1):
+    def ret_lambda(epoch):
+        if epoch <= warmup_steps:
+            return min_lr_ratio + (1.0 - min_lr_ratio) * epoch / warmup_steps
+        else:
+            return 1.0
+    return ret_lambda
+# =======================================================================
+# Utils in utils :)
+# =======================================================================
+def to_cpu_tensor(*args):
+    '''
+    Input arbitrary number of array/tensors, each will be converted to CPU torch.Tensor
+    '''
+    out = []
+    for tensor in args:
+        if type(tensor) is np.ndarray:
+            tensor = torch.Tensor(tensor)
+        if type(tensor) is torch.Tensor:
+            tensor = tensor.cpu()
+        out.append(tensor)
+    # single value input: return single value output
+    if len(out) == 1:
+        return out[0]
+    return out
+def merge_leading_dims(tensor, n=2):
+    '''
+    Merge the first N dimension of a tensor
+    '''
+    return tensor.reshape((-1, *tensor.shape[n:]))
+# =======================================================================
+# Model Preparation, saving & loading (copied from utils.py)
+# =======================================================================
+def build_model_name(model_type, model_config):
+    '''
+    Build the model name (without extension)
+    '''
+    model_name = model_type + '_'
+    for k, v in model_config.items():
+        model_name += k
+        if type(v) is list or type(v) is tuple:
+            model_name += '-'
+            for i, item in enumerate(v):
+                model_name += (str(item) if type(item) is not bool else '') + ('-' if i < len(v)-1 else '')
+        else:
+            model_name += (('-' + str(v)) if type(v) is not bool else '')
+        model_name += '_'
+    return model_name[:-1]
+def build_model_path(base_dir, dataset_type, model_type, timestamp=None):
+    if timestamp is None:
+        return os.path.join(base_dir, dataset_type, model_type)
+    elif timestamp == True:
+        return os.path.join(base_dir, dataset_type, model_type, pd.Timestamp.now().strftime('%Y%m%d%H%M%S'))
+    return os.path.join(base_dir, dataset_type, model_type, timestamp)
+# =======================================================================
+# Preprocess Function for Loading HKO-7 dataset
+# =======================================================================
+def hko7_preprocess(x_seq, x_mask, dt_clip, args):
+    resize = args.resize if 'resize' in args else x_seq.shape[-1]
+    seq_len = args.seq_len if 'seq_len' in args else 5
+    # post-process on HKO-10
+    x_seq = x_seq.transpose((1, 0, 2, 3, 4)) / 255. # => (batch_size, seq_length, 1, 480, 480)
+    if 'scale' in args and args.scale == 'non-linear':
+        x_seq = dutils.linear_to_nonlinear_batched(x_seq, dt_clip)
+    else:
+        x_seq = dutils.nonlinear_to_linear_batched(x_seq, dt_clip)
+    b, t, c, h, w = x_seq.shape
+    assert c == 1, f'# channels ({c}) != 1'
+    # resize (downsample) the images if necessary
+    x_seq = torch.Tensor(x_seq).float().reshape((b*t, c, h, w))
+    if resize != h:
+        tform = T.Compose([
+            T.ToPILImage(),
+            T.Resize(resize),
+            T.ToTensor(),
+        ])
+    else:
+        tform = T.Compose([])
+    x_seq = torch.stack([tform(x_frame) for x_frame in x_seq], dim=0)
+    x_seq = x_seq.reshape((b, t, c, resize, resize))
+    x, y = x_seq[:, :seq_len], x_seq[:, seq_len:]
+    return x, y
+# =======================================================================
+# Evaluation Metrics-Related
+# =======================================================================
+mae = lambda *args: torch.nn.functional.l1_loss(*args).cpu().detach().numpy()
+mse = lambda *args: torch.nn.functional.mse_loss(*args).cpu().detach().numpy()
+def ssim(y_pred, y):
+    y, y_pred = to_cpu_tensor(y, y_pred)
+    b, t, c, h, w = y.shape
+    y = y.reshape((b*t, c, h, w))
+    y_pred = y_pred.reshape((b*t, c, h, w))
+    # to further ensure any of the input is not negative
+    y = torch.clamp(y, 0, 1)
+    y_pred = torch.clamp(y_pred, 0, 1)
+    return torchmetrics.image.StructuralSimilarityIndexMeasure(data_range=1.0)(y_pred, y)
+def psnr(y_pred, y):
+    y, y_pred = to_cpu_tensor(y, y_pred)
+    b, t, c, h, w = y.shape
+    y = y.reshape((b*t, c, h, w))
+    y_pred = y_pred.reshape((b*t, c, h, w))
+    acc_score = 0
+    for i in range(b*t):
+        acc_score += torchmetrics.image.PeakSignalNoiseRatio(data_range=1.0)(y_pred[i], y[i]) / (b*t)
+    return acc_score
+GLOBAL_LPIPS_OBJ = None # a static variable
+def lpips64(y_pred, y, net='vgg'):
+    # convert the image range into [-1, 1], assuming the input range to be [0, 1]
+    y = merge_leading_dims(y)
+    y_pred = merge_leading_dims(y_pred)
+    y = torch.nn.functional.interpolate(y, (64, 64), mode='bicubic').clamp(0,1)
+    y_pred = torch.nn.functional.interpolate(y_pred, (64, 64), mode='bicubic').clamp(0,1)
+    y = (2 * y - 1)
+    y_pred = (2 * y_pred - 1)
+    global GLOBAL_LPIPS_OBJ
+    if GLOBAL_LPIPS_OBJ is None:
+        GLOBAL_LPIPS_OBJ = lp.LPIPS(net=net).to(y.device)
+    return GLOBAL_LPIPS_OBJ(y_pred, y).mean()
+def tfpn(y_pred, y, threshold, radius=1):
+    '''
+    convert to cpu, and merge the first two dimensions
+    '''
+    y = merge_leading_dims(y)
+    y_pred = merge_leading_dims(y_pred)
+    with torch.no_grad():
+        if radius > 1:
+            pool = nn.MaxPool2d(radius)
+            y = pool(y)
+            y_pred = pool(y_pred)
+        y = torch.where(y >= threshold, 1, 0)
+        y_pred = torch.where(y_pred >= threshold, 1, 0)
+        mat = torchmetrics.functional.confusion_matrix(y_pred, y, task='binary', threshold=threshold)
+        (tn, fp), (fn, tp) = to_cpu_tensor(mat)
+    return tp, tn, fp, fn
+def tfpn_pool(y_pred, y, threshold, radius):
+    y_pred = merge_leading_dims(y_pred)
+    y = merge_leading_dims(y)
+    pool = nn.MaxPool2d(radius, stride=radius//4 if radius//4 > 0 else radius)
+    with torch.no_grad():
+        y = torch.where(y>=threshold, 1, 0).float()
+        y_pred = torch.where(y_pred>=threshold, 1, 0).float()
+        y = pool(y)
+        y_pred = pool(y_pred)
+        mat = torchmetrics.functional.confusion_matrix(y_pred, y, task='binary', threshold=threshold)
+        (tn, fp), (fn, tp) = to_cpu_tensor(mat)
+    return tp, tn, fp, fn
+def csi(tp, tn, fp, fn):
+    '''Critical Success Index. The larger the better.'''
+    if (tp + fn + fp) < 1e-7:
+        return 0.
+    return tp / (tp + fn + fp)
+def hss(tp, tn, fp, fn):
+    '''Heidke Skill Score. (-inf, 1]. Larger better.'''
+    if (tp+fn)*(fn+tn) + (tp+fp)*(fp+tn) == 0:
+        return 0.
+    return 2 * (tp*tn - fp*fn) / ((tp+fn)*(fn+tn) + (tp+fp)*(fp+tn))
+# =======================================================================
+# Data Visualization
+# =======================================================================
+def torch_visualize(sequences, savedir=None, horizontal=10, vmin=0, vmax=1):
+    '''
+    input: sequences, a list/dict of numpy/torch arrays with shape (B, T, C, H, W)
+    C is assumed to be 1 and squeezed
+    If batch > 1, only the first sequence will be printed
+    '''
+    # First pass: compute the vertical height and convert to proper format
+    vertical = 0
+    display_texts = []
+    if (type(sequences) is dict):
+        temp = []
+        for k, v in sequences.items():
+            vertical += int(np.ceil(v.shape[1] / horizontal))
+            temp.append(v)
+            display_texts.append(k)
+        sequences = temp
+    else:
+        for i, sequence in enumerate(sequences):
+            vertical += int(np.ceil(sequence.shape[1] / horizontal))
+            display_texts.append(f'Item {i+1}')
+    sequences = to_cpu_tensor(*sequences)
+    # Plot the sequences
+    j = 0
+    fig, axes = plt.subplots(vertical, horizontal, figsize=(2*horizontal, 2*vertical), tight_layout=True)
+    plt.setp(axes, xticks=[], yticks=[])
+    for k, sequence in enumerate(sequences):
+        # only take the first batch, now seq[0] is the temporal dim
+        sequence = sequence[0].squeeze() # (T, H, W)
+        axes[j, 0].set_ylabel(display_texts[k])
+        for i, frame in enumerate(sequence):
+            j_shift = j + i // horizontal
+            i_shift = i % horizontal
+            axes[j_shift, i_shift].imshow(frame, vmin=vmin, vmax=vmax, cmap='gray')
+        j += int(np.ceil(sequence.shape[0] / horizontal))
+    if savedir:
+        plt.savefig(savedir + '' if savedir.endswith('.png') else '.png')
+        plt.close()
+    else:
+        plt.show()
+""" Visualize function with colorbar and a line seprate input and output """
+def color_visualize(sequences, savedir='', horizontal=5, skip=1, ypos=0):
+    '''
+    input: sequences, a list/dict of numpy/torch arrays with shape (B, T, C, H, W)
+    C is assumed to be 1 and squeezed
+    If batch > 1, only the first sequence will be printed
+    '''
+    plt.style.use(['science', 'no-latex'])
+    VIL_COLORS = [[0, 0, 0],
+                [0.30196078431372547, 0.30196078431372547, 0.30196078431372547],
+                [0.1568627450980392, 0.7450980392156863, 0.1568627450980392],
+                [0.09803921568627451, 0.5882352941176471, 0.09803921568627451],
+                [0.0392156862745098, 0.4117647058823529, 0.0392156862745098],
+                [0.0392156862745098, 0.29411764705882354, 0.0392156862745098],
+                [0.9607843137254902, 0.9607843137254902, 0.0],
+                [0.9294117647058824, 0.6745098039215687, 0.0],
+                [0.9411764705882353, 0.43137254901960786, 0.0],
+                [0.6274509803921569, 0.0, 0.0],
+                [0.9058823529411765, 0.0, 1.0]]
+    VIL_LEVELS = [0.0, 16.0, 31.0, 59.0, 74.0, 100.0, 133.0, 160.0, 181.0, 219.0, 255.0]
+    # First pass: compute the vertical height and convert to proper format
+    vertical = 0
+    display_texts = []
+    if (type(sequences) is dict):
+        temp = []
+        for k, v in sequences.items():
+            vertical += int(np.ceil(v.shape[1] / horizontal))
+            temp.append(v)
+            display_texts.append(k)
+        sequences = temp
+    else:
+        for i, sequence in enumerate(sequences):
+            vertical += int(np.ceil(sequence.shape[1] / horizontal))
+            display_texts.append(f'Item {i+1}')
+    sequences = to_cpu_tensor(*sequences)
+    # Plot the sequences
+    j = 0
+    fig, axes = plt.subplots(vertical, horizontal, figsize=(2*horizontal, 2*vertical), tight_layout=True)
+    plt.subplots_adjust(hspace=0.0, wspace=0.0) # tight layout
+    plt.setp(axes, xticks=[], yticks=[])
+    for k, sequence in enumerate(sequences):
+        # only take the first batch, now seq[0] is the temporal dim
+        sequence = sequence[0].squeeze() # (T, H, W)
+        ## =================
+        # = labels of time =
+        if k == 0:
+            for i in range(len(sequence)):
+                axes[j, i].set_xlabel(f'$t-{(len(sequence)-i)-1}$', fontsize=16)
+                axes[j, i].xaxis.set_label_position('top')
+        elif k == len(sequences)-1:
+            for i in range(len(sequence)):
+                axes[j, i].set_xlabel(f'$t+{skip*i+1}$', fontsize=16)
+                axes[j, i].xaxis.set_label_position('bottom')
+        ## =================
+        axes[j, 0].set_ylabel(display_texts[k], fontsize=16)
+        for i, frame in enumerate(sequence):
+            j_shift = j + i // horizontal
+            i_shift = i % horizontal
+            im = axes[j_shift, i_shift].imshow(frame*255, cmap=ListedColormap(VIL_COLORS), \
+                                          norm=BoundaryNorm(VIL_LEVELS, ListedColormap(VIL_COLORS).N))
+        j += int(np.ceil(sequence.shape[0] / horizontal))
+    ## = plot splittin line =
+    if ypos == 0:
+        ypos = 1 - 1 / len(sequences) - 0.017
+    fig.lines.append(Line2D((0, 1), (ypos, ypos), transform=fig.transFigure, ls='--', linewidth=2, color='#444'))
+    # color bar
+    cax = fig.add_axes([1, 0.05, 0.02, 0.5])
+    fig.colorbar(im, cax=cax)
+    ## =================
+    if savedir:
+        plt.savefig(savedir + '' if len(savedir)>0 else 'out.png')
+        plt.close()
+    else:
+        plt.show()
+from tempfile import NamedTemporaryFile
+""" Visualize function with colorbar and a line seprate input and output """
+def gradio_visualize(sequences, horizontal=5, skip=1, ypos=0):
+    '''
+    input: sequences, a list/dict of numpy/torch arrays with shape (B, T, C, H, W)
+    C is assumed to be 1 and squeezed
+    If batch > 1, only the first sequence will be printed
+    '''
+    plt.style.use(['science', 'no-latex'])
+    VIL_COLORS = [[0, 0, 0],
+                [0.30196078431372547, 0.30196078431372547, 0.30196078431372547],
+                [0.1568627450980392, 0.7450980392156863, 0.1568627450980392],
+                [0.09803921568627451, 0.5882352941176471, 0.09803921568627451],
+                [0.0392156862745098, 0.4117647058823529, 0.0392156862745098],
+                [0.0392156862745098, 0.29411764705882354, 0.0392156862745098],
+                [0.9607843137254902, 0.9607843137254902, 0.0],
+                [0.9294117647058824, 0.6745098039215687, 0.0],
+                [0.9411764705882353, 0.43137254901960786, 0.0],
+                [0.6274509803921569, 0.0, 0.0],
+                [0.9058823529411765, 0.0, 1.0]]
+    VIL_LEVELS = [0.0, 16.0, 31.0, 59.0, 74.0, 100.0, 133.0, 160.0, 181.0, 219.0, 255.0]
+    # First pass: compute the vertical height and convert to proper format
+    vertical = 0
+    display_texts = []
+    if (type(sequences) is dict):
+        temp = []
+        for k, v in sequences.items():
+            vertical += int(np.ceil(v.shape[1] / horizontal))
+            temp.append(v)
+            display_texts.append(k)
+        sequences = temp
+    else:
+        for i, sequence in enumerate(sequences):
+            vertical += int(np.ceil(sequence.shape[1] / horizontal))
+            display_texts.append(f'Item {i+1}')
+    sequences = to_cpu_tensor(*sequences)
+    # Plot the sequences
+    j = 0
+    fig, axes = plt.subplots(vertical, horizontal, figsize=(2*horizontal, 2*vertical), tight_layout=True)
+    plt.subplots_adjust(hspace=0.0, wspace=0.0) # tight layout
+    plt.setp(axes, xticks=[], yticks=[])
+    for k, sequence in enumerate(sequences):
+        # only take the first batch, now seq[0] is the temporal dim
+        sequence = sequence.squeeze() # (T, H, W)
+        ## =================
+        # = labels of time =
+        if k == 0:
+            for i in range(len(sequence)):
+                axes[j, i].set_xlabel(f'$t-{(len(sequence)-i)-1}$', fontsize=16)
+                axes[j, i].xaxis.set_label_position('top')
+        elif k == len(sequences)-1:
+            for i in range(len(sequence)):
+                axes[j, i].set_xlabel(f'$t+{skip*i+1}$', fontsize=16)
+                axes[j, i].xaxis.set_label_position('bottom')
+        ## =================
+        axes[j, 0].set_ylabel(display_texts[k], fontsize=16)
+        for i, frame in enumerate(sequence):
+            j_shift = j + i // horizontal
+            i_shift = i % horizontal
+            im = axes[j_shift, i_shift].imshow(frame*255, cmap=ListedColormap(VIL_COLORS), \
+                                          norm=BoundaryNorm(VIL_LEVELS, ListedColormap(VIL_COLORS).N))
+        j += int(np.ceil(sequence.shape[0] / horizontal))
+    ## = plot splittin line =
+    if ypos == 0:
+        ypos = 1 - 1 / len(sequences) - 0.017
+    fig.lines.append(Line2D((0, 1), (ypos, ypos), transform=fig.transFigure, ls='--', linewidth=2, color='#444'))
+    # color bar
+    cax = fig.add_axes([1, 0.05, 0.02, 0.5])
+    fig.colorbar(im, cax=cax)
+    # Save the figure to a temporary file
+    with NamedTemporaryFile(suffix=".png", delete=False) as ff:
+        fig.savefig(ff.name)
+        file_path = ff.name
+    # It's important to close the figure to prevent memory leaks
+    plt.close(fig)
+    return file_path
+import matplotlib.animation as animation
+def gradio_gif(sequences, T):
+    '''
+    input: sequences, a list/dict of numpy/torch arrays with shape (B, T, C, H, W)
+    C is assumed to be 1 and squeezed
+    If batch > 1, only the first sequence will be printed
+    '''
+    plt.style.use(['science', 'no-latex'])
+    VIL_COLORS = [[0, 0, 0],
+                [0.30196078431372547, 0.30196078431372547, 0.30196078431372547],
+                [0.1568627450980392, 0.7450980392156863, 0.1568627450980392],
+                [0.09803921568627451, 0.5882352941176471, 0.09803921568627451],
+                [0.0392156862745098, 0.4117647058823529, 0.0392156862745098],
+                [0.0392156862745098, 0.29411764705882354, 0.0392156862745098],
+                [0.9607843137254902, 0.9607843137254902, 0.0],
+                [0.9294117647058824, 0.6745098039215687, 0.0],
+                [0.9411764705882353, 0.43137254901960786, 0.0],
+                [0.6274509803921569, 0.0, 0.0],
+                [0.9058823529411765, 0.0, 1.0]]
+    VIL_LEVELS = [0.0, 16.0, 31.0, 59.0, 74.0, 100.0, 133.0, 160.0, 181.0, 219.0, 255.0]
+    horizontal = len(sequences)
+    fig_size = 3
+    fig, axes = plt.subplots(nrows=1, ncols=horizontal, figsize=(fig_size*horizontal, fig_size), tight_layout=True)
+    plt.subplots_adjust(hspace=0.0, wspace=0.0) # tight layout
+    plt.setp(axes, xticks=[], yticks=[])
+    for i, sequence in enumerate(sequences.values()):
+        axes[i].set_xticks([])
+        axes[i].set_yticks([])
+        axes[i].set_xlabel(f'Ensemble {i+1}', fontsize=12)
+        frame = sequence[0].squeeze()
+        im = axes[i].imshow(frame*255, cmap=ListedColormap(VIL_COLORS), \
+                                norm=BoundaryNorm(VIL_LEVELS, ListedColormap(VIL_COLORS).N), animated=True)
+    title = fig.suptitle('', y=0.9, x=0.505, fontsize=16) # Initialize an empty super title
+    fig.colorbar(im)
+    def animate(t):
+        for i, sequence in enumerate(sequences.values()):
+            frame = sequence[t].squeeze()
+            im = axes[i].imshow(frame*255, cmap=ListedColormap(VIL_COLORS), \
+                                          norm=BoundaryNorm(VIL_LEVELS, ListedColormap(VIL_COLORS).N), animated=True)
+        plt.subplots_adjust(hspace=0.0, wspace=0.0) # tight layout
+        title.set_text(f'$t + {t}$') # update the title text
+        return fig,
+    ani = animation.FuncAnimation(fig, animate, frames=T, interval=750, blit=True, repeat_delay=50,)
+    # Save the figure to a temporary file
+    with NamedTemporaryFile(suffix=".gif", delete=False) as ff:
+        ani.save(ff.name, writer='pillow', fps=5)
+        file_path = ff.name
+    plt.close()
+    return file_path
+# import matplotlib.pyplot as plt
+# import matplotlib.animation as animation
+# def make_gif(frames, save_path):
+#     fig, ax = plt.subplots(figsize=(4,4))
+#     im = ax.imshow(frames[0].squeeze(), cmap='gray', vmin=0, vmax=1, animated=True)
+#     ax.set_axis_off()
+#     def update(i):
+#         im.set_array(frames[i].squeeze())
+#         return im,
+#     animation_fig =
+#     animation_fig.save(f"./{save_path}.gif")