xiangzai commited on Feb 4

Commit

d31b843

verified ·

1 Parent(s): 9734e98

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

Rectified_Noise/GVP-Disp/W_No.log +5 -0
Rectified_Noise/GVP-Disp/W_True_0.15.log +5 -0
Rectified_Noise/GVP-Disp/W_True_0.5.log +5 -0
Rectified_Noise/GVP-Disp/download.py +41 -0
Rectified_Noise/GVP-Disp/environment.yml +16 -0
Rectified_Noise/GVP-Disp/models.py +647 -0
Rectified_Noise/GVP-Disp/sample_ddp.py +233 -0
Rectified_Noise/GVP-Disp/sample_rectified_noise.py +380 -0
Rectified_Noise/GVP-Disp/train_utils.py +35 -0
Rectified_Noise/GVP-Disp/w_training1_VP.log +628 -0
Rectified_Noise/GVP-Disp/权重类型分析.md +133 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000032.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000077.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000133.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000161.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000220.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000331.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000387.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000505.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000517.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000551.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000726.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000817.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000865.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000914.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000940.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/001043.png +0 -0
Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/001210.png +0 -0
SiT_back/SiT_clean/W_training.log +110 -0
SiT_back/SiT_clean/__pycache__/download.cpython-312.pyc +0 -0
SiT_back/SiT_clean/__pycache__/models.cpython-312.pyc +0 -0
SiT_back/SiT_clean/__pycache__/train_utils.cpython-312.pyc +0 -0
SiT_back/SiT_clean/download.py +40 -0
SiT_back/SiT_clean/models.py +370 -0
SiT_back/SiT_clean/run.sh +0 -0
SiT_back/SiT_clean/sample.py +144 -0
SiT_back/SiT_clean/sample_ddp.py +233 -0
SiT_back/SiT_clean/train.py +371 -0
SiT_back/SiT_clean/train_utils.py +32 -0
SiT_back/SiT_clean/transport/__init__.py +65 -0
SiT_back/SiT_clean/transport/__pycache__/__init__.cpython-312.pyc +0 -0
SiT_back/SiT_clean/transport/__pycache__/integrators.cpython-312.pyc +0 -0
SiT_back/SiT_clean/transport/__pycache__/path.cpython-312.pyc +0 -0
SiT_back/SiT_clean/transport/__pycache__/transport.cpython-312.pyc +0 -0
SiT_back/SiT_clean/transport/__pycache__/utils.cpython-312.pyc +0 -0
SiT_back/SiT_clean/transport/integrators.py +115 -0
SiT_back/SiT_clean/transport/path.py +192 -0
SiT_back/SiT_clean/transport/transport.py +440 -0
SiT_back/SiT_clean/transport/utils.py +29 -0
SiT_back/SiT_clean/wandb_utils.py +55 -0

Rectified_Noise/GVP-Disp/W_No.log ADDED Viewed

@@ -0,0 +1,5 @@
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [02:10<1:39:52, 130.26s/it]
  4%|▍         | 2/47 [04:19<1:37:18, 129.75s/it]
  6%|▋         | 3/47 [06:29<1:35:03, 129.63s/it]
  9%|▊         | 4/47 [08:38<1:32:51, 129.58s/it]
 11%|█         | 5/47 [10:48<1:30:41, 129.55s/it]
 13%|█▎        | 6/47 [12:57<1:28:31, 129.54s/it]
 15%|█▍        | 7/47 [15:07<1:26:22, 129.56s/it]
 17%|█▋        | 8/47 [17:14<1:23:44, 128.82s/it]
 19%|█▉        | 9/47 [19:20<1:21:01, 127.93s/it]
 21%|██▏       | 10/47 [21:29<1:19:01, 128.15s/it]
 23%|██▎       | 11/47 [23:38<1:17:05, 128.47s/it]
 26%|██▌       | 12/47 [25:47<1:15:03, 128.67s/it]
 28%|██▊       | 13/47 [27:56<1:13:02, 128.91s/it]

+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=1.
+Saving .png samples at GVP_samples/depth-mu-2-threshold-0.0-0025000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04
+Total number of images that will be sampled: 3008
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [02:10<1:39:52, 130.26s/it]
  4%|▍         | 2/47 [04:19<1:37:18, 129.75s/it]
  6%|▋         | 3/47 [06:29<1:35:03, 129.63s/it]
  9%|▊         | 4/47 [08:38<1:32:51, 129.58s/it]
 11%|█         | 5/47 [10:48<1:30:41, 129.55s/it]
 13%|█▎        | 6/47 [12:57<1:28:31, 129.54s/it]
 15%|█▍        | 7/47 [15:07<1:26:22, 129.56s/it]
 17%|█▋        | 8/47 [17:14<1:23:44, 128.82s/it]
 19%|█▉        | 9/47 [19:20<1:21:01, 127.93s/it]
 21%|██▏       | 10/47 [21:29<1:19:01, 128.15s/it]
 23%|██▎       | 11/47 [23:38<1:17:05, 128.47s/it]
 26%|██▌       | 12/47 [25:47<1:15:03, 128.67s/it]
 28%|██▊       | 13/47 [27:56<1:13:02, 128.91s/it]

Rectified_Noise/GVP-Disp/W_True_0.15.log ADDED Viewed

@@ -0,0 +1,5 @@
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [01:34<1:12:45, 94.90s/it]
  4%|▍         | 2/47 [03:08<1:10:43, 94.31s/it]
  6%|▋         | 3/47 [04:42<1:09:03, 94.17s/it]
  9%|▊         | 4/47 [06:16<1:07:23, 94.04s/it]
 11%|█         | 5/47 [07:50<1:05:47, 93.99s/it]
 13%|█▎        | 6/47 [09:24<1:04:09, 93.88s/it]
 15%|█▍        | 7/47 [10:58<1:02:34, 93.85s/it]
 17%|█▋        | 8/47 [12:31<1:00:57, 93.79s/it]
 19%|█▉        | 9/47 [14:05<59:23, 93.77s/it]
 21%|██▏       | 10/47 [15:39<57:48, 93.75s/it]
 23%|██▎       | 11/47 [17:12<56:15, 93.76s/it]
 26%|██▌       | 12/47 [18:46<54:41, 93.76s/it]
 28%|██▊       | 13/47 [20:20<53:08, 93.78s/it]
 30%|██▉       | 14/47 [21:54<51:36, 93.82s/it]
 32%|███▏      | 15/47 [23:28<50:01, 93.80s/it]
 34%|███▍      | 16/47 [25:01<48:25, 93.71s/it]
 36%|███▌      | 17/47 [26:35<46:53, 93.77s/it]
 38%|███▊      | 18/47 [28:09<45:19, 93.78s/it]
 40%|████      | 19/47 [29:43<43:45, 93.77s/it]

+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=1.
+Saving .png samples at GVP_samples/depth-mu-2-threshold-0.15-0025000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04
+Total number of images that will be sampled: 3008
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [01:34<1:12:45, 94.90s/it]
  4%|▍         | 2/47 [03:08<1:10:43, 94.31s/it]
  6%|▋         | 3/47 [04:42<1:09:03, 94.17s/it]
  9%|▊         | 4/47 [06:16<1:07:23, 94.04s/it]
 11%|█         | 5/47 [07:50<1:05:47, 93.99s/it]
 13%|█▎        | 6/47 [09:24<1:04:09, 93.88s/it]
 15%|█▍        | 7/47 [10:58<1:02:34, 93.85s/it]
 17%|█▋        | 8/47 [12:31<1:00:57, 93.79s/it]
 19%|█▉        | 9/47 [14:05<59:23, 93.77s/it]
 21%|██▏       | 10/47 [15:39<57:48, 93.75s/it]
 23%|██▎       | 11/47 [17:12<56:15, 93.76s/it]
 26%|██▌       | 12/47 [18:46<54:41, 93.76s/it]
 28%|██▊       | 13/47 [20:20<53:08, 93.78s/it]
 30%|██▉       | 14/47 [21:54<51:36, 93.82s/it]
 32%|███▏      | 15/47 [23:28<50:01, 93.80s/it]
 34%|███▍      | 16/47 [25:01<48:25, 93.71s/it]
 36%|███▌      | 17/47 [26:35<46:53, 93.77s/it]
 38%|███▊      | 18/47 [28:09<45:19, 93.78s/it]
 40%|████      | 19/47 [29:43<43:45, 93.77s/it]

Rectified_Noise/GVP-Disp/W_True_0.5.log ADDED Viewed

@@ -0,0 +1,5 @@
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [01:34<1:12:36, 94.71s/it]
  4%|▍         | 2/47 [03:08<1:10:34, 94.11s/it]
  6%|▋         | 3/47 [04:42<1:08:54, 93.97s/it]
  9%|▊         | 4/47 [06:15<1:07:14, 93.82s/it]
 11%|█         | 5/47 [07:49<1:05:39, 93.79s/it]
 13%|█▎        | 6/47 [09:23<1:04:03, 93.75s/it]
 15%|█▍        | 7/47 [10:57<1:02:31, 93.79s/it]
 17%|█▋        | 8/47 [12:30<1:00:56, 93.75s/it]
 19%|█▉        | 9/47 [14:04<59:21, 93.73s/it]
 21%|██▏       | 10/47 [15:38<57:47, 93.73s/it]
 23%|██▎       | 11/47 [17:11<56:14, 93.74s/it]
 26%|██▌       | 12/47 [18:45<54:40, 93.74s/it]
 28%|██▊       | 13/47 [20:19<53:06, 93.72s/it]
 30%|██▉       | 14/47 [21:53<51:34, 93.77s/it]
 32%|███▏      | 15/47 [23:26<50:00, 93.77s/it]
 34%|███▍      | 16/47 [25:00<48:24, 93.70s/it]
 36%|███▌      | 17/47 [26:34<46:53, 93.77s/it]
 38%|███▊      | 18/47 [28:08<45:19, 93.79s/it]
 40%|████      | 19/47 [29:42<43:45, 93.78s/it]

+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=1.
+Saving .png samples at GVP_samples/depth-mu-2-threshold-0.5-0025000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04
+Total number of images that will be sampled: 3008
  0%|          | 0/47 [00:00<?, ?it/s]
  2%|▏         | 1/47 [01:34<1:12:36, 94.71s/it]
  4%|▍         | 2/47 [03:08<1:10:34, 94.11s/it]
  6%|▋         | 3/47 [04:42<1:08:54, 93.97s/it]
  9%|▊         | 4/47 [06:15<1:07:14, 93.82s/it]
 11%|█         | 5/47 [07:49<1:05:39, 93.79s/it]
 13%|█▎        | 6/47 [09:23<1:04:03, 93.75s/it]
 15%|█▍        | 7/47 [10:57<1:02:31, 93.79s/it]
 17%|█▋        | 8/47 [12:30<1:00:56, 93.75s/it]
 19%|█▉        | 9/47 [14:04<59:21, 93.73s/it]
 21%|██▏       | 10/47 [15:38<57:47, 93.73s/it]
 23%|██▎       | 11/47 [17:11<56:14, 93.74s/it]
 26%|██▌       | 12/47 [18:45<54:40, 93.74s/it]
 28%|██▊       | 13/47 [20:19<53:06, 93.72s/it]
 30%|██▉       | 14/47 [21:53<51:34, 93.77s/it]
 32%|███▏      | 15/47 [23:26<50:00, 93.77s/it]
 34%|███▍      | 16/47 [25:00<48:24, 93.70s/it]
 36%|███▌      | 17/47 [26:34<46:53, 93.77s/it]
 38%|███▊      | 18/47 [28:08<45:19, 93.79s/it]
 40%|████      | 19/47 [29:42<43:45, 93.78s/it]

Rectified_Noise/GVP-Disp/download.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Functions for downloading pre-trained SiT models
+"""
+from torchvision.datasets.utils import download_url
+import torch
+import os
+pretrained_models = {'SiT-XL-2-256x256.pt'}
+def find_model(model_name):
+    """
+    Finds a pre-trained SiT model, downloading it if necessary. Alternatively, loads a model from a local path.
+    """
+    if model_name in pretrained_models:
+        return download_model(model_name)
+    else:
+        assert os.path.isfile(model_name), f'Could not find SiT checkpoint at {model_name}'
+        checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage, weights_only=False)
+        if "ema" in checkpoint:  # supports checkpoints from train.py
+            checkpoint = checkpoint["ema"]
+        return checkpoint
+def download_model(model_name):
+    """
+    Downloads a pre-trained SiT model from the web.
+    """
+    assert model_name in pretrained_models
+    local_path = f'pretrained_models/{model_name}'
+    if not os.path.isfile(local_path):
+        os.makedirs('pretrained_models', exist_ok=True)
+        web_path = f'https://www.dl.dropboxusercontent.com/scl/fi/as9oeomcbub47de5g4be0/SiT-XL-2-256.pt?rlkey=uxzxmpicu46coq3msb17b9ofa&dl=0'
+        download_url(web_path, 'pretrained_models', filename=model_name)
+    model = torch.load(local_path, map_location=lambda storage, loc: storage, weights_only=False)
+    return model

Rectified_Noise/GVP-Disp/environment.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: RN
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python >= 3.8
+  - pytorch >= 1.13
+  - torchvision
+  - pytorch-cuda >=11.7
+  - pip
+  - pip:
+    - timm
+    - diffusers
+    - accelerate
+    - torchdiffeq
+    - wandb

Rectified_Noise/GVP-Disp/models.py ADDED Viewed

	@@ -0,0 +1,647 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core SiT Model                                #
+#################################################################################
+class SiTBlock(nn.Module):
+    """
+    A SiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of SiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.learn_sigma = True
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in SiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, t, y, return_act=False):
+        """
+        Forward pass of SiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        return_act: if True, return activations from transformer blocks
+        """
+        act = []
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(y, self.training)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+            if return_act:
+                act.append(x)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        if self.learn_sigma:
+            x, _ = x.chunk(2, dim=1)
+        if return_act:
+            return x, act
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of SiT, but also batches the unconSiTional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   SiT Configs                                  #
+#################################################################################
+def SiT_XL_2(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def SiT_XL_4(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def SiT_XL_8(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def SiT_L_2(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def SiT_L_4(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def SiT_L_8(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def SiT_B_2(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def SiT_B_4(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def SiT_B_8(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def SiT_S_2(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def SiT_S_4(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def SiT_S_8(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+SiT_models = {
+    'SiT-XL/2': SiT_XL_2,  'SiT-XL/4': SiT_XL_4,  'SiT-XL/8': SiT_XL_8,
+    'SiT-L/2':  SiT_L_2,   'SiT-L/4':  SiT_L_4,   'SiT-L/8':  SiT_L_8,
+    'SiT-B/2':  SiT_B_2,   'SiT-B/4':  SiT_B_4,   'SiT-B/8':  SiT_B_8,
+    'SiT-S/2':  SiT_S_2,   'SiT-S/4':  SiT_S_4,   'SiT-S/8':  SiT_S_8,
+}
+#################################################################################
+#                                 SiTF1, SiTF2, CombinedModel                   #
+#################################################################################
+class SiTF1(nn.Module):
+    """
+    SiTF1 Model
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+        final_layer=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.patch_size= patch_size
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.learn_sigma = learn_sigma
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+    def forward(self, x, t, y):
+        x = self.x_embedder(x) + self.pos_embed
+        t = self.t_embedder(t)
+        y = self.y_embedder(y, self.training)
+        c = t + y
+        for block in self.blocks:
+            x = block(x, c)
+        x_now = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x_now = self.unpatchify(x_now)                   # (N, out_channels, H, W)
+        x_now, _ = x_now.chunk(2, dim=1)
+        return x,x_now  # patch token (N, T, D)
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass with classifier-free guidance for SiTF1.
+        Applies guidance consistently to both patch tokens and image output (x_now).
+        """
+        # Take the first half (conditional inputs) and duplicate it so that
+        # it can be paired with conditional and unconditional labels in `y`.
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        patch_tokens, x_now = self.forward(combined, t, y)
+        # Apply CFG on the image output channels (first 3 channels by default)
+        eps, rest = x_now[:, :3, ...], x_now[:, 3:, ...]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        x_now = torch.cat([eps, rest], dim=1)
+        # Apply same guidance logic to patch tokens so downstream modules see
+        # a consistent guided representation.
+        cond_tok, uncond_tok = torch.split(patch_tokens, len(patch_tokens) // 2, dim=0)
+        half_tok = uncond_tok + cfg_scale * (cond_tok - uncond_tok)
+        patch_tokens = torch.cat([half_tok, half_tok], dim=0)
+        return patch_tokens, x_now
+class SiTF2(nn.Module):
+    """
+    SiTF2:
+    """
+    def __init__(
+        self,
+        input_size=32,
+        hidden_size=1152,
+        out_channels=8,
+        patch_size=2,
+        num_heads=16,
+        mlp_ratio=4.0,
+        depth=4,
+        learn_sigma=True,
+        final_layer=None,
+        num_classes=1000,
+        class_dropout_prob=0.1,
+        learn_mu=False,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.learn_mu = learn_mu
+        self.out_channels = out_channels
+        self.in_channels = 4
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.x_embedder = PatchEmbed(input_size, patch_size, self.in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        self.num_patches = num_patches  # Save original num_patches for unpatchify
+        # pos_embed needs to support 2*num_patches for concatenated input
+        self.pos_embed = nn.Parameter(torch.zeros(1, 2 * num_patches, hidden_size), requires_grad=False)
+        # Initialize pos_embed with sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, int(num_patches ** 0.5))
+        # Repeat the pos_embed for both halves (or could use different embeddings)
+        pos_embed_full = np.concatenate([pos_embed, pos_embed], axis=0)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed_full).float().unsqueeze(0))
+        if final_layer is not None:
+            self.final_layer = final_layer
+        else:
+            self.final_layer = FinalLayer(hidden_size, patch_size, out_channels)
+            # if depth !=0:
+            #     for p in self.final_layer.parameters():
+            #         if p is not None:
+            #             torch.nn.init.constant_(p, 0)
+    def unpatchify(self, x, patch_size, out_channels):
+        c = out_channels
+        p = patch_size
+        # x.shape[1] might be 2*num_patches when using concatenated input
+        # Use original num_patches to calculate h and w
+        h = w = int(self.num_patches ** 0.5)
+        # If input has 2*num_patches, we need to handle it
+        if x.shape[1] == 2 * self.num_patches:
+            # Take only the first half (or average, or other strategy)
+            # For now, we'll take the first half
+            x = x[:, :self.num_patches, :]
+        assert h * w == x.shape[1], f"Expected {h * w} patches, got {x.shape[1]}"
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, c, t, return_act=False):
+        act = []
+        for block in self.blocks:
+            x = block(x, c)
+            if return_act:
+                act.append(x)
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x, self.patch_size, self.out_channels)
+        if self.learn_sigma:
+            mean_pred, log_var_pred = x.chunk(2, dim=1)
+            variance_pred = torch.exp(log_var_pred)
+            std_dev_pred = torch.sqrt(variance_pred)
+            noise = torch.randn_like(mean_pred)
+            #uniform_noise = torch.rand_like(mean_pred)
+            #uniform_noise = uniform_noise.clamp(min=1e-5, max=1-1e-5)
+            #gumbel_noise = -torch.log(-torch.log(uniform_noise))
+            if self.learn_mu==True:
+                resampled_x = mean_pred + std_dev_pred * noise
+            else:
+                resampled_x = std_dev_pred * noise
+            x = resampled_x
+        else:
+            x, _ = x.chunk(2, dim=1)
+        if return_act:
+            return x, act
+        return x
+    def forward_noise(self, x, c):
+        for block in self.blocks:
+            x = block(x, c)
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x, self.patch_size, self.out_channels)
+        if self.learn_sigma:
+            mean_pred, log_var_pred = x.chunk(2, dim=1)
+            variance_pred = torch.exp(log_var_pred)
+            std_dev_pred = torch.sqrt(variance_pred)
+            noise = torch.randn_like(mean_pred)
+            if self.learn_mu==True:
+                resampled_x = mean_pred + std_dev_pred * noise
+            else:
+                resampled_x = std_dev_pred * noise
+            x = resampled_x
+        else:
+            x, _ = x.chunk(2, dim=1)
+        return x
+#有两种写法，一种是拿理想的，一种是拿真实的，一种是拼接，一种是加和
+class CombinedModel(nn.Module):
+    """
+    CombinedModel。
+    """
+    def __init__(self, sitf1: SiTF1, sitf2: SiTF2):
+        super().__init__()
+        self.sitf1 = sitf1
+        self.sitf2 = sitf2
+        input_size=self.sitf1.input_size
+        patch_size=self.sitf1.patch_size
+        hidden_size=self.sitf1.hidden_size
+        self.x_embedder = PatchEmbed(input_size, patch_size, 4, hidden_size, bias=True)
+        num_patches = self.x_embedder.num_patches
+        # pos_embed needs to support 2*num_patches for concatenated input
+        self.pos_embed = nn.Parameter(torch.zeros(1, 2 * num_patches, hidden_size), requires_grad=False)
+        # Initialize pos_embed with sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(hidden_size, int(num_patches ** 0.5))
+        # Repeat the pos_embed for both halves (or could use different embeddings)
+        pos_embed_full = np.concatenate([pos_embed, pos_embed], axis=0)
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed_full).float().unsqueeze(0))
+    def forward(self, x, t, y, return_act=False):
+        patch_tokens,x_now = self.sitf1(x, t, y)
+        # Interpolate between x_now and x using timestep t: (1-t)*x_now + t*x
+        # t shape is (N,), need to broadcast to (N, 1, 1, 1) for broadcasting with image (N, C, H, W)
+        t_broadcast = t.view(-1, 1, 1, 1)  # (N, 1, 1, 1)
+        # Compute interpolated input: (1-t)*x_now + t*x
+        x_interpolated = (1 - t_broadcast) * x_now +  x
+        # Convert interpolated input (image format) back to patch token format (without pos_embed, will add later)
+        x_now_patches = self.x_embedder(x_interpolated)
+        # Concatenate patch_tokens and x_now_patches along the sequence dimension
+        concatenated_input = torch.cat([patch_tokens, x_now_patches], dim=1)  # (N, 2*T, D)
+        # Add position embedding for the concatenated input
+        # Use the same pos_embed for both halves (or could use different embeddings)
+        concatenated_input = concatenated_input + self.pos_embed
+        t_emb = self.sitf1.t_embedder(t)
+        y_emb = self.sitf1.y_embedder(y, self.training)
+        c = t_emb + y_emb
+        return self.sitf2(concatenated_input, c, t, return_act=return_act)

Rectified_Noise/GVP-Disp/sample_ddp.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Samples a large number of images from a pre-trained SiT model using DDP.
+Subsequently saves a .npz file that can be used to compute FID and other
+evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations
+For a simple single-GPU/CPU sampling script, see sample.py.
+"""
+import torch
+import torch.distributed as dist
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def main(mode, args):
+    """
+    Run sampling.
+    """
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load model:
+    latent_size = args.image_size // 8
+    model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=learn_sigma,
+    ).to(device)
+    # Auto-download a pre-trained model or load a custom SiT checkpoint from train.py:
+    ckpt_path = args.ckpt or f"SiT-XL-2-{args.image_size}x{args.image_size}.pt"
+    state_dict = find_model(ckpt_path)
+    model.load_state_dict(state_dict)
+    model.eval()  # important!
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
+    using_cfg = args.cfg_scale > 1.0
+    # Create folder to save samples:
+    model_string_name = args.model.replace("/", "-")
+    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    if mode == "ODE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                  f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                  f"{mode}-{args.num_sampling_steps}-{args.sampling_method}"
+    elif mode == "SDE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                    f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                    f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                    f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // dist.get_world_size()) // n)
+    pbar = range(iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    for i in pbar:
+        # Sample inputs:
+        z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        # Setup classifier-free guidance:
+        if using_cfg:
+            z = torch.cat([z, z], 0)
+            y_null = torch.tensor([1000] * n, device=device)
+            y = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+            model_fn = model.forward_with_cfg
+        else:
+            model_kwargs = dict(y=y)
+            model_fn = model.forward
+        samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+        if using_cfg:
+            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+        samples = vae.decode(samples / 0.18215).sample
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, sample in enumerate(samples):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+        total += global_batch_size
+        dist.barrier()
+    # Make sure all processes have finished saving their samples before attempting to convert to .npz
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=4)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale",  type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
+                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint (default: auto-download a pre-trained SiT-XL/2 model).")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

Rectified_Noise/GVP-Disp/sample_rectified_noise.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def fix_state_dict_for_ddp(state_dict):
+    """
+    Fix state dict keys to match DistributedDataParallel model keys.
+    Add "module." prefix to keys if they don't have it.
+    """
+    # Check if this is a full checkpoint dict with "model", "ema", or "opt" keys
+    if isinstance(state_dict, dict) and ("model" in state_dict or "ema" in state_dict or "opt" in state_dict):
+        # This is a full checkpoint dict, extract the state dict we need
+        # Prefer "ema" then "model" then return as is
+        if "ema" in state_dict:
+            state_dict = state_dict["ema"]
+        elif "model" in state_dict:
+            state_dict = state_dict["model"]
+        else:
+            # If only "opt" or other keys exist, return original
+            state_dict = state_dict
+    # Now fix the keys to match DDP format
+    fixed_state_dict = {}
+    for key, value in state_dict.items():
+        if not key.startswith("module."):
+            new_key = "module." + key
+        else:
+            new_key = key
+        fixed_state_dict[new_key] = value
+    return fixed_state_dict
+def main(mode, args):
+    """
+    Run sampling.
+    """
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    learn_mu = args.learn_mu
+    sitf2_depth = args.depth  # Save SiTF2 depth before it gets overwritten
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load SiTF1 and SiTF2 models and create CombinedModel
+    from models import SiTF1, SiTF2, CombinedModel
+    latent_size = args.image_size // 8
+    # Get model configuration based on args.model
+    model_name = args.model
+    if 'XL' in model_name:
+        hidden_size, depth, num_heads = 1152, 28, 16
+    elif 'L' in model_name:
+        hidden_size, depth, num_heads = 1024, 24, 16
+    elif 'B' in model_name:
+        hidden_size, depth, num_heads = 768, 12, 12
+    elif 'S' in model_name:
+        hidden_size, depth, num_heads = 384, 12, 6
+    else:
+        # Default fallback
+        hidden_size, depth, num_heads = 768, 12, 12
+    # Extract patch size from model name like 'SiT-XL/2' -> patch_size = 2
+    patch_size = int(model_name.split('/')[-1])
+    # Load SiTF1
+    sitf1 = SiTF1(
+        input_size=latent_size,
+        patch_size=patch_size,
+        in_channels=4,
+        hidden_size=hidden_size,
+        depth=depth,
+        num_heads=num_heads,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=args.num_classes,
+        learn_sigma=False
+    ).to(device)
+    sitf1_state_raw = find_model(args.ckpt)
+    # find_model now returns ema if available, or the full checkpoint
+    # Extract the actual state_dict to use for both sitf1 and base_model
+    if isinstance(sitf1_state_raw, dict) and "model" in sitf1_state_raw:
+        sitf1_state = sitf1_state_raw["model"]
+    else:
+        # sitf1_state_raw is already a state_dict (either ema or direct model state)
+        sitf1_state = sitf1_state_raw
+    sitf1.load_state_dict(sitf1_state)
+    sitf1.eval()
+    # For sampling, we can use sitf1 directly instead of creating a separate sit model
+    # since sitf1 and sit have the same architecture and weights
+    # Load SiTF2 with the same architecture parameters as SiTF1 for compatibility
+    sitf2 = SiTF2(
+        input_size=latent_size,
+        hidden_size=hidden_size,  # Use the same hidden_size as SiTF1
+        out_channels=8,
+        patch_size=patch_size,  # Use the same patch_size as SiTF1
+        num_heads=num_heads,  # Use the same num_heads as SiTF1
+        mlp_ratio=4.0,
+        depth=sitf2_depth,  # Use the depth specified by command line argument (not the model's default depth)
+        learn_sigma=True,
+        num_classes=args.num_classes,
+        learn_mu=learn_mu
+    ).to(device)
+    sitf2 = DDP(sitf2, device_ids=[device])
+    sitf2_state = find_model(args.sitf2_ckpt)
+    # Fix state dict keys to match DDP model
+    sitf2_state_fixed = fix_state_dict_for_ddp(sitf2_state)
+    try:
+        sitf2.load_state_dict(sitf2_state_fixed)
+    except Exception as e:
+        print(f"Error loading state dict: {e}")
+        # Try loading with strict=False as fallback
+        sitf2.load_state_dict(sitf2_state_fixed, strict=False)
+    sitf2.eval()
+    # CombinedModel
+    combined_model = CombinedModel(sitf1, sitf2).to(device)
+    sitf2.eval()
+    combined_model.eval()
+    # Use SiT_models factory function to create the base model, same as in SiT_clean
+    # This ensures correct model configuration
+    # Use learn_sigma=False to match sitf1 configuration
+    base_model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=False,  # Match sitf1's learn_sigma=False
+    ).to(device)
+    # Load the checkpoint (same as sitf1) - use the exact same state_dict
+    base_model.load_state_dict(sitf1_state)
+    base_model.eval()
+    # Determine if CFG will be used (needed for combined_sampling_model function)
+    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
+    using_cfg = args.cfg_scale > 1.0
+    # There are repeated calculations in the middle,
+    # which will cause Flops to double. A simplified version will be released later
+    def combined_sampling_model(x, t, y=None, **kwargs):
+        with torch.no_grad():
+            # Handle CFG same as in SiT_clean/sample_ddp.py
+            if using_cfg and 'cfg_scale' in kwargs:
+                # Use forward_with_cfg when CFG is enabled
+                sit_out = base_model.forward_with_cfg(x, t, y, kwargs['cfg_scale'])
+            else:
+                # Use regular forward when CFG is disabled
+                sit_out = base_model.forward(x, t, y)
+            # If use_sitf2_before_t05 is True, only use sitf2 when t < threshold
+            if args.use_sitf2:
+                if args.use_sitf2_before_t05:
+                    # t is a tensor, check which samples have t < threshold
+                    # Create a mask: 1.0 where t < threshold, 0.0 otherwise
+                    mask = (t < args.sitf2_threshold).float()
+                    # Compute sitf2 output for all samples
+                    combined_out = combined_model.forward(x, t, y)
+                    # Expand mask to match the spatial dimensions of combined_out
+                    # combined_out shape is (batch, channels, height, width)
+                    while len(mask.shape) < len(combined_out.shape):
+                        mask = mask.unsqueeze(-1)
+                    # Broadcast mask to match combined_out shape
+                    mask = mask.expand_as(combined_out)
+                    # Only use sitf2 output where t < threshold
+                    combined_out = combined_out * mask
+                    # Combine sit_out and masked combined_out
+                    return sit_out + combined_out
+                else:
+                # Default behavior: only use base model output
+                    return sit_out
+            else:
+                # Default behavior: only use base model output
+                return sit_out
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    # Create folder to save samples:
+    model_string_name = args.model.replace("/", "-")
+    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    sitf2_ckpt_string_name = os.path.basename(args.sitf2_ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    if mode == "ODE":
+        folder_name = f"{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                  f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                  f"{mode}-{args.num_sampling_steps}-{args.sampling_method}"
+    elif mode == "SDE":
+        # Add threshold info to folder name if use_sitf2_before_t05 is enabled
+        threshold_suffix = f"-threshold-{args.sitf2_threshold}" if args.use_sitf2_before_t05 else ""
+        if learn_mu:
+            folder_name = f"depth-mu-{sitf2_depth}{threshold_suffix}-{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                        f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                        f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                        f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+        else:
+            folder_name = f"depth-sigma-{sitf2_depth}{threshold_suffix}-{sitf2_ckpt_string_name}-{ckpt_string_name}-" \
+                        f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                        f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                        f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // dist.get_world_size()) // n)
+    pbar = range(iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    for i in pbar:
+        # Sample inputs:
+        z = torch.randn(n, base_model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        # Setup classifier-free guidance:
+        if using_cfg:
+            z = torch.cat([z, z], 0)
+            y_null = torch.tensor([1000] * n, device=device)
+            y = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+        else:
+            model_kwargs = dict(y=y)
+        samples = sample_fn(z, combined_sampling_model, **model_kwargs)[-1]
+        if using_cfg:
+            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+        samples = vae.decode(samples / 0.18215).sample
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, sample in enumerate(samples):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+        total += global_batch_size
+        dist.barrier()
+    # Make sure all processes have finished saving their samples before attempting to convert to .npz
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=64)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale",  type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=100)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
+                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint.")
+    parser.add_argument("--sitf2-ckpt", type=str, required=True, help="Path to SiTF2 checkpoint")
+    parser.add_argument("--learn-mu", action=argparse.BooleanOptionalAction, default=True,
+                        help="Whether to learn mu parameter")
+    parser.add_argument("--depth", type=int, default=1,
+                        help="Depth parameter for SiTF2 model")
+    parser.add_argument("--use-sitf2", action=argparse.BooleanOptionalAction, default=True,
+    help="Only use SiTF2 output when t < threshold, otherwise use only SiT")
+    parser.add_argument("--use-sitf2-before-t05", action=argparse.BooleanOptionalAction, default=False,
+                        help="Only use SiTF2 output when t < threshold, otherwise use only SiT")
+    parser.add_argument("--sitf2-threshold", type=float, default=0.5,
+                        help="Time threshold for using SiTF2 output (default: 0.5). Only effective when --use-sitf2-before-t05 is True")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

Rectified_Noise/GVP-Disp/train_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+def none_or_str(value):
+    if value == 'None':
+        return None
+    return value
+def parse_transport_args(parser):
+    group = parser.add_argument_group("Transport arguments")
+    group.add_argument("--path-type", type=str, default="GVP", choices=["Linear", "GVP", "VP"],
+                       help="Type of path for loss calculation. This parameter directly affects the loss form used during training. "
+                            "Choices: Linear (linear interpolation path), GVP (Geodesic Velocity Path), VP (Velocity Path). "
+                            "The path_type determines how the transport loss is computed in training_losses().")
+    group.add_argument("--prediction", type=str, default="velocity", choices=["velocity", "score", "noise"])
+    group.add_argument("--loss-weight", type=none_or_str, default=None, choices=[None, "velocity", "likelihood"])
+    group.add_argument("--sample-eps", type=float, default=0.0)
+    group.add_argument("--train-eps", type=float, default=0.0)
+def parse_ode_args(parser):
+    group = parser.add_argument_group("ODE arguments")
+    group.add_argument("--sampling-method", type=str, default="dopri5", help="blackbox ODE solver methods; for full list check https://github.com/rtqichen/torchdiffeq")
+    group.add_argument("--atol", type=float, default=1e-6, help="Absolute tolerance")
+    group.add_argument("--rtol", type=float, default=1e-3, help="Relative tolerance")
+    group.add_argument("--reverse", action="store_true")
+    group.add_argument("--likelihood", action="store_true")
+def parse_sde_args(parser):
+    group = parser.add_argument_group("SDE arguments")
+    group.add_argument("--sampling-method", type=str, default="Euler", choices=["Euler", "Heun"])
+    group.add_argument("--diffusion-form", type=str, default="sigma", \
+                        choices=["constant", "SBDM", "sigma", "linear", "decreasing", "increasing-decreasing"],\
+                        help="form of diffusion coefficient in the SDE")
+    group.add_argument("--diffusion-norm", type=float, default=1.0)
+    group.add_argument("--last-step", type=none_or_str, default="Mean", choices=[None, "Mean", "Tweedie", "Euler"],\
+                        help="form of last step taken in the SDE")
+    group.add_argument("--last-step-size", type=float, default=0.04, \
+                        help="size of the last step taken")

Rectified_Noise/GVP-Disp/w_training1_VP.log ADDED Viewed

	@@ -0,0 +1,628 @@

+nohup: ignoring input
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=1.
+[[34m2026-02-01 14:09:25[0m] Experiment directory created at results_256_vp/depth-mu-2-000-SiT-XL-2-VP-velocity-None
+[[34m2026-02-01 14:09:57[0m] Combined_model Parameters: 729,629,632
+[[34m2026-02-01 14:09:57[0m] Total trainable parameters: 53,910,176
+[[34m2026-02-01 14:09:59[0m] Dataset contains 1,281,167 images (/gemini/platform/public/zhaozy/hzh/datasets/Imagenet/train/)
+[[34m2026-02-01 14:09:59[0m] Training for 100000 epochs...
+[[34m2026-02-01 14:09:59[0m] Beginning epoch 0...
+[[34m2026-02-01 14:11:18[0m] (step=0000100) Train Loss: 2.7011, Train Steps/Sec: 1.27
+[[34m2026-02-01 14:12:34[0m] (step=0000200) Train Loss: 1.9056, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:13:49[0m] (step=0000300) Train Loss: 1.7930, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:15:05[0m] (step=0000400) Train Loss: 2.0316, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:16:21[0m] (step=0000500) Train Loss: 1.8412, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:17:37[0m] (step=0000600) Train Loss: 1.8505, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:18:53[0m] (step=0000700) Train Loss: 1.8542, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:20:09[0m] (step=0000800) Train Loss: 1.8904, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:21:25[0m] (step=0000900) Train Loss: 1.9280, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:22:41[0m] (step=0001000) Train Loss: 1.8453, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:23:57[0m] (step=0001100) Train Loss: 1.8745, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:25:13[0m] (step=0001200) Train Loss: 1.8410, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:26:29[0m] (step=0001300) Train Loss: 1.8445, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:27:44[0m] (step=0001400) Train Loss: 1.8173, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:29:00[0m] (step=0001500) Train Loss: 3.5917, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:30:16[0m] (step=0001600) Train Loss: 1.8747, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:31:32[0m] (step=0001700) Train Loss: 1.8092, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:32:48[0m] (step=0001800) Train Loss: 1.8720, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:34:04[0m] (step=0001900) Train Loss: 1.8186, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:35:20[0m] (step=0002000) Train Loss: 1.9034, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:36:36[0m] (step=0002100) Train Loss: 1.8993, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:37:52[0m] (step=0002200) Train Loss: 1.8499, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:39:08[0m] (step=0002300) Train Loss: 2.1165, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:40:24[0m] (step=0002400) Train Loss: 1.8346, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:41:40[0m] (step=0002500) Train Loss: 1.7744, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:42:56[0m] (step=0002600) Train Loss: 1.8164, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:44:12[0m] (step=0002700) Train Loss: 1.8115, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:45:28[0m] (step=0002800) Train Loss: 1.8150, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:46:44[0m] (step=0002900) Train Loss: 1.8270, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:48:00[0m] (step=0003000) Train Loss: 1.9181, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:49:16[0m] (step=0003100) Train Loss: 1.9040, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:50:31[0m] (step=0003200) Train Loss: 2.2287, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:51:47[0m] (step=0003300) Train Loss: 2.0059, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:53:03[0m] (step=0003400) Train Loss: 1.8687, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:54:19[0m] (step=0003500) Train Loss: 1.9185, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:55:35[0m] (step=0003600) Train Loss: 1.9162, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:56:51[0m] (step=0003700) Train Loss: 2.0918, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:58:07[0m] (step=0003800) Train Loss: 2.5750, Train Steps/Sec: 1.32
+[[34m2026-02-01 14:59:23[0m] (step=0003900) Train Loss: 1.8959, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:00:39[0m] (step=0004000) Train Loss: 1.8935, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:01:55[0m] (step=0004100) Train Loss: 1.8143, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:03:11[0m] (step=0004200) Train Loss: 2.0503, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:04:27[0m] (step=0004300) Train Loss: 1.8916, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:05:43[0m] (step=0004400) Train Loss: 2.1279, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:06:59[0m] (step=0004500) Train Loss: 1.8331, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:08:15[0m] (step=0004600) Train Loss: 1.8969, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:09:31[0m] (step=0004700) Train Loss: 1.8220, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:10:47[0m] (step=0004800) Train Loss: 1.8862, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:12:03[0m] (step=0004900) Train Loss: 1.9553, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:13:19[0m] (step=0005000) Train Loss: 1.8549, Train Steps/Sec: 1.31
+[[34m2026-02-01 15:14:35[0m] (step=0005100) Train Loss: 1.9343, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:15:51[0m] (step=0005200) Train Loss: 1.9899, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:17:07[0m] (step=0005300) Train Loss: 1.9115, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:18:23[0m] (step=0005400) Train Loss: 2.2117, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:19:39[0m] (step=0005500) Train Loss: 1.9424, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:20:55[0m] (step=0005600) Train Loss: 1.8367, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:22:11[0m] (step=0005700) Train Loss: 1.8696, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:23:27[0m] (step=0005800) Train Loss: 2.2085, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:24:43[0m] (step=0005900) Train Loss: 1.8185, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:25:59[0m] (step=0006000) Train Loss: 1.8452, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:27:15[0m] (step=0006100) Train Loss: 1.8141, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:28:31[0m] (step=0006200) Train Loss: 2.4398, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:29:47[0m] (step=0006300) Train Loss: 1.9160, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:31:03[0m] (step=0006400) Train Loss: 1.9920, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:32:19[0m] (step=0006500) Train Loss: 1.8726, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:33:35[0m] (step=0006600) Train Loss: 1.9302, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:34:51[0m] (step=0006700) Train Loss: 1.8886, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:36:07[0m] (step=0006800) Train Loss: 1.8492, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:37:23[0m] (step=0006900) Train Loss: 2.0008, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:38:39[0m] (step=0007000) Train Loss: 1.9791, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:39:55[0m] (step=0007100) Train Loss: 1.9221, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:41:11[0m] (step=0007200) Train Loss: 1.8893, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:42:27[0m] (step=0007300) Train Loss: 1.8739, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:43:43[0m] (step=0007400) Train Loss: 2.6370, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:44:59[0m] (step=0007500) Train Loss: 2.1859, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:46:15[0m] (step=0007600) Train Loss: 1.8067, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:47:31[0m] (step=0007700) Train Loss: 1.8996, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:48:47[0m] (step=0007800) Train Loss: 1.9468, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:50:03[0m] (step=0007900) Train Loss: 1.8925, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:51:19[0m] (step=0008000) Train Loss: 1.7844, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:52:35[0m] (step=0008100) Train Loss: 1.9823, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:53:51[0m] (step=0008200) Train Loss: 1.9363, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:55:07[0m] (step=0008300) Train Loss: 1.8508, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:56:22[0m] (step=0008400) Train Loss: 1.9048, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:57:38[0m] (step=0008500) Train Loss: 1.8955, Train Steps/Sec: 1.32
+[[34m2026-02-01 15:58:54[0m] (step=0008600) Train Loss: 1.8585, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:00:10[0m] (step=0008700) Train Loss: 1.8621, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:01:26[0m] (step=0008800) Train Loss: 1.8826, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:02:43[0m] (step=0008900) Train Loss: 1.9289, Train Steps/Sec: 1.31
+[[34m2026-02-01 16:03:59[0m] (step=0009000) Train Loss: 1.9667, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:05:15[0m] (step=0009100) Train Loss: 2.1871, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:06:31[0m] (step=0009200) Train Loss: 1.8651, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:07:47[0m] (step=0009300) Train Loss: 1.9620, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:09:03[0m] (step=0009400) Train Loss: 1.8992, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:10:18[0m] (step=0009500) Train Loss: 1.8620, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:11:34[0m] (step=0009600) Train Loss: 1.9782, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:12:50[0m] (step=0009700) Train Loss: 2.3364, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:14:06[0m] (step=0009800) Train Loss: 1.8309, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:15:22[0m] (step=0009900) Train Loss: 2.5777, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:16:38[0m] (step=0010000) Train Loss: 1.9410, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:16:45[0m] Beginning epoch 1...
+[[34m2026-02-01 16:17:56[0m] (step=0010100) Train Loss: 1.8156, Train Steps/Sec: 1.28
+[[34m2026-02-01 16:19:12[0m] (step=0010200) Train Loss: 1.7965, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:20:28[0m] (step=0010300) Train Loss: 1.9732, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:21:44[0m] (step=0010400) Train Loss: 2.6702, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:23:00[0m] (step=0010500) Train Loss: 1.9175, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:24:16[0m] (step=0010600) Train Loss: 1.8493, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:25:32[0m] (step=0010700) Train Loss: 1.8514, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:26:48[0m] (step=0010800) Train Loss: 2.0059, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:28:04[0m] (step=0010900) Train Loss: 1.8519, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:29:20[0m] (step=0011000) Train Loss: 1.8523, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:30:36[0m] (step=0011100) Train Loss: 1.7980, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:31:52[0m] (step=0011200) Train Loss: 1.8429, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:33:08[0m] (step=0011300) Train Loss: 1.9200, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:34:24[0m] (step=0011400) Train Loss: 1.8371, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:35:40[0m] (step=0011500) Train Loss: 2.0173, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:36:56[0m] (step=0011600) Train Loss: 1.8135, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:38:12[0m] (step=0011700) Train Loss: 1.9532, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:39:28[0m] (step=0011800) Train Loss: 2.0043, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:40:44[0m] (step=0011900) Train Loss: 1.8474, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:42:00[0m] (step=0012000) Train Loss: 1.8364, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:43:15[0m] (step=0012100) Train Loss: 2.6696, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:44:31[0m] (step=0012200) Train Loss: 1.8652, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:45:47[0m] (step=0012300) Train Loss: 1.9174, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:47:03[0m] (step=0012400) Train Loss: 1.8479, Train Steps/Sec: 1.31
+[[34m2026-02-01 16:48:19[0m] (step=0012500) Train Loss: 1.8228, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:49:35[0m] (step=0012600) Train Loss: 1.9067, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:50:51[0m] (step=0012700) Train Loss: 1.7572, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:52:07[0m] (step=0012800) Train Loss: 1.8446, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:53:23[0m] (step=0012900) Train Loss: 1.8543, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:54:39[0m] (step=0013000) Train Loss: 1.8222, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:55:55[0m] (step=0013100) Train Loss: 2.0108, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:57:11[0m] (step=0013200) Train Loss: 2.3761, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:58:27[0m] (step=0013300) Train Loss: 1.8902, Train Steps/Sec: 1.32
+[[34m2026-02-01 16:59:43[0m] (step=0013400) Train Loss: 1.8800, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:00:59[0m] (step=0013500) Train Loss: 1.7917, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:02:15[0m] (step=0013600) Train Loss: 1.9730, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:03:31[0m] (step=0013700) Train Loss: 1.8894, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:04:47[0m] (step=0013800) Train Loss: 2.1075, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:06:03[0m] (step=0013900) Train Loss: 1.8469, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:07:19[0m] (step=0014000) Train Loss: 1.8705, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:08:35[0m] (step=0014100) Train Loss: 1.8630, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:09:51[0m] (step=0014200) Train Loss: 1.8509, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:11:07[0m] (step=0014300) Train Loss: 2.2249, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:12:23[0m] (step=0014400) Train Loss: 1.8378, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:13:39[0m] (step=0014500) Train Loss: 1.8106, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:14:55[0m] (step=0014600) Train Loss: 1.8131, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:16:11[0m] (step=0014700) Train Loss: 1.9024, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:17:27[0m] (step=0014800) Train Loss: 2.2030, Train Steps/Sec: 1.32
+[[34m2026-02-01 17:18:42[0m] (step=0014900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:19:58[0m] (step=0015000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:21:13[0m] (step=0015100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:22:28[0m] (step=0015200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:23:43[0m] (step=0015300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:24:58[0m] (step=0015400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:26:13[0m] (step=0015500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:27:29[0m] (step=0015600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:28:44[0m] (step=0015700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:29:59[0m] (step=0015800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:31:14[0m] (step=0015900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:32:29[0m] (step=0016000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:33:44[0m] (step=0016100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:34:59[0m] (step=0016200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:36:14[0m] (step=0016300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:37:29[0m] (step=0016400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:38:45[0m] (step=0016500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:40:00[0m] (step=0016600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:41:15[0m] (step=0016700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:42:30[0m] (step=0016800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:43:45[0m] (step=0016900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:45:00[0m] (step=0017000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:46:16[0m] (step=0017100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:47:31[0m] (step=0017200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:48:46[0m] (step=0017300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:50:01[0m] (step=0017400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:51:16[0m] (step=0017500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:52:31[0m] (step=0017600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:53:46[0m] (step=0017700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:55:01[0m] (step=0017800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:56:16[0m] (step=0017900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:57:31[0m] (step=0018000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 17:58:46[0m] (step=0018100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:00:02[0m] (step=0018200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:01:17[0m] (step=0018300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:02:32[0m] (step=0018400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:03:47[0m] (step=0018500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:05:02[0m] (step=0018600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:06:17[0m] (step=0018700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:07:32[0m] (step=0018800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:08:47[0m] (step=0018900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:10:02[0m] (step=0019000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:11:17[0m] (step=0019100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:12:33[0m] (step=0019200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:13:48[0m] (step=0019300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:15:03[0m] (step=0019400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:16:18[0m] (step=0019500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:17:33[0m] (step=0019600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:18:48[0m] (step=0019700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:20:03[0m] (step=0019800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:21:18[0m] (step=0019900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:22:34[0m] (step=0020000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:22:47[0m] Beginning epoch 2...
+[[34m2026-02-01 18:23:51[0m] (step=0020100) Train Loss: nan, Train Steps/Sec: 1.29
+[[34m2026-02-01 18:25:06[0m] (step=0020200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:26:21[0m] (step=0020300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:27:36[0m] (step=0020400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:28:51[0m] (step=0020500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:30:06[0m] (step=0020600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:31:21[0m] (step=0020700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:32:37[0m] (step=0020800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:33:52[0m] (step=0020900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:35:07[0m] (step=0021000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:36:22[0m] (step=0021100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:37:37[0m] (step=0021200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:38:52[0m] (step=0021300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:40:07[0m] (step=0021400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:41:22[0m] (step=0021500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:42:37[0m] (step=0021600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:43:53[0m] (step=0021700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:45:08[0m] (step=0021800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:46:23[0m] (step=0021900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:47:38[0m] (step=0022000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:48:53[0m] (step=0022100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:50:08[0m] (step=0022200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:51:24[0m] (step=0022300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:52:39[0m] (step=0022400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:53:54[0m] (step=0022500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:55:09[0m] (step=0022600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:56:24[0m] (step=0022700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:57:39[0m] (step=0022800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 18:58:54[0m] (step=0022900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:00:09[0m] (step=0023000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:01:24[0m] (step=0023100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:02:39[0m] (step=0023200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:03:54[0m] (step=0023300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:05:09[0m] (step=0023400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:06:25[0m] (step=0023500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:07:40[0m] (step=0023600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:08:55[0m] (step=0023700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:10:10[0m] (step=0023800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:11:25[0m] (step=0023900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:12:40[0m] (step=0024000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:13:56[0m] (step=0024100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:15:11[0m] (step=0024200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:16:26[0m] (step=0024300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:17:41[0m] (step=0024400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:18:56[0m] (step=0024500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:20:11[0m] (step=0024600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:21:26[0m] (step=0024700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:22:41[0m] (step=0024800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:23:56[0m] (step=0024900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:25:11[0m] (step=0025000) Train Loss: nan, Train Steps/Sec: 1.33
+25000
+[[34m2026-02-01 19:25:12[0m] Saved checkpoint to results_256_vp/depth-mu-2-000-SiT-XL-2-VP-velocity-None/checkpoints/0025000.pt
+[[34m2026-02-01 19:26:27[0m] (step=0025100) Train Loss: nan, Train Steps/Sec: 1.32
+[[34m2026-02-01 19:27:36[0m] Generating EMA samples...
+[[34m2026-02-01 19:27:42[0m] (step=0025200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:28:57[0m] (step=0025300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:30:13[0m] (step=0025400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:31:28[0m] (step=0025500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:32:43[0m] (step=0025600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:33:58[0m] (step=0025700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:35:13[0m] (step=0025800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:36:28[0m] (step=0025900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:37:44[0m] (step=0026000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:38:59[0m] (step=0026100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:40:14[0m] (step=0026200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:41:29[0m] (step=0026300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:42:44[0m] (step=0026400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:43:59[0m] (step=0026500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:45:14[0m] (step=0026600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:46:29[0m] (step=0026700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:47:44[0m] (step=0026800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:49:00[0m] (step=0026900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:50:15[0m] (step=0027000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:51:30[0m] (step=0027100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:52:45[0m] (step=0027200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:54:00[0m] (step=0027300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:55:15[0m] (step=0027400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:56:30[0m] (step=0027500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:57:45[0m] (step=0027600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 19:59:00[0m] (step=0027700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:00:15[0m] (step=0027800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:01:31[0m] (step=0027900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:02:46[0m] (step=0028000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:04:01[0m] (step=0028100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:05:16[0m] (step=0028200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:06:31[0m] (step=0028300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:07:46[0m] (step=0028400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:09:02[0m] (step=0028500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:10:17[0m] (step=0028600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:11:32[0m] (step=0028700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:12:47[0m] (step=0028800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:14:02[0m] (step=0028900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:15:17[0m] (step=0029000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:16:32[0m] (step=0029100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:17:47[0m] (step=0029200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:19:02[0m] (step=0029300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:20:18[0m] (step=0029400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:21:33[0m] (step=0029500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:22:48[0m] (step=0029600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:24:03[0m] (step=0029700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:25:18[0m] (step=0029800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:26:33[0m] (step=0029900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:27:49[0m] (step=0030000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:28:09[0m] Beginning epoch 3...
+[[34m2026-02-01 20:29:06[0m] (step=0030100) Train Loss: nan, Train Steps/Sec: 1.29
+[[34m2026-02-01 20:30:21[0m] (step=0030200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:31:36[0m] (step=0030300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:32:51[0m] (step=0030400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:34:06[0m] (step=0030500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:35:22[0m] (step=0030600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:36:37[0m] (step=0030700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:37:52[0m] (step=0030800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:39:07[0m] (step=0030900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:40:22[0m] (step=0031000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:41:37[0m] (step=0031100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:42:52[0m] (step=0031200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:44:08[0m] (step=0031300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:45:23[0m] (step=0031400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:46:38[0m] (step=0031500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:47:53[0m] (step=0031600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:49:08[0m] (step=0031700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:50:23[0m] (step=0031800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:51:38[0m] (step=0031900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:52:54[0m] (step=0032000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:54:09[0m] (step=0032100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:55:24[0m] (step=0032200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:56:39[0m] (step=0032300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:57:54[0m] (step=0032400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 20:59:09[0m] (step=0032500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:00:24[0m] (step=0032600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:01:39[0m] (step=0032700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:02:54[0m] (step=0032800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:04:09[0m] (step=0032900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:05:25[0m] (step=0033000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:06:40[0m] (step=0033100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:07:55[0m] (step=0033200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:09:10[0m] (step=0033300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:10:25[0m] (step=0033400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:11:40[0m] (step=0033500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:12:56[0m] (step=0033600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:14:11[0m] (step=0033700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:15:26[0m] (step=0033800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:16:41[0m] (step=0033900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:17:56[0m] (step=0034000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:19:11[0m] (step=0034100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:20:27[0m] (step=0034200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:21:42[0m] (step=0034300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:22:57[0m] (step=0034400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:24:12[0m] (step=0034500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:25:27[0m] (step=0034600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:26:42[0m] (step=0034700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:27:57[0m] (step=0034800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:29:12[0m] (step=0034900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:30:28[0m] (step=0035000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:31:43[0m] (step=0035100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:32:58[0m] (step=0035200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:34:13[0m] (step=0035300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:35:28[0m] (step=0035400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:36:43[0m] (step=0035500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:37:58[0m] (step=0035600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:39:13[0m] (step=0035700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:40:29[0m] (step=0035800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:41:44[0m] (step=0035900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:42:59[0m] (step=0036000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:44:14[0m] (step=0036100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:45:29[0m] (step=0036200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:46:44[0m] (step=0036300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:48:00[0m] (step=0036400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:49:15[0m] (step=0036500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:50:30[0m] (step=0036600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:51:45[0m] (step=0036700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:53:00[0m] (step=0036800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:54:15[0m] (step=0036900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:55:31[0m] (step=0037000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:56:46[0m] (step=0037100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:58:01[0m] (step=0037200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 21:59:16[0m] (step=0037300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:00:31[0m] (step=0037400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:01:46[0m] (step=0037500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:03:01[0m] (step=0037600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:04:17[0m] (step=0037700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:05:32[0m] (step=0037800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:06:47[0m] (step=0037900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:08:02[0m] (step=0038000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:09:17[0m] (step=0038100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:10:32[0m] (step=0038200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:11:47[0m] (step=0038300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:13:02[0m] (step=0038400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:14:18[0m] (step=0038500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:15:33[0m] (step=0038600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:16:48[0m] (step=0038700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:18:03[0m] (step=0038800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:19:18[0m] (step=0038900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:20:33[0m] (step=0039000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:21:48[0m] (step=0039100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:23:04[0m] (step=0039200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:24:19[0m] (step=0039300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:25:34[0m] (step=0039400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:26:49[0m] (step=0039500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:28:04[0m] (step=0039600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:29:19[0m] (step=0039700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:30:34[0m] (step=0039800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:31:49[0m] (step=0039900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:33:04[0m] (step=0040000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:33:32[0m] Beginning epoch 4...
+[[34m2026-02-01 22:34:22[0m] (step=0040100) Train Loss: nan, Train Steps/Sec: 1.29
+[[34m2026-02-01 22:35:37[0m] (step=0040200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:36:52[0m] (step=0040300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:38:07[0m] (step=0040400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:39:22[0m] (step=0040500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:40:37[0m] (step=0040600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:41:52[0m] (step=0040700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:43:08[0m] (step=0040800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:44:23[0m] (step=0040900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:45:38[0m] (step=0041000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:46:53[0m] (step=0041100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:48:08[0m] (step=0041200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:49:23[0m] (step=0041300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:50:39[0m] (step=0041400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:51:54[0m] (step=0041500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:53:09[0m] (step=0041600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:54:24[0m] (step=0041700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:55:39[0m] (step=0041800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:56:54[0m] (step=0041900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:58:09[0m] (step=0042000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 22:59:24[0m] (step=0042100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:00:40[0m] (step=0042200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:01:55[0m] (step=0042300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:03:10[0m] (step=0042400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:04:25[0m] (step=0042500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:05:40[0m] (step=0042600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:06:55[0m] (step=0042700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:08:10[0m] (step=0042800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:09:26[0m] (step=0042900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:10:41[0m] (step=0043000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:11:56[0m] (step=0043100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:13:11[0m] (step=0043200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:14:26[0m] (step=0043300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:15:41[0m] (step=0043400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:16:56[0m] (step=0043500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:18:11[0m] (step=0043600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:19:27[0m] (step=0043700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:20:42[0m] (step=0043800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:21:57[0m] (step=0043900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:23:12[0m] (step=0044000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:24:27[0m] (step=0044100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:25:42[0m] (step=0044200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:26:57[0m] (step=0044300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:28:12[0m] (step=0044400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:29:28[0m] (step=0044500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:30:43[0m] (step=0044600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:31:58[0m] (step=0044700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:33:13[0m] (step=0044800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:34:28[0m] (step=0044900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:35:43[0m] (step=0045000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:36:58[0m] (step=0045100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:38:13[0m] (step=0045200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:39:28[0m] (step=0045300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:40:44[0m] (step=0045400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:41:59[0m] (step=0045500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:43:14[0m] (step=0045600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:44:29[0m] (step=0045700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:45:44[0m] (step=0045800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:46:59[0m] (step=0045900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:48:14[0m] (step=0046000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:49:29[0m] (step=0046100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:50:45[0m] (step=0046200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:52:00[0m] (step=0046300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:53:15[0m] (step=0046400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:54:30[0m] (step=0046500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:55:45[0m] (step=0046600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:57:00[0m] (step=0046700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:58:15[0m] (step=0046800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-01 23:59:31[0m] (step=0046900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:00:46[0m] (step=0047000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:02:01[0m] (step=0047100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:03:16[0m] (step=0047200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:04:31[0m] (step=0047300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:05:46[0m] (step=0047400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:07:01[0m] (step=0047500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:08:16[0m] (step=0047600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:09:32[0m] (step=0047700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:10:47[0m] (step=0047800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:12:02[0m] (step=0047900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:13:17[0m] (step=0048000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:14:32[0m] (step=0048100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:15:47[0m] (step=0048200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:17:02[0m] (step=0048300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:18:17[0m] (step=0048400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:19:32[0m] (step=0048500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:20:48[0m] (step=0048600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:22:03[0m] (step=0048700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:23:18[0m] (step=0048800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:24:33[0m] (step=0048900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:25:48[0m] (step=0049000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:27:03[0m] (step=0049100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:28:18[0m] (step=0049200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:29:33[0m] (step=0049300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:30:48[0m] (step=0049400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:32:04[0m] (step=0049500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:33:19[0m] (step=0049600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:34:34[0m] (step=0049700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:35:49[0m] (step=0049800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:37:04[0m] (step=0049900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:38:19[0m] (step=0050000) Train Loss: nan, Train Steps/Sec: 1.33
+50000
+[[34m2026-02-02 00:38:20[0m] Saved checkpoint to results_256_vp/depth-mu-2-000-SiT-XL-2-VP-velocity-None/checkpoints/0050000.pt
+[[34m2026-02-02 00:38:54[0m] Beginning epoch 5...
+[[34m2026-02-02 00:39:37[0m] (step=0050100) Train Loss: nan, Train Steps/Sec: 1.28
+[[34m2026-02-02 00:40:53[0m] (step=0050200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:42:08[0m] (step=0050300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:43:11[0m] Generating EMA samples...
+[[34m2026-02-02 00:43:23[0m] (step=0050400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:44:38[0m] (step=0050500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:45:53[0m] (step=0050600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:47:08[0m] (step=0050700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:48:23[0m] (step=0050800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:49:38[0m] (step=0050900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:50:53[0m] (step=0051000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:52:09[0m] (step=0051100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:53:24[0m] (step=0051200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:54:39[0m] (step=0051300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:55:54[0m] (step=0051400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:57:09[0m] (step=0051500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:58:24[0m] (step=0051600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 00:59:39[0m] (step=0051700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:00:54[0m] (step=0051800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:02:10[0m] (step=0051900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:03:25[0m] (step=0052000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:04:40[0m] (step=0052100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:05:55[0m] (step=0052200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:07:10[0m] (step=0052300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:08:25[0m] (step=0052400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:09:41[0m] (step=0052500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:10:56[0m] (step=0052600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:12:11[0m] (step=0052700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:13:26[0m] (step=0052800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:14:41[0m] (step=0052900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:15:57[0m] (step=0053000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:17:12[0m] (step=0053100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:18:27[0m] (step=0053200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:19:42[0m] (step=0053300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:20:57[0m] (step=0053400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:22:13[0m] (step=0053500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:23:28[0m] (step=0053600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:24:43[0m] (step=0053700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:25:58[0m] (step=0053800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:27:13[0m] (step=0053900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:28:28[0m] (step=0054000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:29:44[0m] (step=0054100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:30:59[0m] (step=0054200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:32:14[0m] (step=0054300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:33:29[0m] (step=0054400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:34:44[0m] (step=0054500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:35:59[0m] (step=0054600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:37:15[0m] (step=0054700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:38:30[0m] (step=0054800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:39:45[0m] (step=0054900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:41:00[0m] (step=0055000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:42:15[0m] (step=0055100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:43:30[0m] (step=0055200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:44:46[0m] (step=0055300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:46:01[0m] (step=0055400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:47:16[0m] (step=0055500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:48:31[0m] (step=0055600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:49:46[0m] (step=0055700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:51:02[0m] (step=0055800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:52:17[0m] (step=0055900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:53:32[0m] (step=0056000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:54:47[0m] (step=0056100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:56:02[0m] (step=0056200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:57:17[0m] (step=0056300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:58:32[0m] (step=0056400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 01:59:48[0m] (step=0056500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:01:03[0m] (step=0056600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:02:18[0m] (step=0056700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:03:33[0m] (step=0056800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:04:48[0m] (step=0056900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:06:04[0m] (step=0057000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:07:19[0m] (step=0057100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:08:34[0m] (step=0057200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:09:49[0m] (step=0057300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:11:04[0m] (step=0057400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:12:19[0m] (step=0057500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:13:35[0m] (step=0057600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:14:50[0m] (step=0057700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:16:05[0m] (step=0057800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:17:20[0m] (step=0057900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:18:35[0m] (step=0058000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:19:50[0m] (step=0058100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:21:05[0m] (step=0058200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:22:21[0m] (step=0058300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:23:36[0m] (step=0058400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:24:51[0m] (step=0058500) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:26:06[0m] (step=0058600) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:27:21[0m] (step=0058700) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:28:36[0m] (step=0058800) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:29:51[0m] (step=0058900) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:31:06[0m] (step=0059000) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:32:21[0m] (step=0059100) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:33:36[0m] (step=0059200) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:34:51[0m] (step=0059300) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:36:07[0m] (step=0059400) Train Loss: nan, Train Steps/Sec: 1.33
+[[34m2026-02-02 02:38:31[0m] (step=0059500) Train Loss: nan, Train Steps/Sec: 0.69
+[[34m2026-02-02 02:41:15[0m] (step=0059600) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:43:58[0m] (step=0059700) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:46:41[0m] (step=0059800) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:49:24[0m] (step=0059900) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:52:09[0m] (step=0060000) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:53:37[0m] Beginning epoch 6...
+[[34m2026-02-02 02:54:54[0m] (step=0060100) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 02:57:37[0m] (step=0060200) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 03:00:21[0m] (step=0060300) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 03:03:04[0m] (step=0060400) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 03:05:46[0m] (step=0060500) Train Loss: nan, Train Steps/Sec: 0.62
+[[34m2026-02-02 03:08:31[0m] (step=0060600) Train Loss: nan, Train Steps/Sec: 0.61
+[[34m2026-02-02 03:11:14[0m] (step=0060700) Train Loss: nan, Train Steps/Sec: 0.61

Rectified_Noise/GVP-Disp/权重类型分析.md ADDED Viewed

	@@ -0,0 +1,133 @@

+# 损失函数权重类型分析
+## 代码位置
+`transport/transport.py` 第150-156行
+## 三种权重类型
+### 1. WeightType.NONE
+```python
+weight = 1
+```
+**特点：**
+- 均匀权重，所有时间步的损失贡献相同
+- 最简单的权重策略
+**影响：**
+- 对训练过程的影响：所有时间步 t 的损失被同等对待
+- 优点：实现简单，训练稳定
+- 缺点：可能忽略不同时间步的重要性差异
+### 2. WeightType.VELOCITY
+```python
+weight = (drift_var / sigma_t) ** 2
+```
+**特点：**
+- 权重与 `(drift_var / sigma_t)²` 成正比
+- `drift_var` 是扩散系数（diffusion coefficient）
+- `sigma_t` 是噪声系数（noise coefficient）
+**数学含义：**
+- 对于线性路径（ICPlan）：`sigma_t = 1 - t`，`drift_var` 是扩散项
+- 权重 = `(扩散系数 / 噪声系数)²`
+- 当 `sigma_t` 较小时（接近 t=1，噪声少），权重较大
+- 当 `sigma_t` 较大时（接近 t=0，噪声多），权重较小
+**影响：**
+- **强调后期时间步**：在去噪过程的后期（t接近1，噪声少）给予更高权重
+- **训练重点**：模型在低噪声区域的预测精度更重要
+- **适用场景**：当最终生成质量（低噪声区域）是关键时
+**时间依赖行为：**
+- t → 0（高噪声）：`sigma_t` 大 → 权重小
+- t → 1（低噪声）：`sigma_t` 小 → 权重大
+### 3. WeightType.LIKELIHOOD
+```python
+weight = drift_var / (sigma_t ** 2)
+```
+**特点：**
+- 权重与 `drift_var / sigma_t²` 成正比
+- 相比 VELOCITY 权重，分母是 `sigma_t²` 而不是 `sigma_t`
+**数学含义：**
+- 权重 = `扩散系数 / 噪声系数²`
+- 当 `sigma_t` 较小时，`sigma_t²` 更小，权重更大
+- 当 `sigma_t` 较大时，`sigma_t²` 更大，权重更小
+**影响：**
+- **更强烈地强调后期时间步**：相比 VELOCITY，对低噪声区域的权重更大
+- **训练重点**：极大化模型在低噪声区域的预测精度
+- **适用场景**：当需要最大化似然或生成质量时
+**与 VELOCITY 的对比：**
+- LIKELIHOOD 权重 = VELOCITY 权重 × `(1 / sigma_t)`
+- 在相同 `drift_var` 和 `sigma_t` 下，LIKELIHOOD 权重总是 ≥ VELOCITY 权重
+- LIKELIHOOD 对后期时间步的强调更极端
+## 权重随时间的典型行为（线性路径示例）
+假设线性路径（ICPlan）：
+- `sigma_t = 1 - t`（从 1 到 0）
+- `drift_var` 通常与 `t` 相关
+### 时间步 t=0.1（高噪声）
+- `sigma_t ≈ 0.9`
+- **NONE**: weight = 1
+- **VELOCITY**: weight = `(drift_var / 0.9)²` ≈ 中等
+- **LIKELIHOOD**: weight = `drift_var / 0.81` ≈ 较大
+### 时间步 t=0.5（中等噪声）
+- `sigma_t = 0.5`
+- **NONE**: weight = 1
+- **VELOCITY**: weight = `(drift_var / 0.5)²` = `4 × drift_var²`
+- **LIKELIHOOD**: weight = `drift_var / 0.25` = `4 × drift_var`
+### 时间步 t=0.9（低噪声）
+- `sigma_t ≈ 0.1`
+- **NONE**: weight = 1
+- **VELOCITY**: weight = `(drift_var / 0.1)²` = `100 × drift_var²`（很大）
+- **LIKELIHOOD**: weight = `drift_var / 0.01` = `100 × drift_var`（非常大）
+## 实际损失计算
+### 对于 NOISE 模型类型：
+```python
+loss = mean_flat(weight * ((model_output - x0) ** 2))
+```
+### 对于 SCORE 模型类型：
+```python
+loss = mean_flat(weight * ((model_output * sigma_t + x0) ** 2))
+```
+## 选择建议
+1. **WeightType.NONE**
+   - 适合：简单实验、基线对比
+   - 优点：训练稳定，实现简单
+   - 缺点：可能忽略时间步重要性
+2. **WeightType.VELOCITY**
+   - 适合：关注最终生成质量
+   - 优点：强调低噪声区域，生成质量通常更好
+   - 缺点：可能在高噪声区域训练不足
+3. **WeightType.LIKELIHOOD**
+   - 适合：需要最大化似然、追求最高生成质量
+   - 优点：最强调低噪声区域
+   - 缺点：可能在高噪声区域训练严重不足，训练可能不稳定
+## 总结
+三种权重类型形成了一个从均匀到极端强调后期的梯度：
+```
+NONE (均匀) < VELOCITY (强调后期) < LIKELIHOOD (极端强调后期)
+```
+选择哪种权重取决于：
+- 训练目标（生成质量 vs 训练稳定性）
+- 数据特性
+- 模型类型（NOISE vs SCORE）
+- 路径类型（LINEAR vs GVP vs VP）

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000032.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000077.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000133.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000161.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000220.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000331.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000387.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000505.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000517.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000551.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000726.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000817.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000865.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000914.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/000940.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/001043.png ADDED Viewed

Rectified_Noise/VP-Disp/VP_samples/depth-mu-2-threshold-0.0-0175000-base-cfg-1.0-64-SDE-100-Euler-sigma-Mean-0.04/001210.png ADDED Viewed

SiT_back/SiT_clean/W_training.log ADDED Viewed

	@@ -0,0 +1,110 @@

+nohup: ignoring input
+W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793]
+W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] *****************************************
+W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W1124 10:39:29.690000 58030 site-packages/torch/distributed/run.py:793] *****************************************
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+[NOTICE] The application is pending for GPU resource in asynchronous queue. The longest waiting time in queue is 1800 seconds.
+Starting rank=0, seed=0, world_size=4.
+[[34m2025-11-24 10:39:48[0m] Experiment directory created at results/005-SiT-XL-2-Linear-velocity-None
+[[34m2025-11-24 10:39:48[0m] Sample images will be saved to results/005-SiT-XL-2-Linear-velocity-None/pic
+Starting rank=2, seed=2, world_size=4.
+Starting rank=1, seed=1, world_size=4.
+Starting rank=3, seed=3, world_size=4.
+[[34m2025-11-24 10:40:02[0m] SiT Parameters: 675,129,632
+[[34m2025-11-24 10:40:04[0m] Dataset contains 1,281,167 images (/gemini/platform/public/hzh/datasets/Imagenet/train/)
+[[34m2025-11-24 10:40:04[0m] Training for 140000 epochs...
+[[34m2025-11-24 10:40:04[0m] Beginning epoch 0...
+[[34m2025-11-24 10:40:24[0m] Saved checkpoint to results/005-SiT-XL-2-Linear-velocity-None/checkpoints/0000010.pt
+[[34m2025-11-24 10:40:24[0m] Generating EMA samples...
+[[34m2025-11-24 10:40:25[0m] Saved sample images grid to results/005-SiT-XL-2-Linear-velocity-None/pic/step_0000010_samples_grid.png
+[[34m2025-11-24 10:40:25[0m] Generating EMA samples done.
+W1124 10:40:39.173000 58030 site-packages/torch/distributed/elastic/agent/server/api.py:704] Received 2 death signal, shutting down workers
+W1124 10:40:39.173000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58079 closing signal SIGINT
+W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58080 closing signal SIGINT
+W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58081 closing signal SIGINT
+W1124 10:40:39.174000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58082 closing signal SIGINT
+[rank0]: Traceback (most recent call last):
+[rank0]:   File "/gemini/space/gzy_new/Noise_Matching/SiT_clean/train.py", line 371, in <module>
+[rank0]:     main(args)
+[rank0]:   File "/gemini/space/gzy_new/Noise_Matching/SiT_clean/train.py", line 298, in main
+[rank0]:     torch.save(checkpoint, checkpoint_path)
+[rank0]:   File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/serialization.py", line 850, in save
+[rank0]:     _save(
+[rank0]:   File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/serialization.py", line 1114, in _save
+[rank0]:     zip_file.write_record(name, storage, num_bytes)
+[rank0]: KeyboardInterrupt
+W1124 10:40:39.380000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58079 closing signal SIGTERM
+W1124 10:40:39.380000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58080 closing signal SIGTERM
+W1124 10:40:39.381000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58081 closing signal SIGTERM
+W1124 10:40:39.381000 58030 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 58082 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 696, in run
+    result = self._invoke_run(role)
+             ^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 855, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 705, in run
+    self._shutdown(e.sigval)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close
+    handler.proc.wait(time_to_wait)
+  File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 1266, in wait
+    return self._wait(timeout=timeout)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 2055, in _wait
+    time.sleep(delay)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/opt/conda/envs/SiT/bin/torchrun", line 33, in <module>
+    sys.exit(load_entry_point('torch==2.5.1', 'console_scripts', 'torchrun')())
+             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
+    return f(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/run.py", line 919, in main
+    run(args)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/run.py", line 910, in run
+    elastic_launch(
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/launcher/api.py", line 260, in launch_agent
+    result = agent.run()
+             ^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/metrics/api.py", line 137, in wrapper
+    result = f(*args, **kwargs)
+             ^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/api.py", line 710, in run
+    self._shutdown()
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 365, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 572, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 909, in _close
+    handler.proc.wait(time_to_wait)
+  File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 1266, in wait
+    return self._wait(timeout=timeout)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/envs/SiT/lib/python3.12/subprocess.py", line 2055, in _wait
+    time.sleep(delay)
+  File "/opt/conda/envs/SiT/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 84, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 58030 got signal: 2

SiT_back/SiT_clean/__pycache__/download.cpython-312.pyc ADDED Viewed

Binary file (1.99 kB). View file

SiT_back/SiT_clean/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (20.8 kB). View file

SiT_back/SiT_clean/__pycache__/train_utils.cpython-312.pyc ADDED Viewed

Binary file (2.84 kB). View file

SiT_back/SiT_clean/download.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Functions for downloading pre-trained SiT models
+"""
+from torchvision.datasets.utils import download_url
+import torch
+import os
+pretrained_models = {'SiT-XL-2-256x256.pt'}
+def find_model(model_name):
+    """
+    Finds a pre-trained SiT model, downloading it if necessary. Alternatively, loads a model from a local path.
+    """
+    if model_name in pretrained_models:
+        return download_model(model_name)
+    else:
+        assert os.path.isfile(model_name), f'Could not find SiT checkpoint at {model_name}'
+        checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage)
+        if "ema" in checkpoint:  # supports checkpoints from train.py
+            checkpoint = checkpoint["ema"]
+        return checkpoint
+def download_model(model_name):
+    """
+    Downloads a pre-trained SiT model from the web.
+    """
+    assert model_name in pretrained_models
+    local_path = f'pretrained_models/{model_name}'
+    if not os.path.isfile(local_path):
+        os.makedirs('pretrained_models', exist_ok=True)
+        web_path = f'https://www.dl.dropboxusercontent.com/scl/fi/as9oeomcbub47de5g4be0/SiT-XL-2-256.pt?rlkey=uxzxmpicu46coq3msb17b9ofa&dl=0'
+        download_url(web_path, 'pretrained_models', filename=model_name)
+    model = torch.load(local_path, map_location=lambda storage, loc: storage)
+    return model

SiT_back/SiT_clean/models.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core SiT Model                                #
+#################################################################################
+class SiTBlock(nn.Module):
+    """
+    A SiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of SiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.learn_sigma = True
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            SiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize label embedding table:
+        nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in SiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def forward(self, x, t, y):
+        """
+        Forward pass of SiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
+        t = self.t_embedder(t)                   # (N, D)
+        y = self.y_embedder(y, self.training)    # (N, D)
+        c = t + y                                # (N, D)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, D)
+        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        if self.learn_sigma:
+            x, _ = x.chunk(2, dim=1)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale):
+        """
+        Forward pass of SiT, but also batches the unconSiTional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        model_out = self.forward(combined, t, y)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :3], model_out[:, 3:]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=1)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   SiT Configs                                  #
+#################################################################################
+def SiT_XL_2(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def SiT_XL_4(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def SiT_XL_8(**kwargs):
+    return SiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def SiT_L_2(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def SiT_L_4(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def SiT_L_8(**kwargs):
+    return SiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def SiT_B_2(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def SiT_B_4(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def SiT_B_8(**kwargs):
+    return SiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def SiT_S_2(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def SiT_S_4(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def SiT_S_8(**kwargs):
+    return SiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+SiT_models = {
+    'SiT-XL/2': SiT_XL_2,  'SiT-XL/4': SiT_XL_4,  'SiT-XL/8': SiT_XL_8,
+    'SiT-L/2':  SiT_L_2,   'SiT-L/4':  SiT_L_4,   'SiT-L/8':  SiT_L_8,
+    'SiT-B/2':  SiT_B_2,   'SiT-B/4':  SiT_B_4,   'SiT-B/8':  SiT_B_8,
+    'SiT-S/2':  SiT_S_2,   'SiT-S/4':  SiT_S_4,   'SiT-S/8':  SiT_S_8,
+}

SiT_back/SiT_clean/run.sh ADDED Viewed

File without changes

SiT_back/SiT_clean/sample.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Sample new images from a pre-trained SiT.
+"""
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+from torchvision.utils import save_image
+from diffusers.models import AutoencoderKL
+from download import find_model
+from models import SiT_models
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from transport import create_transport, Sampler
+import argparse
+import sys
+from time import time
+def main(mode, args):
+    # Setup PyTorch:
+    torch.manual_seed(args.seed)
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load model:
+    latent_size = args.image_size // 8
+    model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=learn_sigma,
+    ).to(device)
+    # Auto-download a pre-trained model or load a custom SiT checkpoint from train.py:
+    ckpt_path = args.ckpt or f"SiT-XL-2-{args.image_size}x{args.image_size}.pt"
+    state_dict = find_model(ckpt_path)
+    model.load_state_dict(state_dict)
+    model.eval()  # important!
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    # Labels to condition the model with (feel free to change):
+    class_labels = [207, 360, 387, 974, 88, 979, 417, 279]
+    # Create sampling noise:
+    n = len(class_labels)
+    z = torch.randn(n, 4, latent_size, latent_size, device=device)
+    y = torch.tensor(class_labels, device=device)
+    # Setup classifier-free guidance:
+    z = torch.cat([z, z], 0)
+    y_null = torch.tensor([1000] * n, device=device)
+    y = torch.cat([y, y_null], 0)
+    model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+    # Sample images:
+    start_time = time()
+    samples = sample_fn(z, model.forward_with_cfg, **model_kwargs)[-1]
+    samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+    samples = vae.decode(samples / 0.18215).sample
+    print(f"Sampling took {time() - start_time:.2f} seconds.")
+    # Save and display images:
+    save_image(samples, "sample.png", nrow=4, normalize=True, value_range=(-1, 1))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="mse")
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale", type=float, default=4.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint (default: auto-download a pre-trained SiT-XL/2 model).")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

SiT_back/SiT_clean/sample_ddp.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Samples a large number of images from a pre-trained SiT model using DDP.
+Subsequently saves a .npz file that can be used to compute FID and other
+evaluation metrics via the ADM repo: https://github.com/openai/guided-diffusion/tree/main/evaluations
+For a simple single-GPU/CPU sampling script, see sample.py.
+"""
+import torch
+import torch.distributed as dist
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_ode_args, parse_sde_args, parse_transport_args
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import math
+import argparse
+import sys
+def create_npz_from_sample_folder(sample_dir, num=50_000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def main(mode, args):
+    """
+    Run sampling.
+    """
+    torch.backends.cuda.matmul.allow_tf32 = args.tf32  # True: fast but may lead to some small numerical differences
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    if args.ckpt is None:
+        assert args.model == "SiT-XL/2", "Only SiT-XL/2 models are available for auto-download."
+        assert args.image_size in [256, 512]
+        assert args.num_classes == 1000
+        assert args.image_size == 256, "512x512 models are not yet available for auto-download." # remove this line when 512x512 models are available
+        learn_sigma = args.image_size == 256
+    else:
+        learn_sigma = False
+    # Load model:
+    latent_size = args.image_size // 8
+    model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes,
+        learn_sigma=learn_sigma,
+    ).to(device)
+    # Auto-download a pre-trained model or load a custom SiT checkpoint from train.py:
+    ckpt_path = args.ckpt or f"SiT-XL-2-{args.image_size}x{args.image_size}.pt"
+    state_dict = find_model(ckpt_path)
+    model.load_state_dict(state_dict)
+    model.eval()  # important!
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )
+    sampler = Sampler(transport)
+    if mode == "ODE":
+        if args.likelihood:
+            assert args.cfg_scale == 1, "Likelihood is incompatible with guidance"
+            sample_fn = sampler.sample_ode_likelihood(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+            )
+        else:
+            sample_fn = sampler.sample_ode(
+                sampling_method=args.sampling_method,
+                num_steps=args.num_sampling_steps,
+                atol=args.atol,
+                rtol=args.rtol,
+                reverse=args.reverse
+            )
+    elif mode == "SDE":
+        sample_fn = sampler.sample_sde(
+            sampling_method=args.sampling_method,
+            diffusion_form=args.diffusion_form,
+            diffusion_norm=args.diffusion_norm,
+            last_step=args.last_step,
+            last_step_size=args.last_step_size,
+            num_steps=args.num_sampling_steps,
+        )
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    assert args.cfg_scale >= 1.0, "In almost all cases, cfg_scale be >= 1.0"
+    using_cfg = args.cfg_scale > 1.0
+    # Create folder to save samples:
+    model_string_name = args.model.replace("/", "-")
+    ckpt_string_name = os.path.basename(args.ckpt).replace(".pt", "") if args.ckpt else "pretrained"
+    if mode == "ODE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                  f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                  f"{mode}-{args.num_sampling_steps}-{args.sampling_method}"
+    elif mode == "SDE":
+        folder_name = f"{model_string_name}-{ckpt_string_name}-" \
+                    f"cfg-{args.cfg_scale}-{args.per_proc_batch_size}-"\
+                    f"{mode}-{args.num_sampling_steps}-{args.sampling_method}-"\
+                    f"{args.diffusion_form}-{args.last_step}-{args.last_step_size}"
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    # To make things evenly-divisible, we'll sample a bit more than we need and then discard the extra samples:
+    num_samples = len([name for name in os.listdir(sample_folder_dir) if (os.path.isfile(os.path.join(sample_folder_dir, name)) and ".png" in name)])
+    total_samples = int(math.ceil(args.num_fid_samples / global_batch_size) * global_batch_size)
+    if rank == 0:
+        print(f"Total number of images that will be sampled: {total_samples}")
+    assert total_samples % dist.get_world_size() == 0, "total_samples must be divisible by world_size"
+    samples_needed_this_gpu = int(total_samples // dist.get_world_size())
+    assert samples_needed_this_gpu % n == 0, "samples_needed_this_gpu must be divisible by the per-GPU batch size"
+    iterations = int(samples_needed_this_gpu // n)
+    done_iterations = int( int(num_samples // dist.get_world_size()) // n)
+    pbar = range(iterations)
+    pbar = tqdm(pbar) if rank == 0 else pbar
+    total = 0
+    for i in pbar:
+        # Sample inputs:
+        z = torch.randn(n, model.in_channels, latent_size, latent_size, device=device)
+        y = torch.randint(0, args.num_classes, (n,), device=device)
+        # Setup classifier-free guidance:
+        if using_cfg:
+            z = torch.cat([z, z], 0)
+            y_null = torch.tensor([1000] * n, device=device)
+            y = torch.cat([y, y_null], 0)
+            model_kwargs = dict(y=y, cfg_scale=args.cfg_scale)
+            model_fn = model.forward_with_cfg
+        else:
+            model_kwargs = dict(y=y)
+            model_fn = model.forward
+        samples = sample_fn(z, model_fn, **model_kwargs)[-1]
+        if using_cfg:
+            samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+        samples = vae.decode(samples / 0.18215).sample
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, sample in enumerate(samples):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+        total += global_batch_size
+        dist.barrier()
+    # Make sure all processes have finished saving their samples before attempting to convert to .npz
+    dist.barrier()
+    if rank == 0:
+        create_npz_from_sample_folder(sample_folder_dir, args.num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    if len(sys.argv) < 2:
+        print("Usage: program.py <mode> [options]")
+        sys.exit(1)
+    mode = sys.argv[1]
+    assert mode[:2] != "--", "Usage: program.py <mode> [options]"
+    assert mode in ["ODE", "SDE"], "Invalid mode. Please choose 'ODE' or 'SDE'"
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--vae",  type=str, choices=["ema", "mse"], default="ema")
+    parser.add_argument("--sample-dir", type=str, default="samples")
+    parser.add_argument("--per-proc-batch-size", type=int, default=4)
+    parser.add_argument("--num-fid-samples", type=int, default=50_000)
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--cfg-scale",  type=float, default=1.0)
+    parser.add_argument("--num-sampling-steps", type=int, default=250)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--tf32", action=argparse.BooleanOptionalAction, default=True,
+                        help="By default, use TF32 matmuls. This massively accelerates sampling on Ampere GPUs.")
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a SiT checkpoint (default: auto-download a pre-trained SiT-XL/2 model).")
+    parse_transport_args(parser)
+    if mode == "ODE":
+        parse_ode_args(parser)
+        # Further processing for ODE
+    elif mode == "SDE":
+        parse_sde_args(parser)
+        # Further processing for SDE
+    args = parser.parse_known_args()[0]
+    main(mode, args)

SiT_back/SiT_clean/train.py ADDED Viewed

	@@ -0,0 +1,371 @@

+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+A minimal training script for SiT using PyTorch DDP.
+"""
+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torchvision.datasets import ImageFolder
+from torchvision import transforms
+import numpy as np
+from collections import OrderedDict
+from PIL import Image
+from copy import deepcopy
+from glob import glob
+from time import time
+import argparse
+import logging
+import os
+from models import SiT_models
+from download import find_model
+from transport import create_transport, Sampler
+from diffusers.models import AutoencoderKL
+from train_utils import parse_transport_args
+#################################################################################
+#                             Training Helper Functions                         #
+#################################################################################
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+    for name, param in model_params.items():
+        # TODO: Consider applying only to params that require_grad to avoid small numerical changes of pos_embed
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
+def requires_grad(model, flag=True):
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.destroy_process_group()
+def create_logger(logging_dir):
+    """
+    Create a logger that writes to a log file and stdout.
+    """
+    if dist.get_rank() == 0:  # real logger
+        logging.basicConfig(
+            level=logging.INFO,
+            format='[\033[34m%(asctime)s\033[0m] %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S',
+            handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
+        )
+        logger = logging.getLogger(__name__)
+    else:  # dummy logger (does nothing)
+        logger = logging.getLogger(__name__)
+        logger.addHandler(logging.NullHandler())
+    return logger
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+#################################################################################
+#                                  Training Loop                                #
+#################################################################################
+def main(args):
+    """
+    Trains a new SiT model.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    assert args.global_batch_size % dist.get_world_size() == 0, f"Batch size must be divisible by world size."
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    local_batch_size = int(args.global_batch_size // dist.get_world_size())
+    # Setup an experiment folder:
+    if rank == 0:
+        os.makedirs(args.results_dir, exist_ok=True)  # Make results folder (holds all experiment subfolders)
+        experiment_index = len(glob(f"{args.results_dir}/*"))
+        model_string_name = args.model.replace("/", "-")  # e.g., SiT-XL/2 --> SiT-XL-2 (for naming folders)
+        experiment_name = f"{experiment_index:03d}-{model_string_name}-" \
+                        f"{args.path_type}-{args.prediction}-{args.loss_weight}"
+        experiment_dir = f"{args.results_dir}/{experiment_name}"  # Create an experiment folder
+        checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        # Create pic directory for saving sample images
+        pic_dir = f"{experiment_dir}/pic"
+        os.makedirs(pic_dir, exist_ok=True)
+        logger = create_logger(experiment_dir)
+        logger.info(f"Experiment directory created at {experiment_dir}")
+        logger.info(f"Sample images will be saved to {pic_dir}")
+    else:
+        logger = create_logger(None)
+    # Create model:
+    assert args.image_size % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
+    latent_size = args.image_size // 8
+    model = SiT_models[args.model](
+        input_size=latent_size,
+        num_classes=args.num_classes
+    )
+    # Note that parameter initialization is done within the SiT constructor
+    ema = deepcopy(model).to(device)  # Create an EMA of the model for use after training
+    if args.ckpt is not None:
+        ckpt_path = args.ckpt
+        state_dict = find_model(ckpt_path)
+        model.load_state_dict(state_dict["model"])
+        ema.load_state_dict(state_dict["ema"])
+        opt.load_state_dict(state_dict["opt"])
+        args = state_dict["args"]
+    requires_grad(ema, False)
+    model = DDP(model.to(device), device_ids=[device])
+    transport = create_transport(
+        args.path_type,
+        args.prediction,
+        args.loss_weight,
+        args.train_eps,
+        args.sample_eps
+    )  # default: velocity;
+    transport_sampler = Sampler(transport)
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
+    logger.info(f"SiT Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Setup optimizer (we used default Adam betas=(0.9, 0.999) and a constant learning rate of 1e-4 in our paper):
+    opt = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0)
+    # Setup data:
+    transform = transforms.Compose([
+        transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    dataset = ImageFolder(args.data_path, transform=transform)
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=local_batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    logger.info(f"Dataset contains {len(dataset):,} images ({args.data_path})")
+    # Prepare models for training:
+    update_ema(ema, model.module, decay=0)  # Ensure EMA is initialized with synced weights
+    model.train()  # important! This enables embedding dropout for classifier-free guidance
+    ema.eval()  # EMA model should always be in eval mode
+    # Variables for monitoring/logging purposes:
+    train_steps = 0
+    log_steps = 0
+    running_loss = 0
+    start_time = time()
+    # Labels to condition the model with (feel free to change):
+    ys = torch.randint(1000, size=(local_batch_size,), device=device)
+    use_cfg = args.cfg_scale > 1.0
+    # Create sampling noise:
+    n = ys.size(0)
+    zs = torch.randn(n, 4, latent_size, latent_size, device=device)
+    # Create fixed sampling noise and conditions for consistent sampling visualization
+    fixed_ys = torch.randint(1000, size=(16,), device=device)  # Fixed labels for sampling
+    fixed_zs = torch.randn(16, 4, latent_size, latent_size, device=device)  # Fixed noise for sampling
+    # Setup classifier-free guidance:
+    if use_cfg:
+        zs = torch.cat([zs, zs], 0)
+        y_null = torch.tensor([1000] * n, device=device)
+        ys = torch.cat([ys, y_null], 0)
+        sample_model_kwargs = dict(y=ys, cfg_scale=args.cfg_scale)
+        model_fn = ema.forward_with_cfg
+    else:
+        sample_model_kwargs = dict(y=ys)
+        model_fn = ema.forward
+    # Setup fixed classifier-free guidance for sampling:
+    if args.cfg_scale > 1.0:
+        fixed_zs = torch.cat([fixed_zs, fixed_zs], 0)
+        fixed_y_null = torch.tensor([1000] * 16, device=device)
+        fixed_ys = torch.cat([fixed_ys, fixed_y_null], 0)
+        fixed_sample_model_kwargs = dict(y=fixed_ys, cfg_scale=args.cfg_scale)
+    else:
+        fixed_sample_model_kwargs = dict(y=fixed_ys)
+    logger.info(f"Training for {args.epochs} epochs...")
+    for epoch in range(args.epochs):
+        sampler.set_epoch(epoch)
+        logger.info(f"Beginning epoch {epoch}...")
+        for x, y in loader:
+            x = x.to(device)
+            y = y.to(device)
+            with torch.no_grad():
+                # Map input images to latent space + normalize latents:
+                x = vae.encode(x).latent_dist.sample().mul_(0.18215)
+            model_kwargs = dict(y=y)
+            loss_dict = transport.training_losses(model, x, model_kwargs)
+            loss = loss_dict["loss"].mean()
+            opt.zero_grad()
+            loss.backward()
+            opt.step()
+            update_ema(ema, model.module)
+            # Log loss values:
+            running_loss += loss.item()
+            log_steps += 1
+            train_steps += 1
+            if train_steps % args.log_every == 0:
+                # Measure training speed:
+                torch.cuda.synchronize()
+                end_time = time()
+                steps_per_sec = log_steps / (end_time - start_time)
+                # Reduce loss history over all processes:
+                avg_loss = torch.tensor(running_loss / log_steps, device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
+                avg_loss = avg_loss.item() / dist.get_world_size()
+                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}")
+                # Reset monitoring variables:
+                running_loss = 0
+                log_steps = 0
+                start_time = time()
+            # Save SiT checkpoint:
+            if train_steps % args.ckpt_every == 0 and train_steps > 0:
+                if rank == 0:
+                    checkpoint = {
+                        "model": model.module.state_dict(),
+                        "ema": ema.state_dict(),
+                        "opt": opt.state_dict(),
+                        "args": args
+                    }
+                    checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pt"
+                    torch.save(checkpoint, checkpoint_path)
+                    logger.info(f"Saved checkpoint to {checkpoint_path}")
+                dist.barrier()
+            # Save sample images:
+            if train_steps % args.sample_every == 0 and train_steps > 0:
+                logger.info("Generating EMA samples...")
+                sample_fn = transport_sampler.sample_ode() # default to ode sampling
+                samples = sample_fn(fixed_zs, model_fn, **fixed_sample_model_kwargs)[-1]
+                dist.barrier()
+                if args.cfg_scale > 1.0: #remove null samples
+                    samples, _ = samples.chunk(2, dim=0)
+                samples = vae.decode(samples / 0.18215).sample
+                # Save sample images to pic directory instead of wandb
+                if rank == 0:
+                    # Create a 4x4 grid of images
+                    # Normalize images from [-1, 1] to [0, 1]
+                    samples = (samples.clamp(-1, 1) + 1) / 2
+                    # Convert to PIL Images and arrange in a 4x4 grid
+                    # Create a blank image for the grid
+                    grid_size = args.image_size
+                    grid_image = Image.new('RGB', (4 * grid_size, 4 * grid_size))
+                    # Place each sample in the grid
+                    for i in range(min(16, samples.shape[0])):
+                        # Convert to PIL Image
+                        img = samples[i].permute(1, 2, 0).cpu().detach().numpy()
+                        img = (img * 255).astype(np.uint8)
+                        pil_img = Image.fromarray(img)
+                        # Calculate position in the grid
+                        row = i // 4
+                        col = i % 4
+                        grid_image.paste(pil_img, (col * grid_size, row * grid_size))
+                    # Save the grid image
+                    img_path = f"{pic_dir}/step_{train_steps:07d}_samples_grid.png"
+                    grid_image.save(img_path)
+                    logger.info(f"Saved sample images grid to {img_path}")
+                logging.info("Generating EMA samples done.")
+    model.eval()  # important! This disables randomized embedding dropout
+    # do any sampling/FID calculation/etc. with ema (or model) in eval mode ...
+    logger.info("Done!")
+    cleanup()
+if __name__ == "__main__":
+    # Default args here will train SiT-XL/2 with the hyperparameters we used in our paper (except training iters).
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, default="/gemini/platform/public/hzh/datasets/Imagenet/train/")
+    parser.add_argument("--results-dir", type=str, default="results")
+    parser.add_argument("--model", type=str, choices=list(SiT_models.keys()), default="SiT-XL/2")
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--num-classes", type=int, default=1000)
+    parser.add_argument("--epochs", type=int, default=140000)
+    parser.add_argument("--global-batch-size", type=int, default=256)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema")  # Choice doesn't affect training
+    parser.add_argument("--num-workers", type=int, default=4)
+    parser.add_argument("--log-every", type=int, default=100)
+    parser.add_argument("--ckpt-every", type=int, default=10)
+    parser.add_argument("--sample-every", type=int, default=10)
+    parser.add_argument("--cfg-scale", type=float, default=4.0)
+    parser.add_argument("--ckpt", type=str, default=None,
+                        help="Optional path to a custom SiT checkpoint")
+    parse_transport_args(parser)
+    args = parser.parse_args()
+    main(args)

SiT_back/SiT_clean/train_utils.py ADDED Viewed

	@@ -0,0 +1,32 @@

+def none_or_str(value):
+    if value == 'None':
+        return None
+    return value
+def parse_transport_args(parser):
+    group = parser.add_argument_group("Transport arguments")
+    group.add_argument("--path-type", type=str, default="Linear", choices=["Linear", "GVP", "VP"])
+    group.add_argument("--prediction", type=str, default="velocity", choices=["velocity", "score", "noise"])
+    group.add_argument("--loss-weight", type=none_or_str, default=None, choices=[None, "velocity", "likelihood"])
+    group.add_argument("--sample-eps", type=float)
+    group.add_argument("--train-eps", type=float)
+def parse_ode_args(parser):
+    group = parser.add_argument_group("ODE arguments")
+    group.add_argument("--sampling-method", type=str, default="dopri5", help="blackbox ODE solver methods; for full list check https://github.com/rtqichen/torchdiffeq")
+    group.add_argument("--atol", type=float, default=1e-6, help="Absolute tolerance")
+    group.add_argument("--rtol", type=float, default=1e-3, help="Relative tolerance")
+    group.add_argument("--reverse", action="store_true")
+    group.add_argument("--likelihood", action="store_true")
+def parse_sde_args(parser):
+    group = parser.add_argument_group("SDE arguments")
+    group.add_argument("--sampling-method", type=str, default="Euler", choices=["Euler", "Heun"])
+    group.add_argument("--diffusion-form", type=str, default="sigma", \
+                        choices=["constant", "SBDM", "sigma", "linear", "decreasing", "increasing-decreasing"],\
+                        help="form of diffusion coefficient in the SDE")
+    group.add_argument("--diffusion-norm", type=float, default=1.0)
+    group.add_argument("--last-step", type=none_or_str, default="Mean", choices=[None, "Mean", "Tweedie", "Euler"],\
+                        help="form of last step taken in the SDE")
+    group.add_argument("--last-step-size", type=float, default=0.04, \
+                        help="size of the last step taken")

SiT_back/SiT_clean/transport/__init__.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from .transport import Transport, ModelType, WeightType, PathType, Sampler
+def create_transport(
+    path_type='Linear',
+    prediction="velocity",
+    loss_weight=None,
+    train_eps=None,
+    sample_eps=None,
+):
+    """function for creating Transport object
+    **Note**: model prediction defaults to velocity
+    Args:
+    - path_type: type of path to use; default to linear
+    - learn_score: set model prediction to score
+    - learn_noise: set model prediction to noise
+    - velocity_weighted: weight loss by velocity weight
+    - likelihood_weighted: weight loss by likelihood weight
+    - train_eps: small epsilon for avoiding instability during training
+    - sample_eps: small epsilon for avoiding instability during sampling
+    """
+    if prediction == "noise":
+        model_type = ModelType.NOISE
+    elif prediction == "score":
+        model_type = ModelType.SCORE
+    else:
+        model_type = ModelType.VELOCITY
+    if loss_weight == "velocity":
+        loss_type = WeightType.VELOCITY
+    elif loss_weight == "likelihood":
+        loss_type = WeightType.LIKELIHOOD
+    else:
+        loss_type = WeightType.NONE
+    path_choice = {
+        "Linear": PathType.LINEAR,
+        "GVP": PathType.GVP,
+        "VP": PathType.VP,
+    }
+    path_type = path_choice[path_type]
+    if (path_type in [PathType.VP]):
+        train_eps_new = 1e-5 if train_eps is None else train_eps
+        sample_eps_new = 1e-3 if train_eps is None else sample_eps
+        train_eps, sample_eps = train_eps_new, sample_eps_new
+    elif (path_type in [PathType.GVP, PathType.LINEAR] and model_type != ModelType.VELOCITY):
+        train_eps_new = 1e-3 if train_eps is None else train_eps
+        sample_eps_new = 1e-3 if train_eps is None else sample_eps
+        train_eps, sample_eps = train_eps_new, sample_eps_new
+    else: # velocity & [GVP, LINEAR] is stable everywhere
+        train_eps = 0
+        sample_eps = 0
+    # create flow state
+    state = Transport(
+        model_type=model_type,
+        path_type=path_type,
+        loss_type=loss_type,
+        train_eps=train_eps,
+        sample_eps=sample_eps,
+    )
+    return state

SiT_back/SiT_clean/transport/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.19 kB). View file

SiT_back/SiT_clean/transport/__pycache__/integrators.cpython-312.pyc ADDED Viewed

Binary file (6.21 kB). View file

SiT_back/SiT_clean/transport/__pycache__/path.cpython-312.pyc ADDED Viewed

Binary file (11.3 kB). View file

SiT_back/SiT_clean/transport/__pycache__/transport.cpython-312.pyc ADDED Viewed

Binary file (20.4 kB). View file

SiT_back/SiT_clean/transport/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (1.87 kB). View file

SiT_back/SiT_clean/transport/integrators.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import numpy as np
+import torch as th
+import torch.nn as nn
+from torchdiffeq import odeint
+from functools import partial
+from tqdm import tqdm
+class sde:
+    """SDE solver class"""
+    def __init__(
+        self,
+        drift,
+        diffusion,
+        *,
+        t0,
+        t1,
+        num_steps,
+        sampler_type,
+    ):
+        assert t0 < t1, "SDE sampler has to be in forward time"
+        self.num_timesteps = num_steps
+        self.t = th.linspace(t0, t1, num_steps)
+        self.dt = self.t[1] - self.t[0]
+        self.drift = drift
+        self.diffusion = diffusion
+        self.sampler_type = sampler_type
+    def __Euler_Maruyama_step(self, x, mean_x, t, model, **model_kwargs):
+        w_cur = th.randn(x.size()).to(x)
+        t = th.ones(x.size(0)).to(x) * t
+        dw = w_cur * th.sqrt(self.dt)
+        drift = self.drift(x, t, model, **model_kwargs)
+        diffusion = self.diffusion(x, t)
+        mean_x = x + drift * self.dt
+        x = mean_x + th.sqrt(2 * diffusion) * dw
+        return x, mean_x
+    def __Heun_step(self, x, _, t, model, **model_kwargs):
+        w_cur = th.randn(x.size()).to(x)
+        dw = w_cur * th.sqrt(self.dt)
+        t_cur = th.ones(x.size(0)).to(x) * t
+        diffusion = self.diffusion(x, t_cur)
+        xhat = x + th.sqrt(2 * diffusion) * dw
+        K1 = self.drift(xhat, t_cur, model, **model_kwargs)
+        xp = xhat + self.dt * K1
+        K2 = self.drift(xp, t_cur + self.dt, model, **model_kwargs)
+        return xhat + 0.5 * self.dt * (K1 + K2), xhat # at last time point we do not perform the heun step
+    def __forward_fn(self):
+        """TODO: generalize here by adding all private functions ending with steps to it"""
+        sampler_dict = {
+            "Euler": self.__Euler_Maruyama_step,
+            "Heun": self.__Heun_step,
+        }
+        try:
+            sampler = sampler_dict[self.sampler_type]
+        except:
+            raise NotImplementedError("Smapler type not implemented.")
+        return sampler
+    def sample(self, init, model, **model_kwargs):
+        """forward loop of sde"""
+        x = init
+        mean_x = init
+        samples = []
+        sampler = self.__forward_fn()
+        for ti in self.t[:-1]:
+            with th.no_grad():
+                x, mean_x = sampler(x, mean_x, ti, model, **model_kwargs)
+                samples.append(x)
+        return samples
+class ode:
+    """ODE solver class"""
+    def __init__(
+        self,
+        drift,
+        *,
+        t0,
+        t1,
+        sampler_type,
+        num_steps,
+        atol,
+        rtol,
+    ):
+        self.drift = drift
+        self.t = th.linspace(t0, t1, num_steps)
+        self.atol = atol
+        self.rtol = rtol
+        self.sampler_type = sampler_type
+    def sample(self, x, model, **model_kwargs):
+        device = x[0].device if isinstance(x, tuple) else x.device
+        def _fn(t, x):
+            t = th.ones(x[0].size(0)).to(device) * t if isinstance(x, tuple) else th.ones(x.size(0)).to(device) * t
+            model_output = self.drift(x, t, model, **model_kwargs)
+            return model_output
+        t = self.t.to(device)
+        atol = [self.atol] * len(x) if isinstance(x, tuple) else [self.atol]
+        rtol = [self.rtol] * len(x) if isinstance(x, tuple) else [self.rtol]
+        samples = odeint(
+            _fn,
+            x,
+            t,
+            method=self.sampler_type,
+            atol=atol,
+            rtol=rtol
+        )
+        return samples

SiT_back/SiT_clean/transport/path.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import torch as th
+import numpy as np
+from functools import partial
+def expand_t_like_x(t, x):
+    """Function to reshape time t to broadcastable dimension of x
+    Args:
+      t: [batch_dim,], time vector
+      x: [batch_dim,...], data point
+    """
+    dims = [1] * (len(x.size()) - 1)
+    t = t.view(t.size(0), *dims)
+    return t
+#################### Coupling Plans ####################
+class ICPlan:
+    """Linear Coupling Plan"""
+    def __init__(self, sigma=0.0):
+        self.sigma = sigma
+    def compute_alpha_t(self, t):
+        """Compute the data coefficient along the path"""
+        return t, 1
+    def compute_sigma_t(self, t):
+        """Compute the noise coefficient along the path"""
+        return 1 - t, -1
+    def compute_d_alpha_alpha_ratio_t(self, t):
+        """Compute the ratio between d_alpha and alpha"""
+        return 1 / t
+    def compute_drift(self, x, t):
+        """We always output sde according to score parametrization; """
+        t = expand_t_like_x(t, x)
+        alpha_ratio = self.compute_d_alpha_alpha_ratio_t(t)
+        sigma_t, d_sigma_t = self.compute_sigma_t(t)
+        drift = alpha_ratio * x
+        diffusion = alpha_ratio * (sigma_t ** 2) - sigma_t * d_sigma_t
+        return -drift, diffusion
+    def compute_diffusion(self, x, t, form="constant", norm=1.0):
+        """Compute the diffusion term of the SDE
+        Args:
+          x: [batch_dim, ...], data point
+          t: [batch_dim,], time vector
+          form: str, form of the diffusion term
+          norm: float, norm of the diffusion term
+        """
+        t = expand_t_like_x(t, x)
+        choices = {
+            "constant": norm,
+            "SBDM": norm * self.compute_drift(x, t)[1],
+            "sigma": norm * self.compute_sigma_t(t)[0],
+            "linear": norm * (1 - t),
+            "decreasing": 0.25 * (norm * th.cos(np.pi * t) + 1) ** 2,
+            "inccreasing-decreasing": norm * th.sin(np.pi * t) ** 2,
+        }
+        try:
+            diffusion = choices[form]
+        except KeyError:
+            raise NotImplementedError(f"Diffusion form {form} not implemented")
+        return diffusion
+    def get_score_from_velocity(self, velocity, x, t):
+        """Wrapper function: transfrom velocity prediction model to score
+        Args:
+            velocity: [batch_dim, ...] shaped tensor; velocity model output
+            x: [batch_dim, ...] shaped tensor; x_t data point
+            t: [batch_dim,] time tensor
+        """
+        t = expand_t_like_x(t, x)
+        alpha_t, d_alpha_t = self.compute_alpha_t(t)
+        sigma_t, d_sigma_t = self.compute_sigma_t(t)
+        mean = x
+        reverse_alpha_ratio = alpha_t / d_alpha_t
+        var = sigma_t**2 - reverse_alpha_ratio * d_sigma_t * sigma_t
+        score = (reverse_alpha_ratio * velocity - mean) / var
+        return score
+    def get_noise_from_velocity(self, velocity, x, t):
+        """Wrapper function: transfrom velocity prediction model to denoiser
+        Args:
+            velocity: [batch_dim, ...] shaped tensor; velocity model output
+            x: [batch_dim, ...] shaped tensor; x_t data point
+            t: [batch_dim,] time tensor
+        """
+        t = expand_t_like_x(t, x)
+        alpha_t, d_alpha_t = self.compute_alpha_t(t)
+        sigma_t, d_sigma_t = self.compute_sigma_t(t)
+        mean = x
+        reverse_alpha_ratio = alpha_t / d_alpha_t
+        var = reverse_alpha_ratio * d_sigma_t - sigma_t
+        noise = (reverse_alpha_ratio * velocity - mean) / var
+        return noise
+    def get_velocity_from_score(self, score, x, t):
+        """Wrapper function: transfrom score prediction model to velocity
+        Args:
+            score: [batch_dim, ...] shaped tensor; score model output
+            x: [batch_dim, ...] shaped tensor; x_t data point
+            t: [batch_dim,] time tensor
+        """
+        t = expand_t_like_x(t, x)
+        drift, var = self.compute_drift(x, t)
+        velocity = var * score - drift
+        return velocity
+    def compute_mu_t(self, t, x0, x1):
+        """Compute the mean of time-dependent density p_t"""
+        t = expand_t_like_x(t, x1)
+        alpha_t, _ = self.compute_alpha_t(t)
+        sigma_t, _ = self.compute_sigma_t(t)
+        return alpha_t * x1 + sigma_t * x0
+    def compute_xt(self, t, x0, x1):
+        """Sample xt from time-dependent density p_t; rng is required"""
+        xt = self.compute_mu_t(t, x0, x1)
+        return xt
+    def compute_ut(self, t, x0, x1, xt):
+        """Compute the vector field corresponding to p_t"""
+        t = expand_t_like_x(t, x1)
+        _, d_alpha_t = self.compute_alpha_t(t)
+        _, d_sigma_t = self.compute_sigma_t(t)
+        return d_alpha_t * x1 + d_sigma_t * x0
+    def plan(self, t, x0, x1):
+        xt = self.compute_xt(t, x0, x1)
+        ut = self.compute_ut(t, x0, x1, xt)
+        return t, xt, ut
+class VPCPlan(ICPlan):
+    """class for VP path flow matching"""
+    def __init__(self, sigma_min=0.1, sigma_max=20.0):
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.log_mean_coeff = lambda t: -0.25 * ((1 - t) ** 2) * (self.sigma_max - self.sigma_min) - 0.5 * (1 - t) * self.sigma_min
+        self.d_log_mean_coeff = lambda t: 0.5 * (1 - t) * (self.sigma_max - self.sigma_min) + 0.5 * self.sigma_min
+    def compute_alpha_t(self, t):
+        """Compute coefficient of x1"""
+        alpha_t = self.log_mean_coeff(t)
+        alpha_t = th.exp(alpha_t)
+        d_alpha_t = alpha_t * self.d_log_mean_coeff(t)
+        return alpha_t, d_alpha_t
+    def compute_sigma_t(self, t):
+        """Compute coefficient of x0"""
+        p_sigma_t = 2 * self.log_mean_coeff(t)
+        sigma_t = th.sqrt(1 - th.exp(p_sigma_t))
+        d_sigma_t = th.exp(p_sigma_t) * (2 * self.d_log_mean_coeff(t)) / (-2 * sigma_t)
+        return sigma_t, d_sigma_t
+    def compute_d_alpha_alpha_ratio_t(self, t):
+        """Special purposed function for computing numerical stabled d_alpha_t / alpha_t"""
+        return self.d_log_mean_coeff(t)
+    def compute_drift(self, x, t):
+        """Compute the drift term of the SDE"""
+        t = expand_t_like_x(t, x)
+        beta_t = self.sigma_min + (1 - t) * (self.sigma_max - self.sigma_min)
+        return -0.5 * beta_t * x, beta_t / 2
+class GVPCPlan(ICPlan):
+    def __init__(self, sigma=0.0):
+        super().__init__(sigma)
+    def compute_alpha_t(self, t):
+        """Compute coefficient of x1"""
+        alpha_t = th.sin(t * np.pi / 2)
+        d_alpha_t = np.pi / 2 * th.cos(t * np.pi / 2)
+        return alpha_t, d_alpha_t
+    def compute_sigma_t(self, t):
+        """Compute coefficient of x0"""
+        sigma_t = th.cos(t * np.pi / 2)
+        d_sigma_t = -np.pi / 2 * th.sin(t * np.pi / 2)
+        return sigma_t, d_sigma_t
+    def compute_d_alpha_alpha_ratio_t(self, t):
+        """Special purposed function for computing numerical stabled d_alpha_t / alpha_t"""
+        return np.pi / (2 * th.tan(t * np.pi / 2))

SiT_back/SiT_clean/transport/transport.py ADDED Viewed

	@@ -0,0 +1,440 @@

+import torch as th
+import numpy as np
+import logging
+import enum
+from . import path
+from .utils import EasyDict, log_state, mean_flat
+from .integrators import ode, sde
+class ModelType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    NOISE = enum.auto()  # the model predicts epsilon
+    SCORE = enum.auto()  # the model predicts \nabla \log p(x)
+    VELOCITY = enum.auto()  # the model predicts v(x)
+class PathType(enum.Enum):
+    """
+    Which type of path to use.
+    """
+    LINEAR = enum.auto()
+    GVP = enum.auto()
+    VP = enum.auto()
+class WeightType(enum.Enum):
+    """
+    Which type of weighting to use.
+    """
+    NONE = enum.auto()
+    VELOCITY = enum.auto()
+    LIKELIHOOD = enum.auto()
+class Transport:
+    def __init__(
+        self,
+        *,
+        model_type,
+        path_type,
+        loss_type,
+        train_eps,
+        sample_eps,
+    ):
+        path_options = {
+            PathType.LINEAR: path.ICPlan,
+            PathType.GVP: path.GVPCPlan,
+            PathType.VP: path.VPCPlan,
+        }
+        self.loss_type = loss_type
+        self.model_type = model_type
+        self.path_sampler = path_options[path_type]()
+        self.train_eps = train_eps
+        self.sample_eps = sample_eps
+    def prior_logp(self, z):
+        '''
+            Standard multivariate normal prior
+            Assume z is batched
+        '''
+        shape = th.tensor(z.size())
+        N = th.prod(shape[1:])
+        _fn = lambda x: -N / 2. * np.log(2 * np.pi) - th.sum(x ** 2) / 2.
+        return th.vmap(_fn)(z)
+    def check_interval(
+        self,
+        train_eps,
+        sample_eps,
+        *,
+        diffusion_form="SBDM",
+        sde=False,
+        reverse=False,
+        eval=False,
+        last_step_size=0.0,
+    ):
+        t0 = 0
+        t1 = 1
+        eps = train_eps if not eval else sample_eps
+        if (type(self.path_sampler) in [path.VPCPlan]):
+            t1 = 1 - eps if (not sde or last_step_size == 0) else 1 - last_step_size
+        elif (type(self.path_sampler) in [path.ICPlan, path.GVPCPlan]) \
+            and (self.model_type != ModelType.VELOCITY or sde): # avoid numerical issue by taking a first semi-implicit step
+            t0 = eps if (diffusion_form == "SBDM" and sde) or self.model_type != ModelType.VELOCITY else 0
+            t1 = 1 - eps if (not sde or last_step_size == 0) else 1 - last_step_size
+        if reverse:
+            t0, t1 = 1 - t0, 1 - t1
+        return t0, t1
+    def sample(self, x1):
+        """Sampling x0 & t based on shape of x1 (if needed)
+          Args:
+            x1 - data point; [batch, *dim]
+        """
+        x0 = th.randn_like(x1)
+        t0, t1 = self.check_interval(self.train_eps, self.sample_eps)
+        t = th.rand((x1.shape[0],)) * (t1 - t0) + t0
+        t = t.to(x1)
+        return t, x0, x1
+    def training_losses(
+        self,
+        model,
+        x1,
+        model_kwargs=None
+    ):
+        """Loss for training the score model
+        Args:
+        - model: backbone model; could be score, noise, or velocity
+        - x1: datapoint
+        - model_kwargs: additional arguments for the model
+        """
+        if model_kwargs == None:
+            model_kwargs = {}
+        t, x0, x1 = self.sample(x1)
+        t, xt, ut = self.path_sampler.plan(t, x0, x1)
+        model_output = model(xt, t, **model_kwargs)
+        B, *_, C = xt.shape
+        assert model_output.size() == (B, *xt.size()[1:-1], C)
+        terms = {}
+        terms['pred'] = model_output
+        if self.model_type == ModelType.VELOCITY:
+            terms['loss'] = mean_flat(((model_output - ut) ** 2))
+        else:
+            _, drift_var = self.path_sampler.compute_drift(xt, t)
+            sigma_t, _ = self.path_sampler.compute_sigma_t(path.expand_t_like_x(t, xt))
+            if self.loss_type in [WeightType.VELOCITY]:
+                weight = (drift_var / sigma_t) ** 2
+            elif self.loss_type in [WeightType.LIKELIHOOD]:
+                weight = drift_var / (sigma_t ** 2)
+            elif self.loss_type in [WeightType.NONE]:
+                weight = 1
+            else:
+                raise NotImplementedError()
+            if self.model_type == ModelType.NOISE:
+                terms['loss'] = mean_flat(weight * ((model_output - x0) ** 2))
+            else:
+                terms['loss'] = mean_flat(weight * ((model_output * sigma_t + x0) ** 2))
+        return terms
+    def get_drift(
+        self
+    ):
+        """member function for obtaining the drift of the probability flow ODE"""
+        def score_ode(x, t, model, **model_kwargs):
+            drift_mean, drift_var = self.path_sampler.compute_drift(x, t)
+            model_output = model(x, t, **model_kwargs)
+            return (-drift_mean + drift_var * model_output) # by change of variable
+        def noise_ode(x, t, model, **model_kwargs):
+            drift_mean, drift_var = self.path_sampler.compute_drift(x, t)
+            sigma_t, _ = self.path_sampler.compute_sigma_t(path.expand_t_like_x(t, x))
+            model_output = model(x, t, **model_kwargs)
+            score = model_output / -sigma_t
+            return (-drift_mean + drift_var * score)
+        def velocity_ode(x, t, model, **model_kwargs):
+            model_output = model(x, t, **model_kwargs)
+            return model_output
+        if self.model_type == ModelType.NOISE:
+            drift_fn = noise_ode
+        elif self.model_type == ModelType.SCORE:
+            drift_fn = score_ode
+        else:
+            drift_fn = velocity_ode
+        def body_fn(x, t, model, **model_kwargs):
+            model_output = drift_fn(x, t, model, **model_kwargs)
+            assert model_output.shape == x.shape, "Output shape from ODE solver must match input shape"
+            return model_output
+        return body_fn
+    def get_score(
+        self,
+    ):
+        """member function for obtaining score of
+            x_t = alpha_t * x + sigma_t * eps"""
+        if self.model_type == ModelType.NOISE:
+            score_fn = lambda x, t, model, **kwargs: model(x, t, **kwargs) / -self.path_sampler.compute_sigma_t(path.expand_t_like_x(t, x))[0]
+        elif self.model_type == ModelType.SCORE:
+            score_fn = lambda x, t, model, **kwagrs: model(x, t, **kwagrs)
+        elif self.model_type == ModelType.VELOCITY:
+            score_fn = lambda x, t, model, **kwargs: self.path_sampler.get_score_from_velocity(model(x, t, **kwargs), x, t)
+        else:
+            raise NotImplementedError()
+        return score_fn
+class Sampler:
+    """Sampler class for the transport model"""
+    def __init__(
+        self,
+        transport,
+    ):
+        """Constructor for a general sampler; supporting different sampling methods
+        Args:
+        - transport: an tranport object specify model prediction & interpolant type
+        """
+        self.transport = transport
+        self.drift = self.transport.get_drift()
+        self.score = self.transport.get_score()
+    def __get_sde_diffusion_and_drift(
+        self,
+        *,
+        diffusion_form="SBDM",
+        diffusion_norm=1.0,
+    ):
+        def diffusion_fn(x, t):
+            diffusion = self.transport.path_sampler.compute_diffusion(x, t, form=diffusion_form, norm=diffusion_norm)
+            return diffusion
+        sde_drift = \
+            lambda x, t, model, **kwargs: \
+                self.drift(x, t, model, **kwargs) + diffusion_fn(x, t) * self.score(x, t, model, **kwargs)
+        sde_diffusion = diffusion_fn
+        return sde_drift, sde_diffusion
+    def __get_last_step(
+        self,
+        sde_drift,
+        *,
+        last_step,
+        last_step_size,
+    ):
+        """Get the last step function of the SDE solver"""
+        if last_step is None:
+            last_step_fn = \
+                lambda x, t, model, **model_kwargs: \
+                    x
+        elif last_step == "Mean":
+            last_step_fn = \
+                lambda x, t, model, **model_kwargs: \
+                    x + sde_drift(x, t, model, **model_kwargs) * last_step_size
+        elif last_step == "Tweedie":
+            alpha = self.transport.path_sampler.compute_alpha_t # simple aliasing; the original name was too long
+            sigma = self.transport.path_sampler.compute_sigma_t
+            last_step_fn = \
+                lambda x, t, model, **model_kwargs: \
+                    x / alpha(t)[0][0] + (sigma(t)[0][0] ** 2) / alpha(t)[0][0] * self.score(x, t, model, **model_kwargs)
+        elif last_step == "Euler":
+            last_step_fn = \
+                lambda x, t, model, **model_kwargs: \
+                    x + self.drift(x, t, model, **model_kwargs) * last_step_size
+        else:
+            raise NotImplementedError()
+        return last_step_fn
+    def sample_sde(
+        self,
+        *,
+        sampling_method="Euler",
+        diffusion_form="SBDM",
+        diffusion_norm=1.0,
+        last_step="Mean",
+        last_step_size=0.04,
+        num_steps=250,
+    ):
+        """returns a sampling function with given SDE settings
+        Args:
+        - sampling_method: type of sampler used in solving the SDE; default to be Euler-Maruyama
+        - diffusion_form: function form of diffusion coefficient; default to be matching SBDM
+        - diffusion_norm: function magnitude of diffusion coefficient; default to 1
+        - last_step: type of the last step; default to identity
+        - last_step_size: size of the last step; default to match the stride of 250 steps over [0,1]
+        - num_steps: total integration step of SDE
+        """
+        if last_step is None:
+            last_step_size = 0.0
+        sde_drift, sde_diffusion = self.__get_sde_diffusion_and_drift(
+            diffusion_form=diffusion_form,
+            diffusion_norm=diffusion_norm,
+        )
+        t0, t1 = self.transport.check_interval(
+            self.transport.train_eps,
+            self.transport.sample_eps,
+            diffusion_form=diffusion_form,
+            sde=True,
+            eval=True,
+            reverse=False,
+            last_step_size=last_step_size,
+        )
+        _sde = sde(
+            sde_drift,
+            sde_diffusion,
+            t0=t0,
+            t1=t1,
+            num_steps=num_steps,
+            sampler_type=sampling_method
+        )
+        last_step_fn = self.__get_last_step(sde_drift, last_step=last_step, last_step_size=last_step_size)
+        def _sample(init, model, **model_kwargs):
+            xs = _sde.sample(init, model, **model_kwargs)
+            ts = th.ones(init.size(0), device=init.device) * t1
+            x = last_step_fn(xs[-1], ts, model, **model_kwargs)
+            xs.append(x)
+            assert len(xs) == num_steps, "Samples does not match the number of steps"
+            return xs
+        return _sample
+    def sample_ode(
+        self,
+        *,
+        sampling_method="dopri5",
+        num_steps=50,
+        atol=1e-6,
+        rtol=1e-3,
+        reverse=False,
+    ):
+        """returns a sampling function with given ODE settings
+        Args:
+        - sampling_method: type of sampler used in solving the ODE; default to be Dopri5
+        - num_steps:
+            - fixed solver (Euler, Heun): the actual number of integration steps performed
+            - adaptive solver (Dopri5): the number of datapoints saved during integration; produced by interpolation
+        - atol: absolute error tolerance for the solver
+        - rtol: relative error tolerance for the solver
+        - reverse: whether solving the ODE in reverse (data to noise); default to False
+        """
+        drift = self.drift
+        t0, t1 = self.transport.check_interval(
+            self.transport.train_eps,
+            self.transport.sample_eps,
+            sde=False,
+            eval=True,
+            reverse=reverse,
+            last_step_size=0.0,
+        )
+        _ode = ode(
+            drift=drift,
+            t0=t0,
+            t1=t1,
+            sampler_type=sampling_method,
+            num_steps=num_steps,
+            atol=atol,
+            rtol=rtol,
+        )
+        return _ode.sample
+    def sample_ode_likelihood(
+        self,
+        *,
+        sampling_method="dopri5",
+        num_steps=50,
+        atol=1e-6,
+        rtol=1e-3,
+    ):
+        """returns a sampling function for calculating likelihood with given ODE settings
+        Args:
+        - sampling_method: type of sampler used in solving the ODE; default to be Dopri5
+        - num_steps:
+            - fixed solver (Euler, Heun): the actual number of integration steps performed
+            - adaptive solver (Dopri5): the number of datapoints saved during integration; produced by interpolation
+        - atol: absolute error tolerance for the solver
+        - rtol: relative error tolerance for the solver
+        """
+        def _likelihood_drift(x, t, model, **model_kwargs):
+            x, _ = x
+            eps = th.randint(2, x.size(), dtype=th.float, device=x.device) * 2 - 1
+            t = th.ones_like(t) * (1 - t)
+            with th.enable_grad():
+                x.requires_grad = True
+                grad = th.autograd.grad(th.sum(self.drift(x, t, model, **model_kwargs) * eps), x)[0]
+                logp_grad = th.sum(grad * eps, dim=tuple(range(1, len(x.size()))))
+                drift = self.drift(x, t, model, **model_kwargs)
+            return (-drift, logp_grad)
+        t0, t1 = self.transport.check_interval(
+            self.transport.train_eps,
+            self.transport.sample_eps,
+            sde=False,
+            eval=True,
+            reverse=False,
+            last_step_size=0.0,
+        )
+        _ode = ode(
+            drift=_likelihood_drift,
+            t0=t0,
+            t1=t1,
+            sampler_type=sampling_method,
+            num_steps=num_steps,
+            atol=atol,
+            rtol=rtol,
+        )
+        def _sample_fn(x, model, **model_kwargs):
+            init_logp = th.zeros(x.size(0)).to(x)
+            input = (x, init_logp)
+            drift, delta_logp = _ode.sample(input, model, **model_kwargs)
+            drift, delta_logp = drift[-1], delta_logp[-1]
+            prior_logp = self.transport.prior_logp(drift)
+            logp = prior_logp - delta_logp
+            return logp, drift
+        return _sample_fn

SiT_back/SiT_clean/transport/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch as th
+class EasyDict:
+    def __init__(self, sub_dict):
+        for k, v in sub_dict.items():
+            setattr(self, k, v)
+    def __getitem__(self, key):
+        return getattr(self, key)
+def mean_flat(x):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return th.mean(x, dim=list(range(1, len(x.size()))))
+def log_state(state):
+    result = []
+    sorted_state = dict(sorted(state.items()))
+    for key, value in sorted_state.items():
+        # Check if the value is an instance of a class
+        if "<object" in str(value) or "object at" in str(value):
+            result.append(f"{key}: [{value.__class__.__name__}]")
+        else:
+            result.append(f"{key}: {value}")
+    return '\n'.join(result)

SiT_back/SiT_clean/wandb_utils.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import wandb
+import torch
+from torchvision.utils import make_grid
+import torch.distributed as dist
+from PIL import Image
+import os
+import argparse
+import hashlib
+import math
+def is_main_process():
+    return dist.get_rank() == 0
+def namespace_to_dict(namespace):
+    return {
+        k: namespace_to_dict(v) if isinstance(v, argparse.Namespace) else v
+        for k, v in vars(namespace).items()
+    }
+def generate_run_id(exp_name):
+    # https://stackoverflow.com/questions/16008670/how-to-hash-a-string-into-8-digits
+    return str(int(hashlib.sha256(exp_name.encode('utf-8')).hexdigest(), 16) % 10 ** 8)
+def initialize(args, entity, exp_name, project_name):
+    config_dict = namespace_to_dict(args)
+    wandb.login(key=os.environ["WANDB_KEY"])
+    wandb.init(
+        entity=entity,
+        project=project_name,
+        name=exp_name,
+        config=config_dict,
+        id=generate_run_id(exp_name),
+        resume="allow",
+    )
+def log(stats, step=None):
+    if is_main_process():
+        wandb.log({k: v for k, v in stats.items()}, step=step)
+def log_image(sample, step=None):
+    if is_main_process():
+        sample = array2grid(sample)
+        wandb.log({f"samples": wandb.Image(sample), "train_step": step})
+def array2grid(x):
+    nrow = round(math.sqrt(x.size(0)))
+    x = make_grid(x, nrow=nrow, normalize=True, value_range=(-1,1))
+    x = x.mul(255).add_(0.5).clamp_(0,255).permute(1,2,0).to('cpu', torch.uint8).numpy()
+    return x