WayneW commited on Jan 24

Commit

705a8fd

verified ·

1 Parent(s): f72be28

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

config/data_config.yaml +10 -0
config/eval_config.yaml +12 -0
config/train_config_stage1.yaml +22 -0
config/train_config_stage2.yaml +25 -0
config/train_config_stage3.yaml +25 -0
datasets.py +307 -0
diffusion/__init__.py +53 -0
diffusion/diffusion_utils.py +83 -0
diffusion/gaussian_diffusion.py +870 -0
diffusion/gaussian_diffusion_dual.py +975 -0
diffusion/respace.py +125 -0
diffusion/respace_dual.py +135 -0
diffusion/timestep_sampler.py +145 -0
distributed.py +277 -0
eval_audio.py +210 -0
eval_metrics.py +1033 -0
inference_avwm.py +498 -0
mel_scale.py +221 -0
merge_experts.py +128 -0
misc.py +232 -0
models.py +482 -0
soundstream.py +178 -0
train_avwm_stage1.py +463 -0
train_avwm_stage2.py +514 -0
train_avwm_stage3.py +532 -0

config/data_config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+action_stats:
+  min: [-2.5, -4] # [min_dx, min_dy]
+  max: [5, 4] # [max_dx, max_dy]
+distance_diff_stats:
+  min: [-20] # [min]
+  max: [20] # [max]
+avw_4k:
+  metric_waypoint_spacing: 0.15

config/eval_config.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+eval_distance:
+  eval_min_dist_cat: -16
+  eval_max_dist_cat: 16
+eval_len_traj_pred: 16
+eval_context_size: 4
+traj_stride: 8
+eval_datasets:
+  avw_4k:
+    data_folder: /path/to/dataset/avw_4k
+    test: /path/to/data_splits/avw_4k/test
+    goals_per_obs: 4

config/train_config_stage1.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+batch_size: 16
+context_size: 4
+datasets:
+  avw_4k:
+    data_folder: /path/to/dataset/avw_4k
+    goals_per_obs: 4
+    test: /path/to/data_splits/avw_4k/val
+    train: /path/to/data_splits/avw_4k/train
+distance:
+  max_dist_cat: 16
+  min_dist_cat: -16
+from_checkpoint: /path/to/pretrained/cdit_b_100000.pth.tar
+grad_clip_val: 10.0
+image_size: 224
+len_traj_pred: 16
+lr: 16.0e-05
+model: AVCDiT-B/2
+normalize: true
+num_workers: 1
+results_dir: logs
+run_name: training_stage1
+train: true

config/train_config_stage2.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+batch_size: 24
+context_size: 4
+datasets:
+  avw_4k:
+    data_folder: /path/to/dataset/avw_4k
+    goals_per_obs: 4
+    test: /path/to/data_splits/avw_4k/val
+    train: /path/to/data_splits/avw_4k/train
+distance:
+  max_dist_cat: 16
+  min_dist_cat: -16
+from_checkpoint: logs/training_stage1/checkpoints/latest.pth.tar
+sample_rate: 16000
+input_sr: 48000
+tokenizer_a_path: /path/to/pretrained/soundstream.pt
+grad_clip_val: 10.0
+image_size: 224
+len_traj_pred: 16
+lr: 8.0e-4
+model: AVCDiT-B/2
+normalize: true
+num_workers: 12
+results_dir: logs
+run_name: training_stage2
+train: true

config/train_config_stage3.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+batch_size: 4
+context_size: 4
+datasets:
+  avw_4k:
+    data_folder: /path/to/dataset/avw_4k
+    goals_per_obs: 4
+    test: /path/to/data_splits/avw_4k/val
+    train: /path/to/data_splits/avw_4k/train
+distance:
+  max_dist_cat: 16
+  min_dist_cat: -16
+from_checkpoint: /path/to/pretrained/experts_merged.pth
+sample_rate: 16000
+input_sr: 48000
+tokenizer_a_path: /path/to/pretrained/soundstream.pt
+grad_clip_val: 10.0
+image_size: 224
+len_traj_pred: 16
+lr: 16.0e-05
+model: AVCDiT-B/2
+normalize: true
+num_workers: 12
+results_dir: logs
+run_name: training_stage3
+train: true

datasets.py ADDED Viewed

	@@ -0,0 +1,307 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# NoMaD, GNM, ViNT: https://github.com/robodhruv/visualnav-transformer
+# --------------------------------------------------------
+import numpy as np
+import torch
+import os
+from PIL import Image
+from typing import Tuple
+import yaml
+import pickle
+import tqdm
+from torch.utils.data import Dataset
+from misc import angle_difference, get_data_path, get_delta_np, normalize_data, to_local_coords
+import torchaudio
+class BaseDataset(Dataset):
+    def __init__(
+        self,
+        data_folder: str,
+        data_split_folder: str,
+        dataset_name: str,
+        image_size: Tuple[int, int],
+        min_dist_cat: int,
+        max_dist_cat: int,
+        len_traj_pred: int,
+        traj_stride: int,
+        context_size: int,
+        transform: object,
+        traj_names: str,
+        normalize: bool = True,
+        predefined_index: list = None,
+        goals_per_obs: int = 1,
+    ):
+        self.data_folder = data_folder
+        self.data_split_folder = data_split_folder
+        self.dataset_name = dataset_name
+        self.goals_per_obs = goals_per_obs
+        traj_names_file = os.path.join(data_split_folder, traj_names)
+        with open(traj_names_file, "r") as f:
+            file_lines = f.read()
+            self.traj_names = file_lines.split("\n")
+        if "" in self.traj_names:
+            self.traj_names.remove("")
+        self.image_size = image_size
+        self.distance_categories = list(range(min_dist_cat, max_dist_cat + 1))
+        self.min_dist_cat = self.distance_categories[0]
+        self.max_dist_cat = self.distance_categories[-1]
+        self.len_traj_pred = len_traj_pred
+        self.traj_stride = traj_stride
+        self.context_size = context_size
+        self.normalize = normalize
+        # load data/data_config.yaml
+        with open("config/data_config.yaml", "r") as f:
+            all_data_config = yaml.safe_load(f)
+        dataset_names = list(all_data_config.keys())
+        dataset_names.sort()
+        # use this index to retrieve the dataset name from the data_config.yaml
+        self.data_config = all_data_config[self.dataset_name]
+        self.transform = transform
+        self._load_index(predefined_index)
+        self.ACTION_STATS = {}
+        for key in all_data_config['action_stats']:
+            self.ACTION_STATS[key] = np.expand_dims(all_data_config['action_stats'][key], axis=0)
+        self.DISTANCE_DIFF_STATS = {} # [NEW]
+        for key in all_data_config['distance_diff_stats']: # [NEW]
+            self.DISTANCE_DIFF_STATS[key] = np.expand_dims(all_data_config['distance_diff_stats'][key], axis=0) # [NEW]
+    def _load_index(self, predefined_index) -> None:
+        """
+        Generates a list of tuples of (obs_traj_name, goal_traj_name, obs_time, goal_time) for each observation in the dataset
+        """
+        if predefined_index:
+            print(f"****** Using a predefined evaluation index... {predefined_index}******")
+            with open(predefined_index, "rb") as f:
+                self.index_to_data = pickle.load(f)
+                return
+        else:
+            print("****** Evaluating from NON PREDEFINED index... ******")
+            index_to_data_path = os.path.join(
+                self.data_split_folder,
+                f"dataset_dist_{self.min_dist_cat}_to_{self.max_dist_cat}_n{self.context_size}_len_traj_pred_{self.len_traj_pred}.pkl",
+            )
+            self.index_to_data, self.goals_index = self._build_index()
+            with open(index_to_data_path, "wb") as f:
+                pickle.dump((self.index_to_data, self.goals_index), f)
+    def _build_index(self, use_tqdm: bool = False):
+        """
+        Build an index consisting of tuples (trajectory name, time, max goal distance)
+        """
+        samples_index = []
+        goals_index = []
+        for traj_name in tqdm.tqdm(self.traj_names, disable=not use_tqdm, dynamic_ncols=True):
+            traj_data = self._get_trajectory(traj_name)
+            traj_len = len(traj_data["position"])
+            for goal_time in range(0, traj_len):
+                goals_index.append((traj_name, goal_time))
+            begin_time = self.context_size - 1
+            end_time = traj_len - self.len_traj_pred
+            for curr_time in range(begin_time, end_time, self.traj_stride):
+                max_goal_distance = min(self.max_dist_cat, traj_len - curr_time - 1)
+                min_goal_distance = max(self.min_dist_cat, -curr_time)
+                samples_index.append((traj_name, curr_time, min_goal_distance, max_goal_distance))
+        return samples_index, goals_index
+    def _get_trajectory(self, trajectory_name):
+        with open(os.path.join(self.data_folder, trajectory_name, "traj_data.pkl"), "rb") as f:
+            traj_data = pickle.load(f)
+        for k,v in traj_data.items():
+            traj_data[k] = v.astype('float')
+        return traj_data
+    def __len__(self) -> int:
+        return len(self.index_to_data)
+    def _compute_actions(self, traj_data, curr_time, goal_time):
+        start_index = curr_time
+        end_index = curr_time + self.len_traj_pred + 1
+        yaw = traj_data["yaw"][start_index:end_index]
+        positions = traj_data["position"][start_index:end_index]
+        goal_pos = traj_data["position"][goal_time]
+        goal_yaw = traj_data["yaw"][goal_time]
+        dist_window = traj_data["distance_to_target"][start_index:end_index]  # shape (len_traj_pred+1,) # [NEW]
+        goal_dist = traj_data["distance_to_target"][goal_time]    # shape (N,) or scalar # [NEW]
+        if len(yaw.shape) == 2:
+            yaw = yaw.squeeze(1)
+        if yaw.shape != (self.len_traj_pred + 1,):
+            raise ValueError("is used?")
+        waypoints_pos = to_local_coords(positions, positions[0], yaw[0])
+        waypoints_yaw = angle_difference(yaw[0], yaw)
+        actions = np.concatenate([waypoints_pos, waypoints_yaw.reshape(-1, 1)], axis=-1)
+        actions = actions[1:]
+        goal_pos = to_local_coords(goal_pos, positions[0], yaw[0])
+        goal_yaw = angle_difference(yaw[0], goal_yaw)
+        diffs_seq = (dist_window[0] - dist_window).reshape(-1, 1)[1:] # [NEW]
+        goal_diff = (dist_window[0] - goal_dist).reshape(-1, 1) # [NEW]
+        if self.normalize:
+            actions[:, :2] /= self.data_config["metric_waypoint_spacing"]
+            goal_pos[:, :2] /= self.data_config["metric_waypoint_spacing"]
+            diffs_seq /= self.data_config["metric_waypoint_spacing"] # [NEW]
+            goal_diff /= self.data_config["metric_waypoint_spacing"] # [NEW]
+        goal_pos = np.concatenate([goal_pos, goal_yaw.reshape(-1, 1)], axis=-1)
+        return actions, goal_pos, diffs_seq, goal_diff
+class TrainingDataset(BaseDataset):
+    def __init__(
+        self,
+        data_folder: str,
+        data_split_folder: str,
+        dataset_name: str,
+        image_size: Tuple[int, int],
+        min_dist_cat: int,
+        max_dist_cat: int,
+        len_traj_pred: int,
+        traj_stride: int,
+        context_size: int,
+        transform: object,
+        traj_names: str = 'traj_names.txt',
+        normalize: bool = True,
+        predefined_index: list = None,
+        goals_per_obs: int = 1,
+        # sample_rate: int = 16000,
+        # target_len: int = 7840
+        sample_rate: int = 16000,
+        input_sr: int = 48000,
+        evaluate: bool = False
+    ):
+        super().__init__(data_folder, data_split_folder, dataset_name, image_size, min_dist_cat, max_dist_cat,
+            len_traj_pred, traj_stride, context_size, transform, traj_names, normalize, predefined_index, goals_per_obs)
+        self.resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=input_sr, lowpass_filter_width=64)
+        self.evaluate = evaluate
+    def __getitem__(self, i: int) -> Tuple[torch.Tensor]:
+        try:
+            f_curr, curr_time, min_goal_dist, max_goal_dist = self.index_to_data[i]
+            goal_offset = np.random.randint(min_goal_dist, max_goal_dist + 1, size=(self.goals_per_obs))
+            goal_time = (curr_time + goal_offset).astype('int')
+            rel_time = (goal_offset).astype('float')/(128.) # TODO: refactor, currently a fixed const
+            context_times = list(range(curr_time - self.context_size + 1, curr_time + 1))
+            context = [(f_curr, t) for t in context_times] + [(f_curr, t) for t in goal_time]
+            obs_image = torch.stack([self.transform(Image.open(get_data_path(self.data_folder, f, t))) for f, t in context])
+            obs_audio = torch.stack([torchaudio.load(get_data_path(self.data_folder, f, t, data_type="audio"))[0] for f, t in context])
+            if self.evaluate:
+                orig_obs_audio = obs_audio
+            obs_audio = self.resampler(obs_audio)
+            # Load other trajectory data
+            curr_traj_data = self._get_trajectory(f_curr)
+            # Compute actions
+            _, goal_pos, _, goal_diff = self._compute_actions(curr_traj_data, curr_time, goal_time)
+            goal_pos[:, :2] = normalize_data(goal_pos[:, :2], self.ACTION_STATS)
+            goal_diff = normalize_data(goal_diff, self.DISTANCE_DIFF_STATS)
+            if self.evaluate:
+                return (
+                    torch.as_tensor(obs_image, dtype=torch.float32),
+                    torch.as_tensor(obs_audio, dtype=torch.float32),
+                    torch.as_tensor(goal_pos, dtype=torch.float32),
+                    torch.as_tensor(goal_diff, dtype=torch.float32),
+                    torch.as_tensor(rel_time, dtype=torch.float32),
+                    torch.as_tensor(orig_obs_audio, dtype=torch.float32),
+                )
+            else:
+                return (
+                    torch.as_tensor(obs_image, dtype=torch.float32),
+                    torch.as_tensor(obs_audio, dtype=torch.float32),
+                    torch.as_tensor(goal_pos, dtype=torch.float32),
+                    torch.as_tensor(goal_diff, dtype=torch.float32),
+                    torch.as_tensor(rel_time, dtype=torch.float32),
+                )
+        except Exception as e:
+            print(f"Exception in {self.dataset_name}", e)
+            raise Exception(e)
+class EvalDataset(BaseDataset):
+    def __init__(
+        self,
+        data_folder: str,
+        data_split_folder: str,
+        dataset_name: str,
+        image_size: Tuple[int, int],
+        min_dist_cat: int,
+        max_dist_cat: int,
+        len_traj_pred: int,
+        traj_stride: int,
+        context_size: int,
+        transform: object,
+        traj_names: str,
+        normalize: bool = True,
+        predefined_index: list = None,
+        goals_per_obs: int = 1,
+        sample_rate: int = 16000,
+        input_sr: int = 48000
+    ):
+        super().__init__(data_folder, data_split_folder, dataset_name, image_size, min_dist_cat, max_dist_cat,
+            len_traj_pred, traj_stride, context_size, transform, traj_names, normalize, predefined_index, goals_per_obs)
+        self.resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=input_sr, lowpass_filter_width=64)
+    def __getitem__(self, i: int) -> Tuple[torch.Tensor]:
+        try:
+            f_curr, curr_time, _, _ = self.index_to_data[i]
+            context_times = list(range(curr_time - self.context_size + 1, curr_time + 1))
+            pred_times = list(range(curr_time + 1, curr_time + self.len_traj_pred + 1))
+            context = [(f_curr, t) for t in context_times]
+            pred = [(f_curr, t) for t in pred_times]
+            obs_image = torch.stack([self.transform(Image.open(get_data_path(self.data_folder, f, t))) for f, t in context])
+            pred_image = torch.stack([self.transform(Image.open(get_data_path(self.data_folder, f, t))) for f, t in pred])
+            orig_obs_audio = torch.stack([torchaudio.load(get_data_path(self.data_folder, f, t, data_type="audio"))[0] for f, t in context])
+            orig_pred_audio = torch.stack([torchaudio.load(get_data_path(self.data_folder, f, t, data_type="audio"))[0] for f, t in pred])
+            obs_audio = self.resampler(orig_obs_audio)
+            pred_audio = self.resampler(orig_pred_audio)
+            curr_traj_data = self._get_trajectory(f_curr)
+            # Compute actions
+            actions, _, diffs_seq, _ = self._compute_actions(curr_traj_data, curr_time, np.array([curr_time+1])) # last argument is dummy goal
+            actions[:, :2] = normalize_data(actions[:, :2], self.ACTION_STATS)
+            diffs_seq = normalize_data(diffs_seq, self.DISTANCE_DIFF_STATS)
+            delta = get_delta_np(actions)
+            diffs_seq = get_delta_np(diffs_seq)
+            return (
+                torch.tensor([i], dtype=torch.float32), # for logging purposes
+                torch.as_tensor(obs_image, dtype=torch.float32),
+                torch.as_tensor(pred_image, dtype=torch.float32),
+                torch.as_tensor(obs_audio, dtype=torch.float32),
+                torch.as_tensor(pred_audio, dtype=torch.float32),
+                torch.as_tensor(diffs_seq, dtype=torch.float32),
+                torch.as_tensor(delta, dtype=torch.float32),
+                torch.as_tensor(orig_obs_audio, dtype=torch.float32),
+                torch.as_tensor(orig_pred_audio, dtype=torch.float32),
+            )
+        except Exception as e:
+            print(f"Exception in {self.dataset_name}", e)
+            raise Exception(e)

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from . import gaussian_diffusion as gd_orig
+from . import gaussian_diffusion_dual as gd_dual
+# from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000,
+    dual=False
+):
+    if dual:
+        print("Using DUAL diffusion")
+        from .respace_dual import SpacedDiffusion, space_timesteps
+        gd_module = gd_dual
+    else:
+        print("Using SINGLE diffusion")
+        from .respace import SpacedDiffusion, space_timesteps
+        gd_module = gd_orig
+    betas = gd_module.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    # betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd_module.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd_module.LossType.RESCALED_MSE
+    else:
+        loss_type = gd_module.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd_module.ModelMeanType.EPSILON if not predict_xstart else gd_module.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd_module.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd_module.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd_module.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,870 @@

+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+import torch.nn.functional as F
+import torch
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/gaussian_diffusion_dual.py ADDED Viewed

	@@ -0,0 +1,975 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def q_posterior_mean_variance_dual(self, x_start, x_t, t):
+        """
+        Compute the posterior mean and variance for each modality:
+            q(x_{t-1} | x_t, x_0)
+        Inputs:
+            x_start: tuple (x_v_start, x_a_start)
+            x_t: tuple (x_v_t, x_a_t)
+            t: Tensor of shape [B]
+        Outputs:
+            posterior_mean: (mean_v, mean_a)
+            posterior_variance: (var_v, var_a)
+            posterior_log_variance_clipped: (logvar_v, logvar_a)
+        """
+        x_v_start, x_a_start = x_start
+        x_v_t, x_a_t = x_t
+        def single_modality_q(x_start_i, x_t_i):
+            assert x_start_i.shape == x_t_i.shape
+            posterior_mean = (
+                _extract_into_tensor(self.posterior_mean_coef1, t, x_t_i.shape) * x_start_i
+                + _extract_into_tensor(self.posterior_mean_coef2, t, x_t_i.shape) * x_t_i
+            )
+            posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t_i.shape)
+            posterior_log_variance_clipped = _extract_into_tensor(
+                self.posterior_log_variance_clipped, t, x_t_i.shape
+            )
+            return posterior_mean, posterior_variance, posterior_log_variance_clipped
+        mean_v, var_v, logvar_v = single_modality_q(x_v_start, x_v_t)
+        mean_a, var_a, logvar_a = single_modality_q(x_a_start, x_a_t)
+        return (mean_v, mean_a), (var_v, var_a), (logvar_v, logvar_a)
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Dual-modality version.
+        x: (x_v_t, x_a_t)
+        model: takes (x_v_t, x_a_t, t, **model_kwargs)
+        returns: out_v, out_a: dicts with 'mean', 'variance', 'log_variance', 'pred_xstart'
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        x_v, x_a = x
+        B, C_v = x_v.shape[:2]
+        B, C_a = x_a.shape[:2]
+        assert t.shape == (B,)
+        # Call model once to get both outputs
+        model_output_v, model_output_a = model(x_v, x_a, t, **model_kwargs)
+        # Helper function for one modality
+        def process_modality(x_t, model_output, C):
+            if isinstance(model_output, tuple):
+                model_output, _ = model_output  # drop extra output if any
+            if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+                max_log = _extract_into_tensor(np.log(self.betas), t, x_t.shape)
+                frac = (model_var_values + 1) / 2
+                model_log_variance = frac * max_log + (1 - frac) * min_log
+                model_variance = th.exp(model_log_variance)
+            else:
+                model_variance_, model_log_variance_ = {
+                    ModelVarType.FIXED_LARGE: (
+                        np.append(self.posterior_variance[1], self.betas[1:]),
+                        np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                    ),
+                    ModelVarType.FIXED_SMALL: (
+                        self.posterior_variance,
+                        self.posterior_log_variance_clipped,
+                    ),
+                }[self.model_var_type]
+                model_variance = _extract_into_tensor(model_variance_, t, x_t.shape)
+                model_log_variance = _extract_into_tensor(model_log_variance_, t, x_t.shape)
+            def process_xstart(x):
+                if denoised_fn is not None:
+                    x = denoised_fn(x)
+                if clip_denoised:
+                    x = x.clamp(-1, 1)
+                return x
+            if self.model_mean_type == ModelMeanType.START_X:
+                pred_xstart = process_xstart(model_output)
+            else:
+                pred_xstart = process_xstart(
+                    self._predict_xstart_from_eps(x_t=x_t, t=t, eps=model_output)
+                )
+            model_mean, _, _ = self.q_posterior_mean_variance(
+                x_start=pred_xstart, x_t=x_t, t=t
+            )
+            return {
+                "mean": model_mean,
+                "variance": model_variance,
+                "log_variance": model_log_variance,
+                "pred_xstart": pred_xstart,
+            }
+        out_v = process_modality(x_v, model_output_v, C_v)
+        out_a = process_modality(x_a, model_output_a, C_a)
+        return out_v, out_a
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(
+        self,
+        cond_fn,                  # callable(x_v, x_a, t, **model_kwargs) -> (grad_v, grad_a)
+        p_mean_var_v,             # dict for video: contains 'mean', 'variance'
+        p_mean_var_a,             # dict for audio
+        x_v, x_a,                 # x_t for video/audio
+        t,
+        model_kwargs=None,
+    ):
+        """
+        Compute conditional mean separately for each modality:
+            new_mean = mean + variance * ∇ log p(y|x_t)
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        # cond_fn must return (grad_v, grad_a)
+        grad_v, grad_a = cond_fn(x_v, x_a, t, **model_kwargs)
+        new_mean_v = p_mean_var_v["mean"].float() + p_mean_var_v["variance"] * grad_v.float()
+        new_mean_a = p_mean_var_a["mean"].float() + p_mean_var_a["variance"] * grad_a.float()
+        return new_mean_v, new_mean_a
+    def p_sample(
+        self,
+        model,
+        x_v,
+        x_a,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        # out = self.p_mean_variance(
+        #     model,
+        #     x,
+        #     t,
+        #     clip_denoised=clip_denoised,
+        #     denoised_fn=denoised_fn,
+        #     model_kwargs=model_kwargs,
+        # )
+        out_v, out_a = self.p_mean_variance(
+            model=model,
+            x=(x_v, x_a),
+            t=t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise_v = th.randn_like(x_v)
+        noise_a = th.randn_like(x_a)
+        nonzero_mask_v = (
+            (t != 0).float().view(-1, *([1] * (len(x_v.shape) - 1)))
+        )  # no noise when t == 0
+        nonzero_mask_a = (
+            (t != 0).float().view(-1, *([1] * (len(x_a.shape) - 1)))
+        )
+        if cond_fn is not None:
+            out_v["mean"], out_a["mean"] = condition_mean(cond_fn, out_v, out_a, x_v, x_a, t, model_kwargs=model_kwargs)
+        sample_v = out_v["mean"] + nonzero_mask_v * th.exp(0.5 * out_v["log_variance"]) * noise_v
+        sample_a = out_a["mean"] + nonzero_mask_a * th.exp(0.5 * out_a["log_variance"]) * noise_a
+        return {"sample_v": sample_v, "sample_a": sample_a, "pred_xstart_v": out_v["pred_xstart"], "pred_xstart_a": out_a["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape_v,
+        shape_a,
+        noise_v=None,
+        noise_a=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape_v,
+            shape_a,
+            noise_v=noise_v,
+            noise_a=noise_a,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample_v"], final["sample_a"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape_v,
+        shape_a,
+        noise_v=None,
+        noise_a=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape_v, (tuple, list))
+        assert isinstance(shape_a, (tuple, list))
+        if noise_v is not None:
+            img = noise_v
+        else:
+            img = th.randn(*shape_v, device=device)
+        if noise_a is not None:
+            audio = noise_a
+        else:
+            audio = th.randn(*shape_a, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape_v[0], device=device)
+            with th.no_grad():
+            #{"sample_v": sample_v, "sample_a": sample_a, "pred_xstart_v": out_v["pred_xstart"], "pred_xstart_a": out_a["pred_xstart"]}
+                out = self.p_sample(
+                    model,
+                    img,
+                    audio,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample_v"]
+                audio = out["sample_a"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+        self, model, x_v_start, x_a_start, x_v_t, x_a_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Dual-modality VB loss.
+        """
+        # --- True posterior
+        (true_mean_v, true_mean_a), _, (logvar_v, logvar_a) = self.q_posterior_mean_variance_dual(
+            x_start=(x_v_start, x_a_start),
+            x_t=(x_v_t, x_a_t),
+            t=t,
+        )
+        # --- Model prediction
+        out_v, out_a = self.p_mean_variance(
+            model=model,
+            x=(x_v_t, x_a_t),
+            t=t,
+            clip_denoised=clip_denoised,
+            model_kwargs=model_kwargs,
+        )
+        # --- KL loss
+        kl_v = normal_kl(true_mean_v, logvar_v, out_v["mean"], out_v["log_variance"])
+        kl_a = normal_kl(true_mean_a, logvar_a, out_a["mean"], out_a["log_variance"])
+        kl_v = mean_flat(kl_v) / np.log(2.0)
+        kl_a = mean_flat(kl_a) / np.log(2.0)
+        # --- NLL loss (only at t=0)
+        decoder_nll_v = -discretized_gaussian_log_likelihood(
+            x_v_start, means=out_v["mean"], log_scales=0.5 * out_v["log_variance"]
+        )
+        decoder_nll_v = mean_flat(decoder_nll_v) / np.log(2.0)
+        decoder_nll_a = -discretized_gaussian_log_likelihood(
+            x_a_start, means=out_a["mean"], log_scales=0.5 * out_a["log_variance"]
+        )
+        decoder_nll_a = mean_flat(decoder_nll_a) / np.log(2.0)
+        # --- Final VB loss
+        output_v = th.where((t == 0), decoder_nll_v, kl_v)
+        output_a = th.where((t == 0), decoder_nll_a, kl_a)
+        return {
+            "output_v": output_v,
+            "output_a": output_a,
+            "pred_xstart": (out_v["pred_xstart"], out_a["pred_xstart"]),
+        }
+    def training_losses(self, model, x_v_start, x_a_start, t, model_kwargs=None, noise_v=None, noise_a=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise_v is None:
+            noise_v = th.randn_like(x_v_start)
+        x_v_t = self.q_sample(x_v_start, t, noise=noise_v)
+        if noise_a is None:
+            noise_a = th.randn_like(x_a_start)
+        x_a_t = self.q_sample(x_a_start, t, noise=noise_a)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            vb_terms = self._vb_terms_bpd(
+                model=model,
+                x_v_start=x_v_start,
+                x_a_start=x_a_start,
+                x_v_t=x_v_t,
+                x_a_t=x_a_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )
+            terms["vb_v"] = vb_terms["output_v"]
+            terms["vb_a"] = vb_terms["output_a"]
+            terms["loss"] = vb_terms["output_v"] + vb_terms["output_a"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output_v, model_output_a = model(x_v_t, x_a_t, t, **model_kwargs)
+            if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+                B, C_v = x_v_t.shape[:2]
+                B, C_a = x_a_t.shape[:2]
+                model_output_v, model_var_v = th.split(model_output_v, C_v, dim=1)
+                model_output_a, model_var_a = th.split(model_output_a, C_a, dim=1)
+                frozen_out_v = th.cat([model_output_v.detach(), model_var_v], dim=1)
+                frozen_out_a = th.cat([model_output_a.detach(), model_var_a], dim=1)
+                frozen_model = lambda *args, **kwargs: (frozen_out_v, frozen_out_a)
+                vb_output = self._vb_terms_bpd(
+                    model=frozen_model,
+                    x_v_start=x_v_start,
+                    x_a_start=x_a_start,
+                    x_v_t=x_v_t,
+                    x_a_t=x_a_t,
+                    t=t,
+                    clip_denoised=False,
+                )
+                terms["vb_v"] = vb_output["output_v"]
+                terms["vb_a"] = vb_output["output_a"]
+            # === MSE Loss ===
+            def process_mse(modality, x_start, x_t, model_output, noise):
+                target = {
+                    ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance_dual(
+                        x_start=(x_v_start, x_a_start),
+                        x_t=(x_v_t, x_a_t),
+                        t=t,
+                    )[0][0 if modality == "v" else 1],
+                    ModelMeanType.START_X: x_start,
+                    ModelMeanType.EPSILON: noise,
+                }[self.model_mean_type]
+                assert model_output.shape == target.shape == x_start.shape
+                terms[f"mse_{modality}"] = mean_flat((target - model_output) ** 2)
+            process_mse("v", x_v_start, x_v_t, model_output_v, noise_v)
+            process_mse("a", x_a_start, x_a_t, model_output_a, noise_a)
+            if "vb_v" in terms and "vb_a" in terms:
+                terms["vb"] = terms["vb_v"] + terms["vb_a"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            terms["loss"] = terms["mse_v"] + terms["mse_a"]
+            if "vb" in terms:
+                terms["loss"] += terms["vb"]
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, dual, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        self.dual = dual
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps, self.dual
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

diffusion/respace_dual.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import numpy as np
+import torch as th
+from .gaussian_diffusion_dual import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x_v, x_a, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x_v, x_a, new_ts, **kwargs)

diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

distributed.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import torch
+import torch.distributed as dist
+from torcheval.metrics import FrechetInceptionDistance
+from collections import defaultdict, deque
+import os
+import datetime
+import builtins
+from logging import getLogger
+import pickle
+import time
+logger = getLogger()
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+    builtins.print = print
+def init_distributed(port=37124, rank_and_world_size=(None, None)):
+    rank, world_size = rank_and_world_size
+    dist_url='env://'
+    os.environ['MASTER_PORT'] = os.environ.get('MASTER_PORT', str(port))
+    print("Using port", os.environ['MASTER_PORT'])
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        try:
+            rank = int(os.environ["RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+            gpu = int(os.environ["LOCAL_RANK"])
+        except Exception:
+            logger.info('torchrun env vars not sets')
+    elif "SLURM_PROCID" in os.environ:
+        try:
+            world_size = int(os.environ['SLURM_NTASKS'])
+            rank = int(os.environ['SLURM_PROCID'])
+            gpu = rank % torch.cuda.device_count()
+            if 'HOSTNAME' in os.environ:
+                os.environ['MASTER_ADDR'] = os.environ['HOSTNAME']
+            else:
+                os.environ['MASTER_ADDR'] = '127.0.0.1'
+        except Exception:
+            logger.info('SLURM vars not set')
+    else:
+        rank = 0
+        world_size = 1
+        gpu = 0
+        os.environ['MASTER_ADDR'] = '127.0.0.1'
+    torch.cuda.set_device(gpu)
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=world_size,
+        rank=rank,
+        init_method=dist_url
+    )
+    # setup_for_distributed(rank == 0)
+    return world_size, rank, gpu, True
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+    @property
+    def global_avg(self):
+        return self.total / self.count
+    @property
+    def max(self):
+        return max(self.deque)
+    @property
+    def value(self):
+        return self.deque[-1]
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+        self.update(total_time=total_time)
+def sync_fid_loss_fns(fid_loss_fn, device="cuda"):
+    """
+    Synchronizes FID loss function metrics across all processes.
+    Args:
+        fid_loss_fn (dict): Local FID loss function metrics on each process.
+        device (str): Device to move the merged FID metrics to.
+    Returns:
+        final_fid_loss_fn (dict): Merged FID loss function metrics on all processes.
+    """
+    if not is_dist_avail_and_initialized():
+        return fid_loss_fn
+    serialized_fid_loss_fn = pickle.dumps(fid_loss_fn)
+    gathered_fid_loss_fn = [None] * dist.get_world_size()
+    dist.barrier()
+    dist.all_gather_object(gathered_fid_loss_fn, serialized_fid_loss_fn)
+    final_fid_loss_fn = {
+        1: FrechetInceptionDistance(feature_dim=2048).to(device),
+        2: FrechetInceptionDistance(feature_dim=2048).to(device),
+        4: FrechetInceptionDistance(feature_dim=2048).to(device),
+        8: FrechetInceptionDistance(feature_dim=2048).to(device),
+        16: FrechetInceptionDistance(feature_dim=2048).to(device),
+    }
+    for serialized_fid_loss_fn in gathered_fid_loss_fn:
+        curr_fid_loss_fn = pickle.loads(serialized_fid_loss_fn)
+        for sec in [1, 2, 4, 8, 16]:
+            sec_fid_loss_fn = curr_fid_loss_fn[sec]
+            final_fid_loss_fn[sec].merge_state([sec_fid_loss_fn])
+    return final_fid_loss_fn

eval_audio.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# eval_audio.py
+from typing import Optional
+import os
+import re
+import argparse
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchaudio
+import librosa
+import matplotlib.pyplot as plt
+_EPS = 1e-12
+def build_mel_transform(
+    sample_rate,
+    n_fft=1024,
+    win_length=1024,
+    hop_length=256,
+    n_mels=80,
+    power=1.0,
+    f_min=0.0,
+    f_max=None,
+    mel_scale="htk",
+    norm=None,
+    device=None,
+):
+    mel_tf = torchaudio.transforms.MelSpectrogram(
+        sample_rate=sample_rate,
+        n_fft=n_fft,
+        win_length=win_length,
+        hop_length=hop_length,
+        f_min=f_min,
+        f_max=f_max,
+        n_mels=n_mels,
+        power=power,
+        center=True,
+        norm=norm,
+        mel_scale=mel_scale,
+    )
+    if device is not None:
+        mel_tf = mel_tf.to(device)
+    return mel_tf
+def _ensure_stereo_torch(x):
+    if x.dim() == 1:
+        x = x.unsqueeze(0)
+    if x.size(0) == 1:
+        x = x.repeat(2, 1)
+    elif x.size(0) > 2:
+        x = x[:2]
+    return x
+@torch.no_grad()
+def mel_cosine_stereo(
+    ref, hat, sample_rate,
+    n_fft=1024,
+    win_length=1024,
+    hop_length=256,
+    n_mels=80,
+    power=1.0,
+    mel_tf=None,
+):
+    ref = _ensure_stereo_torch(ref)
+    hat = _ensure_stereo_torch(hat)
+    device = ref.device
+    if mel_tf is None:
+        mel_tf = build_mel_transform(
+            sample_rate=sample_rate,
+            n_fft=n_fft, win_length=win_length, hop_length=hop_length,
+            n_mels=n_mels, power=power, device=device
+        )
+    else:
+        mel_tf = mel_tf.to(device)
+    Mr = mel_tf(ref)
+    Mh = mel_tf(hat)
+    Ar = Mr.reshape(Mr.size(0), -1)
+    Ah = Mh.reshape(Mh.size(0), -1)
+    sim = F.cosine_similarity(Ar, Ah, dim=-1)
+    return float(sim.mean().item())
+@torch.no_grad()
+def drms_avg_db_stereo(ref, hat, win_length=1024, hop_length=256):
+    ref = _ensure_stereo_torch(ref)
+    hat = _ensure_stereo_torch(hat)
+    def _rms_db(x):
+        C, T = x.size(0), x.size(1)
+        if T < win_length:
+            x = F.pad(x, (0, win_length - T))
+        frames = x.unfold(dimension=-1, size=win_length, step=hop_length)
+        rms = torch.sqrt(frames.pow(2).mean(dim=-1) + _EPS)
+        db = 20.0 * torch.log10(rms + _EPS)
+        return db
+    dbr = _rms_db(ref)
+    dbh = _rms_db(hat)
+    Fmin = min(dbr.size(-1), dbh.size(-1))
+    dbr = dbr[:, :Fmin]
+    dbh = dbh[:, :Fmin]
+    d_db = dbh - dbr
+    return float(d_db.mean(dim=-1).mean().item())
+def load_stereo_wav_np(path):
+    y, sr = librosa.load(path, sr=None, mono=False)
+    if y.ndim == 1:
+        y = np.stack([y, y], axis=0)
+    elif y.shape[0] != 2:
+        y = y[:2]
+    return y, sr
+def compute_spectrogram_np(audio_stereo,
+                           n_fft=512,
+                           hop_length=160,
+                           win_length=400,
+                           pool=4):
+    def _stft_abs(sig):
+        st = np.abs(librosa.stft(sig, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
+        h, w = st.shape
+        hq, wq = h // pool, w // pool
+        if hq == 0 or wq == 0:
+            raise ValueError(f"audio too short for pooling (stft shape {st.shape})")
+        st = st[:hq * pool, :wq * pool]
+        st = st.reshape(hq, pool, wq, pool).mean(axis=(1, 3))
+        return st
+    L = np.log1p(_stft_abs(audio_stereo[0]))
+    if audio_stereo.shape[0] >= 2:
+        R = np.log1p(_stft_abs(audio_stereo[1]))
+    else:
+        R = L.copy()
+    spec = np.stack([L, R], axis=-1)
+    return spec
+def render_ref_hat_panel(title, spec_ref, spec_hat, out_path, cmap="magma"):
+    L_all = [spec_ref[:, :, 0], spec_hat[:, :, 0]]
+    R_all = [spec_ref[:, :, 1], spec_hat[:, :, 1]]
+    if any(a.size == 0 for a in L_all + R_all):
+        print(f"[SKIP]")
+        return False
+    vmin_L = min(a.min() for a in L_all)
+    vmax_L = max(a.max() for a in L_all)
+    vmin_R = min(a.min() for a in R_all)
+    vmax_R = max(a.max() for a in R_all)
+    fig, axes = plt.subplots(2, 2, figsize=(8, 6), constrained_layout=True)
+    Lr, Rr = spec_ref[:, :, 0], spec_ref[:, :, 1]
+    Lh, Rh = spec_hat[:, :, 0], spec_hat[:, :, 1]
+    axes[0, 0].imshow(Lr, origin="lower", aspect="auto", cmap=cmap, vmin=vmin_L, vmax=vmax_L)
+    axes[0, 1].imshow(Lh, origin="lower", aspect="auto", cmap=cmap, vmin=vmin_L, vmax=vmax_L)
+    axes[1, 0].imshow(Rr, origin="lower", aspect="auto", cmap=cmap, vmin=vmin_R, vmax=vmax_R)
+    axes[1, 1].imshow(Rh, origin="lower", aspect="auto", cmap=cmap, vmin=vmin_R, vmax=vmax_R)
+    axes[0, 0].set_title("ref")
+    axes[0, 1].set_title("hat")
+    axes[0, 0].set_ylabel("Left")
+    axes[1, 0].set_ylabel("Right")
+    for ax in axes.ravel():
+        ax.set_xticks([])
+        ax.set_yticks([])
+    fig.suptitle(title)
+    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
+    plt.savefig(out_path, dpi=180)
+    plt.close(fig)
+    return True
+def save_ref_hat_spectrogram_panel(
+    ref, hat, out_path,
+    n_fft=512,
+    hop_length=160,
+    win_length=400,
+    pool=4,
+    title="ref vs hat (binaural spectrogram)",
+    cmap="magma",
+):
+    def _to_np_stereo(x):
+        if isinstance(x, torch.Tensor):
+            x = x.detach().to(torch.float32).cpu().numpy()
+        if x.ndim == 1:
+            x = np.stack([x, x], axis=0)
+        elif x.shape[0] == 1:
+            x = np.repeat(x, 2, axis=0)
+        elif x.shape[0] > 2:
+            x = x[:2]
+        return x
+    ref_np = _to_np_stereo(ref)
+    hat_np = _to_np_stereo(hat)
+    spec_ref = compute_spectrogram_np(ref_np, n_fft=n_fft, hop_length=hop_length, win_length=win_length, pool=pool)
+    spec_hat = compute_spectrogram_np(hat_np, n_fft=n_fft, hop_length=hop_length, win_length=win_length, pool=pool)
+    return render_ref_hat_panel(title, spec_ref, spec_hat, out_path, cmap=cmap)

eval_metrics.py ADDED Viewed

	@@ -0,0 +1,1033 @@

+# Copyright (c) Meta Platforms, Inc.
+# All rights reserved.
+import os
+import json
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import torch.distributed as dist_torch
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import lpips
+from dreamsim import dreamsim
+from torchvision import transforms
+from torcheval.metrics import FrechetInceptionDistance
+import soundfile as sf
+import resampy
+import distributed as dist
+import librosa
+from skimage.metrics import structural_similarity as sk_ssim
+from mel_scale import MelScale
+# -----------------------------
+# Safe, lazy import for FAD (avoid argparse conflicts from dependencies)
+# -----------------------------
+def safe_import_fad():
+    """
+    Import frechet_audio_distance.FrechetAudioDistance without letting downstream
+    libraries parse our CLI args during import time.
+    """
+    import importlib, sys
+    argv_backup = sys.argv[:]
+    try:
+        sys.argv = [argv_backup[0]]  # hide our CLI flags from misbehaving imports
+        fad_mod = importlib.import_module("frechet_audio_distance")
+        return getattr(fad_mod, "FrechetAudioDistance")
+    finally:
+        sys.argv = argv_backup
+# -----------------------------
+# Distributed init
+# -----------------------------
+def setup_distributed():
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ and "LOCAL_RANK" in os.environ:
+        rank        = int(os.environ["RANK"])
+        world_size  = int(os.environ["WORLD_SIZE"])
+        local_rank  = int(os.environ["LOCAL_RANK"])
+    else:
+        return 0, 1, 0
+    os.environ.setdefault("MASTER_ADDR", "127.0.0.1")
+    os.environ.setdefault("MASTER_PORT", "29500")
+    assert torch.cuda.is_available(), "CUDA Unavailable"
+    assert torch.cuda.device_count() > local_rank, "local_rank out of the number of GPUs"
+    torch.cuda.set_device(local_rank)
+    dist_torch.init_process_group(
+        backend="nccl",
+        init_method="env://",
+        rank=rank,
+        world_size=world_size,
+    )
+    dist_torch.barrier()
+    if rank == 0:
+        print(f"[init] world_size={world_size} | rank->gpu OK")
+    return rank, world_size, local_rank
+# -----------------------------
+# Vision metrics factory
+# -----------------------------
+def get_loss_fn(loss_fn_type, secs, device):
+    if loss_fn_type == 'lpips':
+        general_lpips_loss_fn = lpips.LPIPS(net='alex').to(device).eval()
+        def loss_fn(img0_paths, img1_paths):
+            img0_list, img1_list = [], []
+            for p0, p1 in zip(img0_paths, img1_paths):
+                img0 = lpips.im2tensor(lpips.load_image(p0)).to(device)  # [-1,1]
+                img1 = lpips.im2tensor(lpips.load_image(p1)).to(device)
+                img0_list.append(img0)
+                img1_list.append(img1)
+            all_img0 = torch.cat(img0_list, dim=0)
+            all_img1 = torch.cat(img1_list, dim=0)
+            with torch.no_grad():
+                dist_val = general_lpips_loss_fn.forward(all_img0, all_img1)
+                return dist_val.mean()
+    elif loss_fn_type == 'dreamsim':
+        dreamsim_loss_fn, preprocess = dreamsim(pretrained=True, device=device)
+        dreamsim_loss_fn.eval()
+        def loss_fn(img0_paths, img1_paths):
+            img0_list, img1_list = [], []
+            for p0, p1 in zip(img0_paths, img1_paths):
+                img0 = preprocess(Image.open(p0)).to(device)
+                img1 = preprocess(Image.open(p1)).to(device)
+                img0_list.append(img0)
+                img1_list.append(img1)
+            all_img0 = torch.cat(img0_list, dim=0)
+            all_img1 = torch.cat(img1_list, dim=0)
+            with torch.no_grad():
+                dist_val = dreamsim_loss_fn(all_img0, all_img1)
+                return dist_val.mean()
+    elif loss_fn_type == 'fid':
+        fid_metrics = {}
+        for sec in secs:
+            fid_metrics[sec] = FrechetInceptionDistance(feature_dim=2048).to(device)
+        return fid_metrics
+    else:
+        raise NotImplementedError
+    return loss_fn
+# ===== Helpers for LSD/SSIM (reproducing AudioMetrics behavior) =====
+_EPS = 1e-12
+def _ensure_stereo_np(y: np.ndarray):
+    if y.ndim == 1:
+        y = np.stack([y, y], axis=0)
+    elif y.ndim == 2:
+        if y.shape[0] == 1:
+            y = np.concatenate([y, y], axis=0)
+        elif y.shape[0] > 2:
+            y = y[:2, :]
+    else:
+        raise ValueError("Unsupported audio array shape")
+    return y
+def _wav_to_spectrogram(wav: np.ndarray, rate: int):
+    if rate == 44100:
+        hop_length = 441
+        n_fft = 2048
+    elif rate == 16000:
+        hop_length = 160
+        n_fft = 743
+    else:
+        raise ValueError("Bad Samplerate (expected 16000 or 44100)")
+    f = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))  # [F, T]
+    f = np.transpose(f, (1, 0))  # [T, F]
+    f_torch = torch.tensor(f[None, None, ...], dtype=torch.float32)  # [1,1,T,F]
+    return f_torch
+def _lsd_from_specs(est: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+    ratio = (target ** 2) / ((est + _EPS) ** 2) + _EPS
+    lsd = torch.log10(ratio) ** 2
+    lsd = torch.mean(torch.mean(lsd, dim=3) ** 0.5, dim=2)
+    return lsd.mean()
+def _mel_lsd_ssim_single(
+    e_wav: np.ndarray,
+    g_wav: np.ndarray,
+    mel_tf: MelScale,
+    n_fft: int = 743,
+    hop_length: int = 160,
+) -> tuple[float, float]:
+    est_mag = np.abs(librosa.stft(e_wav, n_fft=n_fft, hop_length=hop_length))
+    ref_mag = np.abs(librosa.stft(g_wav, n_fft=n_fft, hop_length=hop_length))
+    est_mag_t = torch.from_numpy(est_mag).float()
+    ref_mag_t = torch.from_numpy(ref_mag).float()
+    est_mel = mel_tf(est_mag_t)
+    ref_mel = mel_tf(ref_mag_t)
+    ex_m = est_mel.transpose(0, 1).unsqueeze(0).unsqueeze(0)
+    gt_m = ref_mel.transpose(0, 1).unsqueeze(0).unsqueeze(0)
+    mel_lsd  = float(_lsd_from_specs(ex_m, gt_m))
+    mel_ssim = float(_ssim_from_specs(ex_m, gt_m))
+    return mel_lsd, mel_ssim
+def _to_log_specs(x: torch.Tensor) -> torch.Tensor:
+    return torch.log10(x + _EPS)
+def _pow_p_norm(x: torch.Tensor) -> torch.Tensor:
+    return torch.mean(x.pow(2), dim=(2, 3))
+def _energy_unify(est: torch.Tensor, target: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    p_est = _pow_p_norm(est)
+    p_tgt = _pow_p_norm(target)
+    scale = torch.sqrt((p_tgt + _EPS) / (p_est + _EPS))
+    scale = scale[..., None, None]
+    est_scaled = est * scale
+    return est_scaled, target
+def _sispec_from_specs(est: torch.Tensor, target: torch.Tensor, log_domain: bool) -> torch.Tensor:
+    if log_domain:
+        est = _to_log_specs(est)
+        target = _to_log_specs(target)
+    est_u, tgt_u = _energy_unify(est, target)
+    noise = est_u - tgt_u
+    snr = ( _pow_p_norm(tgt_u) / (_pow_p_norm(noise) + _EPS) ) + _EPS
+    sp_loss = 10.0 * torch.log10(snr)
+    return sp_loss.mean()
+# ===== Image PSNR (RGB on [0,1]) =====
+def _psnr_from_tensors(gt: torch.Tensor, pred: torch.Tensor, data_range: float = 1.0, eps: float = 1e-10) -> torch.Tensor:
+    mse = torch.mean((gt - pred) ** 2, dim=(1, 2, 3))
+    dr = torch.as_tensor(data_range, device=gt.device, dtype=gt.dtype)
+    psnr = 10.0 * torch.log10((dr * dr) / (mse + eps))
+    return psnr
+def _ssim_from_specs(est: torch.Tensor, target: torch.Tensor) -> float:
+    if est.is_cuda:
+        est_np = est.detach().cpu().numpy()
+        tgt_np = target.detach().cpu().numpy()
+    else:
+        est_np = est.numpy()
+        tgt_np = target.numpy()
+    N, C, _, _ = est_np.shape
+    acc, cnt = 0.0, 0
+    for n in range(N):
+        for c in range(C):
+            ref = tgt_np[n, c, ...]
+            out = est_np[n, c, ...]
+            rng = float(out.max() - out.min())
+            rng = 1.0 if rng == 0.0 else rng
+            s = sk_ssim(out, ref, win_size=7, data_range=rng)
+            acc += float(s); cnt += 1
+    return acc / max(cnt, 1)
+# ==========================================================
+#            Streaming, DDP-friendly Audio FAD
+#   (embeddings identical to official FrechetAudioDistance)
+# ==========================================================
+class _RunningGaussianStats:
+    def __init__(self, feat_dim: int, device: torch.device):
+        self.D = feat_dim
+        self.device = device
+        self.reset()
+    def reset(self):
+        D = self.D
+        self.count = torch.zeros(1, device=self.device, dtype=torch.float64)
+        self.sum_feat = torch.zeros(D, device=self.device, dtype=torch.float64)
+        self.sum_outer = torch.zeros(D, D, device=self.device, dtype=torch.float64)
+    @torch.no_grad()
+    def update(self, feats: torch.Tensor):  # [N, D]
+        if feats is None or feats.numel() == 0:
+            return
+        f = feats.to(dtype=torch.float64)
+        self.count += torch.tensor([f.shape[0]], device=self.device, dtype=torch.float64)
+        self.sum_feat += f.sum(dim=0)
+        self.sum_outer += f.t().mm(f)
+    @torch.no_grad()
+    def sync(self):
+        if dist_torch.is_initialized():
+            for t in (self.count, self.sum_feat, self.sum_outer):
+                dist_torch.all_reduce(t, op=dist_torch.ReduceOp.SUM)
+    @torch.no_grad()
+    def mean_cov(self, eps: float = 1e-6):
+        n = int(self.count.item())
+        if n == 0:
+            return None, None
+        mean = self.sum_feat / self.count
+        cov = self.sum_outer / self.count - torch.ger(mean, mean)
+        cov = cov + torch.eye(self.D, device=self.device, dtype=torch.float64) * eps
+        return mean, cov
+@torch.no_grad()
+def _frechet_distance_torch(mean1, cov1, mean2, cov2) -> float:
+    diff = mean1 - mean2
+    diff2 = diff.dot(diff)
+    evals1, evecs1 = torch.linalg.eigh(cov1)
+    sqrt1 = evecs1 @ torch.diag(evals1.clamp(min=0).sqrt()) @ evecs1.t()
+    prod = sqrt1 @ cov2 @ sqrt1
+    evals_prod = torch.linalg.eigvalsh(prod).clamp(min=0).sqrt()
+    trace = torch.trace(cov1 + cov2) - 2.0 * evals_prod.sum()
+    return float((diff2 + trace).item())
+class StreamingFAD:
+    """
+    Mono (downmix) FID-style streaming FAD:
+        - update_from_wavs(paths, is_real=True/False)
+        - compute()  # does DDP all_reduce internally
+    """
+    def __init__(self, fad_backend, pad_seconds: float = 0.96, batch_size: int = 16):
+        self.fad = fad_backend
+        self.device = self.fad.device
+        self.bs = batch_size
+        self.pad_len = int(round(self.fad.sample_rate * float(pad_seconds)))
+        self.feat_dim = self._infer_feat_dim()
+        self.real_stats = _RunningGaussianStats(self.feat_dim, self.device)
+        self.fake_stats = _RunningGaussianStats(self.feat_dim, self.device)
+    def _infer_feat_dim(self) -> int:
+        sr = self.fad.sample_rate
+        x = np.zeros((self.pad_len,), dtype=np.float32)
+        emb = self.fad.get_embeddings([x], sr=sr)
+        return int(emb.shape[-1]) if isinstance(emb, np.ndarray) else int(emb.shape[-1])
+    @torch.no_grad()
+    def _load_and_resample(self, path: str):
+        try:
+            audio, sr = sf.read(path, dtype="float32", always_2d=False)
+        except Exception as e:
+            print(f"[StreamingFAD] read error: {path}: {e}")
+            return None
+        if audio is None or (isinstance(audio, np.ndarray) and audio.size == 0):
+            return None
+        if isinstance(audio, np.ndarray) and audio.ndim == 2:
+            audio = audio.mean(axis=1)
+        if sr != self.fad.sample_rate:
+            try:
+                audio = resampy.resample(audio, sr, self.fad.sample_rate)
+            except Exception as e:
+                print(f"[StreamingFAD] resample error: {path}: {e}")
+                return None
+        if audio.shape[0] < self.pad_len:
+            pad = np.zeros((self.pad_len - audio.shape[0],), dtype=np.float32)
+            audio = np.concatenate([audio, pad], axis=0)
+        return audio.astype(np.float32, copy=False)
+    @torch.no_grad()
+    def update_from_wavs(self, wav_paths, is_real: bool):
+        if not wav_paths:
+            return
+        xs = []
+        for p in wav_paths:
+            a = self._load_and_resample(p)
+            if a is not None:
+                xs.append(a)
+        if not xs:
+            return
+        feats_chunks = []
+        for i in range(0, len(xs), self.bs):
+            chunk = xs[i:i+self.bs]
+            emb_np = self.fad.get_embeddings(chunk, sr=self.fad.sample_rate)
+            if isinstance(emb_np, np.ndarray):
+                if emb_np.size == 0:
+                    continue
+                feats_chunks.append(torch.from_numpy(emb_np).to(self.device))
+            else:
+                if emb_np.numel() == 0:
+                    continue
+                feats_chunks.append(emb_np.to(self.device))
+        if len(feats_chunks) == 0:
+            return
+        feats = torch.cat(feats_chunks, dim=0)
+        (self.real_stats if is_real else self.fake_stats).update(feats)
+    @torch.no_grad()
+    def compute(self) -> float:
+        self.real_stats.sync()
+        self.fake_stats.sync()
+        m1, c1 = self.real_stats.mean_cov()
+        m2, c2 = self.fake_stats.mean_cov()
+        if (m1 is None) or (m2 is None):
+            raise RuntimeError("StreamingFAD: empty stats")
+        return _frechet_distance_torch(m1, c1, m2, c2)
+class StereoStreamingFAD:
+    def __init__(self, fad_backend, pad_seconds: float = 0.96, batch_size: int = 16):
+        self.fad = fad_backend
+        self.device = self.fad.device
+        self.bs = batch_size
+        self.pad_len = int(round(self.fad.sample_rate * float(pad_seconds)))
+        self.feat_dim = self._infer_feat_dim()
+        self.L_real = _RunningGaussianStats(self.feat_dim, self.device)
+        self.L_fake = _RunningGaussianStats(self.feat_dim, self.device)
+        self.R_real = _RunningGaussianStats(self.feat_dim, self.device)
+        self.R_fake = _RunningGaussianStats(self.feat_dim, self.device)
+    def _infer_feat_dim(self) -> int:
+        sr = self.fad.sample_rate
+        x = np.zeros((self.pad_len,), dtype=np.float32)
+        emb = self.fad.get_embeddings([x], sr=sr)
+        return int(emb.shape[-1]) if isinstance(emb, np.ndarray) else int(emb.shape[-1])
+    @torch.no_grad()
+    def _load_lr_and_resample_pad(self, path: str):
+        try:
+            audio, sr = sf.read(path, dtype="float32", always_2d=True)  # [T, C]
+        except Exception as e:
+            print(f"[StereoFAD] read error: {path}: {e}")
+            return None, None
+        if audio is None or audio.size == 0:
+            return None, None
+        C = audio.shape[1]
+        if C == 1:
+            L = audio[:, 0]; R = audio[:, 0]
+        else:
+            L = audio[:, 0]; R = audio[:, 1] if C >= 2 else audio[:, 0]
+        if sr != self.fad.sample_rate:
+            try:
+                L = resampy.resample(L, sr, self.fad.sample_rate)
+                R = resampy.resample(R, sr, self.fad.sample_rate)
+            except Exception as e:
+                print(f"[StereoFAD] resample error: {path}: {e}")
+                return None, None
+        def _pad_to_len(x: np.ndarray, n: int):
+            if x.shape[0] >= n:
+                return x.astype(np.float32, copy=False)
+            pad = np.zeros((n - x.shape[0],), dtype=np.float32)
+            return np.concatenate([x, pad], axis=0)
+        L = _pad_to_len(L, self.pad_len)
+        R = _pad_to_len(R, self.pad_len)
+        return L, R
+    @torch.no_grad()
+    def update_from_wavs(self, wav_paths, is_real: bool):
+        if not wav_paths:
+            return
+        L_list, R_list = [], []
+        for p in wav_paths:
+            L, R = self._load_lr_and_resample_pad(p)
+            if L is not None and R is not None:
+                L_list.append(L); R_list.append(R)
+        if not L_list:
+            return
+        def _embed_and_update(xs, stats_obj: _RunningGaussianStats):
+            feats_chunks = []
+            for i in range(0, len(xs), self.bs):
+                chunk = xs[i:i+self.bs]
+                emb_np = self.fad.get_embeddings(chunk, sr=self.fad.sample_rate)
+                if isinstance(emb_np, np.ndarray):
+                    if emb_np.size == 0:
+                        continue
+                    feats_chunks.append(torch.from_numpy(emb_np).to(self.device))
+                else:
+                    if emb_np.numel() == 0:
+                        continue
+                    feats_chunks.append(emb_np.to(self.device))
+            if len(feats_chunks) == 0:
+                return
+            feats = torch.cat(feats_chunks, dim=0)
+            stats_obj.update(feats)
+        if is_real:
+            _embed_and_update(L_list, self.L_real)
+            _embed_and_update(R_list, self.R_real)
+        else:
+            _embed_and_update(L_list, self.L_fake)
+            _embed_and_update(R_list, self.R_fake)
+    @torch.no_grad()
+    def compute(self):
+        for t in (self.L_real, self.L_fake, self.R_real, self.R_fake):
+            t.sync()
+        mL_r, cL_r = self.L_real.mean_cov()
+        mL_f, cL_f = self.L_fake.mean_cov()
+        mR_r, cR_r = self.R_real.mean_cov()
+        mR_f, cR_f = self.R_fake.mean_cov()
+        if (mL_r is None) or (mL_f is None) or (mR_r is None) or (mR_f is None):
+            raise RuntimeError("StereoStreamingFAD: empty stats")
+        fad_left  = _frechet_distance_torch(mL_r, cL_r, mL_f, cL_f)
+        fad_right = _frechet_distance_torch(mR_r, cR_r, mR_f, cR_f)
+        fad_mean  = 0.5 * (fad_left + fad_right)
+        return float(fad_left), float(fad_right), float(fad_mean)
+# -----------------------------
+# Stereo-friendly Audio Metrics (LSD/SSIM/MelCos/DRMS)
+# -----------------------------
+def _load_librosa_stereo(path: str, sr: int) -> np.ndarray:
+    y, _ = librosa.load(path, sr=sr, mono=False)
+    y = _ensure_stereo_np(y)  # (2, T)
+    return y
+def _mel_cosine_single_channel(wav: np.ndarray, ref: np.ndarray, sr: int, mel_tf: MelScale) -> float:
+    hop_length = 160; n_fft = 743
+    est_mag = np.abs(librosa.stft(wav, hop_length=hop_length, n_fft=n_fft))  # [F, T]
+    ref_mag = np.abs(librosa.stft(ref, hop_length=hop_length, n_fft=n_fft))
+    est_mag_t = torch.tensor(est_mag, dtype=torch.float32)  # [F,T]
+    ref_mag_t = torch.tensor(ref_mag, dtype=torch.float32)  # [F,T]
+    est_mel = mel_tf(est_mag_t)  # [80, T]
+    ref_mel = mel_tf(ref_mag_t)  # [80, T]
+    sim = F.cosine_similarity(est_mel.flatten(), ref_mel.flatten(), dim=0)
+    return float(sim.item())
+# -----------------------------
+# Evaluate
+# -----------------------------
+def evaluate(args, dataset_name, eval_type, metric_logger, loss_fns,
+             gt_dir, exp_dir, secs, device, rank, world_size, modals):
+    lpips_loss_fn, dreamsim_loss_fn, fid_loss_fn = loss_fns
+    if eval_type == 'rollout':
+        eval_name = 'rollout'
+        image_idxs = secs.copy()
+    elif eval_type == 'time':
+        eval_name = eval_type
+        image_idxs = secs.copy()
+    else:
+        raise ValueError(f"Unknown eval_type {eval_type}")
+    if 'v' in modals:
+        for s in secs:
+            metric_logger.meters[f'{dataset_name}_{eval_name}_fid_{int(s)}'].update(0.0, n=0)
+    # Episodes split by rank
+    all_eps = sorted([e for e in os.listdir(gt_dir) if os.path.isdir(os.path.join(gt_dir, e))])
+    eps = all_eps[rank::world_size]
+    if len(eps) == 0:
+        return
+    to_tensor = transforms.ToTensor()
+    fad_streams = {}
+    stereo_mode = False
+    if 'a' in modals:
+        try:
+            FADLib = safe_import_fad()
+        except Exception as e:
+            if rank == 0:
+                print(f"[WARN] Fail to import frechet_audio_distance：{e}")
+            FADLib = None
+        if FADLib is not None:
+            base_fad = FADLib(
+                model_name=args.fad_model,
+                sample_rate=args.fad_sr,
+                verbose=False
+            )
+            if args.fad_model == 'vggish' and not args.mono:
+                stereo_mode = True
+                for sec in secs:
+                    fad_streams[sec] = StereoStreamingFAD(base_fad, pad_seconds=args.fad_pad_sec, batch_size=16)
+            else:
+                for sec in secs:
+                    fad_streams[sec] = StreamingFAD(base_fad, pad_seconds=args.fad_pad_sec, batch_size=16)
+    mel_tf = MelScale(n_mels=80, sample_rate=16000, n_stft=372)
+    for batch_start in tqdm(range(0, len(eps), args.batch_size),
+                            total=(len(eps) + args.batch_size - 1) // args.batch_size,
+                            disable=(rank != 0)):
+        batch_eps = eps[batch_start:batch_start + args.batch_size]
+        # per-sec containers (vision)
+        gt_img_batch, exp_img_batch = {}, {}
+        gt_img_paths_batch, exp_img_paths_batch = {}, {}
+        denorm_pairs_by_sec = {}
+        secs_py = [int(s) for s in secs]
+        denorm_pairs_by_sec = {s: [] for s in secs_py}
+        for sec in secs:
+            gt_img_batch[sec], exp_img_batch[sec] = [], []
+            gt_img_paths_batch[sec], exp_img_paths_batch[sec] = [], []
+        # per-sec containers (audio paths)
+        gt_wav_paths_batch, exp_wav_paths_batch = {}, {}
+        for sec in secs:
+            gt_wav_paths_batch[sec], exp_wav_paths_batch[sec] = [], []
+        for ep in batch_eps:
+            gt_ep_dir = os.path.join(gt_dir, ep)
+            exp_ep_dir = os.path.join(exp_dir, ep)
+            if (not os.path.isdir(gt_ep_dir)) or (not os.path.isdir(exp_ep_dir)):
+                continue
+            gt_dist_p  = os.path.join(gt_ep_dir,  "distance.json")
+            exp_dist_p = os.path.join(exp_ep_dir, "distance.json")
+            try:
+                if os.path.isfile(gt_dist_p) and os.path.isfile(exp_dist_p):
+                    with open(gt_dist_p,  "r") as f: gt_list  = json.load(f)
+                    with open(exp_dist_p, "r") as f: exp_list = json.load(f)
+                    gt_map  = {int(it["sec"]): float(it["denorm_gt"])   for it in gt_list  if "sec" in it and "denorm_gt"  in it}
+                    exp_map = {int(it["sec"]): float(it["denorm_pred"]) for it in exp_list if "sec" in it and "denorm_pred" in it}
+                    for s in secs_py:
+                        if s in gt_map and s in exp_map:
+                            denorm_pairs_by_sec[s].append((gt_map[s], exp_map[s]))
+            except Exception:
+                pass
+            for sec, image_idx in zip(secs, image_idxs):
+                # ---- vision
+                if 'v' in modals:
+                    gt_sec_img_path = os.path.join(gt_ep_dir, f'{int(image_idx)}.png')
+                    exp_sec_img_path = os.path.join(exp_ep_dir, f'{int(image_idx)}.png')
+                    if os.path.isfile(gt_sec_img_path) and os.path.isfile(exp_sec_img_path):
+                        try:
+                            gt_img = to_tensor(Image.open(gt_sec_img_path).convert("RGB")).unsqueeze(0).to(device)
+                            exp_img = to_tensor(Image.open(exp_sec_img_path).convert("RGB")).unsqueeze(0).to(device)
+                            if torch.isfinite(gt_img).all() and torch.isfinite(exp_img).all():
+                                gt_img_batch[sec].append(gt_img)
+                                exp_img_batch[sec].append(exp_img)
+                                gt_img_paths_batch[sec].append(gt_sec_img_path)
+                                exp_img_paths_batch[sec].append(exp_sec_img_path)
+                        except Exception:
+                            pass
+                # ---- audio
+                if 'a' in modals:
+                    gt_sec_wav_path = os.path.join(gt_ep_dir, f'{int(image_idx)}.wav')
+                    exp_sec_wav_path = os.path.join(exp_ep_dir, f'{int(image_idx)}.wav')
+                    if os.path.isfile(gt_sec_wav_path) and os.path.isfile(exp_sec_wav_path):
+                        gt_wav_paths_batch[sec].append(gt_sec_wav_path)
+                        exp_wav_paths_batch[sec].append(exp_sec_wav_path)
+        # ---- vision metric update per batch
+        if 'v' in modals:
+            for sec in secs:
+                if (len(gt_img_batch[sec]) == 0) or (len(exp_img_batch[sec]) == 0):
+                    continue
+                lpips_dists = lpips_loss_fn(gt_img_paths_batch[sec], exp_img_paths_batch[sec])
+                dreamsim_dists = dreamsim_loss_fn(gt_img_paths_batch[sec], exp_img_paths_batch[sec])
+                metric_logger.meters[f'{dataset_name}_{eval_name}_lpips_{sec}'].update(lpips_dists, n=1)
+                metric_logger.meters[f'{dataset_name}_{eval_name}_dreamsim_{sec}'].update(dreamsim_dists, n=1)
+                sec_gt_batch = torch.cat(gt_img_batch[sec], dim=0)
+                sec_exp_batch = torch.cat(exp_img_batch[sec], dim=0)
+                if torch.isfinite(sec_gt_batch).all() and torch.isfinite(sec_exp_batch).all():
+                    fid_loss_fn[sec].update(images=sec_gt_batch, is_real=True)
+                    fid_loss_fn[sec].update(images=sec_exp_batch, is_real=False)
+                    psnr_vals = _psnr_from_tensors(sec_gt_batch, sec_exp_batch, data_range=1.0)  # (N,)
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_psnr_{sec}'].update(psnr_vals.mean(), n=1)
+        # ---- audio metrics per batch
+        if 'a' in modals:
+            # FAD (streaming)
+            if len(fad_streams) > 0:
+                for sec in secs:
+                    if len(gt_wav_paths_batch[sec]) == 0 and len(exp_wav_paths_batch[sec]) == 0:
+                        continue
+                    fad_streams[sec].update_from_wavs(gt_wav_paths_batch[sec], is_real=True)
+                    fad_streams[sec].update_from_wavs(exp_wav_paths_batch[sec], is_real=False)
+            # LSD / SSIM / MelCos / dRMS-db
+            _AUDIO_SR = 16000
+            for sec in secs:
+                gt_list = gt_wav_paths_batch[sec]
+                exp_list = exp_wav_paths_batch[sec]
+                if len(gt_list) == 0 or len(exp_list) == 0:
+                    continue
+                pair_cnt = min(len(gt_list), len(exp_list))
+                if pair_cnt == 0:
+                    continue
+                lsd_L, lsd_R, ssim_L, ssim_R = [], [], [], []
+                mel_L, mel_R = [], []
+                mel_lsd_L, mel_lsd_R = [], []
+                mel_ssim_L, mel_ssim_R = [], []
+                sispec_nl_L, sispec_nl_R = [], []
+                sispec_log_L, sispec_log_R = [], []
+                mel_sispec_nl_L, mel_sispec_n_R = [], []
+                mel_sispec_log_L, mel_sispec_log_R = [], []
+                for i in range(pair_cnt):
+                    gpath = gt_list[i]
+                    epath = exp_list[i]
+                    try:
+                        g_st = _load_librosa_stereo(gpath, _AUDIO_SR)  # (2,T)
+                        e_st = _load_librosa_stereo(epath, _AUDIO_SR)  # (2,T)
+                        if args.mono:
+                            g_mono = g_st.mean(axis=0)
+                            e_mono = e_st.mean(axis=0)
+                            # LSD/SSIM
+                            gt_sp = _wav_to_spectrogram(g_mono, rate=_AUDIO_SR)
+                            ex_sp = _wav_to_spectrogram(e_mono, rate=_AUDIO_SR)
+                            lsd_val = _lsd_from_specs(ex_sp.clone(), gt_sp.clone())
+                            ssim_val = _ssim_from_specs(ex_sp.clone(), gt_sp.clone())
+                            # MelCos
+                            mel_val = _mel_cosine_single_channel(e_mono, g_mono, _AUDIO_SR, mel_tf)
+                            # mel_lsd & mel_ssim
+                            mel_lsd_val, mel_ssim_val = _mel_lsd_ssim_single(e_mono, g_mono, mel_tf)
+                            # sispec
+                            sispec_nl  = _sispec_from_specs(ex_sp.clone(), gt_sp.clone(), log_domain=False)
+                            sispec_log = _sispec_from_specs(ex_sp.clone(), gt_sp.clone(), log_domain=True)
+                            # Mel sispec
+                            mel_sispec_nl  = _sispec_from_specs(ex_m.clone(), gt_m.clone(), log_domain=False)
+                            mel_sispec_log = _sispec_from_specs(ex_m.clone(), gt_m.clone(), log_domain=True)
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_lsd_{sec}'].update(lsd_val, n=1)
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_ssim_{sec}'].update(
+                                torch.tensor(ssim_val), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_melcos_{sec}'].update(
+                                torch.tensor(mel_val), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_lsd_{sec}'].update(
+                                torch.tensor(float(mel_lsd_val)), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_ssim_{sec}'].update(
+                                torch.tensor(float(mel_ssim_val)), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_non_log_sispec_{sec}'].update(
+                                torch.tensor(float(sispec_nl)), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_sispec_{sec}'].update(
+                                torch.tensor(float(sispec_log)), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_final_non_log_mel_sispec_{sec}'].update(
+                                torch.tensor(float(mel_sispec_nl)), n=1
+                            )
+                            metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_sispec_{sec}'].update(
+                                torch.tensor(float(mel_sispec_log)), n=1
+                            )
+                        else:
+                            for ch, (acc_lsd, acc_ssim, acc_mel,
+                                    acc_mel_lsd, acc_mel_ssim,
+                                    acc_sispec_nl, acc_sispec_log,
+                                    acc_mel_sispec_nl, acc_mel_sispec_log) in enumerate([
+                                (lsd_L, ssim_L, mel_L, mel_lsd_L, mel_ssim_L, sispec_nl_L, sispec_log_L, mel_sispec_nl_L, mel_sispec_log_L),
+                                (lsd_R, ssim_R, mel_R, mel_lsd_R, mel_ssim_R, sispec_nl_R, sispec_log_R, mel_sispec_n_R, mel_sispec_log_R),
+                            ]):
+                                g = g_st[ch]; e = e_st[ch]
+                                # LSD/SSIM
+                                gt_sp = _wav_to_spectrogram(g, rate=_AUDIO_SR)
+                                ex_sp = _wav_to_spectrogram(e, rate=_AUDIO_SR)
+                                acc_lsd.append(float(_lsd_from_specs(ex_sp.clone(), gt_sp.clone())))
+                                acc_ssim.append(float(_ssim_from_specs(ex_sp.clone(), gt_sp.clone())))
+                                # MelCos
+                                acc_mel.append(_mel_cosine_single_channel(e, g, _AUDIO_SR, mel_tf))
+                                # mel_lsd & mel_ssim
+                                mel_lsd_val, mel_ssim_val = _mel_lsd_ssim_single(e, g, mel_tf)
+                                acc_mel_lsd.append(mel_lsd_val)
+                                acc_mel_ssim.append(mel_ssim_val)
+                                # sispec
+                                acc_sispec_nl.append( float(_sispec_from_specs(ex_sp.clone(), gt_sp.clone(), log_domain=False)) )
+                                acc_sispec_log.append( float(_sispec_from_specs(ex_sp.clone(), gt_sp.clone(), log_domain=True)) )
+                                # Mel
+                                est_mag = np.abs(librosa.stft(e, n_fft=743, hop_length=160))
+                                ref_mag = np.abs(librosa.stft(g, n_fft=743, hop_length=160))
+                                est_mel = mel_tf(torch.from_numpy(est_mag).float())  # [M,T]
+                                ref_mel = mel_tf(torch.from_numpy(ref_mag).float())  # [M,T]
+                                ex_m = est_mel.T.unsqueeze(0).unsqueeze(0)  # [1,1,T,M]
+                                gt_m = ref_mel.T.unsqueeze(0).unsqueeze(0)  # [1,1,T,M]
+                                # sispec(Mel, non_log / log)
+                                acc_mel_sispec_nl.append( float(_sispec_from_specs(ex_m.clone(), gt_m.clone(), log_domain=False)) )
+                                acc_mel_sispec_log.append( float(_sispec_from_specs(ex_m.clone(), gt_m.clone(), log_domain=True)) )
+                    except Exception:
+                        pass
+                if not args.mono:
+                    def _maybe_mean(x):
+                        return float(np.mean(x)) if len(x) > 0 else None
+                    v = _maybe_mean(lsd_L);  w = _maybe_mean(lsd_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_lsdL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_lsdR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_lsd_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(ssim_L); w = _maybe_mean(ssim_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_ssimL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_ssimR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_ssim_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(mel_L);  w = _maybe_mean(mel_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_melcosL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_melcosR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_melcos_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(mel_lsd_L);  w = _maybe_mean(mel_lsd_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_lsdL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_lsdR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_lsd_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(mel_ssim_L); w = _maybe_mean(mel_ssim_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_ssimL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_ssimR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_ssim_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(sispec_nl_L); w = _maybe_mean(sispec_nl_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_non_log_sispecL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_non_log_sispecR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_non_log_sispec_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(sispec_log_L); w = _maybe_mean(sispec_log_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_sispecL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_sispecR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_sispec_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(mel_sispec_nl_L); w = _maybe_mean(mel_sispec_n_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_non_log_mel_sispecL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_non_log_mel_sispecR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_final_non_log_mel_sispec_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+                    v = _maybe_mean(mel_sispec_log_L); w = _maybe_mean(mel_sispec_log_R)
+                    if v is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_sispecL_{sec}'].update(torch.tensor(v), n=1)
+                    if w is not None: metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_sispecR_{sec}'].update(torch.tensor(w), n=1)
+                    if v is not None and w is not None:
+                        metric_logger.meters[f'{dataset_name}_{eval_name}_final_mel_sispec_{sec}'].update(torch.tensor(0.5*(v+w)), n=1)
+        for s in secs_py:
+            pairs = denorm_pairs_by_sec[s]
+            if not pairs:
+                continue
+            arr = np.asarray(pairs, dtype=np.float32)
+            mask = np.isfinite(arr).all(axis=1)
+            if not np.any(mask):
+                continue
+            se_mean = float(np.mean((arr[mask, 1] - arr[mask, 0]) ** 2))
+            metric_logger.meters[f'{dataset_name}_{eval_name}_denorm_mse_{s}'].update(
+                torch.tensor(se_mean), n=1
+            )
+    if 'v' in modals:
+        feature_dim = 2048
+        sec_list = [int(s) for s in secs]
+        tmp_dir = Path(os.path.join(args.exp_dir, ".fid_tmp"))
+        if dist_torch.is_initialized():
+            if dist_torch.get_rank() == 0:
+                tmp_dir.mkdir(parents=True, exist_ok=True)
+            dist_torch.barrier()
+        else:
+            tmp_dir.mkdir(parents=True, exist_ok=True)
+        if dist_torch.is_initialized():
+            my_rank = dist_torch.get_rank()
+            world_size = dist_torch.get_world_size()
+        else:
+            my_rank = 0
+            world_size = 1
+        for s in sec_list:
+            fid_m = fid_loss_fn[s]
+            state = {
+                "real_sum":        fid_m.real_sum.detach().to("cpu", torch.float64),
+                "real_cov_sum":    fid_m.real_cov_sum.detach().to("cpu", torch.float64),
+                "fake_sum":        fid_m.fake_sum.detach().to("cpu", torch.float64),
+                "fake_cov_sum":    fid_m.fake_cov_sum.detach().to("cpu", torch.float64),
+                "num_real_images": torch.tensor(int(fid_m.num_real_images.item()), dtype=torch.int64),
+                "num_fake_images": torch.tensor(int(fid_m.num_fake_images.item()), dtype=torch.int64),
+            }
+            out_path = tmp_dir / f"fid_sec{s}_rank{my_rank}.pt"
+            torch.save(state, out_path)
+        if dist_torch.is_initialized():
+            dist_torch.barrier()
+        if (not dist_torch.is_initialized()) or my_rank == 0:
+            for s in sec_list:
+                agg = {
+                    "real_sum": torch.zeros(feature_dim, dtype=torch.float64),
+                    "real_cov_sum": torch.zeros((feature_dim, feature_dim), dtype=torch.float64),
+                    "fake_sum": torch.zeros(feature_dim, dtype=torch.float64),
+                    "fake_cov_sum": torch.zeros((feature_dim, feature_dim), dtype=torch.float64),
+                    "num_real_images": torch.tensor(0, dtype=torch.int64),
+                    "num_fake_images": torch.tensor(0, dtype=torch.int64),
+                }
+                for r in range(world_size):
+                    p = tmp_dir / f"fid_sec{s}_rank{r}.pt"
+                    if not p.exists():
+                        continue
+                    st = torch.load(p, map_location="cpu")
+                    agg["real_sum"]        += st["real_sum"]
+                    agg["real_cov_sum"]    += st["real_cov_sum"]
+                    agg["fake_sum"]        += st["fake_sum"]
+                    agg["fake_cov_sum"]    += st["fake_cov_sum"]
+                    agg["num_real_images"] += st["num_real_images"]
+                    agg["num_fake_images"] += st["num_fake_images"]
+                fid_m = fid_loss_fn[s]
+                fid_m.real_sum        = agg["real_sum"].to(fid_m.device, fid_m.real_sum.dtype)
+                fid_m.real_cov_sum    = agg["real_cov_sum"].to(fid_m.device, fid_m.real_cov_sum.dtype)
+                fid_m.fake_sum        = agg["fake_sum"].to(fid_m.device, fid_m.fake_sum.dtype)
+                fid_m.fake_cov_sum    = agg["fake_cov_sum"].to(fid_m.device, fid_m.fake_cov_sum.dtype)
+                fid_m.num_real_images = torch.tensor(
+                    int(agg["num_real_images"].item()), device=fid_m.device, dtype=fid_m.num_real_images.dtype
+                )
+                fid_m.num_fake_images = torch.tensor(
+                    int(agg["num_fake_images"].item()), device=fid_m.device, dtype=fid_m.num_fake_images.dtype
+                )
+                try:
+                    val = float(fid_m.compute().item())
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_fid_{s}'].update(val, n=1)
+                except Exception as e:
+                    print(f"[WARN] FID compute failed at sec={s}: {e}")
+            for s in sec_list:
+                for r in range(world_size):
+                    p = tmp_dir / f"fid_sec{s}_rank{r}.pt"
+                    try:
+                        if p.exists():
+                            p.unlink()
+                    except Exception:
+                        pass
+            try:
+                tmp_dir.rmdir()
+            except Exception:
+                pass
+        if dist_torch.is_initialized():
+            dist_torch.barrier()
+    if 'a' in modals and len(fad_streams) > 0:
+        for sec in secs:
+            try:
+                if stereo_mode:
+                    fad_L, fad_R, fad_avg = fad_streams[sec].compute()
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_fadL_{sec}'].update(fad_L, n=1)
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_fadR_{sec}'].update(fad_R, n=1)
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_fad_{sec}'].update(fad_avg, n=1)
+                else:
+                    fad_val = float(fad_streams[sec].compute())
+                    metric_logger.meters[f'{dataset_name}_{eval_name}_fad_{sec}'].update(fad_val, n=1)
+            except Exception as e:
+                if rank == 0:
+                    print(f"[WARN] FAD compute failed at sec={sec}: {e}")
+                continue
+# -----------------------------
+# Save
+# -----------------------------
+def save_metric_to_disk(metric_logger, log_p, rank):
+    if dist_torch.is_initialized():
+        metric_logger.synchronize_between_processes()
+    if rank == 0:
+        log_stats = {k: float(meter.global_avg) for k, meter in metric_logger.meters.items()}
+        os.makedirs(os.path.dirname(log_p), exist_ok=True)
+        with open(log_p, 'w') as json_file:
+            json.dump(log_stats, json_file, indent=4)
+        print(f"[OK] Metrics saved to: {log_p}")
+# -----------------------------
+# Main
+# -----------------------------
+def main(args):
+    rank, world_size, local_rank = setup_distributed()
+    device = f"cuda:{local_rank}" if world_size > 1 else ("cuda" if torch.cuda.is_available() else "cpu")
+    torch.backends.cudnn.benchmark = True
+    dataset_name = args.dataset
+    secs = np.array([i for i in range(1, 17)], dtype=int)
+    # vision metrics (will only be used if 'v' in modals)
+    lpips_loss_fn = get_loss_fn('lpips', secs, device)
+    dreamsim_loss_fn = get_loss_fn('dreamsim', secs, device)
+    fid_metrics_vision = get_loss_fn('fid', secs, device)
+    try:
+        metric_logger = dist.MetricLogger(delimiter="  ")
+        if rank == 0:
+            print(f"Evaluating {args.eval_name} {dataset_name} | modals = {args.modals}")
+        time_loss_fns = (lpips_loss_fn, dreamsim_loss_fn, fid_metrics_vision)
+        with torch.no_grad():
+            evaluate(
+                args=args,
+                dataset_name=dataset_name,
+                eval_type=args.eval_name,
+                metric_logger=metric_logger,
+                loss_fns=time_loss_fns,
+                gt_dir=args.gt_dir,
+                exp_dir=args.exp_dir,
+                secs=secs,
+                device=device,
+                rank=rank,
+                world_size=world_size,
+                modals=args.modals
+            )
+        output_fn = os.path.join(args.exp_dir, f'{dataset_name}_{args.eval_name}.json')
+        save_metric_to_disk(metric_logger, output_fn, rank)
+    except Exception as e:
+        if rank == 0:
+            print(e)
+    finally:
+        if dist_torch.is_initialized():
+            dist_torch.barrier()
+            dist_torch.destroy_process_group()
+# -----------------------------
+# CLI
+# -----------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(allow_abbrev=False)
+    parser.add_argument("--batch_size", type=int, default=64, help="batch size")
+    parser.add_argument("--gt_dir", type=str, required=True, help="gt directory")
+    parser.add_argument("--exp_dir", type=str, required=True, help="experiment directory (also save json here)")
+    parser.add_argument("--eval_name", type=str, default='time', choices=['time', 'rollout'], help="eval type")
+    parser.add_argument("--dataset", type=str, required=True, help="dataset name (for metric keys & json name)")
+    parser.add_argument("--modals", type=str, default="av", choices=["a", "v", "av"],
+                        help="a=audio only (wav), v= image only (png), av=both")
+    # FAD options
+    parser.add_argument("--fad_model", type=str, default="vggish",
+                        choices=["vggish", "pann", "clap", "encodec"],
+                        help="embedding model for FAD")
+    parser.add_argument("--fad_sr", type=int, default=16000,
+                        help="sampling rate for FAD")
+    # Stereo VGGish FAD options
+    parser.add_argument("--mono", action="store_true",
+                        help="default as stereo, add --mono to mono")
+    parser.add_argument("--fad_pad_sec", type=float, default=1.0,
+                        help="pad the input of VGGish to x seconds")
+    args = parser.parse_args()
+    main(args)

inference_avwm.py ADDED Viewed

	@@ -0,0 +1,498 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from distributed import init_distributed
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import yaml
+import argparse
+import os
+import numpy as np
+from diffusion import create_diffusion
+from diffusers.models import AutoencoderKL
+import misc
+import distributed as dist
+from models import AVCDiT_models
+from datasets import EvalDataset
+from PIL import Image
+from soundstream import SoundStream
+import torchaudio
+from skimage.measure import block_reduce
+import matplotlib.pyplot as plt
+import librosa
+import time
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+from collections import defaultdict
+import json
+def save_image(output_file, img, unnormalize_img):
+    img = img.detach().cpu()
+    if unnormalize_img:
+        img = misc.unnormalize(img)
+    img = img * 255
+    img = img.byte()
+    image = Image.fromarray(img.permute(1, 2, 0).numpy(), mode='RGB')
+    image.save(output_file)
+def save_audio(output_file, audio_tensor, sample_rate):
+    audio_tensor = audio_tensor.detach().cpu()
+    if audio_tensor.ndim == 1:
+        audio_tensor = audio_tensor.unsqueeze(0)
+    torchaudio.save(output_file, audio_tensor.to(torch.float32), sample_rate)
+def get_dataset_eval(config, dataset_name, eval_type, predefined_index=True):
+    data_config = config["eval_datasets"][dataset_name]
+    if predefined_index:
+        predefined_index = f"data_splits/{dataset_name}/test/{eval_type}.pkl"
+    else:
+        predefined_index=None
+    dataset = EvalDataset(
+                data_folder=data_config["data_folder"],
+                data_split_folder=data_config["test"],
+                dataset_name=dataset_name,
+                image_size=config["image_size"],
+                min_dist_cat=config["eval_distance"]["eval_min_dist_cat"],
+                max_dist_cat=config["eval_distance"]["eval_max_dist_cat"],
+                len_traj_pred=config["eval_len_traj_pred"],
+                traj_stride=config["traj_stride"],
+                context_size=config["eval_context_size"],
+                normalize=config["normalize"],
+                transform=misc.transform,
+                goals_per_obs=4,
+                predefined_index=predefined_index,
+                traj_names='traj_names.txt'
+            )
+    return dataset
+@torch.no_grad()
+def model_forward_wrapper_v(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
+    model, diffusion, vae = all_models
+    x = curr_obs.to(device)
+    y = curr_delta.to(device)
+    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+        B, T = x.shape[:2]
+        if rel_t is None:
+            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
+            rel_t *= num_timesteps
+        x = x.flatten(0,1)
+        x = vae.encode(x).latent_dist.sample().mul_(0.18215).unflatten(0, (B, T))
+        x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3], x.shape[4]).flatten(0, 1)
+        z = torch.randn(B*num_goals, 4, latent_size, latent_size, device=device)
+        y = y.flatten(0, 1)
+        model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)
+        samples = diffusion.p_sample_loop(
+                model.forward, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
+        )
+        samples = vae.decode(samples / 0.18215).sample
+        return torch.clip(samples, -1., 1.)
+@torch.no_grad()
+def model_forward_wrapper_a(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
+    model, diffusion, sstream = all_models
+    x = curr_obs.to(device)
+    y = curr_delta.to(device)
+    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+        B, T = x.shape[:2]
+        if rel_t is None:
+            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
+            rel_t *= num_timesteps
+        x = x.flatten(0,1)
+        x = sstream.encoder(x).unflatten(0, (B, T))
+        x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3]).flatten(0, 1)
+        z = torch.randn(B*num_goals, 16, 181, device=device)
+        y = y.flatten(0, 1)
+        model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)
+        samples = diffusion.p_sample_loop(
+                model.forward, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
+        )
+        # REWARD TOKEN
+        patch_tok  = samples[..., -1:]                        # [N, 64, 1]
+        diff_pred  = patch_tok.mean(dim=1, keepdim=True)      # [N, 1]
+        samples = samples[..., :-1]
+        # AUDIO TOKENS
+        quantized, _, _ = sstream.quantizer(samples.permute(0, 2, 1))  # [1, T', D]
+        samples = sstream.decoder(quantized.permute(0, 2, 1))
+        return samples, diff_pred
+@torch.no_grad()
+def model_forward_wrapper_av(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
+    model, diffusion, vae, sstream = all_models
+    x_v, x_a = curr_obs
+    x_v = x_v.to(device)
+    x_a = x_a.to(device)
+    y = curr_delta.to(device)
+    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+        B, T_v = x_v.shape[:2]
+        B, T_a = x_a.shape[:2]
+        if rel_t is None:
+            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
+            rel_t *= num_timesteps
+        x_v = x_v.flatten(0,1)
+        x_a = x_a.flatten(0,1)
+        x_v = vae.encode(x_v).latent_dist.sample().mul_(0.18215).unflatten(0, (B, T_v))
+        x_a = sstream.encoder(x_a).unflatten(0, (B, T_a))
+        x_v_cond = x_v[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_v.shape[2], x_v.shape[3], x_v.shape[4]).flatten(0, 1)
+        x_a_cond = x_a[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_a.shape[2], x_a.shape[3]).flatten(0, 1)
+        z_v = torch.randn(B*num_goals, 4, latent_size, latent_size, device=device)
+        z_a = torch.randn(B*num_goals, 16, 181, device=device) #TODO
+        y = y.flatten(0, 1)
+        model_kwargs = dict(y=y, x_v_cond=x_v_cond, x_a_cond=x_a_cond, rel_t=rel_t)
+        samples_v, samples_a = diffusion.p_sample_loop(
+                model.forward, z_v.shape, z_a.shape, z_v, z_a, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
+        )
+        patch_tok  = samples_a[..., -1:]                        # [N, 16, 1]
+        diff_pred  = patch_tok.mean(dim=1, keepdim=True)      # [N, 1]
+        samples_a = samples_a[..., :-1]
+        samples_v = vae.decode(samples_v / 0.18215).sample
+        quantized, _, _ = sstream.quantizer(samples_a.permute(0, 2, 1))  # [1, T', D]
+        samples_a = sstream.decoder(quantized.permute(0, 2, 1))
+        return torch.clip(samples_v, -1., 1.), samples_a, diff_pred
+def generate_rollout(args, output_dir, rollout_frames, idxs, all_models, obs_av, gt_av, diffs_seq, delta, num_cond, device):
+    (obs_image, obs_audio, orig_obs_audio)=obs_av
+    (gt_image, gt_audio, orig_gt_audio)=gt_av
+    gt_image = gt_image[:,:rollout_frames]
+    gt_audio = gt_audio[:,:rollout_frames]
+    curr_v = obs_image.to(device)
+    curr_a = obs_audio.to(device)
+    down_resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000, lowpass_filter_width=64).to(device, dtype=torch.bfloat16)
+    episode_records = defaultdict(list)
+    value_key = "denorm_gt" if args.gt else "denorm_pred"
+    for i in range(gt_image.shape[1]):
+        curr_delta = delta[:, i:i+1].to(device)
+        x_gt_pixels = gt_image[:, i].to(device)
+        x_gt_audios_orig = orig_gt_audio[:, i].to(device)
+        if args.gt:
+            visualize_preds(output_dir, idxs, i+1, x_gt_pixels, x_gt_audios_orig, 16000)
+            denorm_gt_vals = denorm_from_tensor(diffs_seq[:, i:i+1, :])  # [B]
+            idxs_1d = idxs.detach().view(-1).cpu().numpy()
+            for b, sample_idx in enumerate(idxs_1d):
+                episode_records[int(sample_idx)].append({"sec": int(i+1), "value": float(denorm_gt_vals[b])})
+        else:
+            diff_gt = diffs_seq[:, i:i+1, :].unsqueeze(1).to(device)
+            x_pred_pixels, x_pred_audios, diff_pred = model_forward_wrapper_av(all_models, (curr_v, curr_a), curr_delta, num_timesteps=1, latent_size=args.latent_size, device=device, num_cond=num_cond, num_goals=1)
+            x_pred_audios_orig = down_resampler(x_pred_audios)
+            curr_v = torch.cat((curr_v, x_pred_pixels.unsqueeze(1)), dim=1) # append current prediction
+            curr_v = curr_v[:, 1:] # remove first observation
+            curr_a = torch.cat((curr_a, x_pred_audios.unsqueeze(1)), dim=1) # append current prediction
+            curr_a = curr_a[:, 1:] # remove first observation
+            denorm_pred_vals = denorm_from_tensor(diff_pred)  # [B]
+            denorm_gt_vals   = denorm_from_tensor(diff_gt)    # [B]
+            visualize_preds(output_dir, idxs, i+1, x_pred_pixels, x_pred_audios_orig, 16000)
+            visualize_compare(output_dir, idxs, i+1,
+                              x_pred_pixels, x_pred_audios_orig,
+                              x_gt_pixels,   x_gt_audios_orig,
+                              denorm_pred_vals=denorm_pred_vals,
+                              denorm_gt_vals=denorm_gt_vals)
+            idxs_1d = idxs.detach().view(-1).cpu().numpy()
+            for b, sample_idx in enumerate(idxs_1d):
+                episode_records[int(sample_idx)].append({"sec": int(i+1), "value": float(denorm_pred_vals[b])})
+    for sample_idx, rows in episode_records.items():
+        rows = sorted(rows, key=lambda r: r["sec"])
+        sample_folder = os.path.join(output_dir, f"id_{sample_idx}")
+        os.makedirs(sample_folder, exist_ok=True)
+        out_json = os.path.join(sample_folder, "distance.json")
+        compact = [{ "sec": r["sec"], value_key: r["value"] } for r in rows]
+        with open(out_json, "w") as f:
+            json.dump(compact, f, indent=2)
+def generate_time(args, output_dir, idxs, all_models, obs_av, gt_av, diffs_seq, delta, secs, num_cond, device):
+    (obs_image, obs_audio, _)=obs_av
+    (gt_image, _, orig_gt_audio)=gt_av
+    down_resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000, lowpass_filter_width=64).to(device, dtype=torch.bfloat16)
+    episode_records = defaultdict(list)  # {sample_idx: [{"sec": int, "value": float}, ...]}
+    value_key = "denorm_gt" if args.gt else "denorm_pred"
+    for sec in secs:
+        curr_delta = delta[:, :sec].sum(dim=1, keepdim=True)
+        x_gt_pixels = gt_image[:, sec-1].to(device)
+        x_gt_audios_orig = orig_gt_audio[:, sec-1].to(device)
+        if args.gt:
+            denorm_gt_vals = denorm_from_tensor(diffs_seq[:, :sec, :].sum(dim=1, keepdim=True))  # [B]
+            visualize_preds(output_dir, idxs, sec, x_gt_pixels, x_gt_audios_orig, 16000)
+            idxs_1d = idxs.detach().view(-1).cpu().numpy()
+            for b, sample_idx in enumerate(idxs_1d):
+                episode_records[int(sample_idx)].append({"sec": int(sec), "value": float(denorm_gt_vals[b])})
+        else:
+            diff_gt = diffs_seq[:, :sec, :].sum(dim=1, keepdim=True).to(device)
+            print(obs_image.shape, obs_audio.shape, curr_delta.shape, obs_image.dtype, obs_audio.dtype, curr_delta.dtype)
+            x_pred_pixels, x_pred_audios, diff_pred = model_forward_wrapper_av(all_models, (obs_image, obs_audio) , curr_delta, sec, args.latent_size, num_cond=num_cond, num_goals=1, device=device)
+            x_pred_audios_orig = down_resampler(x_pred_audios)
+            denorm_pred_vals = denorm_from_tensor(diff_pred)       # [B]
+            denorm_gt_vals   = denorm_from_tensor(diff_gt)         # [B]
+            visualize_preds(output_dir, idxs, sec, x_pred_pixels, x_pred_audios_orig, 16000)
+            visualize_compare(output_dir, idxs, sec,
+                              x_pred_pixels, x_pred_audios_orig,
+                              x_gt_pixels,   x_gt_audios_orig,
+                              denorm_pred_vals=denorm_pred_vals,
+                              denorm_gt_vals=denorm_gt_vals)
+            idxs_1d = idxs.detach().view(-1).cpu().numpy()
+            for b, sample_idx in enumerate(idxs_1d):
+                episode_records[int(sample_idx)].append({"sec": int(sec), "value": float(denorm_pred_vals[b])})
+    for sample_idx, rows in episode_records.items():
+        rows = sorted(rows, key=lambda r: r["sec"])
+        sample_folder = os.path.join(output_dir, f"id_{sample_idx}")
+        os.makedirs(sample_folder, exist_ok=True)
+        out_json = os.path.join(sample_folder, "distance.json")
+        compact = [{ "sec": r["sec"], value_key: r["value"] } for r in rows]
+        with open(out_json, "w") as f:
+            json.dump(compact, f, indent=2)
+def visualize_preds(output_dir, idxs, sec, x_pred_pixels, x_pred_audios, sample_rate):
+    idxs_1d = idxs.detach().view(-1)
+    for batch_idx, sample_idx in enumerate(idxs_1d):
+        sample_idx = int(sample_idx.item())
+        sample_folder = os.path.join(output_dir, f'id_{sample_idx}')
+        os.makedirs(sample_folder, exist_ok=True)
+        image_file = os.path.join(sample_folder, f'{sec}.png')
+        save_image(image_file, x_pred_pixels[batch_idx], True)
+        audio_file = os.path.join(sample_folder, f'{sec}.wav')
+        save_audio(audio_file, x_pred_audios[batch_idx], sample_rate)
+def _compute_binaural_spectrogram_np(audio_2ch: np.ndarray):
+    def _stft_abs(signal):
+        n_fft = 512
+        hop_length = 160
+        win_length = 400
+        stft = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
+        stft = block_reduce(stft, block_size=(4, 4), func=np.mean)
+        return stft
+    L = np.log1p(_stft_abs(audio_2ch[0]))
+    R = np.log1p(_stft_abs(audio_2ch[1]))
+    spec = np.stack([L, R], axis=-1)  # (F,T,2)
+    return spec
+def denorm_from_tensor(t: torch.Tensor, min_v=-20.0, max_v=20.0, scale=0.15) -> torch.Tensor:
+    x = t.detach().float().view(t.shape[0], -1)[:, 0]
+    n01 = (x + 1.0) / 2.0
+    raw = n01 * (max_v - min_v) + min_v
+    return raw * scale
+def visualize_compare(output_dir, idxs, sec,
+                    x_pred_pixels, x_pred_audios_orig,
+                    x_gt_pixels,   x_gt_audios_orig,
+                    denorm_pred_vals,
+                    denorm_gt_vals):
+    idxs_np = idxs.detach().view(-1).cpu().numpy()
+    B = x_pred_pixels.shape[0]
+    assert x_gt_pixels.shape[0] == B and x_pred_audios_orig.shape[0] == B and x_gt_audios_orig.shape[0] == B
+    for b in range(B):
+        sample_idx = int(idxs_np[b])
+        sample_folder = os.path.join(output_dir, f'id_{sample_idx}')
+        os.makedirs(sample_folder, exist_ok=True)
+        out_path = os.path.join(sample_folder, f'compare_{sec}.png')
+        def _tensor_to_display_img(x: torch.Tensor):
+            x = x.detach().cpu()
+            x = misc.unnormalize(x)
+            x = (x * 255.0).round().clamp(0, 255)
+            x = x.to(torch.uint8).permute(1, 2, 0)
+            return x.numpy()
+        pred_img = _tensor_to_display_img(x_pred_pixels[b])
+        gt_img   = _tensor_to_display_img(x_gt_pixels[b])
+        pred_aud = x_pred_audios_orig[b].detach().cpu().float().numpy()
+        gt_aud   = x_gt_audios_orig[b].detach().cpu().float().numpy()
+        pred_spec = _compute_binaural_spectrogram_np(pred_aud)
+        gt_spec   = _compute_binaural_spectrogram_np(gt_aud)
+        vmin_L = min(pred_spec[:, :, 0].min(), gt_spec[:, :, 0].min())
+        vmax_L = max(pred_spec[:, :, 0].max(), gt_spec[:, :, 0].max())
+        vmin_R = min(pred_spec[:, :, 1].min(), gt_spec[:, :, 1].min())
+        vmax_R = max(pred_spec[:, :, 1].max(), gt_spec[:, :, 1].max())
+        dn_pred = float(denorm_pred_vals[b]) if denorm_pred_vals is not None else 0
+        dn_gt   = float(denorm_gt_vals[b])   if denorm_gt_vals   is not None else 0
+        fig, axes = plt.subplots(2, 4, figsize=(14, 6), constrained_layout=True)
+        axes[0, 0].imshow(pred_img); axes[0, 0].set_title('pred image'); axes[0, 0].axis('off')
+        axes[0, 1].imshow(gt_img);   axes[0, 1].set_title('gt image');   axes[0, 1].axis('off')
+        axes[1, 0].axis('off')
+        axes[1, 1].axis('off')
+        im_pred_L = axes[0, 2].imshow(pred_spec[:, :, 0], origin='lower', aspect='auto', vmin=vmin_L, vmax=vmax_L)
+        axes[0, 2].set_title('pred spec (Left)'); axes[0, 2].set_xticks([]); axes[0, 2].set_yticks([])
+        im_gt_L = axes[0, 3].imshow(gt_spec[:, :, 0], origin='lower', aspect='auto', vmin=vmin_L, vmax=vmax_L)
+        axes[0, 3].set_title('gt spec (Left)'); axes[0, 3].set_xticks([]); axes[0, 3].set_yticks([])
+        im_pred_R = axes[1, 2].imshow(pred_spec[:, :, 1], origin='lower', aspect='auto', vmin=vmin_R, vmax=vmax_R)
+        axes[1, 2].set_title('pred spec (Right)'); axes[1, 2].set_xticks([]); axes[1, 2].set_yticks([])
+        im_gt_R = axes[1, 3].imshow(gt_spec[:, :, 1], origin='lower', aspect='auto', vmin=vmin_R, vmax=vmax_R)
+        axes[1, 3].set_title('gt spec (Right)'); axes[1, 3].set_xticks([]); axes[1, 3].set_yticks([])
+        fig.suptitle(
+            f'id={sample_idx}, sec={sec} | denorm(reward_pred)={dn_pred:.4f}, denorm(reward_gt)={dn_gt:.4f}',
+            fontsize=11
+        )
+        plt.savefig(out_path, dpi=180)
+        plt.close(fig)
+@torch.no_grad()
+def main(args):
+    _, _, device, _ = init_distributed()
+    print(args)
+    device = torch.device(device)
+    num_tasks = dist.get_world_size()
+    global_rank = dist.get_rank()
+    exp_eval = args.exp
+    # model & config setup
+    if args.gt:
+        args.save_output_dir = os.path.join(args.output_dir, 'gt')
+    else:
+        exp_name = os.path.basename(exp_eval).split('.')[0]
+        args.save_output_dir = os.path.join(args.output_dir, exp_name)
+    if  args.ckp != '0100000':
+        args.save_output_dir = args.save_output_dir + "_%s"%(args.ckp)
+    os.makedirs(args.save_output_dir, exist_ok=True)
+    with open("config/eval_config.yaml", "r") as f:
+        default_config = yaml.safe_load(f)
+    config = default_config
+    with open(exp_eval, "r") as f:
+        user_config = yaml.safe_load(f)
+    config.update(user_config)
+    eval_len_traj_pred=config["eval_len_traj_pred"]
+    if args.rollout_frames==-1:
+        args.rollout_frames=eval_len_traj_pred
+    assert args.rollout_frames<=eval_len_traj_pred
+    latent_size = config['image_size'] // 8
+    args.latent_size = config['image_size'] // 8
+    num_cond = config['context_size']
+    print("loading")
+    model_lst = (None, None, None, None)
+    if not args.gt:
+        model = AVCDiT_models[config['model']](context_size=num_cond, input_size=latent_size, in_channels=4, mode="av")
+        ckp = torch.load(f'{config["results_dir"]}/{config["run_name"]}/checkpoints/{args.ckp}.pth.tar', map_location='cpu', weights_only=False)
+        print(model.load_state_dict(ckp["ema"], strict=True))
+        model.eval()
+        model.to(device)
+        model = torch.compile(model)
+        diffusion = create_diffusion(str(250), dual=True)
+        vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema").to(device)
+        sstream = SoundStream(C=32, D=16, n_q=8, codebook_size=1024).to(device)
+        sstream_path=config["tokenizer_a_path"]
+        sstream_checkpoint = torch.load(sstream_path, map_location=device)
+        sstream.load_state_dict(sstream_checkpoint["model_state"])
+        sstream.eval()
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=False)
+        model_lst = (model, diffusion, vae, sstream)
+    # Loading Datasets
+    dataset_names = args.datasets.split(',')
+    datasets = {}
+    for dataset_name in dataset_names:
+        dataset_val = get_dataset_eval(config, dataset_name, args.eval_type, predefined_index=False)
+        if len(dataset_val) % num_tasks != 0:
+            print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
+                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
+                    'equal num of samples per-process.')
+        sampler_val = torch.utils.data.DistributedSampler(
+            dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)
+        curr_data_loader = torch.utils.data.DataLoader(
+                            dataset_val, sampler=sampler_val,
+                            batch_size=args.batch_size,
+                            num_workers=args.num_workers,
+                            pin_memory=True,
+                            drop_last=False
+                        )
+        datasets[dataset_name] = curr_data_loader
+    print_freq = 1
+    header = 'Evaluation: '
+    metric_logger = dist.MetricLogger(delimiter="  ")
+    for dataset_name in dataset_names:
+        dataset_save_output_dir = os.path.join(args.save_output_dir, dataset_name)
+        os.makedirs(dataset_save_output_dir, exist_ok=True)
+        curr_data_loader = datasets[dataset_name]
+        for data_iter_step, (idxs, obs_image, gt_image, obs_audio, gt_audio, diffs_seq, delta, orig_obs_audio, orig_gt_audio) in enumerate(metric_logger.log_every(curr_data_loader, print_freq, header)):
+            with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+                obs_image = obs_image[:, -num_cond:].to(device)
+                gt_image = gt_image.to(device)
+                obs_audio = obs_audio[:, -num_cond:].to(device)
+                gt_audio = gt_audio.to(device)
+                orig_obs_audio = orig_obs_audio[:, -num_cond:].to(device)
+                orig_gt_audio = orig_gt_audio.to(device)
+                diffs_seq = diffs_seq.to(device)
+                obs_av=(obs_image, obs_audio, orig_obs_audio)
+                gt_av=(gt_image, gt_audio, orig_gt_audio)
+                if args.eval_type == 'rollout':
+                    curr_rollout_output_dir = os.path.join(dataset_save_output_dir, f'rollout_{args.rollout_frames}frames')
+                    os.makedirs(curr_rollout_output_dir, exist_ok=True)
+                    generate_rollout(args, curr_rollout_output_dir, args.rollout_frames, idxs, model_lst, obs_av, gt_av, diffs_seq, delta, num_cond, device)
+                elif args.eval_type == 'time':
+                    if args.time_secs != '':
+                        secs = np.array([int(sec) for sec in args.time_secs.split(',')])
+                    else:
+                        secs = np.array([int(sec) for sec in range(1,args.rollout_frames+1)])
+                    curr_time_output_dir = os.path.join(dataset_save_output_dir, 'time')
+                    os.makedirs(curr_time_output_dir, exist_ok=True)
+                    generate_time(args, curr_time_output_dir, idxs, model_lst, obs_av, gt_av, diffs_seq, delta, secs, num_cond, device)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default=None, help="output directory")
+    parser.add_argument("--exp", type=str, default=None, help="experiment name")
+    parser.add_argument("--ckp", type=str, default='0100000')
+    parser.add_argument("--num_sec_eval", type=int, default=5)
+    parser.add_argument("--input_fps", type=int, default=4)
+    parser.add_argument("--datasets", type=str, default=None, help="dataset name")
+    parser.add_argument("--num_workers", type=int, default=8, help="num workers")
+    parser.add_argument("--batch_size", type=int, default=16, help="batch size")
+    parser.add_argument("--eval_type", type=str, default=None, help="type of evaluation has to be either 'time' or 'rollout'")
+    # Rollout Evaluation Args
+    parser.add_argument("--time_secs", type=str, default='', help="") #'1,2,3,4'
+    parser.add_argument("--rollout_frames", type=int, default=-1, help="")
+    parser.add_argument("--gt", type=int, default=0, help="set to 1 to produce ground truth evaluation set")
+    args = parser.parse_args()
+    main(args)

mel_scale.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import torch
+from torch import Tensor
+from typing import Optional
+import math
+import warnings
+class MelScale(torch.nn.Module):
+    r"""Turn a normal STFT into a mel frequency STFT, using a conversion
+    matrix.  This uses triangular filter banks.
+    User can control which device the filter bank (`fb`) is (e.g. fb.to(spec_f.device)).
+    Args:
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ['n_mels', 'sample_rate', 'f_min', 'f_max']
+    def __init__(self,
+                 n_mels: int = 128,
+                 sample_rate: int = 16000,
+                 f_min: float = 0.,
+                 f_max: Optional[float] = None,
+                 n_stft: int = 201,
+                 norm: Optional[str] = None,
+                 mel_scale: str = "htk") -> None:
+        super(MelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.norm = norm
+        self.mel_scale = mel_scale
+        assert f_min <= self.f_max, 'Require f_min: {} < f_max: {}'.format(f_min, self.f_max)
+        fb = melscale_fbanks(
+            n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate, self.norm,
+            self.mel_scale)
+        self.register_buffer('fb', fb)
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(-1, -2)
+        return mel_specgram
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        mels (float): Frequency in Mels
+    """
+    if mel_scale not in ['slaney', 'htk']:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    mels = (freq - f_min) / f_sp
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+    return mels
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+    if mel_scale not in ['slaney', 'htk']:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+    if mel_scale == "htk":
+        return 700.0 * (10.0**(mels / 2595.0) - 1.0)
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+    log_t = (mels >= min_log_mel)
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+    return freqs
+def _create_triangular_filterbank(
+        all_freqs: Tensor,
+        f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+    return fb
+def melscale_fbanks(
+        n_freqs: int,
+        f_min: float,
+        f_max: float,
+        n_mels: int,
+        sample_rate: int,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * melscale_fbanks(A.size(-1), ...)``.
+    """
+    if norm is not None and norm != "slaney":
+        raise ValueError("norm must be one of None or 'slaney'")
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+    if (fb.max(dim=0).values == 0.).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+    return fb

merge_experts.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import torch
+import yaml
+import argparse
+from models import AVCDiT_models
+def add_exact_keys(mapping, keys):
+    for k in keys:
+        mapping[k] = k
+def add_mlp_block_keys(mapping, mlp_name, num_blocks):
+    for i in range(num_blocks):
+        for fc in ["fc1", "fc2"]:
+            for param in ["weight", "bias"]:
+                k = f"blocks.{i}.{mlp_name}.{fc}.{param}"
+                mapping[k] = k
+def load_from_two_checkpoints(model, ckpt1_path, ckpt2_path, map1=None, map2=None, device='cuda'):
+    ckpt1 = torch.load(ckpt1_path, map_location=device, weights_only=False)
+    ckpt2 = torch.load(ckpt2_path, map_location=device, weights_only=False)
+    state1 = {k.replace('_orig_mod.', ''): v for k, v in ckpt1["ema"].items()}
+    state2 = {k.replace('_orig_mod.', ''): v for k, v in ckpt2["ema"].items()}
+    model_state = model.state_dict()
+    new_state = {}
+    source_info = {}  # key: model param name, value: ckpt source name
+    if map1:
+        for k_model, k_ckpt in map1.items():
+            if (
+                k_ckpt in state1
+                and k_model in model_state
+                and state1[k_ckpt].shape == model_state[k_model].shape
+            ):
+                new_state[k_model] = state1[k_ckpt]
+                source_info[k_model] = "ckpt1"
+    if map2:
+        for k_model, k_ckpt in map2.items():
+            if (
+                k_ckpt in state2
+                and k_model in model_state
+                and state2[k_ckpt].shape == model_state[k_model].shape
+            ):
+                new_state[k_model] = state2[k_ckpt]
+                source_info[k_model] = "ckpt2"
+    for k_model, tensor in model_state.items():
+        if k_model not in new_state:
+            if k_model in state1 and state1[k_model].shape == tensor.shape:
+                new_state[k_model] = state1[k_model]
+                source_info[k_model] = "fallback_ckpt1"
+    model.load_state_dict(new_state, strict=False)
+    print(f"Loaded {len(new_state)} / {len(model_state)} parameters")
+    return new_state
+def main(args):
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+    model_name = config.get("model", "AVCDiT-B/2")
+    print(f"Using model: {model_name}")
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    model = AVCDiT_models[model_name](
+        context_size=4,
+        input_size=28,
+        in_channels=4,
+        mode="av"
+    ).to(device)
+    depth = len(model.blocks)
+    map1 = {}
+    add_exact_keys(map1, [
+        "pos_embed_v",
+        "x_embedder_v.proj.weight",
+        "x_embedder_v.proj.bias",
+        "final_layer.linear.weight",
+        "final_layer.linear.bias",
+        "final_layer.adaLN_modulation.1.weight",
+        "final_layer.adaLN_modulation.1.bias",
+    ])
+    add_mlp_block_keys(map1, "mlp_v", depth)
+    map2 = {}
+    add_exact_keys(map2, [
+        "pos_embed_a_cond",
+        "pos_embed_a_pred",
+        "x_embedder_a.weight",
+        "x_embedder_a.bias",
+        "final_layer_a.linear.weight",
+        "final_layer_a.linear.bias",
+        "final_layer_a.adaLN_modulation.1.weight",
+        "final_layer_a.adaLN_modulation.1.bias",
+    ])
+    add_mlp_block_keys(map2, "mlp_a", depth)
+    merged_state_dict = load_from_two_checkpoints(
+        model,
+        ckpt1_path=args.v_expert,
+        ckpt2_path=args.a_expert,
+        map1=map1,
+        map2=map2,
+        device=device
+    )
+    torch.save({"ema": merged_state_dict}, args.output)
+    print(f"Merged model saved to {args.output}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--v_expert", type=str, required=True)
+    parser.add_argument("--a_expert", type=str, required=True)
+    parser.add_argument("--output", type=str, default="experts_merged.pth")
+    args = parser.parse_args()
+    main(args)

misc.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import yaml
+import matplotlib.pyplot as plt
+import torch
+import numpy as np
+import os
+from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
+from PIL import Image
+from torchvision import transforms
+import torchvision.transforms.functional as TF
+IMAGE_ASPECT_RATIO = (4 / 3)  # all images are centered cropped to a 4:3 aspect ratio in training
+with open("config/data_config.yaml", "r") as f:
+    data_config = yaml.safe_load(f)
+def get_action_torch(diffusion_output, action_stats):
+    ndeltas = diffusion_output
+    ndeltas = ndeltas.reshape(ndeltas.shape[0], -1, 2)
+    ndeltas = unnormalize_data(ndeltas, action_stats)
+    actions = torch.cumsum(ndeltas, dim=1)
+    return actions.to(ndeltas)
+def log_viz_single(dataset_name, obs_image, goal_image, preds, deltas, loss, min_idx, actions, action_stats, plan_iter=0, output_dir='plot.png'):
+    '''
+    Visualize a single instance
+    actions is gt actions
+    '''
+    viz_obs_image = unnormalize(obs_image.detach().cpu())[-1] # take last img
+    viz_goal_image = unnormalize(goal_image.detach().cpu())
+    deltas = deltas.detach().cpu()
+    loss = loss.detach().cpu()
+    actions = actions.detach().cpu()
+    pred_actions = get_action_torch(deltas[:, :, :2], action_stats)
+    plot_array = plot_images_and_actions(dataset_name, viz_obs_image, viz_goal_image, pred_actions, actions, min_idx, loss=loss)
+    plt.imshow(plot_array)
+    plt.axis('off')  # Hide axes for a cleaner image
+    # Save the plot array as a PNG file locally
+    plt.savefig(output_dir, format='png', dpi=300, bbox_inches='tight')
+def plot_images_and_actions(dataset_name, curr_viz_obs_image, curr_viz_goal_image, curr_viz_pred_actions, curr_viz_actions, min_idx, loss):
+    curr_viz_obs_image = curr_viz_obs_image.permute(1, 2, 0).cpu().numpy()
+    curr_viz_goal_image = curr_viz_goal_image.permute(1, 2, 0).cpu().numpy()
+    # scale back to metric space for plotting
+    curr_viz_pred_actions = curr_viz_pred_actions * data_config[dataset_name]['metric_waypoint_spacing']
+    curr_viz_actions = curr_viz_actions * data_config[dataset_name]['metric_waypoint_spacing']
+    # Create the figure with three subplots
+    fig, axs = plt.subplots(1, 3, figsize=(9, 3))
+    # Plot condition image
+    axs[0].imshow(curr_viz_obs_image)
+    axs[0].set_title("Condition Image", fontsize=13)
+    axs[0].axis("off")
+    # Plot goal image
+    axs[1].imshow(curr_viz_goal_image)
+    axs[1].set_title("Goal Image", fontsize=13)
+    axs[1].axis("off")
+    colors = ['red', 'orange', 'cyan']
+    for i in range(1, curr_viz_pred_actions.shape[0]):
+        color = colors[(i - 1) % len(colors)]
+        label = f"Sample {i} Min Loss" if i == min_idx.item() else f"{i}"
+        if i != min_idx.item():
+            axs[2].plot(-curr_viz_pred_actions[i, :, 1], curr_viz_pred_actions[i, :, 0],
+                        color=color, marker="o", markersize=5, label=label)
+            axs[2].text(-curr_viz_pred_actions[i, -1, 1],
+                curr_viz_pred_actions[i, -1, 0],
+                round(loss[i].item(), 3),
+                color='black',
+                fontsize=10,
+                ha='left', va='bottom')  # Adjust position to avoid overlap
+    # Highlight the minimum loss sample
+    axs[2].plot(-curr_viz_pred_actions[min_idx.item(), :, 1], curr_viz_pred_actions[min_idx.item(), :, 0],
+                color='green', marker="o", markersize=5, label=f"{min_idx.item()}")
+    axs[2].text(-curr_viz_pred_actions[min_idx.item(), -1, 1],
+        curr_viz_pred_actions[min_idx.item(), -1, 0],
+        round(loss[min_idx.item()].item(), 3),
+        color='black',
+        fontsize=10,
+        ha='left', va='bottom')  # Adjust position to avoid overlap
+    # Plot ground truth actions
+    axs[2].plot(-curr_viz_actions[:, 1], curr_viz_actions[:, 0], color='blue', marker="o", label="GT")
+    # Set titles and labels with larger font size
+    axs[2].set_title("   ", fontsize=13)
+    axs[2].set_xlabel("X (m)", fontsize=11)
+    axs[2].set_ylabel("Y (m)", fontsize=11)
+    # Set equal aspect ratio and adjust axis limits
+    axs[2].set_aspect('equal', adjustable='box')
+    x_min, x_max = axs[2].get_xlim()
+    y_min, y_max = axs[2].get_ylim()
+    axis_range = max(x_max - x_min, y_max - y_min) / 2
+    x_mid = (x_max + x_min) / 2
+    y_mid = (y_max + y_min) / 2
+    axs[2].set_xlim(x_mid - axis_range, x_mid + axis_range)
+    axs[2].set_ylim(y_mid - axis_range, y_mid + axis_range)
+    axs[2].legend(loc='lower left', fontsize=10, frameon=True, bbox_to_anchor=(0, 0))
+    plt.tight_layout()
+    canvas = FigureCanvas(fig)
+    canvas.draw()
+    plot_array = np.frombuffer(canvas.tostring_rgb(), dtype='uint8')
+    plot_array = plot_array.reshape(canvas.get_width_height()[::-1] + (3,))
+    plt.close(fig)
+    return plot_array
+def normalize_data(data, stats):
+    # nomalize to [0,1]
+    ndata = (data - stats['min']) / (stats['max'] - stats['min'])
+    # normalize to [-1, 1]
+    ndata = ndata * 2 - 1
+    return ndata
+def unnormalize_data(ndata, stats):
+    ndata = (ndata + 1) / 2
+    data = ndata * (stats['max'].to(ndata) - stats['min'].to(ndata)) + stats['min'].to(ndata)
+    return data
+def get_data_path(data_folder: str, f: str, time: int, data_type: str = "image"):
+    data_ext = {
+        "image": ".jpg",
+        "audio": ".wav"
+        # add more data types here
+    }
+    return os.path.join(data_folder, f, f"{str(time)}{data_ext[data_type]}")
+def yaw_rotmat(yaw: float) -> np.ndarray:
+    return np.array(
+        [
+            [np.cos(yaw), -np.sin(yaw), 0.0],
+            [np.sin(yaw), np.cos(yaw), 0.0],
+            [0.0, 0.0, 1.0],
+        ],
+    )
+def angle_difference(theta1, theta2):
+    delta_theta = theta2 - theta1
+    delta_theta = delta_theta - 2 * np.pi * np.floor((delta_theta + np.pi) / (2 * np.pi))
+    return delta_theta
+def get_delta_np(actions):
+    # append zeros to first action (unbatched)
+    ex_actions = np.concatenate((np.zeros((1, actions.shape[1])), actions), axis=0)
+    delta = ex_actions[1:] - ex_actions[:-1]
+    return delta
+def to_local_coords(
+    positions: np.ndarray, curr_pos: np.ndarray, curr_yaw: float
+) -> np.ndarray:
+    """
+    Convert positions to local coordinates
+    Args:
+        positions (np.ndarray): positions to convert
+        curr_pos (np.ndarray): current position
+        curr_yaw (float): current yaw
+    Returns:
+        np.ndarray: positions in local coordinates
+    """
+    rotmat = yaw_rotmat(curr_yaw)
+    if positions.shape[-1] == 2:
+        rotmat = rotmat[:2, :2]
+    elif positions.shape[-1] == 3:
+        pass
+    else:
+        raise ValueError
+    return (positions - curr_pos).dot(rotmat)
+def calculate_delta_yaw(unnorm_actions):
+    x = unnorm_actions[..., 0]
+    y = unnorm_actions[..., 1]
+    yaw = torch.atan2(y, x).unsqueeze(-1)
+    delta_yaw = torch.cat((torch.zeros(yaw.shape[0], 1, yaw.shape[2]).to(yaw.device), yaw), dim=1)
+    delta_yaw = delta_yaw[:, 1:, :] - delta_yaw[:, :-1, :]
+    return delta_yaw
+def save_planning_pred(dataset_save_output_dir, B, idxs, obs_image, goal_image, preds, deltas, loss, gt_actions, plan_iter=0):
+    for batch_idx, idx in enumerate(idxs.flatten()):
+        sample_idx = int(idx)
+        sample_folder = os.path.join(dataset_save_output_dir, f'id_{sample_idx}')
+        os.makedirs(sample_folder, exist_ok=True)
+        preds_save = {
+            'obs_image': obs_image[batch_idx],
+            'goal_image': goal_image[batch_idx],
+            'preds': preds[batch_idx],
+            'deltas': deltas[batch_idx],
+            'loss': loss[batch_idx],
+            'gt_actions': gt_actions[batch_idx],
+        }
+        preds_file = os.path.join(sample_folder, f"preds_{plan_iter}.pth")
+        torch.save(preds_save, preds_file)
+class CenterCropAR:
+    def __init__(self, ar: float = IMAGE_ASPECT_RATIO):
+        self.ar = ar
+    def __call__(self, img: Image.Image):
+        w, h = img.size
+        if w > h:
+            img = TF.center_crop(img, (h, int(h * self.ar)))
+        else:
+            img = TF.center_crop(img, (int(w / self.ar), w))
+        return img
+transform = transforms.Compose([
+    CenterCropAR(),
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+])
+unnormalize = transforms.Normalize(
+    mean=[-0.5 / 0.5, -0.5 / 0.5, -0.5 / 0.5],
+    std=[1 / 0.5, 1 / 0.5, 1 / 0.5]
+)

models.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t.float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class ActionEmbedder(nn.Module):
+    """
+    Embeds action xy into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        hsize = hidden_size//3
+        self.x_emb = TimestepEmbedder(hsize, frequency_embedding_size)
+        self.y_emb = TimestepEmbedder(hsize, frequency_embedding_size)
+        self.angle_emb = TimestepEmbedder(hidden_size -2*hsize, frequency_embedding_size)
+    def forward(self, xya):
+        return torch.cat([self.x_emb(xya[...,0:1]), self.y_emb(xya[...,1:2]), self.angle_emb(xya[...,2:3])], dim=-1)
+#################################################################################
+#                                 Core AVCDiT Model                                #
+#################################################################################
+class AVCDiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning and two modalities.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, mode="av", **block_kwargs):
+        super().__init__()
+        self.mode = mode
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_cond = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.cttn = nn.MultiheadAttention(hidden_size, num_heads=num_heads, add_bias_kv=True, bias=True, batch_first=True, **block_kwargs)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 11 * hidden_size, bias=True)
+        )
+        self.norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        if self.mode == "av" or self.mode == "v":
+            self.mlp_v = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        if self.mode == "av" or self.mode == "a":
+            self.mlp_a = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+    # def forward(self, x_v, x_a, c, x_v_cond, x_a_cond, mode="av"):
+    def forward(self, *args):
+        if self.mode == "av":
+            x_v, x_a, c, x_v_cond, x_a_cond = args
+            shift_msa, scale_msa, gate_msa, shift_ca_xcond, scale_ca_xcond, shift_ca_x, scale_ca_x, gate_ca_x, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(11, dim=1)
+            _, v_token_num, _ = x_v.shape
+            x = torch.cat([x_v, x_a], dim=1)
+            x_cond = torch.cat([x_v_cond, x_a_cond], dim=1)
+            x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+            x_cond_norm = modulate(self.norm_cond(x_cond), shift_ca_xcond, scale_ca_xcond)
+            x = x + gate_ca_x.unsqueeze(1) * self.cttn(query=modulate(self.norm2(x), shift_ca_x, scale_ca_x), key=x_cond_norm, value=x_cond_norm, need_weights=False)[0]
+            x_v = x[:,:v_token_num,:]
+            x_a = x[:,v_token_num:,:]
+            x_v = x_v + gate_mlp.unsqueeze(1) * self.mlp_v(modulate(self.norm3(x_v), shift_mlp, scale_mlp))
+            x_a = x_a + gate_mlp.unsqueeze(1) * self.mlp_a(modulate(self.norm3(x_a), shift_mlp, scale_mlp))
+            return x_v, x_a
+        elif self.mode == "v":
+            x, c, x_cond = args
+            shift_msa, scale_msa, gate_msa, shift_ca_xcond, scale_ca_xcond, shift_ca_x, scale_ca_x, gate_ca_x, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(11, dim=1)
+            x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+            x_cond_norm = modulate(self.norm_cond(x_cond), shift_ca_xcond, scale_ca_xcond)
+            x = x + gate_ca_x.unsqueeze(1) * self.cttn(query=modulate(self.norm2(x), shift_ca_x, scale_ca_x), key=x_cond_norm, value=x_cond_norm, need_weights=False)[0]
+            x = x + gate_mlp.unsqueeze(1) * self.mlp_v(modulate(self.norm3(x), shift_mlp, scale_mlp))
+            return x
+        elif self.mode == "a":
+            x, c, x_cond = args
+            shift_msa, scale_msa, gate_msa, shift_ca_xcond, scale_ca_xcond, shift_ca_x, scale_ca_x, gate_ca_x, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(11, dim=1)
+            x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+            x_cond_norm = modulate(self.norm_cond(x_cond), shift_ca_xcond, scale_ca_xcond)
+            x = x + gate_ca_x.unsqueeze(1) * self.cttn(query=modulate(self.norm2(x), shift_ca_x, scale_ca_x), key=x_cond_norm, value=x_cond_norm, need_weights=False)[0]
+            x = x + gate_mlp.unsqueeze(1) * self.mlp_a(modulate(self.norm3(x), shift_mlp, scale_mlp))
+            return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class FinalLayer_audio(nn.Module):
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)  # no patch²
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        # x: (B, N, hidden_size), c: (B, hidden_size)
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)  # shape (B, hidden_size)
+        x = modulate(self.norm_final(x), shift, scale)  # apply AdaLN
+        x = self.linear(x)  # → (B, N, out_channels)
+        return x
+class AVCDiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        context_size=2,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        learn_sigma=True,
+        num_patches_a=180,
+        mode="av",
+    ):
+        super().__init__()
+        self.mode = mode
+        assert (self.mode=="av" or self.mode=="v" or self.mode=="a")
+        self.context_size = context_size
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        if self.mode == "av" or self.mode == "v":
+            self.x_embedder_v = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+            num_patches_v = self.x_embedder_v.num_patches
+            self.pos_embed_v = nn.Parameter(torch.zeros(self.context_size + 1, num_patches_v, hidden_size), requires_grad=True) # for context and for predicted frame
+            self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        if self.mode == "av" or self.mode == "a":
+            self.x_embedder_a = nn.Conv1d(
+                in_channels=16,
+                out_channels=hidden_size, # [B]
+                kernel_size=1,
+                stride=1,
+                bias=True
+            ) #TODO
+            self.pos_embed_a_cond = nn.Parameter(torch.zeros(self.context_size, num_patches_a, hidden_size), requires_grad=True)
+            self.pos_embed_a_pred = nn.Parameter(torch.zeros(1, num_patches_a+1, hidden_size), requires_grad=True)
+            self.final_layer_a = FinalLayer_audio(hidden_size=hidden_size, out_channels=32) # [B]
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        self.y_embedder = ActionEmbedder(hidden_size)
+        # self.blocks = nn.ModuleList([AVCDiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)])
+        self.blocks = nn.ModuleList([
+            AVCDiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, mode=self.mode)
+            for _ in range(depth)
+        ])
+        self.time_embedder = TimestepEmbedder(hidden_size)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        if self.mode == "av" or self.mode == "v":
+            nn.init.normal_(self.pos_embed_v, std=0.02)
+        if self.mode == "av" or self.mode == "a":
+            nn.init.normal_(self.pos_embed_a_pred, std=0.02)
+            nn.init.normal_(self.pos_embed_a_cond, std=0.02)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        if self.mode == "av" or self.mode == "v":
+            w = self.x_embedder_v.proj.weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.x_embedder_v.proj.bias, 0)
+        # Initialize x_embedder_a (Conv1d) like linear
+        if self.mode == "av" or self.mode == "a":
+            w = self.x_embedder_a.weight.data
+            nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+            nn.init.constant_(self.x_embedder_a.bias, 0)
+        # Initialize action embedding:
+        nn.init.normal_(self.y_embedder.x_emb.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.y_embedder.x_emb.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.y_embedder.y_emb.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.y_embedder.y_emb.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.y_embedder.angle_emb.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.y_embedder.angle_emb.mlp[2].weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.time_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.time_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        if self.mode == "av" or self.mode == "v":
+            nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(self.final_layer.linear.weight, 0)
+            nn.init.constant_(self.final_layer.linear.bias, 0)
+        if self.mode == "av" or self.mode == "a":
+            nn.init.constant_(self.final_layer_a.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(self.final_layer_a.adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(self.final_layer_a.linear.weight, 0)
+            nn.init.constant_(self.final_layer_a.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder_v.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    # def forward(self, x_v, x_a, t, y, x_v_cond, x_a_cond, rel_t):
+    # def forward(self, *args):
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass of DiT.
+        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        if self.mode == "av":
+            if len(args) >= 7:
+                x_v, x_a, t, y, x_v_cond, x_a_cond, rel_t = args[:7]
+            else:
+                assert len(args) == 3, f"mode='v' expects 2 or 5 positional args, got {len(args)}"
+                x_v, x_a, t = args
+                y = kwargs["y"]
+                x_v_cond = kwargs["x_v_cond"]
+                x_a_cond = kwargs["x_a_cond"]
+                rel_t = kwargs["rel_t"]
+            x_v = self.x_embedder_v(x_v) + self.pos_embed_v[self.context_size:]
+            x_v_cond = self.x_embedder_v(x_v_cond.flatten(0, 1)).unflatten(0, (x_v_cond.shape[0], x_v_cond.shape[1])) + self.pos_embed_v[:self.context_size]  # (N, T, D), where T = H * W / patch_size ** 2.flatten(1, 2)
+            x_v_cond = x_v_cond.flatten(1, 2)
+            x_a = self.x_embedder_a(x_a)  # → (B, embed_dim, L')
+            x_a = x_a.transpose(1, 2)  # → (B, L', embed_dim)
+            x_a = x_a + self.pos_embed_a_pred
+            x_a_cond = self.x_embedder_a(x_a_cond.flatten(0, 1)).transpose(1, 2).unflatten(0, (x_a_cond.shape[0], x_a_cond.shape[1])) + self.pos_embed_a_cond
+            x_a_cond = x_a_cond.flatten(1, 2)
+            t = self.t_embedder(t[..., None])
+            y = self.y_embedder(y)
+            time_emb = self.time_embedder(rel_t[..., None])
+            c = t + time_emb + y # if training on unlabeled data, dont add y.
+            for block in self.blocks:
+                x_v, x_a = block(x_v, x_a, c, x_v_cond, x_a_cond)
+            x_v = self.final_layer(x_v, c)
+            x_v = self.unpatchify(x_v)
+            x_a = self.final_layer_a(x_a, c)
+            x_a = x_a.transpose(1, 2)
+            return x_v, x_a
+        elif self.mode == "v":
+            if len(args) >= 5:
+                x, t, y, x_cond, rel_t = args[:5]
+            else:
+                assert len(args) == 2, f"mode='v' expects 2 or 5 positional args, got {len(args)}"
+                x, t = args
+                y = kwargs["y"]
+                x_cond = kwargs["x_cond"]
+                rel_t = kwargs["rel_t"]
+            x = self.x_embedder_v(x) + self.pos_embed_v[self.context_size:]
+            x_cond = self.x_embedder_v(x_cond.flatten(0, 1)).unflatten(0, (x_cond.shape[0], x_cond.shape[1])) + self.pos_embed_v[:self.context_size]  # (N, T, D), where T = H * W / patch_size ** 2.flatten(1, 2)
+            x_cond = x_cond.flatten(1, 2)
+            t = self.t_embedder(t[..., None])
+            y = self.y_embedder(y)
+            time_emb = self.time_embedder(rel_t[..., None])
+            c = t + time_emb + y # if training on unlabeled data, dont add y.
+            for block in self.blocks:
+                x = block(x, c, x_cond)
+            x = self.final_layer(x, c)
+            x = self.unpatchify(x)
+            return x
+        elif self.mode == "a":
+            if len(args) >= 5:
+                x, t, y, x_cond, rel_t = args[:5]
+            else:
+                assert len(args) == 2, f"mode='v' expects 2 or 5 positional args, got {len(args)}"
+                x, t = args
+                y = kwargs["y"]
+                x_cond = kwargs["x_cond"]
+                rel_t = kwargs["rel_t"]
+            x = self.x_embedder_a(x)  # → (B, embed_dim, L')
+            x = x.transpose(1, 2)  # → (B, L', embed_dim)
+            x = x + self.pos_embed_a_pred  # [REWARD]
+            x_cond = self.x_embedder_a(x_cond.flatten(0, 1)).transpose(1, 2).unflatten(0, (x_cond.shape[0], x_cond.shape[1])) + self.pos_embed_a_cond  # [REWARD]
+            x_cond = x_cond.flatten(1, 2)
+            t = self.t_embedder(t[..., None])
+            y = self.y_embedder(y)
+            time_emb = self.time_embedder(rel_t[..., None])
+            c = t + time_emb + y # if training on unlabeled data, dont add y.
+            for block in self.blocks:
+                x = block(x, c, x_cond)
+            x = self.final_layer_a(x, c)
+            x = x.transpose(1, 2)
+            return x
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   AVCDiT Configs                                  #
+#################################################################################
+def AVCDiT_XL_2(**kwargs):
+    return AVCDiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def AVCDiT_L_2(**kwargs):
+    return AVCDiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def AVCDiT_B_2(**kwargs):
+    return AVCDiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def AVCDiT_S_2(**kwargs):
+    return AVCDiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+AVCDiT_models = {
+    'AVCDiT-XL/2': AVCDiT_XL_2,
+    'AVCDiT-L/2':  AVCDiT_L_2,
+    'AVCDiT-B/2':  AVCDiT_B_2,
+    'AVCDiT-S/2':  AVCDiT_S_2
+}

soundstream.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from vector_quantize_pytorch import ResidualVQ
+class CausalConv1d(nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.causal_padding = self.dilation[0] * (self.kernel_size[0] - 1)
+    def forward(self, x):
+        return self._conv_forward(F.pad(x, [self.causal_padding, 0]), self.weight, self.bias)
+class CausalConvTranspose1d(nn.ConvTranspose1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.causal_padding = self.dilation[0] * (self.kernel_size[0] - 1) + self.output_padding[0] + 1 - self.stride[0]
+    def forward(self, x, output_size=None):
+        if self.padding_mode != 'zeros':
+            raise ValueError('Only `zeros` padding mode is supported for ConvTranspose1d')
+        assert isinstance(self.padding, tuple)
+        output_padding = self._output_padding(
+            x, output_size, self.stride, self.padding, self.kernel_size, self.dilation)
+        return F.conv_transpose1d(
+            x, self.weight, self.bias, self.stride, self.padding,
+            output_padding, self.groups, self.dilation)[...,:-self.causal_padding]
+class ResidualUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, dilation):
+        super().__init__()
+        self.dilation = dilation
+        self.layers = nn.Sequential(
+            CausalConv1d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=7, dilation=dilation),
+            nn.ELU(),
+            nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
+                      kernel_size=1)
+        )
+    def forward(self, x):
+        return x + self.layers(x)
+class EncoderBlock(nn.Module):
+    def __init__(self, out_channels, stride):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(in_channels=out_channels//2,
+                         out_channels=out_channels//2, dilation=1),
+            nn.ELU(),
+            ResidualUnit(in_channels=out_channels//2,
+                         out_channels=out_channels//2, dilation=3),
+            nn.ELU(),
+            ResidualUnit(in_channels=out_channels//2,
+                         out_channels=out_channels//2, dilation=9),
+            nn.ELU(),
+            CausalConv1d(in_channels=out_channels//2, out_channels=out_channels,
+                      kernel_size=2*stride, stride=stride)
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(self, out_channels, stride):
+        super().__init__()
+        self.layers = nn.Sequential(
+            CausalConvTranspose1d(in_channels=2*out_channels,
+                               out_channels=out_channels,
+                               kernel_size=2*stride, stride=stride),
+            nn.ELU(),
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=1),
+            nn.ELU(),
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=3),
+            nn.ELU(),
+            ResidualUnit(in_channels=out_channels, out_channels=out_channels,
+                         dilation=9),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class Encoder(nn.Module):
+    def __init__(self, C, D):
+        super().__init__()
+        self.layers = nn.Sequential(
+            CausalConv1d(in_channels=2, out_channels=C, kernel_size=7),
+            nn.ELU(),
+            EncoderBlock(out_channels=2*C, stride=2),
+            nn.ELU(),
+            EncoderBlock(out_channels=4*C, stride=4),
+            nn.ELU(),
+            EncoderBlock(out_channels=8*C, stride=5),
+            nn.ELU(),
+            # EncoderBlock(out_channels=16*C, stride=8),
+            # nn.ELU(),
+            # CausalConv1d(in_channels=16*C, out_channels=D, kernel_size=3)
+            CausalConv1d(in_channels=8*C, out_channels=D, kernel_size=3)
+        )
+    def forward(self, x):
+        return self.layers(x)
+class Decoder(nn.Module):
+    def __init__(self, C, D):
+        super().__init__()
+        self.layers = nn.Sequential(
+            CausalConv1d(in_channels=D, out_channels=8*C, kernel_size=7),
+            # CausalConv1d(in_channels=D, out_channels=16*C, kernel_size=7),
+            # nn.ELU(),
+            # DecoderBlock(out_channels=8*C, stride=8),
+            nn.ELU(),
+            DecoderBlock(out_channels=4*C, stride=5),
+            nn.ELU(),
+            DecoderBlock(out_channels=2*C, stride=4),
+            nn.ELU(),
+            DecoderBlock(out_channels=C, stride=2),
+            nn.ELU(),
+            CausalConv1d(in_channels=C, out_channels=2, kernel_size=7)
+        )
+    def forward(self, x):
+        return self.layers(x)
+class SoundStream(nn.Module):
+    def __init__(self, C, D, n_q, codebook_size):
+        super().__init__()
+        self.encoder = Encoder(C=C, D=D)
+        self.quantizer = ResidualVQ(
+            num_quantizers=n_q, dim=D, codebook_size=codebook_size,
+            kmeans_init=True, kmeans_iters=100, threshold_ema_dead_code=2
+        )
+        self.decoder = Decoder(C=C, D=D)
+    @staticmethod
+    def pad_to_multiple(x, multiple):
+        """
+        x: [B, C, T]
+        multiple: int, e.g., 320
+        return: padded_x, original_length
+        """
+        B, C, T = x.shape
+        target_len = ((T + multiple - 1) // multiple) * multiple
+        pad_len = target_len - T
+        padded_x = F.pad(x, (0, pad_len), mode='reflect')
+        return padded_x, T
+    @staticmethod
+    def crop_to_length(x, original_length):
+        return x[..., :original_length]
+    def forward(self, x):
+        e = self.encoder(x)        # [B, D, T']
+        e = e.permute(0, 2, 1)     # → [B, T', D]
+        quantized, _, _ = self.quantizer(e)
+        quantized = quantized.permute(0, 2, 1)  # → [B, D, T']
+        o = self.decoder(quantized)  # → [B, 2, T_padded]
+        return o

train_avwm_stage1.py ADDED Viewed

	@@ -0,0 +1,463 @@

+from inference_avwm import model_forward_wrapper_v
+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import matplotlib
+matplotlib.use('Agg')
+from collections import OrderedDict
+from copy import deepcopy
+from time import time
+import argparse
+import logging
+import os
+import matplotlib.pyplot as plt
+import yaml
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, ConcatDataset
+from torch.utils.data.distributed import DistributedSampler
+from diffusers.models import AutoencoderKL
+from distributed import init_distributed
+from models import AVCDiT_models
+from diffusion import create_diffusion
+from datasets import TrainingDataset
+from misc import transform
+#################################################################################
+#                             Training Helper Functions                         #
+#################################################################################
+def load_checkpoint_if_available(model, ema, opt, scaler, config, device, logger, args):
+    start_epoch = 0
+    train_steps = 0
+    latest_path = os.path.join(config['results_dir'], config['run_name'], "checkpoints", "latest.pth.tar")
+    if os.path.isfile(latest_path) or config.get('from_checkpoint', 0):
+        latest_path = latest_path if os.path.isfile(latest_path) else config.get('from_checkpoint', 0)
+        print("Loading model from ", latest_path)
+        checkpoint = torch.load(latest_path, map_location=f"cuda:{device}", weights_only=False)
+        ema_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["ema"].items()}
+        remapped = {}
+        for k, v in ema_ckp.items():
+            new_k = k
+            # 1) pos_embed -> pos_embed_v
+            if k.startswith("pos_embed"):
+                new_k = k.replace("pos_embed", "pos_embed_v", 1)
+            # 2) x_embedder. -> x_embedder_v.
+            if new_k.startswith("x_embedder."):
+                new_k = new_k.replace("x_embedder.", "x_embedder_v.", 1)
+            # 3) blocks.*.mlp.*:  .mlp. -> .mlp_v.
+            if new_k.startswith("blocks.") and ".mlp." in new_k:
+                new_k = new_k.replace(".mlp.", ".mlp_v.", 1)
+            remapped[new_k] = v
+        ema_ckp = remapped
+        model.load_state_dict(ema_ckp, strict=True)
+        print("Model weights loaded.")
+        ema.load_state_dict(ema_ckp, strict=True)
+        print("EMA weights loaded.")
+        if args.restart_from_checkpoint:
+            logger.info("Restarting training: epoch and step counters set to 0.")
+        else:
+            if "opt" in checkpoint:
+                opt_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["opt"].items()}
+                opt.load_state_dict(opt_ckp)
+                print("Optimizer state loaded.")
+            if "scaler" in checkpoint and scaler is not None:
+                scaler.load_state_dict(checkpoint["scaler"])
+                print("GradScaler state loaded.")
+            if "epoch" in checkpoint:
+                start_epoch = checkpoint["epoch"] + 1
+            if "train_steps" in checkpoint:
+                train_steps = checkpoint["train_steps"]
+            logger.info(f"Resuming from epoch {start_epoch}, step {train_steps}")
+    return start_epoch, train_steps
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+    for name, param in model_params.items():
+        name = name.replace('_orig_mod.', '')
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
+def requires_grad(model, flag=True):
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.destroy_process_group()
+def create_logger(logging_dir):
+    """
+    Create a logger that writes to a log file and stdout.
+    """
+    if dist.get_rank() == 0:  # real logger
+        logging.basicConfig(
+            level=logging.INFO,
+            format='[\033[34m%(asctime)s\033[0m] %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S',
+            handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
+        )
+        logger = logging.getLogger(__name__)
+    else:  # dummy logger (does nothing)
+        logger = logging.getLogger(__name__)
+        logger.addHandler(logging.NullHandler())
+    return logger
+#################################################################################
+#                                  Training Loop                                #
+#################################################################################
+def main(args):
+    """
+    Trains a new AVCDiT model.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # Setup DDP:
+    _, rank, device, _ = init_distributed()
+    # rank = dist.get_rank()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    with open("config/eval_config.yaml", "r") as f:
+        default_config = yaml.safe_load(f)
+    config = default_config
+    with open(args.config, "r") as f:
+        user_config = yaml.safe_load(f)
+    config.update(user_config)
+    # Setup an experiment folder:
+    os.makedirs(config['results_dir'], exist_ok=True)  # Make results folder (holds all experiment subfolders)
+    experiment_dir = f"{config['results_dir']}/{config['run_name']}"  # Create an experiment folder
+    checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        logger = create_logger(experiment_dir)
+        logger.info(f"Experiment directory created at {experiment_dir}")
+    else:
+        logger = create_logger(None)
+    # Create model:
+    tokenizer = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema").to(device)
+    latent_size = config['image_size'] // 8
+    assert config['image_size'] % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
+    num_cond = config['context_size']
+    model = AVCDiT_models[config['model']](context_size=num_cond, input_size=latent_size, in_channels=4, mode="v").to(device)
+    ema = deepcopy(model).to(device)  # Create an EMA of the model for use after training
+    requires_grad(ema, False)
+    # Setup optimizer (we used default Adam betas=(0.9, 0.999) and a constant learning rate of 1e-4 in our paper):
+    lr = float(config.get('lr', 1e-4))
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0)
+    bfloat_enable = bool(hasattr(args, 'bfloat16') and args.bfloat16)
+    if bfloat_enable:
+        scaler = torch.amp.GradScaler()
+    # load existing checkpoint
+    # latest_path = os.path.join(checkpoint_dir, "latest.pth.tar")
+    # === Load checkpoint or start from a pretrained one ===
+    start_epoch, train_steps = load_checkpoint_if_available(
+        model, ema, opt, scaler if bfloat_enable else None, config, device, logger, args
+    )
+    # ~40% speedup but might leads to worse performance depending on pytorch version
+    if args.torch_compile:
+        model = torch.compile(model)
+    model = DDP(model, device_ids=[device])
+    diffusion = create_diffusion(timestep_respacing="")  # default: 1000 steps, linear noise schedule
+    # ,predict_xstart=True
+    logger.info(f"AVCDiT Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    train_dataset = []
+    test_dataset = []
+    for dataset_name in config["datasets"]:
+        data_config = config["datasets"][dataset_name]
+        for data_split_type in ["train", "test"]:
+            if data_split_type in data_config:
+                    goals_per_obs = int(data_config["goals_per_obs"])
+                    if data_split_type == 'test':
+                        goals_per_obs = 4 # standardize testing
+                    if "distance" in data_config:
+                        min_dist_cat=data_config["distance"]["min_dist_cat"]
+                        max_dist_cat=data_config["distance"]["max_dist_cat"]
+                    else:
+                        min_dist_cat=config["distance"]["min_dist_cat"]
+                        max_dist_cat=config["distance"]["max_dist_cat"]
+                    if "len_traj_pred" in data_config:
+                        len_traj_pred=data_config["len_traj_pred"]
+                    else:
+                        len_traj_pred=config["len_traj_pred"]
+                    dataset = TrainingDataset(
+                        data_folder=data_config["data_folder"],
+                        data_split_folder=data_config[data_split_type],
+                        dataset_name=dataset_name,
+                        image_size=config["image_size"],
+                        min_dist_cat=min_dist_cat,
+                        max_dist_cat=max_dist_cat,
+                        len_traj_pred=len_traj_pred,
+                        context_size=config["context_size"],
+                        normalize=config["normalize"],
+                        goals_per_obs=goals_per_obs,
+                        transform=transform,
+                        predefined_index=None,
+                        traj_stride=1,
+                        evaluate=(data_split_type=="test")
+                    )
+                    if data_split_type == "train":
+                        train_dataset.append(dataset)
+                    else:
+                        test_dataset.append(dataset)
+                    print(f"Dataset: {dataset_name} ({data_split_type}), size: {len(dataset)}")
+    # combine all the datasets from different robots
+    print(f"Combining {len(train_dataset)} datasets.")
+    train_dataset = ConcatDataset(train_dataset)
+    test_dataset = ConcatDataset(test_dataset)
+    sampler = DistributedSampler(
+        train_dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        train_dataset,
+        batch_size=config['batch_size'],
+        shuffle=False,
+        sampler=sampler,
+        num_workers=config['num_workers'],
+        pin_memory=True,
+        drop_last=True,
+        persistent_workers=True
+    )
+    logger.info(f"Dataset contains {len(train_dataset):,} images")
+    # Prepare models for training:
+    model.train()  # important! This enables embedding dropout for classifier-free guidance
+    ema.eval()  # EMA model should always be in eval mode
+    # Variables for monitoring/logging purposes:
+    log_steps = 0
+    running_loss = 0
+    start_time = time()
+    logger.info(f"Training for {args.epochs} epochs...")
+    for epoch in range(start_epoch, args.epochs):
+        sampler.set_epoch(epoch)
+        steps_per_epoch = len(loader)
+        if rank == 0:
+            logger.info(f"Epoch {epoch} contains {steps_per_epoch} steps.")
+        logger.info(f"Beginning epoch {epoch}...")
+        for x, _, y, _, rel_t in loader:
+            x = x.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            rel_t = rel_t.to(device, non_blocking=True)
+            with torch.amp.autocast('cuda', enabled=bfloat_enable, dtype=torch.bfloat16):
+                with torch.no_grad():
+                    # Map input images to latent space + normalize latents:
+                    B, T = x.shape[:2]
+                    x = x.flatten(0,1)
+                    x = tokenizer.encode(x).latent_dist.sample().mul_(0.18215)
+                    x = x.unflatten(0, (B, T))
+                num_goals = T - num_cond
+                x_start = x[:, num_cond:].flatten(0, 1)
+                x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3], x.shape[4]).flatten(0, 1)
+                y = y.flatten(0, 1)
+                rel_t = rel_t.flatten(0, 1)
+                t = torch.randint(0, diffusion.num_timesteps, (x_start.shape[0],), device=device)
+                model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)
+                loss_dict = diffusion.training_losses(model, x_start, t, model_kwargs)
+                loss = loss_dict["loss"].mean()
+            if not bfloat_enable:
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+            else:
+                scaler.scale(loss).backward()
+                if config.get('grad_clip_val', 0) > 0:
+                    scaler.unscale_(opt)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config['grad_clip_val'])
+                scaler.step(opt)
+                scaler.update()
+            update_ema(ema, model.module)
+            # Log loss values:
+            running_loss += loss.detach().item()
+            log_steps += 1
+            train_steps += 1
+            if train_steps % args.log_every == 0:
+                # Measure training speed:
+                torch.cuda.synchronize()
+                end_time = time()
+                steps_per_sec = log_steps / (end_time - start_time)
+                samples_per_sec = dist.get_world_size()*x_cond.shape[0]*steps_per_sec
+                # Reduce loss history over all processes:
+                avg_loss = torch.tensor(running_loss / log_steps, device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
+                avg_loss = avg_loss.item() / dist.get_world_size()
+                total_steps = len(loader) * args.epochs
+                progress_pct = train_steps / total_steps * 100
+                remaining_steps = total_steps - train_steps
+                eta_seconds = remaining_steps / steps_per_sec if steps_per_sec > 0 else 0
+                eta_hours = eta_seconds / 3600
+                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}, Samples/Sec: {samples_per_sec:.2f}")
+                logger.info(f"Progress: {progress_pct:.2f}% | ETA: {eta_hours:.1f}h")
+                # Reset monitoring variables:
+                running_loss = 0
+                log_steps = 0
+                start_time = time()
+            # Save DiT checkpoint:
+            if train_steps % args.ckpt_every == 0 and train_steps > 0:
+                if rank == 0:
+                    checkpoint = {
+                        "model": model.module.state_dict(),
+                        "ema": ema.state_dict(),
+                        "opt": opt.state_dict(),
+                        "args": args,
+                        "epoch": epoch,
+                        "train_steps": train_steps
+                    }
+                    if bfloat_enable:
+                        checkpoint.update({"scaler": scaler.state_dict()})
+                    checkpoint_path = f"{checkpoint_dir}/latest.pth.tar"
+                    torch.save(checkpoint, checkpoint_path)
+                    if train_steps % (10*args.ckpt_every) == 0 and train_steps > 0:
+                        checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pth.tar"
+                        torch.save(checkpoint, checkpoint_path)
+                    logger.info(f"Saved checkpoint to {checkpoint_path}")
+            if train_steps % args.eval_every == 0 and train_steps > 0:
+                eval_start_time = time()
+                # validation / test set evaluation
+                save_dir = os.path.join(experiment_dir, str(train_steps))
+                sim_score_val = evaluate(ema, tokenizer, diffusion, test_dataset, rank, config["batch_size"], config["num_workers"], latent_size, device, save_dir, args.global_seed, bfloat_enable, num_cond)
+                dist.barrier()
+                eval_end_time = time()
+                eval_time = eval_end_time - eval_start_time
+                # logger.info(f"(step={train_steps:07d}) Val Perceptual Loss: {sim_score_val:.4f}, Train Perceptual Loss: {sim_score_train:.4f}, Eval Time: {eval_time:.2f}")
+                logger.info(f"(step={train_steps:07d}) Val Perceptual Loss: {sim_score_val:.4f}, Eval Time: {eval_time:.2f}")
+    model.eval()
+    logger.info("Done!")
+    cleanup()
+@torch.no_grad
+def evaluate(model, vae, diffusion, test_dataloaders, rank, batch_size, num_workers, latent_size, device, save_dir, seed, bfloat_enable, num_cond):
+    sampler = DistributedSampler(
+        test_dataloaders,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=seed
+    )
+    loader = DataLoader(
+        test_dataloaders,
+        batch_size=batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    from dreamsim import dreamsim
+    eval_model, _ = dreamsim(pretrained=True)
+    score = torch.tensor(0.).to(device)
+    n_samples = torch.tensor(0).to(device)
+    # Run for 1 step
+    for x, _, y, _, rel_t, _ in loader:
+        x = x.to(device)
+        y = y.to(device)
+        rel_t = rel_t.to(device).flatten(0, 1)
+        with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+            B, T = x.shape[:2]
+            num_goals = T - num_cond
+            samples = model_forward_wrapper_v((model, diffusion, vae), x, y, num_timesteps=None, latent_size=latent_size, device=device, num_cond=num_cond, num_goals=num_goals, rel_t=rel_t)
+            x_start_pixels = x[:, num_cond:].flatten(0, 1)
+            x_cond_pixels = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3], x.shape[4]).flatten(0, 1)
+            samples = samples * 0.5 + 0.5
+            x_start_pixels = x_start_pixels * 0.5 + 0.5
+            x_cond_pixels = x_cond_pixels * 0.5 + 0.5
+            res = eval_model(x_start_pixels, samples)
+            score += res.sum()
+            n_samples += len(res)
+        break
+    if rank == 0:
+        os.makedirs(save_dir, exist_ok=True)
+        for i in range(min(samples.shape[0], 10)):
+            _, ax = plt.subplots(1,3,dpi=256)
+            ax[0].imshow((x_cond_pixels[i, -1].permute(1,2,0).cpu().numpy()*255).astype('uint8'))
+            ax[1].imshow((x_start_pixels[i].permute(1,2,0).cpu().numpy()*255).astype('uint8'))
+            ax[2].imshow((samples[i].permute(1,2,0).cpu().float().numpy()*255).astype('uint8'))
+            plt.savefig(f'{save_dir}/{i}.png')
+            plt.close()
+    dist.all_reduce(score)
+    dist.all_reduce(n_samples)
+    sim_score = score/n_samples
+    return sim_score
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=300)
+    # parser.add_argument("--global-batch-size", type=int, default=256)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--log-every", type=int, default=100)
+    parser.add_argument("--ckpt-every", type=int, default=2000)
+    parser.add_argument("--eval-every", type=int, default=5000)
+    parser.add_argument("--bfloat16", type=int, default=1)
+    parser.add_argument("--torch-compile", type=int, default=1)
+    parser.add_argument("--restart-from-checkpoint", type=int, default=0,
+                    help="If 1, only load model weights and reset epoch/step to zero (cold start)")
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)

train_avwm_stage2.py ADDED Viewed

	@@ -0,0 +1,514 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# NoMaD, GNM, ViNT: https://github.com/robodhruv/visualnav-transformer
+# --------------------------------------------------------
+from inference_avwm import model_forward_wrapper_a
+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import matplotlib
+matplotlib.use('Agg')
+from collections import OrderedDict
+from copy import deepcopy
+from time import time
+import argparse
+import logging
+import os
+import matplotlib.pyplot as plt
+import yaml
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, ConcatDataset
+from torch.utils.data.distributed import DistributedSampler
+from diffusers.models import AutoencoderKL
+from distributed import init_distributed
+from models import AVCDiT_models
+from diffusion import create_diffusion
+from datasets import TrainingDataset
+from misc import transform
+from soundstream import SoundStream
+# from audiovae import BinauralSeqTokenCodec
+import torchaudio
+from eval_audio import build_mel_transform, mel_cosine_stereo, drms_avg_db_stereo, save_ref_hat_spectrogram_panel
+def load_checkpoint_if_available(model, ema, opt, scaler, config, device, logger, args):
+    start_epoch = 0
+    train_steps = 0
+    latest_path = os.path.join(config['results_dir'], config['run_name'], "checkpoints", "latest.pth.tar")
+    if os.path.isfile(latest_path) or config.get('from_checkpoint', 0):
+        latest_path = latest_path if os.path.isfile(latest_path) else config.get('from_checkpoint', 0)
+        print("Loading model from ", latest_path)
+        checkpoint = torch.load(latest_path, map_location=f"cuda:{device}", weights_only=False)
+        ema_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["ema"].items()}
+        remapped = {}
+        for k, v in ema_ckp.items():
+            new_k = k
+            if new_k.startswith("blocks.") and ".mlp_v." in new_k:
+                new_k = new_k.replace(".mlp_v.", ".mlp_a.", 1)
+            remapped[new_k] = v
+        ema_ckp = remapped
+        model_state = model.state_dict()
+        load_info = model.load_state_dict(ema_ckp, strict=False)
+        print("Model weights loaded.")
+        ema.load_state_dict(ema_ckp, strict=False)
+        print("EMA weights loaded.")
+        if args.restart_from_checkpoint:
+            logger.info("Restarting training: epoch and step counters set to 0.")
+        else:
+            try:
+                if "opt" in checkpoint:
+                    opt_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["opt"].items()}
+                    opt.load_state_dict(opt_ckp)
+                    print("Optimizer state loaded.")
+                if "scaler" in checkpoint and scaler is not None:
+                    scaler.load_state_dict(checkpoint["scaler"])
+                    print("GradScaler state loaded.")
+            except ValueError as e:
+                print(f"[WARN] Skip loading opt and scaler")
+            if "epoch" in checkpoint:
+                start_epoch = checkpoint["epoch"] + 1
+            if "train_steps" in checkpoint:
+                train_steps = checkpoint["train_steps"]
+            logger.info(f"Resuming from epoch {start_epoch}, step {train_steps}")
+    return start_epoch, train_steps
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+    for name, param in model_params.items():
+        name = name.replace('_orig_mod.', '')
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
+def requires_grad(model, flag=True):
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.destroy_process_group()
+def create_logger(logging_dir):
+    """
+    Create a logger that writes to a log file and stdout.
+    """
+    if dist.get_rank() == 0:  # real logger
+        logging.basicConfig(
+            level=logging.INFO,
+            format='[\033[34m%(asctime)s\033[0m] %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S',
+            handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
+        )
+        logger = logging.getLogger(__name__)
+    else:  # dummy logger (does nothing)
+        logger = logging.getLogger(__name__)
+        logger.addHandler(logging.NullHandler())
+    return logger
+#################################################################################
+#                                  Training Loop                                #
+#################################################################################
+def main(args):
+    """
+    Trains a new AVCDiT model.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # Setup DDP:
+    _, rank, device, _ = init_distributed()
+    # rank = dist.get_rank()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    with open("config/eval_config.yaml", "r") as f:
+        default_config = yaml.safe_load(f)
+    config = default_config
+    with open(args.config, "r") as f:
+        user_config = yaml.safe_load(f)
+    config.update(user_config)
+    # Setup an experiment folder:
+    os.makedirs(config['results_dir'], exist_ok=True)  # Make results folder (holds all experiment subfolders)
+    experiment_dir = f"{config['results_dir']}/{config['run_name']}"  # Create an experiment folder
+    checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        logger = create_logger(experiment_dir)
+        logger.info(f"Experiment directory created at {experiment_dir}")
+    else:
+        logger = create_logger(None)
+    # Create model:
+    tokenizer = SoundStream(C=32, D=16, n_q=8, codebook_size=1024).to(device)
+    tokenizer_path=config["tokenizer_a_path"]
+    checkpoint = torch.load(tokenizer_path, map_location=f"cuda:{device}")
+    tokenizer.load_state_dict(checkpoint["model_state"])
+    tokenizer.eval()
+    latent_size = config['image_size'] // 8
+    assert config['image_size'] % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
+    num_cond = config['context_size']
+    model = AVCDiT_models[config['model']](context_size=num_cond, input_size=latent_size, in_channels=4, mode="a").to(device)
+    ema = deepcopy(model).to(device)
+    requires_grad(ema, False)
+    lr = float(config.get('lr', 1e-4))
+    for param in model.parameters():
+        param.requires_grad = False
+    for param in model.x_embedder_a.parameters():
+        param.requires_grad = True
+    model.pos_embed_a_cond.requires_grad = True
+    model.pos_embed_a_pred.requires_grad = True
+    for param in model.final_layer_a.parameters():
+        param.requires_grad = True
+    for i, block in enumerate(model.blocks):
+        for name, param in block.named_parameters():
+            if name.startswith("mlp."):
+                param.requires_grad = True
+    opt = torch.optim.AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=lr, weight_decay=0
+    )
+    bfloat_enable = bool(hasattr(args, 'bfloat16') and args.bfloat16)
+    if bfloat_enable:
+        scaler = torch.amp.GradScaler()
+    start_epoch, train_steps = load_checkpoint_if_available(
+        model, ema, opt, scaler if bfloat_enable else None, config, device, logger, args
+    )
+    print("Trainable Parameters: ")
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            print(f" - {name}: {tuple(param.shape)}")
+    # =======================================================================================#
+    # ~40% speedup but might leads to worse performance depending on pytorch version
+    if args.torch_compile:
+        model = torch.compile(model)
+    model = DDP(model, device_ids=[device])
+    diffusion = create_diffusion(timestep_respacing="")  # default: 1000 steps, linear noise schedule
+    # ,predict_xstart=True
+    logger.info(f"AVCDiT Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    train_dataset = []
+    test_dataset = []
+    for dataset_name in config["datasets"]:
+        data_config = config["datasets"][dataset_name]
+        for data_split_type in ["train", "test"]:
+            if data_split_type in data_config:
+                    goals_per_obs = int(data_config["goals_per_obs"])
+                    if data_split_type == 'test':
+                        goals_per_obs = 4 # standardize testing
+                    if "distance" in data_config:
+                        min_dist_cat=data_config["distance"]["min_dist_cat"]
+                        max_dist_cat=data_config["distance"]["max_dist_cat"]
+                    else:
+                        min_dist_cat=config["distance"]["min_dist_cat"]
+                        max_dist_cat=config["distance"]["max_dist_cat"]
+                    if "len_traj_pred" in data_config:
+                        len_traj_pred=data_config["len_traj_pred"]
+                    else:
+                        len_traj_pred=config["len_traj_pred"]
+                    dataset = TrainingDataset(
+                        data_folder=data_config["data_folder"],
+                        data_split_folder=data_config[data_split_type],
+                        dataset_name=dataset_name,
+                        image_size=config["image_size"],
+                        min_dist_cat=min_dist_cat,
+                        max_dist_cat=max_dist_cat,
+                        len_traj_pred=len_traj_pred,
+                        context_size=config["context_size"],
+                        normalize=config["normalize"],
+                        goals_per_obs=goals_per_obs,
+                        transform=transform,
+                        predefined_index=None,
+                        traj_stride=1,
+                        sample_rate=config["sample_rate"],
+                        input_sr=config["input_sr"],
+                        evaluate=(data_split_type=="test")
+                    )
+                    if data_split_type == "train":
+                        train_dataset.append(dataset)
+                    else:
+                        test_dataset.append(dataset)
+                    print(f"Dataset: {dataset_name} ({data_split_type}), size: {len(dataset)}")
+    # combine all the datasets from different robots
+    print(f"Combining {len(train_dataset)} datasets.")
+    train_dataset = ConcatDataset(train_dataset)
+    test_dataset = ConcatDataset(test_dataset)
+    sampler = DistributedSampler(
+        train_dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        train_dataset,
+        batch_size=config['batch_size'],
+        shuffle=False,
+        sampler=sampler,
+        num_workers=config['num_workers'],
+        pin_memory=True,
+        drop_last=True,
+        persistent_workers=True
+    )
+    logger.info(f"Dataset contains {len(train_dataset):,} images")
+    # Prepare models for training:
+    model.train()  # important! This enables embedding dropout for classifier-free guidance
+    ema.eval()  # EMA model should always be in eval mode
+    # Variables for monitoring/logging purposes:
+    log_steps = 0
+    running_loss = 0
+    start_time = time()
+    logger.info(f"Training for {args.epochs} epochs...")
+    for epoch in range(start_epoch, args.epochs):
+        sampler.set_epoch(epoch)
+        steps_per_epoch = len(loader)
+        if rank == 0:
+            logger.info(f"Epoch {epoch} contains {steps_per_epoch} steps.")
+        logger.info(f"Beginning epoch {epoch}...")
+        for _, x, y, diff, rel_t in loader:
+            x = x.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            diff = diff.to(device, non_blocking=True) # [REWARD]
+            rel_t = rel_t.to(device, non_blocking=True)
+            with torch.amp.autocast('cuda', enabled=bfloat_enable, dtype=torch.bfloat16):
+                with torch.no_grad():
+                    # Map input images to latent space + normalize latents:
+                    B, T = x.shape[:2]
+                    x = x.flatten(0,1)
+                    x = tokenizer.encoder(x)
+                    x = x.unflatten(0, (B, T))
+                num_goals = T - num_cond
+                x_start = x[:, num_cond:].flatten(0, 1)
+                x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3]).flatten(0, 1)
+                y = y.flatten(0, 1)
+                rel_t = rel_t.flatten(0, 1)
+                diff = diff.flatten(0, 1)
+                diff_tok = diff.unsqueeze(1).expand(-1, 16, -1)
+                x_start = torch.cat([x_start, diff_tok], dim=2)
+                t = torch.randint(0, diffusion.num_timesteps, (x_start.shape[0],), device=device)
+                model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)
+                loss_dict = diffusion.training_losses(model, x_start, t, model_kwargs)
+                loss = loss_dict["loss"].mean()
+            if not bfloat_enable:
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+            else:
+                scaler.scale(loss).backward()
+                if config.get('grad_clip_val', 0) > 0:
+                    scaler.unscale_(opt)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config['grad_clip_val'])
+                scaler.step(opt)
+                scaler.update()
+            update_ema(ema, model.module)
+            # Log loss values:
+            running_loss += loss.detach().item()
+            log_steps += 1
+            train_steps += 1
+            if train_steps % args.log_every == 0:
+                # Measure training speed:
+                torch.cuda.synchronize()
+                end_time = time()
+                steps_per_sec = log_steps / (end_time - start_time)
+                samples_per_sec = dist.get_world_size()*x_cond.shape[0]*steps_per_sec
+                # Reduce loss history over all processes:
+                avg_loss = torch.tensor(running_loss / log_steps, device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
+                avg_loss = avg_loss.item() / dist.get_world_size()
+                total_steps = len(loader) * args.epochs
+                progress_pct = train_steps / total_steps * 100
+                remaining_steps = total_steps - train_steps
+                eta_seconds = remaining_steps / steps_per_sec if steps_per_sec > 0 else 0
+                eta_hours = eta_seconds / 3600
+                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}, Samples/Sec: {samples_per_sec:.2f}")
+                logger.info(f"Progress: {progress_pct:.2f}% | ETA: {eta_hours:.1f}h")
+                running_loss = 0
+                log_steps = 0
+                start_time = time()
+            # Save DiT checkpoint:
+            if train_steps % args.ckpt_every == 0 and train_steps > 0:
+                if rank == 0:
+                    checkpoint = {
+                        "model": model.module.state_dict(),
+                        "ema": ema.state_dict(),
+                        "opt": opt.state_dict(),
+                        "args": args,
+                        "epoch": epoch,
+                        "train_steps": train_steps
+                    }
+                    if bfloat_enable:
+                        checkpoint.update({"scaler": scaler.state_dict()})
+                    checkpoint_path = f"{checkpoint_dir}/latest.pth.tar"
+                    torch.save(checkpoint, checkpoint_path)
+                    if train_steps % (10*args.ckpt_every) == 0 and train_steps > 0:
+                        checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pth.tar"
+                        torch.save(checkpoint, checkpoint_path)
+                    logger.info(f"Saved checkpoint to {checkpoint_path}")
+            if train_steps % args.eval_every == 0 and train_steps > 0:
+                eval_start_time = time()
+                save_dir = os.path.join(experiment_dir, str(train_steps))
+                save_dir_train = os.path.join(experiment_dir, f"{train_steps}_train")
+                evaluate(ema, tokenizer, diffusion, test_dataset, rank, config["batch_size"], config["num_workers"], latent_size, device, save_dir_train, args.global_seed, bfloat_enable, num_cond, config["sample_rate"], config["input_sr"], logger)
+                dist.barrier()
+                eval_end_time = time()
+                eval_time = eval_end_time - eval_start_time
+    model.eval()  # important! This disables randomized embedding dropout
+    # do any sampling/FID calculation/etc. with ema (or model) in eval mode ...
+    logger.info("Done!")
+    cleanup()
+def denormalize_dis(ndata: float, min_v=-20.0, max_v=20.0, scale=0.15):
+    n01 = (float(ndata) + 1.0) / 2.0
+    raw = n01 * (max_v - min_v) + min_v
+    return raw * scale
+@torch.no_grad()
+def evaluate(model, vae, diffusion, test_dataloaders, rank, batch_size, num_workers, latent_size, device, save_dir, seed, bfloat_enable, num_cond, sample_rate, input_sr, logger):
+    sampler = DistributedSampler(
+        test_dataloaders,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=seed
+    )
+    loader = DataLoader(
+        test_dataloaders,
+        batch_size=batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    down_resampler = torchaudio.transforms.Resample(orig_freq=input_sr, new_freq=sample_rate, lowpass_filter_width=64).to(device, dtype=torch.bfloat16) # [RESAMPLE]
+    mel_tf = build_mel_transform(
+        sample_rate=sample_rate,
+        n_fft=1024, win_length=1024, hop_length=256,
+        n_mels=80, power=1.0,
+        device=device,
+    )
+    # Run for 1 step
+    for _, x, y, diff, rel_t, x_orig in loader:
+        x = x.to(device)
+        y = y.to(device)
+        diff = diff.to(device).flatten(0, 1) # [REWARD]
+        rel_t = rel_t.to(device).flatten(0, 1)
+        x_orig = x_orig.to(device)
+        with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+            B, T = x.shape[:2]
+            num_goals = T - num_cond
+            samples, diff_pred = model_forward_wrapper_a((model, diffusion, vae), x, y, num_timesteps=None, latent_size=latent_size, device=device, num_cond=num_cond, num_goals=num_goals, rel_t=rel_t)
+            decoded = down_resampler(samples)
+            x_start_pixels = x_orig[:, num_cond:].flatten(0, 1)
+            x_cond_pixels = x_orig[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_orig.shape[2], x_orig.shape[3]).flatten(0, 1)
+        break
+    if rank == 0:
+        os.makedirs(save_dir, exist_ok=True)
+        num_save = min(samples.shape[0], 10)
+        if diff is not None: # [REWARD]
+            mae = torch.mean(torch.abs(diff_pred - diff))
+            logger.info(f"Distance Diff MAE = {mae.item():.6f}")
+        mel_cosine_ls=[]
+        for i in range(num_save):
+            mel_cos = mel_cosine_stereo(x_start_pixels[i], decoded[i], sample_rate=sample_rate, mel_tf=mel_tf)
+            mel_cosine_ls.append(mel_cos)
+            ok = save_ref_hat_spectrogram_panel(
+                x_start_pixels[i], decoded[i],
+                out_path=f"{save_dir}/{i}_spectrograms.png",
+                n_fft=512, hop_length=160, win_length=400, pool=4,
+                title="gt vs pred"
+            )
+            torchaudio.save(f"{save_dir}/{i}_gen.wav", decoded[i].cpu().to(torch.float32), sample_rate=sample_rate)
+            torchaudio.save(f"{save_dir}/{i}_gt.wav", x_start_pixels[i].cpu().to(torch.float32), sample_rate=sample_rate)
+            torchaudio.save(f"{save_dir}/{i}_cond.wav", x_cond_pixels[i, -1].cpu().to(torch.float32), sample_rate=sample_rate)
+        logger.info("the first 10  mel cosine: " + ", ".join(f"{v:.6f}" for v in mel_cosine_ls))
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=300)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--log-every", type=int, default=100)
+    parser.add_argument("--ckpt-every", type=int, default=2000)
+    parser.add_argument("--eval-every", type=int, default=5000)
+    parser.add_argument("--bfloat16", type=int, default=1)
+    parser.add_argument("--torch-compile", type=int, default=1)
+    parser.add_argument("--restart-from-checkpoint", type=int, default=0,
+                    help="If 1, only load model weights and reset epoch/step to zero (cold start)")
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)

train_avwm_stage3.py ADDED Viewed

	@@ -0,0 +1,532 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# NoMaD, GNM, ViNT: https://github.com/robodhruv/visualnav-transformer
+# --------------------------------------------------------
+from inference_avwm import model_forward_wrapper_av
+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import matplotlib
+matplotlib.use('Agg')
+from collections import OrderedDict
+from copy import deepcopy
+from time import time
+import argparse
+import logging
+import os
+import matplotlib.pyplot as plt
+import yaml
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import DataLoader, ConcatDataset
+from torch.utils.data.distributed import DistributedSampler
+from diffusers.models import AutoencoderKL
+from distributed import init_distributed
+from models import AVCDiT_models
+from diffusion import create_diffusion
+from datasets import TrainingDataset
+from misc import transform
+from soundstream import SoundStream
+import torchaudio
+from eval_audio import build_mel_transform, mel_cosine_stereo, drms_avg_db_stereo, save_ref_hat_spectrogram_panel
+#################################################################################
+#                             Training Helper Functions                         #
+#################################################################################
+def load_checkpoint_if_available(model, ema, opt, scaler, config, device, logger, args):
+    start_epoch = 0
+    train_steps = 0
+    latest_path = os.path.join(config['results_dir'], config['run_name'], "checkpoints", "latest.pth.tar")
+    if os.path.isfile(latest_path) or config.get('from_checkpoint', 0):
+        latest_path = latest_path if os.path.isfile(latest_path) else config.get('from_checkpoint', 0)
+        print("Loading model from ", latest_path)
+        checkpoint = torch.load(latest_path, map_location=f"cuda:{device}", weights_only=False)
+        ema_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["ema"].items()}
+        model.load_state_dict(ema_ckp, strict=False)
+        print("Model weights loaded.")
+        ema.load_state_dict(ema_ckp, strict=False)
+        print("EMA weights loaded.")
+        if args.restart_from_checkpoint:
+            logger.info("Restarting training: epoch and step counters set to 0.")
+        else:
+            if "opt" in checkpoint:
+                opt_ckp = {k.replace('_orig_mod.', ''): v for k, v in checkpoint["opt"].items()}
+                opt.load_state_dict(opt_ckp)
+                print("Optimizer state loaded.")
+            if "scaler" in checkpoint and scaler is not None:
+                scaler.load_state_dict(checkpoint["scaler"])
+                print("GradScaler state loaded.")
+            if "epoch" in checkpoint:
+                start_epoch = checkpoint["epoch"] + 1
+            if "train_steps" in checkpoint:
+                train_steps = checkpoint["train_steps"]
+            logger.info(f"Resuming from epoch {start_epoch}, step {train_steps}")
+    return start_epoch, train_steps
+@torch.no_grad()
+def update_ema(ema_model, model, decay=0.9999):
+    """
+    Step the EMA model towards the current model.
+    """
+    ema_params = OrderedDict(ema_model.named_parameters())
+    model_params = OrderedDict(model.named_parameters())
+    for name, param in model_params.items():
+        name = name.replace('_orig_mod.', '')
+        ema_params[name].mul_(decay).add_(param.data, alpha=1 - decay)
+def requires_grad(model, flag=True):
+    """
+    Set requires_grad flag for all parameters in a model.
+    """
+    for p in model.parameters():
+        p.requires_grad = flag
+def cleanup():
+    """
+    End DDP training.
+    """
+    dist.destroy_process_group()
+def create_logger(logging_dir):
+    """
+    Create a logger that writes to a log file and stdout.
+    """
+    if dist.get_rank() == 0:  # real logger
+        logging.basicConfig(
+            level=logging.INFO,
+            format='[\033[34m%(asctime)s\033[0m] %(message)s',
+            datefmt='%Y-%m-%d %H:%M:%S',
+            handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")]
+        )
+        logger = logging.getLogger(__name__)
+    else:  # dummy logger (does nothing)
+        logger = logging.getLogger(__name__)
+        logger.addHandler(logging.NullHandler())
+    return logger
+#################################################################################
+#                                  Training Loop                                #
+#################################################################################
+def main(args):
+    """
+    Trains a new AVCDiT model.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # Setup DDP:
+    _, rank, device, _ = init_distributed()
+    # rank = dist.get_rank()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    with open("config/eval_config.yaml", "r") as f:
+        default_config = yaml.safe_load(f)
+    config = default_config
+    with open(args.config, "r") as f:
+        user_config = yaml.safe_load(f)
+    config.update(user_config)
+    # Setup an experiment folder:
+    os.makedirs(config['results_dir'], exist_ok=True)  # Make results folder (holds all experiment subfolders)
+    experiment_dir = f"{config['results_dir']}/{config['run_name']}"  # Create an experiment folder
+    checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        logger = create_logger(experiment_dir)
+        logger.info(f"Experiment directory created at {experiment_dir}")
+    else:
+        logger = create_logger(None)
+    # Create model:
+    tokenizer_v = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema").to(device)
+    tokenizer_a = SoundStream(C=32, D=16, n_q=8, codebook_size=1024).to(device)
+    tokenizer_a_path=config["tokenizer_a_path"]
+    tokenizer_a_checkpoint = torch.load(tokenizer_a_path, map_location=f"cuda:{device}")
+    tokenizer_a.load_state_dict(tokenizer_a_checkpoint["model_state"])
+    tokenizer_a.eval()
+    latent_size = config['image_size'] // 8
+    assert config['image_size'] % 8 == 0, "Image size must be divisible by 8 (for the VAE encoder)."
+    num_cond = config['context_size']
+    model = AVCDiT_models[config['model']](context_size=num_cond, input_size=latent_size, in_channels=4).to(device)
+    ema = deepcopy(model).to(device)  # Create an EMA of the model for use after training
+    requires_grad(ema, False)
+    # Setup optimizer (we used default Adam betas=(0.9, 0.999) and a constant learning rate of 1e-4 in our paper):
+    lr = float(config.get('lr', 1e-4))
+    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0)
+    bfloat_enable = bool(hasattr(args, 'bfloat16') and args.bfloat16)
+    if bfloat_enable:
+        scaler = torch.amp.GradScaler()
+    start_epoch, train_steps = load_checkpoint_if_available(
+        model, ema, opt, scaler if bfloat_enable else None, config, device, logger, args
+    )
+    # ~40% speedup but might leads to worse performance depending on pytorch version
+    if args.torch_compile:
+        model = torch.compile(model)
+    model = DDP(model, device_ids=[device])
+    diffusion = create_diffusion(timestep_respacing="", dual=True)  # default: 1000 steps, linear noise schedule
+    # ,predict_xstart=True
+    logger.info(f"AVCDiT Parameters: {sum(p.numel() for p in model.parameters()):,}")
+    train_dataset = []
+    test_dataset = []
+    for dataset_name in config["datasets"]:
+        data_config = config["datasets"][dataset_name]
+        for data_split_type in ["train", "test"]:
+            if data_split_type in data_config:
+                    goals_per_obs = int(data_config["goals_per_obs"])
+                    if data_split_type == 'test':
+                        goals_per_obs = 4 # standardize testing
+                    if "distance" in data_config:
+                        min_dist_cat=data_config["distance"]["min_dist_cat"]
+                        max_dist_cat=data_config["distance"]["max_dist_cat"]
+                    else:
+                        min_dist_cat=config["distance"]["min_dist_cat"]
+                        max_dist_cat=config["distance"]["max_dist_cat"]
+                    if "len_traj_pred" in data_config:
+                        len_traj_pred=data_config["len_traj_pred"]
+                    else:
+                        len_traj_pred=config["len_traj_pred"]
+                    dataset = TrainingDataset(
+                        data_folder=data_config["data_folder"],
+                        data_split_folder=data_config[data_split_type],
+                        dataset_name=dataset_name,
+                        image_size=config["image_size"],
+                        min_dist_cat=min_dist_cat,
+                        max_dist_cat=max_dist_cat,
+                        len_traj_pred=len_traj_pred,
+                        context_size=config["context_size"],
+                        normalize=config["normalize"],
+                        goals_per_obs=goals_per_obs,
+                        transform=transform,
+                        predefined_index=None,
+                        traj_stride=1,
+                        sample_rate=config["sample_rate"],
+                        # target_len=7840 #TODO
+                        input_sr=config["input_sr"],
+                        evaluate=(data_split_type=="test")
+                    )
+                    if data_split_type == "train":
+                        train_dataset.append(dataset)
+                    else:
+                        test_dataset.append(dataset)
+                    print(f"Dataset: {dataset_name} ({data_split_type}), size: {len(dataset)}")
+    # combine all the datasets from different robots
+    print(f"Combining {len(train_dataset)} datasets.")
+    train_dataset = ConcatDataset(train_dataset)
+    test_dataset = ConcatDataset(test_dataset)
+    sampler = DistributedSampler(
+        train_dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        train_dataset,
+        batch_size=config['batch_size'],
+        shuffle=False,
+        sampler=sampler,
+        num_workers=config['num_workers'],
+        pin_memory=True,
+        drop_last=True,
+        persistent_workers=True
+    )
+    logger.info(f"Dataset contains {len(train_dataset):,} images")
+    # Prepare models for training:
+    model.train()  # important! This enables embedding dropout for classifier-free guidance
+    ema.eval()  # EMA model should always be in eval mode
+    # Variables for monitoring/logging purposes:
+    log_steps = 0
+    running_loss = 0
+    start_time = time()
+    logger.info(f"Training for {args.epochs} epochs...")
+    for epoch in range(start_epoch, args.epochs):
+        sampler.set_epoch(epoch)
+        steps_per_epoch = len(loader)
+        if rank == 0:
+            logger.info(f"Epoch {epoch} contains {steps_per_epoch} steps.")
+        logger.info(f"Beginning epoch {epoch}...")
+        for x_v, x_a, y, diff, rel_t in loader:
+            x_v = x_v.to(device, non_blocking=True)
+            x_a = x_a.to(device, non_blocking=True)
+            y = y.to(device, non_blocking=True)
+            diff = diff.to(device, non_blocking=True)
+            rel_t = rel_t.to(device, non_blocking=True)
+            with torch.amp.autocast('cuda', enabled=bfloat_enable, dtype=torch.bfloat16):
+                with torch.no_grad():
+                    # Map input images to latent space + normalize latents:
+                    B, T = x_v.shape[:2]
+                    #=== vision observation encoding
+                    x_v = x_v.flatten(0,1)
+                    x_v = tokenizer_v.encode(x_v).latent_dist.sample().mul_(0.18215)
+                    x_v = x_v.unflatten(0, (B, T))
+                    #=== audio observation encoding
+                    x_a = x_a.flatten(0,1)
+                    x_a = tokenizer_a.encoder(x_a)
+                    x_a = x_a.unflatten(0, (B, T))
+                num_goals = T - num_cond
+                #=== split into target and condition
+                x_v_start = x_v[:, num_cond:].flatten(0, 1)
+                x_v_cond = x_v[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_v.shape[2], x_v.shape[3], x_v.shape[4]).flatten(0, 1)
+                x_a_start = x_a[:, num_cond:].flatten(0, 1)
+                x_a_cond = x_a[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_a.shape[2], x_a.shape[3]).flatten(0, 1)
+                #===
+                y = y.flatten(0, 1)
+                rel_t = rel_t.flatten(0, 1)
+                diff = diff.flatten(0, 1)                       # [N, 1]
+                diff_tok = diff.unsqueeze(1).expand(-1, 16, -1)  # [N, 64, 1]
+                x_a_start = torch.cat([x_a_start, diff_tok], dim=2)  # [N, 64, 181]
+                t = torch.randint(0, diffusion.num_timesteps, (x_v_start.shape[0],), device=device)
+                model_kwargs = dict(y=y, x_v_cond=x_v_cond, x_a_cond=x_a_cond, rel_t=rel_t)
+                loss_dict = diffusion.training_losses(model, x_v_start, x_a_start, t, model_kwargs)
+                loss = loss_dict["loss"].mean()
+            if not bfloat_enable:
+                opt.zero_grad()
+                loss.backward()
+                opt.step()
+            else:
+                scaler.scale(loss).backward()
+                if config.get('grad_clip_val', 0) > 0:
+                    scaler.unscale_(opt)
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=config['grad_clip_val'])
+                scaler.step(opt)
+                scaler.update()
+            update_ema(ema, model.module)
+            # Log loss values:
+            running_loss += loss.detach().item()
+            log_steps += 1
+            train_steps += 1
+            if train_steps % args.log_every == 0:
+                # Measure training speed:
+                torch.cuda.synchronize()
+                end_time = time()
+                steps_per_sec = log_steps / (end_time - start_time)
+                samples_per_sec = dist.get_world_size()*x_v_cond.shape[0]*steps_per_sec
+                # Reduce loss history over all processes:
+                avg_loss = torch.tensor(running_loss / log_steps, device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
+                avg_loss = avg_loss.item() / dist.get_world_size()
+                total_steps = len(loader) * args.epochs
+                progress_pct = train_steps / total_steps * 100
+                remaining_steps = total_steps - train_steps
+                eta_seconds = remaining_steps / steps_per_sec if steps_per_sec > 0 else 0
+                eta_hours = eta_seconds / 3600
+                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}, Samples/Sec: {samples_per_sec:.2f}")
+                logger.info(f"Progress: {progress_pct:.2f}% | ETA: {eta_hours:.1f}h")
+                # Reset monitoring variables:
+                running_loss = 0
+                log_steps = 0
+                start_time = time()
+            # Save DiT checkpoint:
+            if train_steps % args.ckpt_every == 0 and train_steps > 0:
+                if rank == 0:
+                    checkpoint = {
+                        "model": model.module.state_dict(),
+                        "ema": ema.state_dict(),
+                        "opt": opt.state_dict(),
+                        "args": args,
+                        "epoch": epoch,
+                        "train_steps": train_steps
+                    }
+                    if bfloat_enable:
+                        checkpoint.update({"scaler": scaler.state_dict()})
+                    checkpoint_path = f"{checkpoint_dir}/latest.pth.tar"
+                    torch.save(checkpoint, checkpoint_path)
+                    if train_steps % (10*args.ckpt_every) == 0 and train_steps > 0:
+                        checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pth.tar"
+                        torch.save(checkpoint, checkpoint_path)
+                    logger.info(f"Saved checkpoint to {checkpoint_path}")
+            if train_steps % args.eval_every == 0 and train_steps > 0:
+                eval_start_time = time()
+                # validation / test set evaluation
+                save_dir = os.path.join(experiment_dir, str(train_steps))
+                sim_score_val = evaluate(ema, tokenizer_v, tokenizer_a, diffusion, test_dataset, rank, config["batch_size"], config["num_workers"], latent_size, device, save_dir, args.global_seed, bfloat_enable, num_cond, config["sample_rate"], config["input_sr"], logger)
+                dist.barrier()
+                eval_end_time = time()
+                eval_time = eval_end_time - eval_start_time
+                # logger.info(f"(step={train_steps:07d}) Val Perceptual Loss: {sim_score_val:.4f}, Train Perceptual Loss: {sim_score_train:.4f}, Eval Time: {eval_time:.2f}")
+                logger.info(f"(step={train_steps:07d}) Val Perceptual Loss: {sim_score_val:.4f}, Eval Time: {eval_time:.2f}")
+    model.eval()  # important! This disables randomized embedding dropout
+    # do any sampling/FID calculation/etc. with ema (or model) in eval mode ...
+    logger.info("Done!")
+    cleanup()
+def denormalize_dis(ndata: float, min_v=-20.0, max_v=20.0, scale=0.15):
+    n01 = (float(ndata) + 1.0) / 2.0
+    raw = n01 * (max_v - min_v) + min_v
+    return raw * scale
+@torch.no_grad
+def evaluate(model, vae, sstream, diffusion, test_dataloaders, rank, batch_size, num_workers, latent_size, device, save_dir, seed, bfloat_enable, num_cond, sample_rate, input_sr, logger):
+    sampler = DistributedSampler(
+        test_dataloaders,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=seed
+    )
+    loader = DataLoader(
+        test_dataloaders,
+        batch_size=batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    from dreamsim import dreamsim
+    eval_model, _ = dreamsim(pretrained=True)
+    score = torch.tensor(0.).to(device)
+    n_samples = torch.tensor(0).to(device)
+    down_resampler = torchaudio.transforms.Resample(orig_freq=input_sr, new_freq=sample_rate, lowpass_filter_width=64).to(device, dtype=torch.bfloat16)
+    mel_tf = build_mel_transform(
+        sample_rate=sample_rate,
+        n_fft=1024, win_length=1024, hop_length=256,
+        n_mels=80, power=1.0,
+        device=device,   # or ref.device
+    )
+    # Run for 1 step
+    for x_v, x_a, y, diff, rel_t, x_a_orig in loader:
+        x_v = x_v.to(device)
+        x_a = x_a.to(device)
+        x_a_orig = x_a_orig.to(device)
+        y = y.to(device)
+        diff = diff.to(device).flatten(0, 1)
+        rel_t = rel_t.to(device).flatten(0, 1)
+        with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
+            B, T = x_v.shape[:2]
+            num_goals = T - num_cond
+            samples_v, samples_a, diff_pred = model_forward_wrapper_av((model, diffusion, vae, sstream), (x_v, x_a), y, num_timesteps=None, latent_size=latent_size, device=device, num_cond=num_cond, num_goals=num_goals, rel_t=rel_t)
+            samples_a = down_resampler(samples_a) #
+            x_start_pixels = x_v[:, num_cond:].flatten(0, 1)
+            x_cond_pixels = x_v[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_v.shape[2], x_v.shape[3], x_v.shape[4]).flatten(0, 1)
+            samples_v = samples_v * 0.5 + 0.5
+            x_start_pixels = x_start_pixels * 0.5 + 0.5
+            x_cond_pixels = x_cond_pixels * 0.5 + 0.5
+            res = eval_model(x_start_pixels, samples_v)
+            score += res.sum()
+            n_samples += len(res)
+            # x_start_audio = x_a[:, num_cond:].flatten(0, 1)
+            # x_cond_audio = x_a[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_a.shape[2], x_a.shape[3]).flatten(0, 1)
+            x_start_audio = x_a_orig[:, num_cond:].flatten(0, 1)
+            x_cond_audio = x_a_orig[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_a_orig.shape[2], x_a_orig.shape[3]).flatten(0, 1)
+        break
+    if rank == 0:
+        os.makedirs(save_dir, exist_ok=True)
+        if diff is not None:
+            mae = torch.mean(torch.abs(diff_pred - diff))
+            logger.info(f"Distance Diff MAE = {mae.item():.6f}")
+        mel_cosine_ls=[]
+        for i in range(min(samples_v.shape[0], 10)):
+            _, ax = plt.subplots(1,3,dpi=256)
+            ax[0].imshow((x_cond_pixels[i, -1].permute(1,2,0).cpu().numpy()*255).astype('uint8'))
+            ax[1].imshow((x_start_pixels[i].permute(1,2,0).cpu().numpy()*255).astype('uint8'))
+            ax[2].imshow((samples_v[i].permute(1,2,0).cpu().float().numpy()*255).astype('uint8'))
+            plt.savefig(f'{save_dir}/{i}.png')
+            plt.close()
+            mel_cos = mel_cosine_stereo(x_start_audio[i], samples_a[i], sample_rate=sample_rate, mel_tf=mel_tf)
+            mel_cosine_ls.append(mel_cos)
+            ok = save_ref_hat_spectrogram_panel(
+                x_start_audio[i], samples_a[i],
+                out_path=f"{save_dir}/{i}_spectrograms.png",
+                n_fft=512, hop_length=160, win_length=400, pool=4,
+                title="gt vs pred"
+            )
+            # sr = int(16000 * 7840 / 2400) #TODO
+            torchaudio.save(f"{save_dir}/{i}_gen.wav", samples_a[i].cpu().to(torch.float32), sample_rate=sample_rate)
+            torchaudio.save(f"{save_dir}/{i}_gt.wav", x_start_audio[i].cpu().to(torch.float32), sample_rate=sample_rate)
+            torchaudio.save(f"{save_dir}/{i}_cond.wav", x_cond_audio[i, -1].cpu().to(torch.float32), sample_rate=sample_rate)
+        logger.info("the first 10  mel cosine: " + ", ".join(f"{v:.6f}" for v in mel_cosine_ls))
+    dist.all_reduce(score)
+    dist.all_reduce(n_samples)
+    sim_score = score/n_samples
+    return sim_score
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--epochs", type=int, default=300)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--log-every", type=int, default=100)
+    parser.add_argument("--ckpt-every", type=int, default=2000)
+    parser.add_argument("--eval-every", type=int, default=5000)
+    parser.add_argument("--bfloat16", type=int, default=1)
+    parser.add_argument("--torch-compile", type=int, default=1)
+    parser.add_argument("--restart-from-checkpoint", type=int, default=0,
+                    help="If 1, only load model weights and reset epoch/step to zero (cold start)")
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)