File size: 24,292 Bytes

705a8fd

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from distributed import init_distributed
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

import yaml
import argparse
import os
import numpy as np

from diffusion import create_diffusion
from diffusers.models import AutoencoderKL

import misc
import distributed as dist
from models import AVCDiT_models
from datasets import EvalDataset
from PIL import Image
from soundstream import SoundStream
import torchaudio
from skimage.measure import block_reduce

import matplotlib.pyplot as plt
import librosa
import time
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from collections import defaultdict
import json

def save_image(output_file, img, unnormalize_img):
    img = img.detach().cpu()
    if unnormalize_img:
        img = misc.unnormalize(img)
        
    img = img * 255
    img = img.byte()
    image = Image.fromarray(img.permute(1, 2, 0).numpy(), mode='RGB')

    image.save(output_file)
    
def save_audio(output_file, audio_tensor, sample_rate):
    audio_tensor = audio_tensor.detach().cpu()
    if audio_tensor.ndim == 1:
        audio_tensor = audio_tensor.unsqueeze(0)
    torchaudio.save(output_file, audio_tensor.to(torch.float32), sample_rate)
    
def get_dataset_eval(config, dataset_name, eval_type, predefined_index=True):
    data_config = config["eval_datasets"][dataset_name]    
    if predefined_index:
        predefined_index = f"data_splits/{dataset_name}/test/{eval_type}.pkl"
    else:
        predefined_index=None

    
    dataset = EvalDataset(
                data_folder=data_config["data_folder"],
                data_split_folder=data_config["test"],
                dataset_name=dataset_name,
                image_size=config["image_size"],
                min_dist_cat=config["eval_distance"]["eval_min_dist_cat"],
                max_dist_cat=config["eval_distance"]["eval_max_dist_cat"],
                len_traj_pred=config["eval_len_traj_pred"],
                traj_stride=config["traj_stride"], 
                context_size=config["eval_context_size"],
                normalize=config["normalize"],
                transform=misc.transform,
                goals_per_obs=4,
                predefined_index=predefined_index,
                traj_names='traj_names.txt'
            )
    
    return dataset


@torch.no_grad()
def model_forward_wrapper_v(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
    model, diffusion, vae = all_models
    x = curr_obs.to(device)
    y = curr_delta.to(device)

    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
        B, T = x.shape[:2]

        if rel_t is None:
            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
            rel_t *= num_timesteps

        x = x.flatten(0,1)
        x = vae.encode(x).latent_dist.sample().mul_(0.18215).unflatten(0, (B, T))
        x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3], x.shape[4]).flatten(0, 1)
        z = torch.randn(B*num_goals, 4, latent_size, latent_size, device=device)
        y = y.flatten(0, 1)
        model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)      
        samples = diffusion.p_sample_loop(
                model.forward, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
        )
        samples = vae.decode(samples / 0.18215).sample

        return torch.clip(samples, -1., 1.)


@torch.no_grad()
def model_forward_wrapper_a(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
    model, diffusion, sstream = all_models
    x = curr_obs.to(device)
    y = curr_delta.to(device)
    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
        B, T = x.shape[:2]
        if rel_t is None:
            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
            rel_t *= num_timesteps
        x = x.flatten(0,1)
        x = sstream.encoder(x).unflatten(0, (B, T))
        x_cond = x[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x.shape[2], x.shape[3]).flatten(0, 1)
        z = torch.randn(B*num_goals, 16, 181, device=device)
        y = y.flatten(0, 1)
        model_kwargs = dict(y=y, x_cond=x_cond, rel_t=rel_t)     
        samples = diffusion.p_sample_loop(
                model.forward, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
        )
        # REWARD TOKEN
        patch_tok  = samples[..., -1:]                        # [N, 64, 1]
        diff_pred  = patch_tok.mean(dim=1, keepdim=True)      # [N, 1]
        samples = samples[..., :-1]
        # AUDIO TOKENS
        quantized, _, _ = sstream.quantizer(samples.permute(0, 2, 1))  # [1, T', D]
        samples = sstream.decoder(quantized.permute(0, 2, 1))
        return samples, diff_pred
    

@torch.no_grad()
def model_forward_wrapper_av(all_models, curr_obs, curr_delta, num_timesteps, latent_size, device, num_cond, num_goals=1, rel_t=None, progress=False):
    model, diffusion, vae, sstream = all_models
    x_v, x_a = curr_obs
    x_v = x_v.to(device)
    x_a = x_a.to(device)
    y = curr_delta.to(device)
    with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
        B, T_v = x_v.shape[:2]
        B, T_a = x_a.shape[:2]

        if rel_t is None:
            rel_t = (torch.ones(B)* (1. / 128.)).to(device)
            rel_t *= num_timesteps
        x_v = x_v.flatten(0,1)
        x_a = x_a.flatten(0,1)
        x_v = vae.encode(x_v).latent_dist.sample().mul_(0.18215).unflatten(0, (B, T_v))
        x_a = sstream.encoder(x_a).unflatten(0, (B, T_a))
        x_v_cond = x_v[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_v.shape[2], x_v.shape[3], x_v.shape[4]).flatten(0, 1)
        x_a_cond = x_a[:, :num_cond].unsqueeze(1).expand(B, num_goals, num_cond, x_a.shape[2], x_a.shape[3]).flatten(0, 1)
        z_v = torch.randn(B*num_goals, 4, latent_size, latent_size, device=device)
        z_a = torch.randn(B*num_goals, 16, 181, device=device) #TODO
        y = y.flatten(0, 1)
        model_kwargs = dict(y=y, x_v_cond=x_v_cond, x_a_cond=x_a_cond, rel_t=rel_t)
        samples_v, samples_a = diffusion.p_sample_loop(
                model.forward, z_v.shape, z_a.shape, z_v, z_a, clip_denoised=False, model_kwargs=model_kwargs, progress=progress, device=device
        )
        patch_tok  = samples_a[..., -1:]                        # [N, 16, 1]
        diff_pred  = patch_tok.mean(dim=1, keepdim=True)      # [N, 1]
        samples_a = samples_a[..., :-1]
        samples_v = vae.decode(samples_v / 0.18215).sample
        quantized, _, _ = sstream.quantizer(samples_a.permute(0, 2, 1))  # [1, T', D]
        samples_a = sstream.decoder(quantized.permute(0, 2, 1))
        return torch.clip(samples_v, -1., 1.), samples_a, diff_pred


def generate_rollout(args, output_dir, rollout_frames, idxs, all_models, obs_av, gt_av, diffs_seq, delta, num_cond, device):
    (obs_image, obs_audio, orig_obs_audio)=obs_av
    (gt_image, gt_audio, orig_gt_audio)=gt_av

    gt_image = gt_image[:,:rollout_frames]
    gt_audio = gt_audio[:,:rollout_frames]
    curr_v = obs_image.to(device)
    curr_a = obs_audio.to(device)
    down_resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000, lowpass_filter_width=64).to(device, dtype=torch.bfloat16)
    episode_records = defaultdict(list)
    value_key = "denorm_gt" if args.gt else "denorm_pred"
    
    for i in range(gt_image.shape[1]):
        curr_delta = delta[:, i:i+1].to(device)

        x_gt_pixels = gt_image[:, i].to(device)
        x_gt_audios_orig = orig_gt_audio[:, i].to(device)
        if args.gt:
            visualize_preds(output_dir, idxs, i+1, x_gt_pixels, x_gt_audios_orig, 16000)
            denorm_gt_vals = denorm_from_tensor(diffs_seq[:, i:i+1, :])  # [B]
            idxs_1d = idxs.detach().view(-1).cpu().numpy()
            for b, sample_idx in enumerate(idxs_1d):
                episode_records[int(sample_idx)].append({"sec": int(i+1), "value": float(denorm_gt_vals[b])})
        else:
            diff_gt = diffs_seq[:, i:i+1, :].unsqueeze(1).to(device)
            x_pred_pixels, x_pred_audios, diff_pred = model_forward_wrapper_av(all_models, (curr_v, curr_a), curr_delta, num_timesteps=1, latent_size=args.latent_size, device=device, num_cond=num_cond, num_goals=1)
            x_pred_audios_orig = down_resampler(x_pred_audios)
            curr_v = torch.cat((curr_v, x_pred_pixels.unsqueeze(1)), dim=1) # append current prediction
            curr_v = curr_v[:, 1:] # remove first observation
            curr_a = torch.cat((curr_a, x_pred_audios.unsqueeze(1)), dim=1) # append current prediction
            curr_a = curr_a[:, 1:] # remove first observation
            denorm_pred_vals = denorm_from_tensor(diff_pred)  # [B]
            denorm_gt_vals   = denorm_from_tensor(diff_gt)    # [B]
            visualize_preds(output_dir, idxs, i+1, x_pred_pixels, x_pred_audios_orig, 16000)
            visualize_compare(output_dir, idxs, i+1,
                              x_pred_pixels, x_pred_audios_orig,
                              x_gt_pixels,   x_gt_audios_orig,
                              denorm_pred_vals=denorm_pred_vals,
                              denorm_gt_vals=denorm_gt_vals)
            idxs_1d = idxs.detach().view(-1).cpu().numpy()
            for b, sample_idx in enumerate(idxs_1d):
                episode_records[int(sample_idx)].append({"sec": int(i+1), "value": float(denorm_pred_vals[b])})

    for sample_idx, rows in episode_records.items():
        rows = sorted(rows, key=lambda r: r["sec"])
        sample_folder = os.path.join(output_dir, f"id_{sample_idx}")
        os.makedirs(sample_folder, exist_ok=True)
        out_json = os.path.join(sample_folder, "distance.json")
        compact = [{ "sec": r["sec"], value_key: r["value"] } for r in rows]
        with open(out_json, "w") as f:
            json.dump(compact, f, indent=2)


def generate_time(args, output_dir, idxs, all_models, obs_av, gt_av, diffs_seq, delta, secs, num_cond, device):
    (obs_image, obs_audio, _)=obs_av
    (gt_image, _, orig_gt_audio)=gt_av
    down_resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000, lowpass_filter_width=64).to(device, dtype=torch.bfloat16)
    episode_records = defaultdict(list)  # {sample_idx: [{"sec": int, "value": float}, ...]}
    value_key = "denorm_gt" if args.gt else "denorm_pred"

    for sec in secs:
        curr_delta = delta[:, :sec].sum(dim=1, keepdim=True)
        x_gt_pixels = gt_image[:, sec-1].to(device)
        x_gt_audios_orig = orig_gt_audio[:, sec-1].to(device)
        if args.gt:
            denorm_gt_vals = denorm_from_tensor(diffs_seq[:, :sec, :].sum(dim=1, keepdim=True))  # [B]
            visualize_preds(output_dir, idxs, sec, x_gt_pixels, x_gt_audios_orig, 16000)
            idxs_1d = idxs.detach().view(-1).cpu().numpy()
            for b, sample_idx in enumerate(idxs_1d):
                episode_records[int(sample_idx)].append({"sec": int(sec), "value": float(denorm_gt_vals[b])})
        else:
            diff_gt = diffs_seq[:, :sec, :].sum(dim=1, keepdim=True).to(device)

            print(obs_image.shape, obs_audio.shape, curr_delta.shape, obs_image.dtype, obs_audio.dtype, curr_delta.dtype)
            x_pred_pixels, x_pred_audios, diff_pred = model_forward_wrapper_av(all_models, (obs_image, obs_audio) , curr_delta, sec, args.latent_size, num_cond=num_cond, num_goals=1, device=device)
            x_pred_audios_orig = down_resampler(x_pred_audios)
            denorm_pred_vals = denorm_from_tensor(diff_pred)       # [B]
            denorm_gt_vals   = denorm_from_tensor(diff_gt)         # [B]

            visualize_preds(output_dir, idxs, sec, x_pred_pixels, x_pred_audios_orig, 16000)
            visualize_compare(output_dir, idxs, sec,
                              x_pred_pixels, x_pred_audios_orig,
                              x_gt_pixels,   x_gt_audios_orig,
                              denorm_pred_vals=denorm_pred_vals,
                              denorm_gt_vals=denorm_gt_vals)
            idxs_1d = idxs.detach().view(-1).cpu().numpy()
            for b, sample_idx in enumerate(idxs_1d):
                episode_records[int(sample_idx)].append({"sec": int(sec), "value": float(denorm_pred_vals[b])})
    for sample_idx, rows in episode_records.items():
        rows = sorted(rows, key=lambda r: r["sec"])
        sample_folder = os.path.join(output_dir, f"id_{sample_idx}")
        os.makedirs(sample_folder, exist_ok=True)
        out_json = os.path.join(sample_folder, "distance.json")
        compact = [{ "sec": r["sec"], value_key: r["value"] } for r in rows]
        with open(out_json, "w") as f:
            json.dump(compact, f, indent=2)


def visualize_preds(output_dir, idxs, sec, x_pred_pixels, x_pred_audios, sample_rate):
    idxs_1d = idxs.detach().view(-1)
    for batch_idx, sample_idx in enumerate(idxs_1d):
        sample_idx = int(sample_idx.item())
        sample_folder = os.path.join(output_dir, f'id_{sample_idx}')
        os.makedirs(sample_folder, exist_ok=True)
        image_file = os.path.join(sample_folder, f'{sec}.png')
        save_image(image_file, x_pred_pixels[batch_idx], True)
        audio_file = os.path.join(sample_folder, f'{sec}.wav')
        save_audio(audio_file, x_pred_audios[batch_idx], sample_rate)

def _compute_binaural_spectrogram_np(audio_2ch: np.ndarray):
    def _stft_abs(signal):
        n_fft = 512
        hop_length = 160
        win_length = 400
        stft = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
        stft = block_reduce(stft, block_size=(4, 4), func=np.mean)
        return stft
    L = np.log1p(_stft_abs(audio_2ch[0]))
    R = np.log1p(_stft_abs(audio_2ch[1]))
    spec = np.stack([L, R], axis=-1)  # (F,T,2)
    return spec

def denorm_from_tensor(t: torch.Tensor, min_v=-20.0, max_v=20.0, scale=0.15) -> torch.Tensor:
    x = t.detach().float().view(t.shape[0], -1)[:, 0]
    n01 = (x + 1.0) / 2.0
    raw = n01 * (max_v - min_v) + min_v
    return raw * scale

def visualize_compare(output_dir, idxs, sec,
                    x_pred_pixels, x_pred_audios_orig,
                    x_gt_pixels,   x_gt_audios_orig,
                    denorm_pred_vals,
                    denorm_gt_vals):
    idxs_np = idxs.detach().view(-1).cpu().numpy()

    B = x_pred_pixels.shape[0]
    assert x_gt_pixels.shape[0] == B and x_pred_audios_orig.shape[0] == B and x_gt_audios_orig.shape[0] == B

    for b in range(B):
        sample_idx = int(idxs_np[b])
        sample_folder = os.path.join(output_dir, f'id_{sample_idx}')
        os.makedirs(sample_folder, exist_ok=True)
        out_path = os.path.join(sample_folder, f'compare_{sec}.png')
        def _tensor_to_display_img(x: torch.Tensor):
            x = x.detach().cpu()
            x = misc.unnormalize(x)
            x = (x * 255.0).round().clamp(0, 255)
            x = x.to(torch.uint8).permute(1, 2, 0)
            return x.numpy()
        
        pred_img = _tensor_to_display_img(x_pred_pixels[b])
        gt_img   = _tensor_to_display_img(x_gt_pixels[b])

        pred_aud = x_pred_audios_orig[b].detach().cpu().float().numpy()
        gt_aud   = x_gt_audios_orig[b].detach().cpu().float().numpy()
        pred_spec = _compute_binaural_spectrogram_np(pred_aud)
        gt_spec   = _compute_binaural_spectrogram_np(gt_aud)

        vmin_L = min(pred_spec[:, :, 0].min(), gt_spec[:, :, 0].min())
        vmax_L = max(pred_spec[:, :, 0].max(), gt_spec[:, :, 0].max())
        vmin_R = min(pred_spec[:, :, 1].min(), gt_spec[:, :, 1].min())
        vmax_R = max(pred_spec[:, :, 1].max(), gt_spec[:, :, 1].max())
        
        dn_pred = float(denorm_pred_vals[b]) if denorm_pred_vals is not None else 0
        dn_gt   = float(denorm_gt_vals[b])   if denorm_gt_vals   is not None else 0

        fig, axes = plt.subplots(2, 4, figsize=(14, 6), constrained_layout=True)

        axes[0, 0].imshow(pred_img); axes[0, 0].set_title('pred image'); axes[0, 0].axis('off')
        axes[0, 1].imshow(gt_img);   axes[0, 1].set_title('gt image');   axes[0, 1].axis('off')

        axes[1, 0].axis('off')
        axes[1, 1].axis('off')

        im_pred_L = axes[0, 2].imshow(pred_spec[:, :, 0], origin='lower', aspect='auto', vmin=vmin_L, vmax=vmax_L)
        axes[0, 2].set_title('pred spec (Left)'); axes[0, 2].set_xticks([]); axes[0, 2].set_yticks([])
        im_gt_L = axes[0, 3].imshow(gt_spec[:, :, 0], origin='lower', aspect='auto', vmin=vmin_L, vmax=vmax_L)
        axes[0, 3].set_title('gt spec (Left)'); axes[0, 3].set_xticks([]); axes[0, 3].set_yticks([])
        im_pred_R = axes[1, 2].imshow(pred_spec[:, :, 1], origin='lower', aspect='auto', vmin=vmin_R, vmax=vmax_R)
        axes[1, 2].set_title('pred spec (Right)'); axes[1, 2].set_xticks([]); axes[1, 2].set_yticks([])
        im_gt_R = axes[1, 3].imshow(gt_spec[:, :, 1], origin='lower', aspect='auto', vmin=vmin_R, vmax=vmax_R)
        axes[1, 3].set_title('gt spec (Right)'); axes[1, 3].set_xticks([]); axes[1, 3].set_yticks([])

        fig.suptitle(
            f'id={sample_idx}, sec={sec} | denorm(reward_pred)={dn_pred:.4f}, denorm(reward_gt)={dn_gt:.4f}',
            fontsize=11
        )
        plt.savefig(out_path, dpi=180)
        plt.close(fig)


@torch.no_grad()
def main(args):
    _, _, device, _ = init_distributed()
    print(args)
    device = torch.device(device)
    num_tasks = dist.get_world_size()
    global_rank = dist.get_rank()
    exp_eval = args.exp

    # model & config setup
    if args.gt:
        args.save_output_dir = os.path.join(args.output_dir, 'gt')
    else:
        exp_name = os.path.basename(exp_eval).split('.')[0]
        args.save_output_dir = os.path.join(args.output_dir, exp_name)
    
    if  args.ckp != '0100000':
        args.save_output_dir = args.save_output_dir + "_%s"%(args.ckp)

    os.makedirs(args.save_output_dir, exist_ok=True)

    with open("config/eval_config.yaml", "r") as f:
        default_config = yaml.safe_load(f)
    config = default_config

    with open(exp_eval, "r") as f:
        user_config = yaml.safe_load(f)
    config.update(user_config)

    eval_len_traj_pred=config["eval_len_traj_pred"]
    if args.rollout_frames==-1:
        args.rollout_frames=eval_len_traj_pred
    assert args.rollout_frames<=eval_len_traj_pred
    latent_size = config['image_size'] // 8
    args.latent_size = config['image_size'] // 8

    num_cond = config['context_size']
    print("loading")
    model_lst = (None, None, None, None)
    if not args.gt:
        model = AVCDiT_models[config['model']](context_size=num_cond, input_size=latent_size, in_channels=4, mode="av")
        ckp = torch.load(f'{config["results_dir"]}/{config["run_name"]}/checkpoints/{args.ckp}.pth.tar', map_location='cpu', weights_only=False)
        print(model.load_state_dict(ckp["ema"], strict=True))
        model.eval()
        model.to(device)
        model = torch.compile(model)
        diffusion = create_diffusion(str(250), dual=True)
        vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema").to(device)

        sstream = SoundStream(C=32, D=16, n_q=8, codebook_size=1024).to(device)
        sstream_path=config["tokenizer_a_path"]
        sstream_checkpoint = torch.load(sstream_path, map_location=device)
        sstream.load_state_dict(sstream_checkpoint["model_state"])
        sstream.eval()

        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=False)
        model_lst = (model, diffusion, vae, sstream)

    # Loading Datasets
    dataset_names = args.datasets.split(',')
    datasets = {}

    for dataset_name in dataset_names:
        dataset_val = get_dataset_eval(config, dataset_name, args.eval_type, predefined_index=False)

        if len(dataset_val) % num_tasks != 0:
            print('Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. '
                    'This will slightly alter validation results as extra duplicate entries are added to achieve '
                    'equal num of samples per-process.')
        sampler_val = torch.utils.data.DistributedSampler(
            dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)

        curr_data_loader = torch.utils.data.DataLoader(
                            dataset_val, sampler=sampler_val,
                            batch_size=args.batch_size,
                            num_workers=args.num_workers,
                            pin_memory=True,
                            drop_last=False
                        )
        datasets[dataset_name] = curr_data_loader

    print_freq = 1
    header = 'Evaluation: '
    metric_logger = dist.MetricLogger(delimiter="  ")

    for dataset_name in dataset_names:
        dataset_save_output_dir = os.path.join(args.save_output_dir, dataset_name)
        os.makedirs(dataset_save_output_dir, exist_ok=True)
        curr_data_loader = datasets[dataset_name]
        
        for data_iter_step, (idxs, obs_image, gt_image, obs_audio, gt_audio, diffs_seq, delta, orig_obs_audio, orig_gt_audio) in enumerate(metric_logger.log_every(curr_data_loader, print_freq, header)):
            with torch.amp.autocast('cuda', enabled=True, dtype=torch.bfloat16):
                obs_image = obs_image[:, -num_cond:].to(device)
                gt_image = gt_image.to(device)
                obs_audio = obs_audio[:, -num_cond:].to(device)
                gt_audio = gt_audio.to(device)
                orig_obs_audio = orig_obs_audio[:, -num_cond:].to(device)
                orig_gt_audio = orig_gt_audio.to(device)

                diffs_seq = diffs_seq.to(device)
                obs_av=(obs_image, obs_audio, orig_obs_audio)
                gt_av=(gt_image, gt_audio, orig_gt_audio)
                if args.eval_type == 'rollout':
                    curr_rollout_output_dir = os.path.join(dataset_save_output_dir, f'rollout_{args.rollout_frames}frames')
                    os.makedirs(curr_rollout_output_dir, exist_ok=True)
                    generate_rollout(args, curr_rollout_output_dir, args.rollout_frames, idxs, model_lst, obs_av, gt_av, diffs_seq, delta, num_cond, device)
                elif args.eval_type == 'time':
                    if args.time_secs != '':
                        secs = np.array([int(sec) for sec in args.time_secs.split(',')])
                    else:
                        secs = np.array([int(sec) for sec in range(1,args.rollout_frames+1)])
                    curr_time_output_dir = os.path.join(dataset_save_output_dir, 'time')
                    os.makedirs(curr_time_output_dir, exist_ok=True)
                    generate_time(args, curr_time_output_dir, idxs, model_lst, obs_av, gt_av, diffs_seq, delta, secs, num_cond, device)
    

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--output_dir", type=str, default=None, help="output directory")
    parser.add_argument("--exp", type=str, default=None, help="experiment name")
    parser.add_argument("--ckp", type=str, default='0100000')
    parser.add_argument("--num_sec_eval", type=int, default=5)
    parser.add_argument("--input_fps", type=int, default=4)
    parser.add_argument("--datasets", type=str, default=None, help="dataset name")
    parser.add_argument("--num_workers", type=int, default=8, help="num workers")
    parser.add_argument("--batch_size", type=int, default=16, help="batch size")
    parser.add_argument("--eval_type", type=str, default=None, help="type of evaluation has to be either 'time' or 'rollout'")
    # Rollout Evaluation Args
    parser.add_argument("--time_secs", type=str, default='', help="") #'1,2,3,4'
    parser.add_argument("--rollout_frames", type=int, default=-1, help="")
    parser.add_argument("--gt", type=int, default=0, help="set to 1 to produce ground truth evaluation set")
    args = parser.parse_args()
    
    main(args)