Mirror alias_free_activation/cuda/anti_alias_activation_cuda.cu from nvidia/bigvgan_v2_44khz_128band_512x@95a9d1dc
Browse files
encoders/nvidia/bigvgan_v2_44khz_128band_512x/alias_free_activation/cuda/anti_alias_activation_cuda.cu
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/* coding=utf-8
|
| 2 |
+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
| 3 |
+
*
|
| 4 |
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
* you may not use this file except in compliance with the License.
|
| 6 |
+
* You may obtain a copy of the License at
|
| 7 |
+
*
|
| 8 |
+
* http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
*
|
| 10 |
+
* Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
* See the License for the specific language governing permissions and
|
| 14 |
+
* limitations under the License.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
#include <ATen/ATen.h>
|
| 18 |
+
#include <cuda.h>
|
| 19 |
+
#include <cuda_runtime.h>
|
| 20 |
+
#include <cuda_fp16.h>
|
| 21 |
+
#include <cuda_profiler_api.h>
|
| 22 |
+
#include <ATen/cuda/CUDAContext.h>
|
| 23 |
+
#include <torch/extension.h>
|
| 24 |
+
#include "type_shim.h"
|
| 25 |
+
#include <assert.h>
|
| 26 |
+
#include <cfloat>
|
| 27 |
+
#include <limits>
|
| 28 |
+
#include <stdint.h>
|
| 29 |
+
#include <c10/macros/Macros.h>
|
| 30 |
+
|
| 31 |
+
namespace
|
| 32 |
+
{
|
| 33 |
+
// Hard-coded hyperparameters
|
| 34 |
+
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
|
| 35 |
+
constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
|
| 36 |
+
constexpr int BUFFER_SIZE = 32;
|
| 37 |
+
constexpr int FILTER_SIZE = 12;
|
| 38 |
+
constexpr int HALF_FILTER_SIZE = 6;
|
| 39 |
+
constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
|
| 40 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
|
| 41 |
+
constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
|
| 42 |
+
|
| 43 |
+
template <typename input_t, typename output_t, typename acc_t>
|
| 44 |
+
__global__ void anti_alias_activation_forward(
|
| 45 |
+
output_t *dst,
|
| 46 |
+
const input_t *src,
|
| 47 |
+
const input_t *up_ftr,
|
| 48 |
+
const input_t *down_ftr,
|
| 49 |
+
const input_t *alpha,
|
| 50 |
+
const input_t *beta,
|
| 51 |
+
int batch_size,
|
| 52 |
+
int channels,
|
| 53 |
+
int seq_len)
|
| 54 |
+
{
|
| 55 |
+
// Up and downsample filters
|
| 56 |
+
input_t up_filter[FILTER_SIZE];
|
| 57 |
+
input_t down_filter[FILTER_SIZE];
|
| 58 |
+
|
| 59 |
+
// Load data from global memory including extra indices reserved for replication paddings
|
| 60 |
+
input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
|
| 61 |
+
input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
|
| 62 |
+
|
| 63 |
+
// Output stores downsampled output before writing to dst
|
| 64 |
+
output_t output[BUFFER_SIZE];
|
| 65 |
+
|
| 66 |
+
// blockDim/threadIdx = (128, 1, 1)
|
| 67 |
+
// gridDim/blockIdx = (seq_blocks, channels, batches)
|
| 68 |
+
int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
| 69 |
+
int local_offset = threadIdx.x * BUFFER_SIZE;
|
| 70 |
+
int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
|
| 71 |
+
|
| 72 |
+
// intermediate have double the seq_len
|
| 73 |
+
int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
|
| 74 |
+
int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
|
| 75 |
+
|
| 76 |
+
// Get values needed for replication padding before moving pointer
|
| 77 |
+
const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
|
| 78 |
+
input_t seq_left_most_value = right_most_pntr[0];
|
| 79 |
+
input_t seq_right_most_value = right_most_pntr[seq_len - 1];
|
| 80 |
+
|
| 81 |
+
// Move src and dst pointers
|
| 82 |
+
src += block_offset + local_offset;
|
| 83 |
+
dst += block_offset + local_offset;
|
| 84 |
+
|
| 85 |
+
// Alpha and beta values for snake activatons. Applies exp by default
|
| 86 |
+
alpha = alpha + blockIdx.y;
|
| 87 |
+
input_t alpha_val = expf(alpha[0]);
|
| 88 |
+
beta = beta + blockIdx.y;
|
| 89 |
+
input_t beta_val = expf(beta[0]);
|
| 90 |
+
|
| 91 |
+
#pragma unroll
|
| 92 |
+
for (int it = 0; it < FILTER_SIZE; it += 1)
|
| 93 |
+
{
|
| 94 |
+
up_filter[it] = up_ftr[it];
|
| 95 |
+
down_filter[it] = down_ftr[it];
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// Apply replication padding for upsampling, matching torch impl
|
| 99 |
+
#pragma unroll
|
| 100 |
+
for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
|
| 101 |
+
{
|
| 102 |
+
int element_index = seq_offset + it; // index for element
|
| 103 |
+
if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
|
| 104 |
+
{
|
| 105 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
|
| 106 |
+
}
|
| 107 |
+
if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
|
| 108 |
+
{
|
| 109 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
|
| 110 |
+
}
|
| 111 |
+
if ((element_index >= 0) && (element_index < seq_len))
|
| 112 |
+
{
|
| 113 |
+
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
// Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
|
| 118 |
+
#pragma unroll
|
| 119 |
+
for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
|
| 120 |
+
{
|
| 121 |
+
input_t acc = 0.0;
|
| 122 |
+
int element_index = intermediate_seq_offset + it; // index for intermediate
|
| 123 |
+
#pragma unroll
|
| 124 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
| 125 |
+
{
|
| 126 |
+
if ((element_index + f_idx) >= 0)
|
| 127 |
+
{
|
| 128 |
+
acc += up_filter[f_idx] * elements[it + f_idx];
|
| 129 |
+
}
|
| 130 |
+
}
|
| 131 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
// Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
|
| 135 |
+
double no_div_by_zero = 0.000000001;
|
| 136 |
+
#pragma unroll
|
| 137 |
+
for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
|
| 138 |
+
{
|
| 139 |
+
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
// Apply replication padding before downsampling conv from intermediates
|
| 143 |
+
#pragma unroll
|
| 144 |
+
for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
|
| 145 |
+
{
|
| 146 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
|
| 147 |
+
}
|
| 148 |
+
#pragma unroll
|
| 149 |
+
for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
|
| 150 |
+
{
|
| 151 |
+
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
// Apply downsample strided convolution (assuming stride=2) from intermediates
|
| 155 |
+
#pragma unroll
|
| 156 |
+
for (int it = 0; it < BUFFER_SIZE; it += 1)
|
| 157 |
+
{
|
| 158 |
+
input_t acc = 0.0;
|
| 159 |
+
#pragma unroll
|
| 160 |
+
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
|
| 161 |
+
{
|
| 162 |
+
// Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
|
| 163 |
+
acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
|
| 164 |
+
}
|
| 165 |
+
output[it] = acc;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// Write output to dst
|
| 169 |
+
#pragma unroll
|
| 170 |
+
for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
|
| 171 |
+
{
|
| 172 |
+
int element_index = seq_offset + it;
|
| 173 |
+
if (element_index < seq_len)
|
| 174 |
+
{
|
| 175 |
+
dst[it] = output[it];
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
template <typename input_t, typename output_t, typename acc_t>
|
| 182 |
+
void dispatch_anti_alias_activation_forward(
|
| 183 |
+
output_t *dst,
|
| 184 |
+
const input_t *src,
|
| 185 |
+
const input_t *up_ftr,
|
| 186 |
+
const input_t *down_ftr,
|
| 187 |
+
const input_t *alpha,
|
| 188 |
+
const input_t *beta,
|
| 189 |
+
int batch_size,
|
| 190 |
+
int channels,
|
| 191 |
+
int seq_len)
|
| 192 |
+
{
|
| 193 |
+
if (seq_len == 0)
|
| 194 |
+
{
|
| 195 |
+
return;
|
| 196 |
+
}
|
| 197 |
+
else
|
| 198 |
+
{
|
| 199 |
+
// Use 128 threads per block to maximimize gpu utilization
|
| 200 |
+
constexpr int threads_per_block = 128;
|
| 201 |
+
constexpr int seq_len_per_block = 4096;
|
| 202 |
+
int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
|
| 203 |
+
dim3 blocks(blocks_per_seq_len, channels, batch_size);
|
| 204 |
+
dim3 threads(threads_per_block, 1, 1);
|
| 205 |
+
|
| 206 |
+
anti_alias_activation_forward<input_t, output_t, acc_t>
|
| 207 |
+
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
|
| 208 |
+
}
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
|
| 213 |
+
{
|
| 214 |
+
// Input is a 3d tensor with dimensions [batches, channels, seq_len]
|
| 215 |
+
const int batches = input.size(0);
|
| 216 |
+
const int channels = input.size(1);
|
| 217 |
+
const int seq_len = input.size(2);
|
| 218 |
+
|
| 219 |
+
// Output
|
| 220 |
+
auto act_options = input.options().requires_grad(false);
|
| 221 |
+
|
| 222 |
+
torch::Tensor anti_alias_activation_results =
|
| 223 |
+
torch::empty({batches, channels, seq_len}, act_options);
|
| 224 |
+
|
| 225 |
+
void *input_ptr = static_cast<void *>(input.data_ptr());
|
| 226 |
+
void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
|
| 227 |
+
void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
|
| 228 |
+
void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
|
| 229 |
+
void *beta_ptr = static_cast<void *>(beta.data_ptr());
|
| 230 |
+
void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
|
| 231 |
+
|
| 232 |
+
DISPATCH_FLOAT_HALF_AND_BFLOAT(
|
| 233 |
+
input.scalar_type(),
|
| 234 |
+
"dispatch anti alias activation_forward",
|
| 235 |
+
dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
|
| 236 |
+
reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
|
| 237 |
+
reinterpret_cast<const scalar_t *>(input_ptr),
|
| 238 |
+
reinterpret_cast<const scalar_t *>(up_filter_ptr),
|
| 239 |
+
reinterpret_cast<const scalar_t *>(down_filter_ptr),
|
| 240 |
+
reinterpret_cast<const scalar_t *>(alpha_ptr),
|
| 241 |
+
reinterpret_cast<const scalar_t *>(beta_ptr),
|
| 242 |
+
batches,
|
| 243 |
+
channels,
|
| 244 |
+
seq_len););
|
| 245 |
+
return anti_alias_activation_results;
|
| 246 |
+
}
|