| | #include <math.h> |
| | #include <stdio.h> |
| | #include <stdlib.h> |
| |
|
| | #include "cuda_utils.h" |
| | #include "interpolate_gpu.h" |
| |
|
| |
|
| | __global__ void three_nn_kernel_fast(int b, int n, int m, const float *__restrict__ unknown, |
| | const float *__restrict__ known, float *__restrict__ dist2, int *__restrict__ idx) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | int bs_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (bs_idx >= b || pt_idx >= n) return; |
| |
|
| | unknown += bs_idx * n * 3 + pt_idx * 3; |
| | known += bs_idx * m * 3; |
| | dist2 += bs_idx * n * 3 + pt_idx * 3; |
| | idx += bs_idx * n * 3 + pt_idx * 3; |
| |
|
| | float ux = unknown[0]; |
| | float uy = unknown[1]; |
| | float uz = unknown[2]; |
| |
|
| | double best1 = 1e40, best2 = 1e40, best3 = 1e40; |
| | int besti1 = 0, besti2 = 0, besti3 = 0; |
| | for (int k = 0; k < m; ++k) { |
| | float x = known[k * 3 + 0]; |
| | float y = known[k * 3 + 1]; |
| | float z = known[k * 3 + 2]; |
| | float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); |
| | if (d < best1) { |
| | best3 = best2; besti3 = besti2; |
| | best2 = best1; besti2 = besti1; |
| | best1 = d; besti1 = k; |
| | } |
| | else if (d < best2) { |
| | best3 = best2; besti3 = besti2; |
| | best2 = d; besti2 = k; |
| | } |
| | else if (d < best3) { |
| | best3 = d; besti3 = k; |
| | } |
| | } |
| | dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; |
| | idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; |
| | } |
| |
|
| |
|
| | void three_nn_kernel_launcher_fast(int b, int n, int m, const float *unknown, |
| | const float *known, float *dist2, int *idx) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | three_nn_kernel_fast<<<blocks, threads>>>(b, n, m, unknown, known, dist2, idx); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| | __global__ void three_interpolate_kernel_fast(int b, int c, int m, int n, const float *__restrict__ points, |
| | const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ out) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | int bs_idx = blockIdx.z; |
| | int c_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| |
|
| | if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; |
| |
|
| | weight += bs_idx * n * 3 + pt_idx * 3; |
| | points += bs_idx * c * m + c_idx * m; |
| | idx += bs_idx * n * 3 + pt_idx * 3; |
| | out += bs_idx * c * n + c_idx * n; |
| |
|
| | out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; |
| | } |
| |
|
| | void three_interpolate_kernel_launcher_fast(int b, int c, int m, int n, |
| | const float *points, const int *idx, const float *weight, float *out) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| | three_interpolate_kernel_fast<<<blocks, threads>>>(b, c, m, n, points, idx, weight, out); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| | __global__ void three_interpolate_grad_kernel_fast(int b, int c, int n, int m, const float *__restrict__ grad_out, |
| | const int *__restrict__ idx, const float *__restrict__ weight, float *__restrict__ grad_points) { |
| | |
| | |
| | |
| | |
| |
|
| | int bs_idx = blockIdx.z; |
| | int c_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| |
|
| | if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; |
| | |
| | grad_out += bs_idx * c * n + c_idx * n + pt_idx; |
| | weight += bs_idx * n * 3 + pt_idx * 3; |
| | grad_points += bs_idx * c * m + c_idx * m; |
| | idx += bs_idx * n * 3 + pt_idx * 3; |
| |
|
| |
|
| | atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); |
| | atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); |
| | atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); |
| | } |
| |
|
| | void three_interpolate_grad_kernel_launcher_fast(int b, int c, int n, int m, const float *grad_out, |
| | const int *idx, const float *weight, float *grad_points) { |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| | three_interpolate_grad_kernel_fast<<<blocks, threads>>>(b, c, n, m, grad_out, idx, weight, grad_points); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| | __global__ void three_nn_kernel_stack(int batch_size, int N, int M, const float *unknown, |
| | const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt, |
| | float *dist2, int *idx) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (pt_idx >= N) return; |
| |
|
| | int bs_idx = 0, pt_cnt = unknown_batch_cnt[0]; |
| | for (int k = 1; k < batch_size; k++){ |
| | if (pt_idx < pt_cnt) break; |
| | pt_cnt += unknown_batch_cnt[k]; |
| | bs_idx = k; |
| | } |
| |
|
| | int cur_num_known_points = known_batch_cnt[bs_idx]; |
| |
|
| | int known_batch_start_idx = 0; |
| | for (int k = 0; k < bs_idx; k++) known_batch_start_idx += known_batch_cnt[k]; |
| |
|
| | known += known_batch_start_idx * 3; |
| | unknown += pt_idx * 3; |
| | dist2 += pt_idx * 3; |
| | idx += pt_idx * 3; |
| |
|
| | float ux = unknown[0]; |
| | float uy = unknown[1]; |
| | float uz = unknown[2]; |
| |
|
| | double best1 = 1e40, best2 = 1e40, best3 = 1e40; |
| | int besti1 = 0, besti2 = 0, besti3 = 0; |
| | for (int k = 0; k < cur_num_known_points; ++k) { |
| | float x = known[k * 3 + 0]; |
| | float y = known[k * 3 + 1]; |
| | float z = known[k * 3 + 2]; |
| | float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); |
| | if (d < best1) { |
| | best3 = best2; besti3 = besti2; |
| | best2 = best1; besti2 = besti1; |
| | best1 = d; besti1 = k; |
| | } |
| | else if (d < best2) { |
| | best3 = best2; besti3 = besti2; |
| | best2 = d; besti2 = k; |
| | } |
| | else if (d < best3) { |
| | best3 = d; besti3 = k; |
| | } |
| | } |
| | dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; |
| | idx[0] = besti1 + known_batch_start_idx; |
| | idx[1] = besti2 + known_batch_start_idx; |
| | idx[2] = besti3 + known_batch_start_idx; |
| | } |
| |
|
| |
|
| | void three_nn_kernel_launcher_stack(int batch_size, int N, int M, const float *unknown, |
| | const int *unknown_batch_cnt, const float *known, const int *known_batch_cnt, |
| | float *dist2, int *idx) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(N, THREADS_PER_BLOCK)); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | three_nn_kernel_stack<<<blocks, threads>>>( |
| | batch_size, N, M, unknown, unknown_batch_cnt, |
| | known, known_batch_cnt, dist2, idx |
| | ); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| |
|
| | __global__ void three_interpolate_kernel_stack(int N, int channels, const float *features, |
| | const int *idx, const float *weight, float *out) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | int c_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (pt_idx >= N || c_idx >= channels) return; |
| |
|
| | weight += pt_idx * 3; |
| | idx += pt_idx * 3; |
| | out += pt_idx * channels + c_idx; |
| |
|
| | out[0] = weight[0] * features[idx[0] * channels + c_idx] + |
| | weight[1] * features[idx[1] * channels + c_idx] + |
| | weight[2] * features[idx[2] * channels + c_idx]; |
| | } |
| |
|
| |
|
| |
|
| | void three_interpolate_kernel_launcher_stack(int N, int channels, |
| | const float *features, const int *idx, const float *weight, float *out) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels); |
| | dim3 threads(THREADS_PER_BLOCK); |
| | three_interpolate_kernel_stack<<<blocks, threads>>>(N, channels, features, idx, weight, out); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| | __global__ void three_interpolate_grad_kernel_stack(int N, int channels, const float *grad_out, |
| | const int *idx, const float *weight, float *grad_features) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | int c_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (pt_idx >= N || c_idx >= channels) return; |
| |
|
| | grad_out += pt_idx * channels + c_idx; |
| | weight += pt_idx * 3; |
| | idx += pt_idx * 3; |
| |
|
| | |
| |
|
| | atomicAdd(grad_features + idx[0] * channels + c_idx, grad_out[0] * weight[0]); |
| | atomicAdd(grad_features + idx[1] * channels + c_idx, grad_out[0] * weight[1]); |
| | atomicAdd(grad_features + idx[2] * channels + c_idx, grad_out[0] * weight[2]); |
| | } |
| |
|
| |
|
| | void three_interpolate_grad_kernel_launcher_stack(int N, int channels, const float *grad_out, |
| | const int *idx, const float *weight, float *grad_features) { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| | dim3 blocks(DIVUP(N, THREADS_PER_BLOCK), channels); |
| | dim3 threads(THREADS_PER_BLOCK); |
| | three_interpolate_grad_kernel_stack<<<blocks, threads>>>( |
| | N, channels, grad_out, idx, weight, grad_features |
| | ); |
| |
|
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |