| | #include <math.h> |
| | #include <stdio.h> |
| | #include <stdlib.h> |
| |
|
| |
|
| | #include "ball_query_gpu.h" |
| | #include "cuda_utils.h" |
| |
|
| |
|
| | __global__ void ball_query_kernel_fast(int b, int n, int m, float radius, int nsample, |
| | const float *__restrict__ new_xyz, const float *__restrict__ xyz, int *__restrict__ idx) { |
| | |
| | |
| | |
| | |
| | int bs_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (bs_idx >= b || pt_idx >= m) return; |
| |
|
| | new_xyz += bs_idx * m * 3 + pt_idx * 3; |
| | xyz += bs_idx * n * 3; |
| | idx += bs_idx * m * nsample + pt_idx * nsample; |
| |
|
| | float radius2 = radius * radius; |
| | float new_x = new_xyz[0]; |
| | float new_y = new_xyz[1]; |
| | float new_z = new_xyz[2]; |
| |
|
| | int cnt = 0; |
| | for (int k = 0; k < n; ++k) { |
| | float x = xyz[k * 3 + 0]; |
| | float y = xyz[k * 3 + 1]; |
| | float z = xyz[k * 3 + 2]; |
| | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); |
| | if (d2 < radius2){ |
| | if (cnt == 0){ |
| | for (int l = 0; l < nsample; ++l) { |
| | idx[l] = k; |
| | } |
| | } |
| | idx[cnt] = k; |
| | ++cnt; |
| | if (cnt >= nsample) break; |
| | } |
| | } |
| | } |
| |
|
| |
|
| | void ball_query_kernel_launcher_fast(int b, int n, int m, float radius, int nsample, \ |
| | const float *new_xyz, const float *xyz, int *idx) { |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| |
|
| | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | ball_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, nsample, new_xyz, xyz, idx); |
| | |
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| | __global__ void ball_center_query_kernel_fast(int b, int n, int m, float radius, \ |
| | const float *__restrict__ point, const float *__restrict__ key_point, int *__restrict__ idx) { |
| | |
| | |
| | |
| | |
| | int bs_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (bs_idx >= b || pt_idx >= n) return; |
| |
|
| | point += bs_idx * n * 3 + pt_idx * 3; |
| | key_point += bs_idx * m * 3; |
| | idx += bs_idx * n + pt_idx; |
| |
|
| | float radius2 = radius * radius; |
| | float point_x = point[0]; |
| | float point_y = point[1]; |
| | float point_z = point[2]; |
| |
|
| | float bestd = 1e8; |
| | for (int k = 0; k < m; ++k) { |
| | float x = key_point[k * 3 + 0]; |
| | float y = key_point[k * 3 + 1]; |
| | float z = key_point[k * 3 + 2]; |
| | if (((x + 1) * (x + 1) + (y + 1) * (y + 1) + (z + 1) * (z + 1)) < 1e-4) break; |
| | float d2 = (point_x - x) * (point_x - x) + (point_y - y) * (point_y - y) + (point_z - z) * (point_z - z); |
| | if (d2 < radius2 && d2 < bestd){ |
| | idx[0] = k; |
| | bestd = d2; |
| | } |
| | } |
| | } |
| |
|
| |
|
| | void ball_center_query_kernel_launcher_fast(int b, int n, int m, float radius, \ |
| | const float *point, const float *key_point, int *idx) { |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| |
|
| | dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | ball_center_query_kernel_fast<<<blocks, threads>>>(b, n, m, radius, point, key_point, idx); |
| | |
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| |
|
| |
|
| |
|
| | __global__ void knn_query_kernel_fast(int b, int n, int m, int nsample, const float *__restrict__ new_xyz, |
| | const float *__restrict__ xyz, float *__restrict__ dist2, int *__restrict__ idx) { |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | int bs_idx = blockIdx.y; |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (bs_idx >= b || pt_idx >= m) return; |
| |
|
| | new_xyz += bs_idx * m * 3 + pt_idx * 3; |
| | xyz += bs_idx * n * 3; |
| | dist2 += bs_idx * m * nsample + pt_idx * nsample; |
| | idx += bs_idx * m * nsample + pt_idx * nsample; |
| |
|
| | float nx = new_xyz[0]; |
| | float ny = new_xyz[1]; |
| | float nz = new_xyz[2]; |
| |
|
| | for (int i = 0; i < n; ++i) { |
| | float x = xyz[i * 3 + 0]; |
| | float y = xyz[i * 3 + 1]; |
| | float z = xyz[i * 3 + 2]; |
| | float d2 = (nx - x) * (nx - x) + (ny - y) * (ny - y) + (nz - z) * (nz - z); |
| | if (d2 < dist2[nsample - 1]) { |
| | dist2[nsample - 1] = d2; |
| | idx[nsample - 1] = i; |
| | for (int j = nsample - 2; j >= 0; j--) { |
| | if (d2 < dist2[j]){ |
| | dist2[j + 1] = dist2[j]; |
| | dist2[j] = d2; |
| | idx[j + 1] = idx[j]; |
| | idx[j] = i; |
| | } |
| | } |
| | } |
| | } |
| | } |
| |
|
| |
|
| | void knn_query_kernel_launcher_fast(int b, int n, int m, int nsample, \ |
| | const float *new_xyz, const float *xyz, float *dist2, int *idx) { |
| | cudaError_t err; |
| |
|
| | dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | knn_query_kernel_fast<<<blocks, threads>>>(b, n, m, nsample, new_xyz, xyz, dist2, idx); |
| | |
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | __global__ void ball_query_kernel_stack(int B, int M, float radius, int nsample, \ |
| | const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx) { |
| | |
| | |
| | |
| | |
| | |
| | |
| | int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (pt_idx >= M) return; |
| |
|
| | int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0]; |
| | for (int k = 1; k < B; k++){ |
| | if (pt_idx < pt_cnt) break; |
| | pt_cnt += new_xyz_batch_cnt[k]; |
| | bs_idx = k; |
| | } |
| |
|
| | int xyz_batch_start_idx = 0; |
| | for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k]; |
| | |
| |
|
| | new_xyz += pt_idx * 3; |
| | xyz += xyz_batch_start_idx * 3; |
| | idx += pt_idx * nsample; |
| |
|
| | float radius2 = radius * radius; |
| | float new_x = new_xyz[0]; |
| | float new_y = new_xyz[1]; |
| | float new_z = new_xyz[2]; |
| | int n = xyz_batch_cnt[bs_idx]; |
| |
|
| | int cnt = 0; |
| | for (int k = 0; k < n; ++k) { |
| | float x = xyz[k * 3 + 0]; |
| | float y = xyz[k * 3 + 1]; |
| | float z = xyz[k * 3 + 2]; |
| | float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); |
| | if (d2 < radius2){ |
| | if (cnt == 0){ |
| | for (int l = 0; l < nsample; ++l) { |
| | idx[l] = k; |
| | } |
| | } |
| | idx[cnt] = k; |
| | ++cnt; |
| | if (cnt >= nsample) break; |
| | } |
| | } |
| | if (cnt == 0) idx[0] = -1; |
| | } |
| |
|
| |
|
| | void ball_query_kernel_launcher_stack(int B, int M, float radius, int nsample, |
| | const float *new_xyz, const int *new_xyz_batch_cnt, const float *xyz, const int *xyz_batch_cnt, int *idx){ |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | cudaError_t err; |
| |
|
| | dim3 blocks(DIVUP(M, THREADS_PER_BLOCK)); |
| | dim3 threads(THREADS_PER_BLOCK); |
| |
|
| | ball_query_kernel_stack<<<blocks, threads>>>(B, M, radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx); |
| | |
| | err = cudaGetLastError(); |
| | if (cudaSuccess != err) { |
| | fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err)); |
| | exit(-1); |
| | } |
| | } |
| |
|