| #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION |
| #define CL_USE_DEPRECATED_OPENCL_1_2_APIS |
|
|
| |
| #pragma GCC diagnostic ignored "-Woverlength-strings" |
| #ifdef __clang__ |
| #pragma GCC diagnostic ignored "-Wgnu-anonymous-struct" |
| #endif |
|
|
| #include "ggml-opencl.h" |
| #include "ggml-backend.h" |
| #include "ggml-impl.h" |
| #include "ggml-backend-impl.h" |
| #include "ggml.h" |
|
|
| #include <CL/cl.h> |
|
|
| #include <string.h> |
|
|
| #include <cstddef> |
| #include <cstdint> |
| #include <atomic> |
| #include <fstream> |
| #include <limits> |
| #include <vector> |
| #include <string> |
| #include <cmath> |
| #include <memory> |
| #include <charconv> |
|
|
| #undef MIN |
| #undef MAX |
| #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
|
|
| #define UNUSED(x) (void)(x) |
|
|
| #define CL_CHECK(err) \ |
| do { \ |
| cl_int err_ = (err); \ |
| if (err_ != CL_SUCCESS) { \ |
| GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n", \ |
| #err, err_, __FILE__, __LINE__); \ |
| GGML_ASSERT(0); \ |
| } \ |
| } while (0) |
|
|
| |
| |
| |
|
|
| bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); |
|
|
| enum GPU_FAMILY { |
| ADRENO, |
| INTEL, |
| UNKNOWN, |
| }; |
|
|
| enum ADRENO_GPU_GEN { |
| ADRENO_UNKNOWN, |
| A7X, |
| A8X, |
| X1E, |
| }; |
|
|
| struct ggml_cl_version { |
| cl_uint major = 0; |
| cl_uint minor = 0; |
| }; |
|
|
| |
| static ggml_cl_version parse_cl_version(std::string_view str) { |
| size_t major_str_begin = 0; |
| size_t major_str_end = str.find(".", major_str_begin); |
| if (major_str_end == std::string::npos) { |
| return {}; |
| } |
|
|
| size_t minor_str_begin = major_str_end + 1; |
| size_t minor_str_end = str.find(" ", minor_str_begin); |
| if (minor_str_end == std::string::npos) { |
| return {}; |
| } |
|
|
| cl_uint version_major; |
| if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) { |
| return {}; |
| } |
|
|
| cl_uint version_minor; |
| if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) { |
| return {}; |
| } |
| return { version_major, version_minor }; |
| } |
|
|
| |
| static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) { |
| size_t param_size; |
| CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size)); |
| std::unique_ptr<char[]> param_storage(new char[param_size]); |
| CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr)); |
|
|
| auto param_value = std::string_view(param_storage.get(), param_size); |
| const std::string version_prefix = "OpenCL "; |
| if (param_value.find(version_prefix) != 0) { |
| return {}; |
| } |
| param_value.remove_prefix(version_prefix.length()); |
| return parse_cl_version(param_value); |
| } |
|
|
| |
| static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) { |
| size_t param_size; |
|
|
| #if CL_TARGET_OPENCL_VERSION >= 300 |
| if (platform_version.major >= 3) { |
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size)); |
| if (!param_size) { |
| return {}; |
| } |
|
|
| std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]); |
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr)); |
| unsigned versions_count = param_size / sizeof(cl_name_version); |
|
|
| cl_version version_max = 0; |
| for (unsigned i = 0; i < versions_count; i++) { |
| version_max = std::max<cl_version>(versions[i].version, version_max); |
| } |
|
|
| return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) }; |
| } |
| #else |
| GGML_UNUSED(platform_version); |
| #endif |
|
|
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size)); |
| if (!param_size) { |
| return {}; |
| } |
|
|
| std::unique_ptr<char[]> param_storage(new char[param_size]); |
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr)); |
| auto param_value = std::string_view(param_storage.get(), param_size); |
|
|
| const std::string version_prefix = "OpenCL C "; |
| if (param_value.find(version_prefix) != 0) { |
| return {}; |
| } |
| param_value.remove_prefix(version_prefix.length()); |
|
|
| return parse_cl_version(param_value); |
| } |
|
|
| static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) { |
| if (strstr(device_name, "730") || |
| strstr(device_name, "740") || |
| strstr(device_name, "750")) { |
| return ADRENO_GPU_GEN::A7X; |
| } |
|
|
| if (strstr(device_name, "830")) { |
| return ADRENO_GPU_GEN::A8X; |
| } |
|
|
| if (strstr(device_name, "X1")) { |
| return ADRENO_GPU_GEN::X1E; |
| } |
|
|
| return ADRENO_GPU_GEN::ADRENO_UNKNOWN; |
| } |
|
|
| static int get_adreno_cl_compiler_version(const char *driver_version) { |
| std::string driver_ver_str(driver_version); |
| size_t compiler_ver_pos = driver_ver_str.find("E031"); |
| size_t compiler_ver_len = 13; |
| size_t compiler_ver_offset = 5; |
|
|
| if (compiler_ver_pos == std::string::npos) { |
| compiler_ver_pos = driver_ver_str.find("DX"); |
| if (compiler_ver_pos == std::string::npos) { |
| return -1; |
| } |
| compiler_ver_len = 11; |
| compiler_ver_offset = 3; |
| } |
|
|
| std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len); |
| std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2); |
| return std::atoi(major_ver_str.c_str()); |
| } |
|
|
| |
| struct ggml_backend_opencl_device_context { |
| cl_platform_id platform; |
| std::string platform_name; |
|
|
| cl_device_id device; |
| std::string device_name; |
| }; |
|
|
| |
| struct ggml_backend_opencl_context { |
| cl_device_id device; |
| std::string device_name; |
|
|
| std::string driver_version; |
|
|
| GPU_FAMILY gpu_family; |
| ADRENO_GPU_GEN adreno_gen; |
|
|
| cl_int alignment; |
| size_t max_alloc_size; |
| bool fp16_support; |
|
|
| int adreno_wave_size; |
|
|
| cl_context context; |
| cl_command_queue queue; |
|
|
| cl_program program; |
| cl_program program_1; |
| cl_program program_2; |
|
|
| cl_kernel kernel_add, kernel_add_row; |
| cl_kernel kernel_mul, kernel_mul_row; |
| cl_kernel kernel_scale; |
| cl_kernel kernel_silu, kernel_silu_4; |
| cl_kernel kernel_gelu, kernel_gelu_4; |
| cl_kernel kernel_relu; |
| cl_kernel kernel_clamp; |
| cl_kernel kernel_norm; |
| cl_kernel kernel_rms_norm; |
| cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8; |
| cl_kernel kernel_soft_max, kernel_soft_max_4; |
| cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16; |
| cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0; |
| cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; |
| cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32; |
| cl_kernel kernel_mul_mat_f32_f32; |
| cl_kernel kernel_mul_mat_f16_f16; |
| cl_kernel kernel_mul_mat_f16_f32_1row; |
| cl_kernel kernel_mul_mat_f16_f32; |
| cl_kernel kernel_mul_mat_f16_f32_l4; |
| cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; |
| cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat; |
| cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; |
| cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0, |
| kernel_mul_mat_q4_0_f32_flat_img_v0; |
| cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; |
| cl_kernel kernel_mul_mv_q6_K_f32; |
|
|
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| |
| cl_program program_transpose_32; |
| cl_program program_transpose_32_16; |
| cl_program program_transpose_16; |
| cl_kernel kernel_transpose_32; |
| cl_kernel kernel_transpose_32_16; |
| cl_kernel kernel_transpose_16; |
|
|
| cl_mem A_s_d_max; |
| cl_mem A_q_d_max; |
| cl_mem B_d_max; |
|
|
| |
| cl_program program_CL_gemm; |
| cl_program program_CL_gemv_general; |
| cl_program program_CL_gemv_4096_1_11008; |
| cl_program program_CL_gemv_4096_1_4096; |
| cl_program program_CL_gemv_11008_1_4096; |
| cl_program program_CL_gemv_32000_1_4096; |
| cl_kernel CL_mul_mat_Ab_Bi_8x4; |
| cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; |
| cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; |
| cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; |
| cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; |
| cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; |
| #endif |
| }; |
|
|
| static ggml_backend_device g_ggml_backend_opencl_device; |
| static ggml_backend_opencl_device_context g_ggml_ctx_dev_main { |
| nullptr, |
| "", |
| nullptr, |
| "", |
| }; |
|
|
| static int ggml_backend_opencl_n_devices = 0; |
|
|
| |
| #ifdef GGML_OPENCL_PROFILING |
| struct ProfilingInfo { |
| std::string op_name; |
| std::string kernel_name; |
| |
| cl_ulong duration_ns; |
| |
| size_t global_size[3]; |
| size_t local_size[3]; |
| |
| size_t output_size[4]; |
| }; |
|
|
| std::vector<ProfilingInfo> g_profiling_info; |
| #endif |
|
|
| inline std::string read_file(const std::string &path) { |
| std::ifstream ifs(path); |
| if (!ifs) { |
| return ""; |
| } |
| std::string text; |
| ifs.seekg(0, std::ios::end); |
| text.resize(ifs.tellg()); |
| ifs.seekg(0, std::ios::beg); |
| ifs.read(&text[0], text.size()); |
| return text; |
| } |
|
|
| static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { |
| cl_program p; |
| char *program_log; |
| size_t program_size; |
| size_t log_size; |
| int err; |
|
|
| program_size = strlen(program_buffer); |
|
|
| p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); |
| if(err < 0) { |
| GGML_LOG_ERROR("OpenCL error creating program"); |
| exit(1); |
| } |
|
|
| err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); |
| if(err < 0) { |
| clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); |
| program_log = (char*) malloc(log_size + 1); |
| program_log[log_size] = '\0'; |
| clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); |
| GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log); |
| free(program_log); |
| exit(1); |
| } |
|
|
| return p; |
| } |
|
|
| static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { |
| static bool initialized = false; |
| static ggml_backend_opencl_context *backend_ctx = nullptr; |
|
|
| if (initialized) { |
| return backend_ctx; |
| } |
|
|
| ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context; |
| GGML_ASSERT(dev_ctx); |
| GGML_ASSERT(dev_ctx->platform == nullptr); |
| GGML_ASSERT(dev_ctx->device == nullptr); |
| GGML_ASSERT(backend_ctx == nullptr); |
|
|
| initialized = true; |
| backend_ctx = new ggml_backend_opencl_context(); |
| backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; |
|
|
| cl_int err; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n"); |
| #endif |
|
|
| struct cl_device; |
| struct cl_platform { |
| cl_platform_id id; |
| unsigned number; |
| char name[128]; |
| char vendor[128]; |
| struct cl_device * devices; |
| unsigned n_devices; |
| struct cl_device * default_device; |
| }; |
|
|
| struct cl_device { |
| struct cl_platform * platform; |
| cl_device_id id; |
| unsigned number; |
| cl_device_type type; |
| char name[128]; |
| }; |
|
|
| enum { NPLAT = 16, NDEV = 16 }; |
|
|
| struct cl_platform platforms[NPLAT]; |
| unsigned n_platforms = 0; |
| struct cl_device devices[NDEV]; |
| unsigned n_devices = 0; |
| struct cl_device * default_device = NULL; |
|
|
| cl_platform_id platform_ids[NPLAT]; |
| if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) { |
| GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n"); |
| return backend_ctx; |
| } |
|
|
| for (unsigned i = 0; i < n_platforms; i++) { |
| struct cl_platform * p = &platforms[i]; |
| p->number = i; |
| p->id = platform_ids[i]; |
| CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); |
| CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); |
|
|
| cl_device_id device_ids[NDEV]; |
| cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); |
| if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { |
| p->n_devices = 0; |
| } else { |
| CL_CHECK(clGetDeviceIDsError); |
| } |
| p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; |
| p->default_device = NULL; |
|
|
| for (unsigned j = 0; j < p->n_devices; j++) { |
| struct cl_device * d = &devices[n_devices]; |
| d->number = n_devices++; |
| d->id = device_ids[j]; |
| d->platform = p; |
| CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); |
| CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); |
|
|
| if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { |
| p->default_device = d; |
| } |
| } |
|
|
| if (default_device == NULL && p->default_device != NULL) { |
| default_device = p->default_device; |
| } |
| } |
|
|
| if (n_devices == 0) { |
| GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n"); |
| return backend_ctx; |
| } |
|
|
| char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); |
| char * user_device_string = getenv("GGML_OPENCL_DEVICE"); |
| int user_platform_number = -1; |
| int user_device_number = -1; |
|
|
| unsigned n; |
| if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { |
| user_platform_number = (int)n; |
| } |
| if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { |
| user_device_number = (int)n; |
| } |
| if (user_platform_number != -1 && user_device_number != -1) { |
| cl_platform* platform = &platforms[user_platform_number]; |
| if ((unsigned)user_device_number >= platform->n_devices) { |
| GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number); |
| exit(1); |
| } |
| default_device = &platform->devices[user_device_number]; |
| } else { |
|
|
| struct cl_device * selected_devices = devices; |
| unsigned n_selected_devices = n_devices; |
|
|
| if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { |
| for (unsigned i = 0; i < n_platforms; i++) { |
| struct cl_platform * p = &platforms[i]; |
| if (strstr(p->name, user_platform_string) != NULL || |
| strstr(p->vendor, user_platform_string) != NULL) { |
| user_platform_number = (int)i; |
| break; |
| } |
| } |
| if (user_platform_number == -1) { |
| GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); |
| exit(1); |
| } |
| } |
| if (user_platform_number != -1) { |
| struct cl_platform * p = &platforms[user_platform_number]; |
| selected_devices = p->devices; |
| n_selected_devices = p->n_devices; |
| default_device = p->default_device; |
| if (n_selected_devices == 0) { |
| GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); |
| exit(1); |
| } |
| } |
|
|
| if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { |
| for (unsigned i = 0; i < n_selected_devices; i++) { |
| struct cl_device * d = &selected_devices[i]; |
| if (strstr(d->name, user_device_string) != NULL) { |
| user_device_number = d->number; |
| break; |
| } |
| } |
| if (user_device_number == -1) { |
| GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string); |
| exit(1); |
| } |
| } |
| if (user_device_number != -1) { |
| selected_devices = &devices[user_device_number]; |
| n_selected_devices = 1; |
| default_device = &selected_devices[0]; |
| } |
|
|
| GGML_ASSERT(n_selected_devices > 0); |
|
|
| if (default_device == NULL) { |
| default_device = &selected_devices[0]; |
| } |
| } |
|
|
| GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); |
| GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name); |
| if (default_device->type != CL_DEVICE_TYPE_GPU) { |
| GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); |
| } |
|
|
| dev_ctx->platform = default_device->platform->id; |
| dev_ctx->device = default_device->id; |
| backend_ctx->device = default_device->id; |
|
|
| if (strstr(default_device->name, "Adreno")) { |
| backend_ctx->gpu_family = GPU_FAMILY::ADRENO; |
| backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name); |
|
|
| |
| backend_ctx->adreno_wave_size = 64; |
| } else if (strstr(default_device->name, "Intel")) { |
| backend_ctx->gpu_family = GPU_FAMILY::INTEL; |
| } else { |
| GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name); |
| backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; |
| return backend_ctx; |
| } |
|
|
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { |
| GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; " |
| "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n"); |
| return backend_ctx; |
| } |
| #endif |
|
|
| |
| dev_ctx->platform_name = default_device->platform->name; |
| dev_ctx->device_name = default_device->name; |
| backend_ctx->device_name = default_device->name; |
|
|
| |
| cl_device_id device = backend_ctx->device; |
|
|
| ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id); |
|
|
| |
| ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device); |
| if (opencl_c_version.major < 2) { |
| GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n"); |
| return backend_ctx; |
| } |
|
|
| |
| size_t driver_version_str_size; |
| clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size); |
| char *driver_version = (char *)alloca(driver_version_str_size + 1); |
| clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL); |
| driver_version[driver_version_str_size] = '\0'; |
| GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version); |
| backend_ctx->driver_version = driver_version; |
|
|
| int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version); |
| bool has_vector_subgroup_broadcast = |
| adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17; |
| GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n", |
| has_vector_subgroup_broadcast ? "true" : "false"); |
|
|
| size_t ext_str_size; |
| clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); |
| char *ext_buffer = (char *)alloca(ext_str_size + 1); |
| clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); |
| ext_buffer[ext_str_size] = '\0'; |
| |
| backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; |
| GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false"); |
|
|
| |
| if (!backend_ctx->fp16_support) { |
| GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n"); |
| return backend_ctx; |
| } |
|
|
| |
| |
| if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL && |
| strstr(ext_buffer, "cl_intel_subgroups") == NULL) { |
| GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " |
| "(note that subgroups is an optional feature in OpenCL 3.0)\n"); |
| return backend_ctx; |
| } |
|
|
| cl_uint base_align_in_bits; |
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL)); |
| GGML_ASSERT(base_align_in_bits % 8u == 0); |
| backend_ctx->alignment = base_align_in_bits / 8u; |
| GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment); |
|
|
| clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); |
| GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); |
|
|
| |
| cl_device_svm_capabilities svm_caps; |
| CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0)); |
| GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n", |
| svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false"); |
| GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n", |
| svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false"); |
| GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n", |
| svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false"); |
| GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", |
| svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); |
|
|
| |
| #ifdef GGML_OPENCL_SOA_Q |
| GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); |
| #endif |
|
|
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); |
| #endif |
|
|
| cl_context_properties properties[] = { |
| (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0 |
| }; |
|
|
| CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); |
|
|
| |
| cl_context context = backend_ctx->context; |
|
|
| |
| |
| |
| |
| cl_command_queue_properties command_queue_props = 0; |
| #ifdef GGML_OPENCL_PROFILING |
| command_queue_props |= CL_QUEUE_PROFILING_ENABLE; |
| #endif |
| CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err)); |
|
|
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src { |
| #include "ggml-opencl.cl.h" |
| }; |
| #else |
| const std::string kernel_src = read_file("ggml-opencl.cl"); |
| #endif |
|
|
| auto opencl_c_std = |
| std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor); |
|
|
| std::string compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable -cl-unsafe-math-optimizations" |
| " -cl-finite-math-only -cl-fast-relaxed-math"; |
| backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts); |
|
|
| |
| CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err)); |
| CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err)); |
| CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err)); |
| CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err)); |
| CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err)); |
| CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err)); |
| CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err)); |
| CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err)); |
| CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err)); |
| CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err)); |
| CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err)); |
| CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err)); |
| CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err)); |
| CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err)); |
| CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err)); |
| CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err)); |
| CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err)); |
|
|
| |
| CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err)); |
|
|
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err)); |
| CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err)); |
| CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err)); |
|
|
| |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src_1 { |
| #include "ggml-opencl_mm.cl.h" |
| }; |
| #else |
| const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl"); |
| #endif |
| backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts); |
|
|
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err)); |
| CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err)); |
|
|
| |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src_2 { |
| #include "ggml-opencl_cvt.cl.h" |
| }; |
| #else |
| const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl"); |
| #endif |
| backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts); |
|
|
| CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err)); |
|
|
| |
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string transpose_32_src { |
| #include "ggml-opencl_transpose_32.cl.h" |
| }; |
| #else |
| const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl"); |
| #endif |
| backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts); |
| CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err)); |
|
|
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string transpose_32_16_src { |
| #include "ggml-opencl_transpose_32_16.cl.h" |
| }; |
| #else |
| const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl"); |
| #endif |
| backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts); |
| CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err)); |
|
|
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string transpose_16_src { |
| #include "ggml-opencl_transpose_16.cl.h" |
| }; |
| #else |
| const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl"); |
| #endif |
| backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts); |
| CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err)); |
|
|
| |
| std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable " |
| " -DSIMDGROUP_WIDTH=" + |
| std::to_string(backend_ctx->adreno_wave_size); |
| if (has_vector_subgroup_broadcast) { |
| CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
| } |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src_CL_gemv_general { |
| #include "ggml-opencl_gemv_noshuffle_general.cl.h" |
| }; |
| #else |
| const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl"); |
| #endif |
|
|
| backend_ctx->program_CL_gemv_general = build_program_from_source( |
| context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err)); |
|
|
| |
| CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable " |
| " -DLINE_STRIDE_A=2048 " |
| " -DBLOCK_STRIDE_A=16384 " |
| " -DSIMDGROUP_WIDTH=" + |
| std::to_string(backend_ctx->adreno_wave_size); |
| if (has_vector_subgroup_broadcast) { |
| CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
| } |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src_CL_gemv { |
| #include "ggml-opencl_gemv_noshuffle.cl.h" |
| }; |
| #else |
| const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl"); |
| #endif |
|
|
| backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source( |
| context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
| |
| CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable " |
| " -DLINE_STRIDE_A=2048 " |
| " -DBLOCK_STRIDE_A=16384 " |
| " -DSIMDGROUP_WIDTH=" + |
| std::to_string(backend_ctx->adreno_wave_size); |
| if (has_vector_subgroup_broadcast) { |
| CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
| } |
|
|
| backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source( |
| context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err)); |
|
|
| |
| CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable " |
| " -DLINE_STRIDE_A=5504 " |
| " -DBLOCK_STRIDE_A=44032 " |
| " -DSIMDGROUP_WIDTH=" + |
| std::to_string(backend_ctx->adreno_wave_size); |
| if (has_vector_subgroup_broadcast) { |
| CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
| } |
|
|
| backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source( |
| context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
| |
| CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + |
| " -cl-mad-enable " |
| " -DLINE_STRIDE_A=16000 " |
| " -DBLOCK_STRIDE_A=128000 " |
| " -DSIMDGROUP_WIDTH=" + |
| std::to_string(backend_ctx->adreno_wave_size); |
| if (has_vector_subgroup_broadcast) { |
| CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
| } |
|
|
| backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
| |
| #ifdef GGML_OPENCL_EMBED_KERNELS |
| const std::string kernel_src_CL_gemm { |
| #include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h" |
| }; |
| #else |
| const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl"); |
| #endif |
| backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts); |
| CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err)); |
|
|
| |
| size_t max_A_q_d_bytes = 311164928; |
| size_t max_A_s_d_bytes = 38895616; |
| size_t max_B_d_bytes = 45088768; |
|
|
| CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err)); |
| CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err)); |
| CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err)); |
| #endif |
|
|
| |
| ggml_backend_opencl_n_devices = 1; |
|
|
| return backend_ctx; |
| } |
|
|
| static void ggml_cl2_free(void) { |
| #ifdef GGML_OPENCL_PROFILING |
| FILE * fperf = fopen("cl_profiling.csv", "w"); |
| if (!fperf) { |
| GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); |
| return; |
| } |
|
|
| float total_kernel_time = 0; |
| fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); |
| for (const ProfilingInfo & info : g_profiling_info) { |
| total_kernel_time += info.duration_ns/1.e6f; |
| fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", |
| info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, |
| info.global_size[0], info.global_size[1], info.global_size[2], |
| info.local_size[0], info.local_size[2], info.local_size[2], |
| info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); |
| } |
| fclose(fperf); |
|
|
| GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); |
| #endif |
| } |
|
|
| |
| |
| |
| struct ggml_tensor_extra_cl { |
| |
| cl_mem data_device; |
| |
| |
| |
| |
| cl_ulong offset; |
| |
| |
| size_t actual_size; |
|
|
| void reset() { |
| data_device = nullptr; |
| offset = 0; |
| actual_size = 0; |
| } |
| }; |
|
|
| |
| |
| |
| |
| struct ggml_tensor_extra_cl_q4_0 { |
| |
| cl_mem q = nullptr; |
| |
| cl_mem q_img = nullptr; |
| |
| cl_mem d = nullptr; |
| |
| cl_mem d_img = nullptr; |
| |
| size_t size_q = 0; |
| |
| size_t size_d = 0; |
|
|
| ~ggml_tensor_extra_cl_q4_0() { |
| reset(); |
| } |
|
|
| void reset() { |
| |
| |
| |
| if (q != nullptr) { |
| CL_CHECK(clReleaseMemObject(q)); |
| q = nullptr; |
| } |
| if (d != nullptr) { |
| CL_CHECK(clReleaseMemObject(d)); |
| d = nullptr; |
| } |
| |
| |
| |
| |
| q_img = nullptr; |
| d_img = nullptr; |
| size_q = 0; |
| size_d = 0; |
| } |
| }; |
|
|
| |
| |
| |
|
|
| |
| |
| |
| static const char * ggml_backend_opencl_name(ggml_backend_t backend) { |
| return "OpenCL"; |
|
|
| UNUSED(backend); |
| } |
|
|
| static void ggml_backend_opencl_free(ggml_backend_t backend) { |
| ggml_cl2_free(); |
|
|
| GGML_UNUSED(backend); |
| } |
|
|
| static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
| GGML_UNUSED(backend); |
| GGML_UNUSED(tensor); |
| GGML_UNUSED(data); |
| GGML_UNUSED(offset); |
| GGML_UNUSED(size); |
| } |
|
|
| static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
| GGML_UNUSED(backend); |
| GGML_UNUSED(tensor); |
| GGML_UNUSED(data); |
| GGML_UNUSED(offset); |
| GGML_UNUSED(size); |
| } |
|
|
| static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { |
| GGML_UNUSED(backend); |
| GGML_UNUSED(src); |
| GGML_UNUSED(dst); |
| return false; |
| } |
|
|
| static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { |
| GGML_UNUSED(backend); |
| } |
|
|
| static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { |
| for (int i = 0; i < cgraph->n_nodes; i++) { |
| ggml_tensor * node = cgraph->nodes[i]; |
|
|
| if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { |
| continue; |
| } |
|
|
| bool ok = ggml_cl_compute_forward(backend, node); |
| if (!ok) { |
| GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); |
| } |
| GGML_ASSERT(ok); |
| } |
|
|
| return GGML_STATUS_SUCCESS; |
| } |
|
|
| static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
| GGML_UNUSED(dev); |
|
|
| switch (op->op) { |
| case GGML_OP_NONE: |
| return true; |
| case GGML_OP_GET_ROWS: |
| switch (op->src[0]->type) { |
| case GGML_TYPE_F32: |
| case GGML_TYPE_F16: |
| return true; |
| case GGML_TYPE_Q4_0: |
| #ifdef GGML_OPENCL_SOA_Q |
| |
| return false; |
| #else |
| return true; |
| #endif |
| default: |
| return false; |
| } |
| case GGML_OP_CPY: |
| case GGML_OP_DUP: |
| case GGML_OP_CONT: |
| switch (op->src[0]->type) { |
| case GGML_TYPE_F32: |
| switch (op->type) { |
| case GGML_TYPE_F16: |
| case GGML_TYPE_F32: |
| return true; |
| default: |
| return false; |
| } |
| case GGML_TYPE_F16: |
| switch (op->type) { |
| case GGML_TYPE_F16: |
| case GGML_TYPE_F32: |
| return true; |
| default: |
| return false; |
| } |
| default: |
| return false; |
| } |
| case GGML_OP_ADD: |
| case GGML_OP_SCALE: |
| case GGML_OP_MUL: |
| return op->src[0]->type == GGML_TYPE_F32; |
| case GGML_OP_UNARY: |
| switch (ggml_get_unary_op(op)) { |
| case GGML_UNARY_OP_GELU: |
| case GGML_UNARY_OP_SILU: |
| case GGML_UNARY_OP_RELU: |
| return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; |
| default: |
| return false; |
| } |
| case GGML_OP_CLAMP: |
| return op->src[0]->type == GGML_TYPE_F32; |
| case GGML_OP_SOFT_MAX: |
| case GGML_OP_NORM: |
| case GGML_OP_RMS_NORM: |
| return true; |
| case GGML_OP_MUL_MAT: |
| if (op->src[0]->type == GGML_TYPE_F16) { |
| return true; |
| } else if (op->src[0]->type == GGML_TYPE_F32) { |
| return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); |
| } else if (op->src[0]->type == GGML_TYPE_Q4_0 || |
| op->src[0]->type == GGML_TYPE_Q6_K) { |
| return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); |
| } |
| return false; |
| case GGML_OP_RESHAPE: |
| case GGML_OP_VIEW: |
| case GGML_OP_PERMUTE: |
| case GGML_OP_TRANSPOSE: |
| return true; |
| case GGML_OP_DIAG_MASK_INF: |
| return op->ne[3] == 1; |
| case GGML_OP_ROPE: { |
| const int mode = ((const int32_t *) op->op_params)[2]; |
| if (mode & GGML_ROPE_TYPE_MROPE) { |
| return false; |
| } |
| if (mode & GGML_ROPE_TYPE_VISION) { |
| return false; |
| } |
| return true; |
| } |
| default: |
| return false; |
| } |
| } |
|
|
| |
| static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type); |
|
|
| static ggml_guid_t ggml_backend_opencl_guid() { |
| static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe }; |
| return &guid; |
| } |
|
|
| static ggml_backend_i ggml_backend_opencl_i = { |
| ggml_backend_opencl_name, |
| ggml_backend_opencl_free, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| ggml_backend_opencl_graph_compute, |
| NULL, |
| NULL, |
| }; |
|
|
| ggml_backend_t ggml_backend_opencl_init(void) { |
| ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0); |
| ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); |
|
|
| ggml_backend_t backend = new ggml_backend { |
| ggml_backend_opencl_guid(), |
| ggml_backend_opencl_i, |
| dev, |
| backend_ctx |
| }; |
|
|
| return backend; |
| } |
|
|
| bool ggml_backend_is_opencl(ggml_backend_t backend) { |
| return backend && backend->iface.get_name == ggml_backend_opencl_name; |
| } |
|
|
| |
| |
| |
| struct ggml_backend_opencl_buffer_context { |
| |
| |
| |
| |
| |
| ggml_backend_opencl_buffer_context(cl_mem buf) |
| : name("OpenCL") { |
| buffer.push_back(buf); |
| } |
|
|
| ~ggml_backend_opencl_buffer_context() { |
| for (cl_mem buf : buffer) { |
| CL_CHECK(clReleaseMemObject(buf)); |
| } |
| for (cl_mem im : img) { |
| CL_CHECK(clReleaseMemObject(im)); |
| } |
|
|
| |
| for (ggml_tensor_extra_cl * e : temp_tensor_extras) { |
| delete e; |
| } |
| for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) { |
| delete e; |
| } |
| for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) { |
| delete e; |
| } |
| for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) { |
| delete e; |
| } |
| } |
|
|
| ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() { |
| ggml_tensor_extra_cl * extra; |
| if (temp_tensor_extras.empty()) { |
| extra = new ggml_tensor_extra_cl(); |
| } else { |
| extra = temp_tensor_extras.back(); |
| temp_tensor_extras.pop_back(); |
| } |
|
|
| temp_tensor_extras_in_use.push_back(extra); |
|
|
| extra->reset(); |
| return extra; |
| } |
|
|
| ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() { |
| ggml_tensor_extra_cl_q4_0 * extra; |
| if (temp_tensor_extras_q4_0.empty()) { |
| extra = new ggml_tensor_extra_cl_q4_0(); |
| } else { |
| extra = temp_tensor_extras_q4_0.back(); |
| temp_tensor_extras_q4_0.pop_back(); |
| } |
|
|
| temp_tensor_extras_q4_0_in_use.push_back(extra); |
|
|
| extra->reset(); |
| return extra; |
| } |
|
|
| void reset() { |
| for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) { |
| temp_tensor_extras.push_back(e); |
| } |
| temp_tensor_extras_in_use.clear(); |
|
|
| for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) { |
| temp_tensor_extras_q4_0.push_back(e); |
| } |
| temp_tensor_extras_q4_0_in_use.clear(); |
| } |
|
|
| |
| |
| |
| |
| |
| std::vector<ggml_tensor_extra_cl *> temp_tensor_extras; |
| std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use; |
| std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0; |
| std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use; |
|
|
| |
| |
| |
| |
| |
| |
| |
| std::vector<cl_mem> buffer; |
| |
| |
| |
| |
| std::vector<cl_mem> img; |
| std::string name; |
| }; |
|
|
| static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { |
| ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
| delete ctx; |
| } |
|
|
| static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { |
| ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device); |
| return (void *) (uintptr_t) backend_ctx->alignment; |
| } |
|
|
| static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { |
| ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
|
| ggml_cl2_init(buffer->buft->device); |
|
|
| if (tensor->view_src != nullptr) { |
| GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); |
|
|
| ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra; |
| GGML_ASSERT(view_extra && "view_extra is nullptr?"); |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| tensor->extra = view_extra; |
| } else { |
| { |
| size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer); |
|
|
| ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra(); |
| extra->offset = offset; |
| extra->data_device = ctx->buffer[0]; |
| extra->actual_size = ggml_nbytes(tensor); |
|
|
| tensor->extra = extra; |
| } |
| } |
| return GGML_STATUS_SUCCESS; |
| } |
|
|
| |
| |
| inline bool use_adreno_kernels(const ggml_tensor *tensor) { |
| return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 && |
| tensor->ne[2] == 1 && tensor->ne[3] == 1; |
| } |
|
|
| static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
| ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); |
|
|
| cl_context context = backend_ctx->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| #ifdef GGML_OPENCL_SOA_Q |
| |
| |
| |
| |
| |
| if (tensor->type == GGML_TYPE_Q4_0) { |
| |
| |
| ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; |
| GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); |
|
|
| |
| ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
| ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0(); |
|
|
| size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); |
| size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; |
| GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); |
|
|
| cl_int err; |
| cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, |
| ggml_nbytes(tensor), NULL, &err); |
| CL_CHECK(err); |
| CL_CHECK(clEnqueueWriteBuffer( |
| queue, data_device, CL_TRUE, 0, |
| ggml_nbytes(tensor), data, 0, NULL, NULL)); |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| cl_buffer_region region; |
|
|
| |
| |
| |
| region.origin = extra_orig->offset + tensor->view_offs + offset; |
| region.size = size_d; |
| extra->d = clCreateSubBuffer( |
| extra_orig->data_device, CL_MEM_READ_WRITE, |
| CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); |
| CL_CHECK(err); |
|
|
| |
| region.origin = extra_orig->offset + tensor->view_offs + offset + size_d; |
| region.size = size_q; |
| extra->q = clCreateSubBuffer( |
| extra_orig->data_device, CL_MEM_READ_WRITE, |
| CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); |
| CL_CHECK(err); |
|
|
| |
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0; |
|
|
| |
| if (use_adreno_kernels(tensor)) { |
| kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle; |
| } |
| #else |
| cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0; |
| #endif |
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); |
|
|
| size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
| CL_CHECK(clReleaseMemObject(data_device)); |
|
|
| tensor->extra = extra; |
|
|
| |
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| |
| |
| if (use_adreno_kernels(tensor)) { |
| |
| |
| |
| int M = tensor->ne[1]; |
| int K = tensor->ne[0]; |
|
|
| |
| GGML_ASSERT(K % 32 == 0); |
| |
| GGML_ASSERT(M % 4 == 0); |
|
|
| |
| |
| |
|
|
| size_t q_size_bytes = K * M / 8 * sizeof(float); |
| cl_buffer_region region; |
| region.origin = 0; |
| region.size = q_size_bytes; |
| cl_mem qT_d = clCreateSubBuffer( |
| backend_ctx->A_q_d_max, |
| 0, |
| CL_BUFFER_CREATE_TYPE_REGION, |
| ®ion, |
| &err); |
| |
| CL_CHECK(err); |
|
|
| |
| size_t d_size_bytes = M * (K / 32) * 2; |
| region.origin = 0; |
| region.size = d_size_bytes; |
| cl_mem dT_d = clCreateSubBuffer( |
| backend_ctx->A_s_d_max, |
| 0, |
| CL_BUFFER_CREATE_TYPE_REGION, |
| ®ion, |
| &err); |
| |
| CL_CHECK(err); |
|
|
| |
|
|
|
|
| |
| |
| cl_mem q_d_image1D; |
| cl_mem d_d_image1D; |
| cl_mem qT_d_image1D; |
| cl_mem dT_d_image1D; |
|
|
| cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; |
| cl_image_desc img_desc_1d; |
|
|
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.image_width = M * K / 4 / 4; |
| img_desc_1d.buffer = extra->q; |
| q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
| CL_CHECK(err); |
|
|
| img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; |
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.image_width = M * K / 4 / 4; |
| img_desc_1d.buffer = qT_d; |
| qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
| CL_CHECK(err); |
|
|
| img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; |
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.image_width = M * K / 32 / 4; |
| img_desc_1d.buffer = extra->d; |
| d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
| CL_CHECK(err); |
|
|
| img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT }; |
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.image_width = M * K / 32 / 4; |
| img_desc_1d.buffer = dT_d; |
| dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
| CL_CHECK(err); |
| |
|
|
| |
| |
| |
| int height_q = M / 4; |
| int width_q = K / 4 / 4; |
| kernel = backend_ctx->kernel_transpose_16; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q)); |
|
|
| size_t local_size_q[3] = {4, 16, 1}; |
| size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1}; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
| |
| int height_s = M / 4; |
| int width_s = K / 32 / 4; |
|
|
| kernel = backend_ctx->kernel_transpose_16; |
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s)); |
|
|
| size_t local_size_s[3] = {4, 16, 1}; |
| size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1}; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
| |
|
|
| |
| |
| |
| CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
| |
| CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
| |
|
|
| |
| |
| CL_CHECK(clReleaseMemObject(qT_d)); |
| CL_CHECK(clReleaseMemObject(dT_d)); |
|
|
| |
| CL_CHECK(clReleaseMemObject(q_d_image1D)); |
| CL_CHECK(clReleaseMemObject(d_d_image1D)); |
| CL_CHECK(clReleaseMemObject(qT_d_image1D)); |
| CL_CHECK(clReleaseMemObject(dT_d_image1D)); |
| |
| |
| |
| } |
| #endif |
|
|
| return; |
| } |
| #endif |
|
|
| ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
| GGML_ASSERT(extra); |
|
|
| CL_CHECK(clEnqueueWriteBuffer( |
| queue, extra->data_device, CL_TRUE, extra->offset + offset, |
| size, data, 0, NULL, NULL)); |
|
|
| GGML_UNUSED(buffer); |
| } |
|
|
| static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
| GGML_ASSERT(tensor->extra); |
|
|
| ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); |
|
|
| cl_context context = backend_ctx->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| |
| CL_CHECK(clFinish(queue)); |
|
|
| #ifdef GGML_OPENCL_SOA_Q |
| |
| |
| |
| |
| |
| |
| if (tensor->type == GGML_TYPE_Q4_0) { |
| ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra; |
|
|
| cl_int err; |
| cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, |
| ggml_nbytes(tensor), NULL, &err); |
| CL_CHECK(err); |
|
|
| cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0; |
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); |
|
|
| size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; |
| size_t local_work_size[] = {1, 1, 1}; |
|
|
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, |
| global_work_size, local_work_size, 0, NULL, &evt)); |
| CL_CHECK(clWaitForEvents(1, &evt)); |
| CL_CHECK(clEnqueueReadBuffer( |
| queue, data_device, CL_TRUE, offset, |
| size, data, 0, NULL, NULL)); |
| CL_CHECK(clReleaseMemObject(data_device)); |
| return; |
| } |
| #endif |
|
|
| ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
|
|
| CL_CHECK(clEnqueueReadBuffer( |
| queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset, |
| size, data, 0, NULL, NULL)); |
|
|
| GGML_UNUSED(buffer); |
| } |
|
|
| static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { |
| ggml_backend_dev_t dev = buffer->buft->device; |
| ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
| for (cl_mem buf : ctx->buffer) { |
| CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); |
| } |
| CL_CHECK(clFinish(queue)); |
| } |
|
|
| static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { |
| ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
| ctx->reset(); |
| } |
|
|
| static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { |
| ggml_backend_opencl_buffer_free_buffer, |
| ggml_backend_opencl_buffer_get_base, |
| ggml_backend_opencl_buffer_init_tensor, |
| NULL, |
| ggml_backend_opencl_buffer_set_tensor, |
| ggml_backend_opencl_buffer_get_tensor, |
| NULL, |
| ggml_backend_opencl_buffer_clear, |
| ggml_backend_opencl_buffer_reset, |
| }; |
|
|
| |
| |
| |
|
|
| static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) { |
| return "OpenCL"; |
|
|
| GGML_UNUSED(buffer_type); |
| } |
|
|
| static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { |
| ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device); |
|
|
| |
| size = std::max(size, (size_t)1); |
|
|
| cl_int err; |
| cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err); |
| if (err != CL_SUCCESS) { |
| GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); |
| return nullptr; |
| } |
|
|
| ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem); |
|
|
| return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); |
| } |
|
|
| static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { |
| |
| static cl_uint alignment = -1; |
| if (alignment == (cl_uint)-1) { |
| ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); |
| alignment = backend_ctx->alignment; |
| } |
| return alignment; |
| } |
|
|
| static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { |
| static size_t max_size = -1; |
| if (max_size == (size_t)-1) { |
| ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); |
| max_size = backend_ctx->max_alloc_size; |
| } |
| return max_size; |
| } |
|
|
| static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { |
| return ggml_backend_is_opencl(backend); |
|
|
| UNUSED(buft); |
| } |
|
|
| static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { |
| ggml_backend_opencl_buffer_type_get_name, |
| ggml_backend_opencl_buffer_type_alloc_buffer, |
| ggml_backend_opencl_buffer_type_get_alignment, |
| ggml_backend_opencl_buffer_type_get_max_size, |
| NULL, |
| NULL, |
| }; |
|
|
| ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { |
| static ggml_backend_buffer_type buffer_type = { |
| ggml_backend_opencl_buffer_type_interface, |
| &g_ggml_backend_opencl_device, |
| nullptr, |
| }; |
|
|
| return &buffer_type; |
| } |
|
|
| |
| |
| |
|
|
| static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) { |
| return "GPUOpenCL"; |
|
|
| GGML_UNUSED(dev); |
| } |
|
|
| static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) { |
| ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context; |
| return dev_ctx->device_name.c_str(); |
| } |
|
|
| static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |
| *free = 1; |
| *total = 1; |
|
|
| GGML_UNUSED(dev); |
| } |
|
|
| static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) { |
| return GGML_BACKEND_DEVICE_TYPE_GPU; |
|
|
| GGML_UNUSED(dev); |
| } |
|
|
| static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |
| props->name = ggml_backend_opencl_device_get_name(dev); |
| props->description = ggml_backend_opencl_device_get_description(dev); |
| props->type = ggml_backend_opencl_device_get_type(dev); |
| ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total); |
| props->caps = ggml_backend_dev_caps { |
| false, |
| false, |
| false, |
| false, |
| }; |
| } |
|
|
| static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { |
| ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); |
|
|
| ggml_backend_t backend = new ggml_backend { |
| ggml_backend_opencl_guid(), |
| ggml_backend_opencl_i, |
| dev, |
| backend_ctx, |
| }; |
|
|
| return backend; |
|
|
| GGML_UNUSED(params); |
| } |
|
|
| static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) { |
| return ggml_backend_opencl_buffer_type(); |
|
|
| GGML_UNUSED(dev); |
| } |
|
|
| static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { |
| GGML_UNUSED(dev); |
| GGML_UNUSED(ptr); |
| GGML_UNUSED(size); |
| GGML_UNUSED(max_tensor_size); |
| return nullptr; |
| } |
|
|
| static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
| return ggml_opencl_supports_op(dev, op); |
| } |
|
|
| static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |
| return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name; |
|
|
| GGML_UNUSED(dev); |
| } |
|
|
| static struct ggml_backend_device_i ggml_backend_opencl_device_i = { |
| ggml_backend_opencl_device_get_name, |
| ggml_backend_opencl_device_get_description, |
| ggml_backend_opencl_device_get_memory, |
| ggml_backend_opencl_device_get_type, |
| ggml_backend_opencl_device_get_props, |
| ggml_backend_opencl_device_init, |
| ggml_backend_opencl_device_get_buffer_type, |
| NULL, |
| ggml_backend_opencl_device_buffer_from_ptr, |
| ggml_backend_opencl_device_supports_op, |
| ggml_backend_opencl_device_supports_buft, |
| NULL, |
| NULL, |
| NULL, |
| NULL, |
| }; |
|
|
| |
|
|
| static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) { |
| return "OpenCL"; |
|
|
| GGML_UNUSED(reg); |
| } |
|
|
| static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) { |
| return ggml_backend_opencl_n_devices; |
|
|
| GGML_UNUSED(reg); |
| } |
|
|
| static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) { |
| GGML_ASSERT(index == 0); |
|
|
| return &g_ggml_backend_opencl_device; |
|
|
| GGML_UNUSED(reg); |
| GGML_UNUSED(index); |
| } |
|
|
| static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = { |
| ggml_backend_opencl_reg_get_name, |
| ggml_backend_opencl_reg_device_count, |
| ggml_backend_opencl_reg_device_get, |
| NULL, |
| }; |
|
|
| ggml_backend_reg_t ggml_backend_opencl_reg(void) { |
| |
| static ggml_backend_reg reg; |
| static bool initialized = false; |
|
|
| if (!initialized) { |
| reg = ggml_backend_reg { |
| GGML_BACKEND_API_VERSION, |
| ggml_backend_opencl_reg_i, |
| NULL, |
| }; |
|
|
| g_ggml_backend_opencl_device = ggml_backend_device { |
| ggml_backend_opencl_device_i, |
| ®, |
| &g_ggml_ctx_dev_main, |
| }; |
|
|
| ggml_cl2_init(&g_ggml_backend_opencl_device); |
|
|
| initialized = true; |
| } |
|
|
| return ® |
| } |
|
|
| GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg) |
|
|
| |
| |
| |
| #if 0 |
| #define QK4_0 32 |
| typedef struct { |
| ggml_fp16_t d; |
| uint8_t qs[QK4_0 / 2]; |
| } block_q4_0; |
| static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, |
| "wrong q4_0 block size/padding"); |
|
|
| #include <math.h> |
| #ifdef __cplusplus |
| #include "half.hpp" |
| #endif |
|
|
| static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) { |
| void * buf = malloc(ggml_nbytes(tensor)); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
| #ifdef GGML_OPENCL_SOA_Q |
| void * buf_q; |
| void * buf_d; |
| #endif |
|
|
| |
| CL_CHECK(clFinish(queue)); |
|
|
| #ifdef GGML_OPENCL_SOA_Q |
| if (tensor->type == GGML_TYPE_Q4_0) { |
| ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra; |
| GGML_ASSERT(extra); |
|
|
| size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2; |
| size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t); |
| GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor)); |
| buf_q = malloc(size_q); |
| buf_d = malloc(size_d); |
|
|
| CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL)); |
| CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL)); |
| CL_CHECK(clFinish(queue)); |
| } else { |
| |
| ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
| GGML_ASSERT(extra); |
|
|
| CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE, |
| extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL)); |
| CL_CHECK(clFinish(queue)); |
| } |
| #else |
| |
| ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
| GGML_ASSERT(extra); |
|
|
| CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE, |
| extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL)); |
| CL_CHECK(clFinish(queue)); |
| #endif |
|
|
| |
| char fname[512]; |
| sprintf(fname, "./tensor-dumps/%s.txt", tensor->name); |
| FILE * f = fopen(fname, "w"); |
| if (!f) { |
| printf("Failed to open %s\n", fname); |
| return; |
| } |
|
|
| if (tensor->type == GGML_TYPE_F32) { |
| float * data = (float *) buf; |
| for (int i = 0; i < ggml_nelements(tensor); ++i) { |
| if (isnan(data[i])) { |
| printf("NaN found: %s\n", tensor->name); |
| break; |
| } |
| fprintf(f, "%f\n", data[i]); |
| } |
| } else if (tensor->type == GGML_TYPE_I32) { |
| int * data = (int *) buf; |
| for (int i = 0; i < ggml_nelements(tensor); ++i) { |
| if (isnan(data[i])) { |
| printf("NaN found: %s\n", tensor->name); |
| break; |
| } |
| fprintf(f, "%d\n", data[i]); |
| } |
| } else if (tensor->type == GGML_TYPE_F16) { |
| #ifdef __cplusplus |
| half_float::half * data = (half_float::half *) buf; |
| for (int i = 0; i < ggml_nelements(tensor); ++i) { |
| if (std::isnan(data[i])) { |
| printf("NaN found: %s\n", tensor->name); |
| break; |
| } |
| fprintf(f, "%f\n", float(data[i])); |
| } |
| #endif |
| } else if (tensor->type == GGML_TYPE_Q4_0) { |
| #ifdef GGML_OPENCL_SOA_Q |
| ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d; |
| unsigned char * data_q = (unsigned char *)buf_q; |
|
|
| for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) { |
| fprintf(f, "%04x, ", data_d[i]); |
| for (int k = 0; k < QK4_0/2; ++k) { |
| fprintf(f, "%02x, ", data_q[k]); |
| } |
| fprintf(f, "\n"); |
| data_q += QK4_0/2; |
| } |
| free(buf_d); |
| free(buf_q); |
| #else |
| block_q4_0 * data = (block_q4_0 *) buf; |
| for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) { |
| fprintf(f, "%04x, ", data[i].d); |
| for (int k = 0; k < QK4_0/2; ++k) { |
| fprintf(f, "%02x, ", data[i].qs[k]); |
| } |
| fprintf(f, "\n"); |
| } |
| #endif |
| } |
| free(buf); |
| fflush(f); |
| fclose(f); |
| } |
| #else |
| #define dump_tensor(tensor) |
| #endif |
|
|
| |
| |
| |
| #ifdef GGML_OPENCL_PROFILING |
| void populateProfilingInfo( |
| ProfilingInfo& info, cl_event evt, cl_kernel kernel, |
| size_t global_size[3], size_t local_size[3], |
| const ggml_tensor * tensor) { |
| cl_ulong start; |
| cl_ulong end; |
| CL_CHECK(clWaitForEvents(1, &evt)); |
| CL_CHECK(clGetEventProfilingInfo( |
| evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); |
| CL_CHECK(clGetEventProfilingInfo( |
| evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); |
|
|
| char kernel_name[512]; |
| CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, |
| sizeof(kernel_name), kernel_name, NULL)); |
|
|
| info.duration_ns = end - start; |
| info.op_name = tensor->name; |
| info.kernel_name = kernel_name; |
| info.local_size[0] = local_size[0]; |
| info.local_size[1] = local_size[1]; |
| info.local_size[2] = local_size[2]; |
| info.global_size[0] = global_size[0]; |
| info.global_size[1] = global_size[1]; |
| info.global_size[2] = global_size[2]; |
| info.output_size[0] = tensor->ne[0]; |
| info.output_size[1] = tensor->ne[1]; |
| info.output_size[2] = tensor->ne[2]; |
| info.output_size[3] = tensor->ne[3]; |
| } |
| #endif |
|
|
| |
| |
| |
|
|
| static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { |
| const int64_t ne10 = src1->ne[0]; |
|
|
| const int64_t ne0 = dst->ne[0]; |
| const int64_t ne1 = dst->ne[1]; |
|
|
| |
| return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && |
| src1->type == GGML_TYPE_F32 && |
| dst->type == GGML_TYPE_F32 && |
| (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); |
| } |
|
|
| static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| UNUSED(backend); |
| UNUSED(src0); |
| UNUSED(src1); |
| UNUSED(dst); |
| } |
|
|
| static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; |
| const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
| const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
| const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel; |
|
|
| switch (src0->type) { |
| case GGML_TYPE_F32: |
| kernel = backend_ctx->kernel_get_rows_f32; |
| break; |
| case GGML_TYPE_F16: |
| kernel = backend_ctx->kernel_get_rows_f16; |
| break; |
| case GGML_TYPE_Q4_0: |
| kernel = backend_ctx->kernel_get_rows_q4_0; |
| break; |
| default: |
| GGML_ASSERT(false && "not implemented"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2)); |
|
|
| size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; |
| size_t local_work_size[] = {1, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; |
| const int ne12 = src1 ? src1->ne[2] : 0; |
| const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
| const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
| const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
| const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
| const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); |
|
|
| const int ne0 = dst ? dst->ne[0] : 0; |
| const int ne1 = dst ? dst->ne[1] : 0; |
| const int ne2 = dst ? dst->ne[2] : 0; |
| const int ne3 = dst ? dst->ne[3] : 0; |
|
|
| const cl_ulong nb0 = dst ? dst->nb[0] : 0; |
| const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
| const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
| const cl_ulong nb3 = dst ? dst->nb[3] : 0; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| bool bcast_row = false; |
| cl_kernel kernel; |
|
|
| if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { |
| GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
| |
| GGML_ASSERT(ne11 == 1); |
|
|
| bcast_row = true; |
| int ne = ne00 / 4; |
| kernel = backend_ctx->kernel_add_row; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); |
| } else { |
| kernel = backend_ctx->kernel_add; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); |
| CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); |
| CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); |
| CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); |
| CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); |
| CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); |
| CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); |
| CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); |
| } |
|
|
| if (bcast_row) { |
| int n = ggml_nelements(dst)/4; |
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } else { |
| unsigned int nth = MIN(64, ne0); |
| size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
| } |
|
|
| static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; |
| const int ne12 = src1 ? src1->ne[2] : 0; |
| const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
| const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
| const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
| const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
| const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); |
|
|
| const int ne0 = dst ? dst->ne[0] : 0; |
| const int ne1 = dst ? dst->ne[1] : 0; |
| const int ne2 = dst ? dst->ne[2] : 0; |
| const int ne3 = dst ? dst->ne[3] : 0; |
|
|
| const cl_ulong nb0 = dst ? dst->nb[0] : 0; |
| const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
| const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
| const cl_ulong nb3 = dst ? dst->nb[3] : 0; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| bool bcast_row = false; |
| cl_kernel kernel; |
|
|
| if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { |
| GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
| |
| GGML_ASSERT(ne11 == 1); |
|
|
| bcast_row = true; |
| int ne = ne00 / 4; |
| kernel = backend_ctx->kernel_mul_row; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); |
| } else { |
| kernel = backend_ctx->kernel_mul; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); |
| CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); |
| CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); |
| CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); |
| CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); |
| CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); |
| CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); |
| CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); |
| } |
|
|
| if (bcast_row) { |
| int n = ggml_nelements(dst)/4; |
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } else { |
| unsigned int nth = MIN(64, ne0); |
| size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
| } |
|
|
| static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel; |
|
|
| int n = ggml_nelements(dst); |
|
|
| if (n % 4 == 0) { |
| kernel = backend_ctx->kernel_gelu_4; |
| n /= 4; |
| } else { |
| kernel = backend_ctx->kernel_gelu; |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
| #endif |
| } |
|
|
| static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel; |
|
|
| int n = ggml_nelements(dst); |
|
|
| if (n % 4 == 0) { |
| kernel = backend_ctx->kernel_silu_4; |
| n /= 4; |
| } else { |
| kernel = backend_ctx->kernel_silu; |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel = backend_ctx->kernel_relu; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
| const int64_t n = ggml_nelements(dst); |
|
|
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| float min; |
| float max; |
| memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); |
| memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); |
|
|
| cl_kernel kernel = backend_ctx->kernel_clamp; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max)); |
|
|
| const int64_t n = ggml_nelements(dst); |
|
|
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| float eps; |
| memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int nth = MIN(64, ne00); |
|
|
| cl_kernel kernel = backend_ctx->kernel_norm; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL)); |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_backend_opencl_device_context * dev_ctx = |
| (ggml_backend_opencl_device_context *)backend->device->context; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| float eps; |
| memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| GGML_ASSERT(ne00 % 4 == 0); |
|
|
| const int nth = MIN(64, ne00); |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
| cl_kernel kernel = backend_ctx->kernel_rms_norm; |
|
|
| |
| |
| |
| |
| size_t sgs; |
| CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device, |
| CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, |
| sizeof(local_work_size), local_work_size, |
| sizeof(size_t), &sgs, NULL)); |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps)); |
| |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL)); |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; |
| const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| #ifdef GGML_OPENCL_SOA_Q |
| ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; |
| #endif |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; |
| const int ne12 = src1 ? src1->ne[2] : 0; |
| const int ne13 = src1 ? src1->ne[3] : 0; |
|
|
| const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
| const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
| const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
| const cl_ulong nb13 = src1 ? src1->nb[3] : 0; |
|
|
| const int ne0 = dst ? dst->ne[0] : 0; |
| const int ne1 = dst ? dst->ne[1] : 0; |
|
|
| int r2 = ne12/ne02; |
| int r3 = ne13/ne03; |
|
|
| GGML_ASSERT(ne00 == ne10); |
|
|
| int nth0 = 32; |
| int nth1 = 1; |
| int nrows = 1; |
| |
| int ndst = 4; |
|
|
| cl_kernel kernel; |
|
|
| #ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
| cl_context context = backend_ctx->context; |
|
|
| if (ne01 && ne1 && use_adreno_kernels(src0)) { |
|
|
| |
| |
| cl_int status; |
| cl_image_format img_fmt_1d; |
| cl_image_desc img_desc_1d; |
| cl_buffer_region region; |
| cl_mem A_image1d = nullptr; |
| cl_mem B_image1d = nullptr; |
| cl_mem B_sub_buffer = nullptr; |
| cl_mem C_d = nullptr; |
| |
| cl_mem B_d = nullptr; |
| cl_mem B_d_input_image = nullptr; |
| |
|
|
| |
| |
| int M = ne01; |
| int N = ne1; |
| int K = ne00; |
| int padding; |
| |
|
|
| |
| if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { |
| |
|
|
| |
| |
| if (N == 1) { |
| img_fmt_1d = { CL_R, CL_UNSIGNED_INT32}; |
| } else { |
| img_fmt_1d = { CL_R, CL_FLOAT}; |
| } |
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.image_width = M * K / 2 / 4; |
| img_desc_1d.buffer = extra0_q4_0->q; |
| A_image1d = clCreateImage( |
| context, |
| CL_MEM_READ_ONLY, |
| &img_fmt_1d, |
| &img_desc_1d, |
| NULL, |
| &status); |
| CL_CHECK(status); |
| |
|
|
|
|
| |
| |
| region.origin = (extra1->offset); |
| region.size = K * N * sizeof(float); |
| B_sub_buffer = clCreateSubBuffer( |
| extra1->data_device, |
| 0, |
| CL_BUFFER_CREATE_TYPE_REGION, |
| ®ion, |
| &status); |
| CL_CHECK(status); |
| |
|
|
| |
| if (N != 1) { |
| |
| int extra_elements = N % 8; |
|
|
| |
| padding = 0; |
| if (extra_elements > 0){ |
| padding = 8 - extra_elements; |
| } |
|
|
| |
| region.origin = 0; |
| |
| region.size = K * (N + padding) * sizeof(float)/2; |
| B_d = clCreateSubBuffer( |
| backend_ctx->B_d_max, |
| 0, |
| CL_BUFFER_CREATE_TYPE_REGION, |
| ®ion, |
| &status); |
| CL_CHECK(status); |
|
|
| cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT }; |
| cl_image_desc image_desc_B_d_input = { |
| CL_MEM_OBJECT_IMAGE1D_BUFFER, |
| static_cast<size_t>(K * N / 4), |
| 0, 0, 0, 0, 0, 0, 0, { B_sub_buffer } |
| }; |
| B_d_input_image = clCreateImage( |
| context, |
| 0, |
| &image_format_B_d_input, |
| &image_desc_B_d_input, |
| NULL, |
| &status); |
| CL_CHECK(status); |
|
|
| cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; |
| cl_image_desc image_desc_B_d_output = { |
| CL_MEM_OBJECT_IMAGE1D_BUFFER, |
| static_cast<size_t>(K * (N + padding)/4), |
| 0, 0, 0, 0, 0, 0, 0, { B_d } |
| }; |
| B_image1d = clCreateImage( |
| context, |
| 0, |
| &image_format_B_d_output, |
| &image_desc_B_d_output, |
| NULL, |
| &status); |
| CL_CHECK(status); |
|
|
| int height_B = N/4; |
| if (height_B == 0) { |
| height_B = 1; |
| } |
| int width_B = K/4; |
| int padded_height_B = (N + padding)/4; |
|
|
| kernel = backend_ctx->kernel_transpose_32_16; |
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); |
|
|
| size_t local_size_t[2] = { 1, 16 }; |
| |
| if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { |
| local_size_t[0]=4; |
| local_size_t[1]=8; |
| } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { |
| local_size_t[0]=2; |
| local_size_t[1]=8; |
| } else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) { |
| local_size_t[0]=1; |
| local_size_t[1]=8; |
| } else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) { |
| local_size_t[0]=2; |
| local_size_t[1]=8; |
| } |
|
|
| size_t global_size_t[2] = { |
| static_cast<size_t>(width_B), |
| static_cast<size_t>(padded_height_B) |
| }; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL)); |
| #endif |
| } else { |
| |
| |
| |
| img_fmt_1d = {CL_RGBA, CL_FLOAT}; |
|
|
| memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
| img_desc_1d.image_width = K * N / 4; |
| img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
| img_desc_1d.buffer = B_sub_buffer; |
| B_image1d = clCreateImage( |
| context, |
| CL_MEM_READ_ONLY, |
| &img_fmt_1d, |
| &img_desc_1d, |
| NULL, |
| &status); |
| CL_CHECK(status); |
| |
| } |
|
|
| |
| |
| if (N == 1) { |
| kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; |
| if (M == 4096 && K == 4096) { |
| kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; |
| } else if (M == 4096 && K == 11008) { |
| kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; |
| } else if (M == 11008 && K == 4096) { |
| kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; |
| } else if (M == 32000 && K == 4096) { |
| kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; |
| } |
| } else { |
| kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4; |
| } |
| |
|
|
| |
| |
| cl_uint k_arg = 0; |
|
|
| if (N == 1) { |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3)); |
| } else { |
| region.origin = extrad->offset; |
| region.size = M * N * sizeof(float); |
| C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); |
| CL_CHECK(status); |
|
|
| int padded_N = ne1 + padding; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); |
| } |
| |
|
|
| |
| |
| size_t global_work_size[3] = { |
| 64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)}; |
| size_t local_work_size[3] = {64, 2, 4}; |
|
|
| global_work_size[0] = (size_t)(ceil((float)ne1/8)); |
| global_work_size[1] = (size_t)(ne01/4); |
| global_work_size[2] = (size_t)(1); |
|
|
| local_work_size[0] = (size_t)(1); |
| local_work_size[1] = (size_t)(128); |
| local_work_size[2] = (size_t)(1); |
|
|
| |
| if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { |
| local_work_size[0] = 1; |
| local_work_size[1] = 128; |
| } else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { |
| local_work_size[0] = 2; |
| local_work_size[1] = 64; |
| } else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) { |
| local_work_size[0] = 2; |
| local_work_size[1] = 64; |
| } else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) { |
| local_work_size[0] = 2; |
| local_work_size[1] = 64; |
| } |
|
|
| if (N == 1) { |
| size_t wavesize = backend_ctx->adreno_wave_size; |
| local_work_size[0] = wavesize; |
| local_work_size[1] = 4; |
| local_work_size[2] = 1; |
|
|
| global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize; |
| global_work_size[1] = 4; |
| global_work_size[2] = 1; |
| } |
| |
|
|
| |
| |
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| |
|
|
| |
| |
| CL_CHECK(clReleaseMemObject(A_image1d)); |
| CL_CHECK(clReleaseMemObject(B_sub_buffer)); |
| CL_CHECK(clReleaseMemObject(B_image1d)); |
|
|
| if (N != 1) { |
| CL_CHECK(clReleaseMemObject(B_d)); |
| CL_CHECK(clReleaseMemObject(B_d_input_image)); |
| CL_CHECK(clReleaseMemObject(C_d)); |
| } |
| |
|
|
| return; |
| } |
| } |
| #endif |
|
|
| if (!ggml_is_transposed(src0) && |
| !ggml_is_transposed(src1) && |
| src1t == GGML_TYPE_F32 && |
| ne00%32 == 0 && |
| ne11 > 2) { |
| #ifdef GGML_OPENCL_SOA_Q |
| |
| switch(src0t) { |
| case GGML_TYPE_Q4_0: |
| |
| GGML_ASSERT(ne11 == ne1); |
| GGML_ASSERT(ne01 == ne0); |
|
|
| if (backend_ctx->gpu_family == INTEL) { |
| nth0 = 16; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 64; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
| break; |
| default: |
| break; |
| } |
|
|
| |
| if (src0t == GGML_TYPE_Q4_0) { |
| size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
| size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
| if (backend_ctx->gpu_family == INTEL) { |
| |
| global_work_size[0] = (size_t)(ne01 + 15)/16*nth0; |
| global_work_size[1] = (size_t)ne11*nth1; |
| global_work_size[2] = (size_t)ne12*ne13; |
| } |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| return; |
| } |
| #else |
| |
| #endif |
| } |
|
|
| |
| switch (src0t) { |
| case GGML_TYPE_F32: |
| |
| GGML_ASSERT(src1t == GGML_TYPE_F32); |
| kernel = backend_ctx->kernel_mul_mat_f32_f32; |
| nrows = 4; |
|
|
| if (backend_ctx->gpu_family == INTEL) { |
| nth0 = 32; |
| nth1 = 1; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 64; |
| nth1 = 1; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
| CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3)); |
| break; |
| case GGML_TYPE_F16: |
| |
| if (backend_ctx->gpu_family == INTEL) { |
| nth0 = 32; |
| nth1 = 1; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 64; |
| nth1 = 1; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| if (src1t == GGML_TYPE_F32) { |
| if (ne11 * ne12 < 4) { |
| kernel = backend_ctx->kernel_mul_mat_f16_f32_1row; |
| } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { |
| kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; |
| nrows = ne11; |
| } else { |
| kernel = backend_ctx->kernel_mul_mat_f16_f32; |
| nrows = 4; |
| } |
| } else { |
| kernel = backend_ctx->kernel_mul_mat_f16_f16; |
| nrows = 4; |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
| CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3)); |
| break; |
| case GGML_TYPE_Q4_0: |
| |
| GGML_ASSERT(ne11 == ne1); |
| GGML_ASSERT(ne01 == ne0); |
|
|
| #ifdef GGML_OPENCL_SOA_Q |
| if (backend_ctx->gpu_family == INTEL) { |
| nth0 = 16; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; |
| ndst = 8; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 64; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; |
| ndst =8; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
| #else |
| if (backend_ctx->gpu_family == INTEL) { |
| |
| |
| |
| |
| nth0 = 16; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32; |
| ndst = 4; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 64; |
| nth1 = 1; |
|
|
| kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v; |
| ndst = 4; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
| #endif |
| break; |
| case GGML_TYPE_Q4_1: |
| case GGML_TYPE_Q8_0: |
| case GGML_TYPE_Q2_K: |
| case GGML_TYPE_Q3_K: |
| case GGML_TYPE_Q4_K: |
| case GGML_TYPE_Q5_K: |
| case GGML_TYPE_Q6_K: |
| kernel = backend_ctx->kernel_mul_mv_q6_K_f32; |
|
|
| if (backend_ctx->gpu_family == INTEL) { |
| nth0 = 2; |
| nth1 = 16; |
| } else if (backend_ctx->gpu_family == ADRENO) { |
| nth0 = 2; |
| nth1 = 64; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
| break; |
| default: |
| GGML_ASSERT(false && "not implemented"); |
| } |
|
|
| if (src0t == GGML_TYPE_Q4_0 || |
| src0t == GGML_TYPE_Q4_1 || |
| src0t == GGML_TYPE_Q8_0 || |
| src0t == GGML_TYPE_Q2_K) { |
| |
| |
| |
| |
| |
| |
| size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
| size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } else if (src0t == GGML_TYPE_Q4_K) { |
| GGML_ASSERT(false && "not implemented"); |
| } else if (src0t == GGML_TYPE_Q3_K) { |
| GGML_ASSERT(false && "not implemented"); |
| } else if (src0t == GGML_TYPE_Q5_K) { |
| GGML_ASSERT(false && "not implemented"); |
| } else if (src0t == GGML_TYPE_Q6_K) { |
| size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
| size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } else { |
| int64_t ny = (ne11 + nrows - 1)/nrows; |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; |
| size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
| } |
|
|
| static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
| GGML_UNUSED(src1); |
|
|
| GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| float scale; |
| memcpy(&scale, dst->op_params, sizeof(scale)); |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel = backend_ctx->kernel_scale; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); |
|
|
| int n = ggml_nelements(dst)/4; |
|
|
| size_t global_work_size[] = {(size_t)n, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
|
|
| |
| |
| UNUSED(dst); |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; |
| const int ne12 = src1 ? src1->ne[2] : 0; |
| const int ne13 = src1 ? src1->ne[3] : 0; |
|
|
| const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
| const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
| const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
| const cl_ulong nb13 = src1 ? src1->nb[3] : 0; |
|
|
| const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; |
| const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
|
| cl_kernel kernel; |
|
|
| switch (src0t) { |
| case GGML_TYPE_F32: |
| switch (src1t) { |
| case GGML_TYPE_F16: |
| kernel = backend_ctx->kernel_cpy_f32_f16; |
| break; |
| case GGML_TYPE_F32: |
| kernel = backend_ctx->kernel_cpy_f32_f32; |
| break; |
| default: |
| GGML_ASSERT(false && "not implemented"); |
| } |
| break; |
| case GGML_TYPE_F16: |
| switch (src1t) { |
| case GGML_TYPE_F16: |
| kernel = backend_ctx->kernel_cpy_f16_f16; |
| break; |
| case GGML_TYPE_F32: |
| kernel = backend_ctx->kernel_cpy_f16_f32; |
| break; |
| default: |
| GGML_ASSERT(false && "not implemented"); |
| } |
| break; |
| default: |
| GGML_ASSERT(false && "not implemented"); |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
|
|
| const int nth = MIN(64, ne00); |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| ggml_cl_cpy(backend, src0, dst, nullptr); |
| UNUSED(src1); |
| } |
|
|
| static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| UNUSED(src1); |
|
|
| int n_past = ((int32_t *)(dst->op_params))[0]; |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_kernel kernel; |
|
|
| if (ne00%8 == 0) { |
| kernel = backend_ctx->kernel_diag_mask_inf_8; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); |
|
|
| size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } else { |
| kernel = backend_ctx->kernel_diag_mask_inf; |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); |
|
|
| size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; |
| size_t local_work_size[] = {64, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
| } |
|
|
| static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| |
| |
| |
| |
| if (src1) { |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| } |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| float scale, max_bias; |
| memcpy(&scale, dst->op_params + 0, sizeof(float)); |
| memcpy(&max_bias, dst->op_params + 1, sizeof(float)); |
|
|
| const int nrows_x = ggml_nrows(src0); |
| const int nrows_y = src0->ne[1]; |
|
|
| const int n_head = nrows_x/nrows_y; |
| const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); |
|
|
| const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); |
| const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); |
|
|
| const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16); |
|
|
| |
| |
| int nth = MIN(32, ne00); |
|
|
| if (backend_ctx->gpu_family == INTEL) { |
| |
| nth = MIN(32, ne00); |
| } |
| else if (backend_ctx->gpu_family == ADRENO) { |
| nth = 64; |
| } else { |
| GGML_ASSERT(false && "TODO: Unknown GPU"); |
| } |
|
|
| cl_kernel kernel; |
|
|
| if (ne00%4 == 0) { |
| if (use_f16) { |
| kernel = backend_ctx->kernel_soft_max_4_f16; |
| } else { |
| kernel = backend_ctx->kernel_soft_max_4; |
| } |
| } else { |
| if (use_f16) { |
| kernel = backend_ctx->kernel_soft_max_f16; |
| } else { |
| kernel = backend_ctx->kernel_soft_max; |
| } |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2)); |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
| GGML_ASSERT(src0); |
| GGML_ASSERT(src0->extra); |
| GGML_ASSERT(src1); |
| GGML_ASSERT(src1->extra); |
| GGML_ASSERT(dst); |
| GGML_ASSERT(dst->extra); |
|
|
| ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
| cl_command_queue queue = backend_ctx->queue; |
|
|
| ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
| ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
| ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
| cl_ulong offset0 = extra0->offset + src0->view_offs; |
| cl_ulong offset1 = extra1->offset + src1->view_offs; |
| cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
| ggml_tensor * src2 = dst->src[2]; |
| ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr; |
|
|
| cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0; |
|
|
| const int ne00 = src0 ? src0->ne[0] : 0; |
| const int ne01 = src0 ? src0->ne[1] : 0; |
| const int ne02 = src0 ? src0->ne[2] : 0; |
| const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
| const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
| const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
| const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
| const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
| const int ne10 = src1 ? src1->ne[0] : 0; |
| const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11); |
| const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12); |
| const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
| const int ne0 = dst ? dst->ne[0] : 0; |
| const int ne1 = dst ? dst->ne[1] : 0; |
| const int ne2 = dst ? dst->ne[2] : 0; |
| const int ne3 = dst ? dst->ne[3] : 0; |
|
|
| const cl_ulong nb0 = dst ? dst->nb[0] : 0; |
| const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
| const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
| const cl_ulong nb3 = dst ? dst->nb[3] : 0; |
|
|
| GGML_ASSERT(ne10 % ne02 == 0); |
| GGML_ASSERT(ne10 >= ne02); |
|
|
| int nth = MIN(64, ne00); |
|
|
| const int n_past = ((int *) dst->op_params)[0]; |
| const int n_dims = ((int *) dst->op_params)[1]; |
| const int mode = ((int *) dst->op_params)[2]; |
| const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; |
|
|
| float freq_base; |
| float freq_scale; |
| float ext_factor; |
| float attn_factor; |
| float beta_fast; |
| float beta_slow; |
|
|
| memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); |
| memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); |
| memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); |
| memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); |
| memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); |
| memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); |
|
|
| const bool is_neox = mode & 2; |
|
|
| cl_kernel kernel; |
|
|
| if (!is_neox) { |
| switch (src0->type) { |
| case GGML_TYPE_F32: |
| kernel = backend_ctx->kernel_rope_norm_f32; |
| break; |
| case GGML_TYPE_F16: |
| kernel = backend_ctx->kernel_rope_norm_f16; |
| break; |
| default: |
| GGML_ASSERT(false); |
| }; |
| } else { |
| switch (src0->type) { |
| case GGML_TYPE_F32: |
| kernel = backend_ctx->kernel_rope_neox_f32; |
| break; |
| case GGML_TYPE_F16: |
| kernel = backend_ctx->kernel_rope_neox_f16; |
| break; |
| default: |
| GGML_ASSERT(false); |
| }; |
| } |
|
|
| CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
| CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
| CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); |
| CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); |
| CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); |
| CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); |
| CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); |
| CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); |
| CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03)); |
| CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00)); |
| CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01)); |
| CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02)); |
| CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03)); |
| CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0)); |
| CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1)); |
| CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2)); |
| CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3)); |
| CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0)); |
| CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1)); |
| CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2)); |
| CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3)); |
| CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past)); |
| CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims)); |
| CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig)); |
| CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &freq_base)); |
| CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float), &freq_scale)); |
| CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &ext_factor)); |
| CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor)); |
| CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast)); |
| CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow)); |
|
|
| size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
| size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
| #ifdef GGML_OPENCL_PROFILING |
| cl_event evt; |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
| g_profiling_info.emplace_back(); |
| populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
| #else |
| CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
| #endif |
| } |
|
|
| |
| |
| |
|
|
| typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); |
|
|
| bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) { |
| ggml_cl_func_t func = nullptr; |
|
|
| ggml_tensor * src0 = tensor->src[0]; |
| ggml_tensor * src1 = tensor->src[1]; |
|
|
| const bool any_on_device = tensor->extra |
| || (src0 != nullptr && src0->extra) |
| || (src1 != nullptr && src1->extra); |
|
|
| switch (tensor->op) { |
| case GGML_OP_GET_ROWS: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_get_rows; |
| break; |
| case GGML_OP_CPY: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_cpy; |
| break; |
| case GGML_OP_DUP: |
| case GGML_OP_CONT: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_dup; |
| break; |
| case GGML_OP_ADD: |
| if (!any_on_device) { |
| return false; |
| } |
| GGML_ASSERT(ggml_is_contiguous(src0)); |
| GGML_ASSERT(ggml_is_contiguous(src1)); |
| func = ggml_cl_add; |
| break; |
| case GGML_OP_MUL: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_mul; |
| break; |
| case GGML_OP_UNARY: |
| switch (ggml_get_unary_op(tensor)) { |
| case GGML_UNARY_OP_GELU: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_gelu; |
| break; |
| case GGML_UNARY_OP_SILU: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_silu; |
| break; |
| case GGML_UNARY_OP_RELU: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_relu; |
| break; |
| default: |
| return false; |
| } break; |
| case GGML_OP_CLAMP: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_clamp; |
| break; |
| case GGML_OP_NORM: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_norm; |
| break; |
| case GGML_OP_RMS_NORM: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_rms_norm; |
| break; |
| case GGML_OP_MUL_MAT: |
| if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { |
| return false; |
| } |
| func = ggml_cl_mul_mat; |
| break; |
| case GGML_OP_SCALE: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_scale; |
| break; |
| case GGML_OP_RESHAPE: |
| case GGML_OP_VIEW: |
| case GGML_OP_PERMUTE: |
| case GGML_OP_TRANSPOSE: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_nop; |
| break; |
| case GGML_OP_DIAG_MASK_INF: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_diag_mask_inf; |
| break; |
| case GGML_OP_SOFT_MAX: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_soft_max; |
| break; |
| case GGML_OP_ROPE: |
| if (!any_on_device) { |
| return false; |
| } |
| func = ggml_cl_rope; |
| break; |
| default: |
| return false; |
| } |
|
|
| func(backend, tensor->src[0], tensor->src[1], tensor); |
| return true; |
| } |
|
|