| #include "model_adapter.h" |
| #include "otherarch/utils.h" |
|
|
| #include "whisper.cpp" |
|
|
| #define DR_WAV_IMPLEMENTATION |
| #include "dr_wav.h" |
|
|
| #include <cmath> |
| #include <fstream> |
| #include <cstdio> |
| #include <regex> |
| #include <string> |
| #include <thread> |
| #include <vector> |
| #include <cstring> |
| #include <mutex> |
| #include <cinttypes> |
|
|
| #define COMMON_SAMPLE_RATE 16000 |
|
|
| #if defined(_MSC_VER) |
| #pragma warning(disable: 4244 4267) |
| #endif |
|
|
| static int whisperdebugmode = 0; |
| static bool whisper_is_quiet = false; |
| static whisper_context * whisper_ctx = nullptr; |
| static std::string whisper_output_text = ""; |
|
|
| int total_transcribe_gens = 0; |
|
|
| static bool is_wav_buffer(const std::string buf) { |
| |
| |
| if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") { |
| return false; |
| } |
| uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4); |
| if (chunk_size + 8 != buf.size()) { |
| return false; |
| } |
| return true; |
| } |
|
|
| static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) |
| { |
| drwav wav; |
| std::vector<uint8_t> wav_data = kcpp_base64_decode(b64data); |
|
|
| if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) { |
| printf("error: failed to open WAV file from stdin\n"); |
| return false; |
| } |
|
|
| if (wav.channels != 1 && wav.channels != 2) { |
| printf("WAV file must be mono or stereo\n"); |
| drwav_uninit(&wav); |
| return false; |
| } |
|
|
| if (wav.bitsPerSample != 8 && wav.bitsPerSample != 16 && wav.bitsPerSample != 32) { |
| printf("WAV file must be 8-bit, 16-bit or 32-bit. Detected: %d\n",wav.bitsPerSample); |
| drwav_uninit(&wav); |
| return false; |
| } |
|
|
| const uint64_t n = wav_data.empty() ? wav.totalPCMFrameCount : wav_data.size()/(wav.channels*wav.bitsPerSample/8); |
|
|
| std::vector<int16_t> pcm16; |
| pcm16.resize(n*wav.channels); |
|
|
| if (wav.bitsPerSample == 8) { |
| |
| std::vector<uint8_t> pcm8(n * wav.channels); |
| drwav_read_pcm_frames(&wav, n, pcm8.data()); |
| drwav_u8_to_s16(pcm16.data(), pcm8.data(), n * wav.channels); |
| } else if (wav.bitsPerSample == 16) { |
| |
| drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); |
| } else if (wav.bitsPerSample == 32) { |
| |
| std::vector<int32_t> pcm32(n * wav.channels); |
| drwav_read_pcm_frames_s32(&wav, n, pcm32.data()); |
| for (uint64_t i = 0; i < n * wav.channels; ++i) { |
| pcm16[i] = static_cast<int16_t>(pcm32[i] >> 16); |
| } |
| } |
| drwav_uninit(&wav); |
|
|
| std::vector<float> raw_pcm; |
| raw_pcm.resize(n); |
|
|
| if(whisperdebugmode==1 && !whisper_is_quiet) |
| { |
| printf("\nwav_data_size: %d, n:%d",wav_data.size(),n); |
| } |
|
|
| |
| if (wav.channels == 1) { |
| for (uint64_t i = 0; i < n; i++) { |
| raw_pcm[i] = float(pcm16[i])/32768.0f; |
| } |
| } else { |
| for (uint64_t i = 0; i < n; i++) { |
| raw_pcm[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f; |
| } |
| } |
|
|
| if (wav.sampleRate != COMMON_SAMPLE_RATE) { |
| if(whisperdebugmode==1 && !whisper_is_quiet) |
| { |
| printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)", |
| wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size()); |
| } |
| raw_pcm = resample_wav(raw_pcm, wav.sampleRate, COMMON_SAMPLE_RATE); |
| } |
|
|
| uint64_t finalsize = raw_pcm.size(); |
| pcmf32.resize(finalsize); |
| for (uint64_t i = 0; i < finalsize; i++) { |
| pcmf32[i] = raw_pcm[i]; |
| } |
|
|
| return true; |
| } |
|
|
| static std::string output_txt(struct whisper_context * ctx, std::vector<std::vector<float>> pcmf32s) { |
|
|
| std::string outtxt = ""; |
| const int n_segments = whisper_full_n_segments(ctx); |
| for (int i = 0; i < n_segments; ++i) { |
| const char * text = whisper_full_get_segment_text(ctx, i); |
| outtxt += text; |
| } |
| return outtxt; |
| } |
|
|
| void cb_log_disable(enum ggml_log_level , const char * , void * ) { } |
|
|
| static std::string whisperplatformenv, whisperdeviceenv, whispervulkandeviceenv; |
| bool whispertype_load_model(const whisper_load_model_inputs inputs) |
| { |
| whisper_is_quiet = inputs.quiet; |
|
|
| |
| int cl_parseinfo = inputs.clblast_info; |
| std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0); |
| putenv((char*)usingclblast.c_str()); |
| cl_parseinfo = cl_parseinfo%100; |
| int platform = cl_parseinfo/10; |
| int devices = cl_parseinfo%10; |
| whisperplatformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform); |
| whisperdeviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices); |
| putenv((char*)whisperplatformenv.c_str()); |
| putenv((char*)whisperdeviceenv.c_str()); |
| std::string vulkan_info_raw = inputs.vulkan_info; |
| std::string vulkan_info_str = ""; |
| for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { |
| vulkan_info_str += vulkan_info_raw[i]; |
| if (i < vulkan_info_raw.length() - 1) { |
| vulkan_info_str += ","; |
| } |
| } |
| if(vulkan_info_str!="") |
| { |
| whispervulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; |
| putenv((char*)whispervulkandeviceenv.c_str()); |
| } |
|
|
|
|
| std::string modelfile = inputs.model_filename; |
| printf("\nLoading Whisper Model: %s",modelfile.c_str()); |
|
|
| whisperdebugmode = inputs.debugmode; |
| if (whisperdebugmode!=1) { |
| whisper_log_set(cb_log_disable, NULL); |
| } |
|
|
| |
| struct whisper_context_params cparams = whisper_context_default_params(); |
| cparams.use_gpu = true; |
| cparams.flash_attn = false; |
|
|
| whisper_ctx = whisper_init_from_file_with_params(modelfile.c_str(), cparams); |
|
|
| if (whisper_ctx == nullptr) { |
| printf("\nWhisper Load Error: Failed to initialize whisper context!\n"); |
| return false; |
| } |
|
|
| printf("\nWhisper Load Complete.\n"); |
|
|
| return true; |
| } |
|
|
| whisper_generation_outputs whispertype_generate(const whisper_generation_inputs inputs) |
| { |
| whisper_generation_outputs output; |
|
|
| if(whisper_ctx==nullptr) |
| { |
| printf("\nWarning: KCPP whisper not initialized!\n"); |
| output.text = ""; |
| output.status = 0; |
| return output; |
| } |
|
|
| if(!whisper_is_quiet) |
| { |
| printf("\nWhisper Transcribe Generating..."); |
| } |
|
|
| const std::string b64data = std::string(inputs.audio_data); |
| const std::string initprompt = std::string(inputs.prompt); |
| const std::string langcode = std::string(inputs.langcode); |
|
|
| std::vector<float> pcmf32; |
| std::vector<std::vector<float>> pcmf32s; |
|
|
| if (!::read_wav(b64data, pcmf32, pcmf32s, false)) { |
| printf("\nWhisper: Failed to read input wav data!\n"); |
| output.text = ""; |
| output.status = 0; |
| return output; |
| } |
|
|
| |
| whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); |
| wparams.strategy = WHISPER_SAMPLING_GREEDY; |
| wparams.print_realtime = false; |
| wparams.print_progress = false; |
| wparams.print_timestamps = false; |
| wparams.print_special = false; |
| wparams.translate = false; |
| wparams.language = langcode.c_str(); |
| wparams.detect_language = false; |
| wparams.n_threads = 4; |
| wparams.n_max_text_ctx = wparams.n_max_text_ctx; |
| wparams.offset_ms = 0; |
| wparams.duration_ms = 0; |
| wparams.token_timestamps = false; |
| wparams.thold_pt = 0.01f; |
| wparams.max_len = 100; |
| wparams.split_on_word = false; |
| wparams.audio_ctx = 0; |
| wparams.speed_up = false; |
| wparams.debug_mode = (whisperdebugmode==1); |
| wparams.tdrz_enable = false; |
| wparams.suppress_regex = nullptr; |
| wparams.suppress_non_speech_tokens = inputs.suppress_non_speech; |
| wparams.initial_prompt = initprompt.c_str(); |
| wparams.greedy.best_of = -1; |
| wparams.beam_search.beam_size = -1; |
| wparams.temperature_inc = 0.2f; |
| wparams.temperature = 0.0f; |
| wparams.entropy_thold = 2.40f; |
| wparams.logprob_thold = -1.00f; |
| wparams.no_timestamps = true; |
|
|
| if (whisper_full_parallel(whisper_ctx, wparams, pcmf32.data(), pcmf32.size(), 1) != 0) { |
| printf("\nWhisper: Failed to process audio!\n"); |
| output.text = ""; |
| output.status = 0; |
| return output; |
| } |
|
|
| if (!whisper_is_quiet && whisperdebugmode==1) { |
| whisper_print_timings(whisper_ctx); |
| } |
|
|
| |
| whisper_output_text = output_txt(whisper_ctx, pcmf32s); |
| std::string ts = get_timestamp_str(); |
| if(!whisper_is_quiet) |
| { |
| printf("\n[%s] Whisper Transcribe Output: %s",ts.c_str(),whisper_output_text.c_str()); |
| } else { |
| printf("\n[%s] Whisper Transcribe Done.",ts.c_str()); |
| } |
| output.text = whisper_output_text.c_str(); |
| output.status = 1; |
| total_transcribe_gens += 1; |
| return output; |
| } |
|
|