Spaces:
Running
Running
| // Copyright 2025 The ODML Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| namespace litert::lm { | |
| // Audio preprocessor implementation using MiniAudio library and kissfft | |
| // library. | |
| class AudioPreprocessorMiniAudio : public AudioPreprocessor { | |
| public: | |
| // Creates an AudioPreprocessorMiniAudio instance. | |
| // Args: | |
| // - config: The configuration of the audio preprocessor. | |
| // Returns: | |
| // A unique pointer to the AudioPreprocessorMiniAudio instance. | |
| static absl::StatusOr<std::unique_ptr<AudioPreprocessorMiniAudio>> Create( | |
| const AudioPreprocessorConfig& config); | |
| // Decodes the raw audio bytes to PCM frames using MiniAudio library. | |
| // Args: | |
| // - audio_bytes: The raw audio bytes read from the audio file to decode. | |
| // The supported audio format from MiniAudio library is WAV, FLAC and MP3. | |
| // - num_channels: The number of channels in the audio to be decoded. Only | |
| // mono audio (num_channels = 1) is supported for now. | |
| // - sample_rate_hz: The sample rate of the audio to be resampled to. | |
| // - pcm_frames: The decoded PCM frames. | |
| static absl::Status DecodeAudio(absl::string_view audio_bytes, | |
| int num_channels, int sample_rate_hz, | |
| std::vector<float>& pcm_frames); | |
| // Preprocesses the undecoded raw audio bytes and returns the preprocessed | |
| // audio mel spectrograms. | |
| // Args: | |
| // - input_audio: The input audio to preprocess. The supported audio | |
| // format from MiniAudio library is WAV, FLAC and MP3. | |
| // If the input audio is already preprocessed, it will be returned | |
| // directly. | |
| // Returns: | |
| // An InputAudio object containing the preprocessed audio mel spectrograms | |
| // with shape (1, num_frames, num_mel_bins). | |
| absl::StatusOr<InputAudio> Preprocess(const InputAudio& input_audio) override; | |
| // Resets the preprocessor to its initial state. | |
| void Reset() override { | |
| input_queue_.clear(); | |
| if (config_.GetSemicausalPadding()) { | |
| samples_to_next_step_ = config_.GetFrameLength() - config_.GetHopLength(); | |
| input_queue_.resize(config_.GetHopLength(), 0.0f); | |
| } else { | |
| samples_to_next_step_ = config_.GetFrameLength(); | |
| } | |
| } | |
| // Copy constructor for cloning the audio preprocessor. | |
| AudioPreprocessorMiniAudio(const AudioPreprocessorMiniAudio& other) | |
| : config_(other.config_), | |
| mel_filterbank_(nullptr), | |
| input_queue_(other.input_queue_), | |
| samples_to_next_step_(other.samples_to_next_step_) { | |
| mel_filterbank_ = std::make_unique<MelFilterbank>(); | |
| ABSL_CHECK_OK(mel_filterbank_->Initialize( | |
| other.config_.GetFftBins(), other.config_.GetSampleRateHz(), | |
| other.config_.GetNumMelBins(), other.config_.GetMelLowHz(), | |
| other.config_.GetMelHighHz())); | |
| } | |
| // Copy assignment operator for cloning the audio preprocessor. | |
| AudioPreprocessorMiniAudio& operator=( | |
| const AudioPreprocessorMiniAudio& other) { | |
| config_ = other.config_; | |
| mel_filterbank_ = std::make_unique<MelFilterbank>(); | |
| ABSL_CHECK_OK(mel_filterbank_->Initialize( | |
| other.config_.GetFftBins(), other.config_.GetSampleRateHz(), | |
| other.config_.GetNumMelBins(), other.config_.GetMelLowHz(), | |
| other.config_.GetMelHighHz())); | |
| input_queue_ = other.input_queue_; | |
| samples_to_next_step_ = other.samples_to_next_step_; | |
| return *this; | |
| } | |
| private: | |
| explicit AudioPreprocessorMiniAudio( | |
| const AudioPreprocessorConfig& config, | |
| std::unique_ptr<MelFilterbank> mel_filterbank) | |
| : config_(config), | |
| mel_filterbank_(std::move(mel_filterbank)), | |
| input_queue_(std::vector<float>()) { | |
| if (config.GetSemicausalPadding()) { | |
| samples_to_next_step_ = config.GetFrameLength() - config.GetHopLength(); | |
| input_queue_.resize(config.GetHopLength(), 0.0f); | |
| } else { | |
| samples_to_next_step_ = config.GetFrameLength(); | |
| } | |
| } | |
| absl::Status PcmFramesToSpectrogram(absl::Span<const float> pcm_frames, | |
| std::vector<float>& spectrograms); | |
| absl::Status ToLogMelSpectrogram(const std::vector<float>& spectrograms, | |
| std::vector<float>& log_mel_spectrograms); | |
| bool GetNextWindowOfSamples(const std::vector<float>& pcm_frames, | |
| int& input_start); | |
| AudioPreprocessorConfig config_; | |
| std::unique_ptr<MelFilterbank> mel_filterbank_; | |
| std::vector<float> input_queue_; | |
| int samples_to_next_step_; | |
| }; | |
| } // namespace litert::lm | |