Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 17,317 Bytes

5f923cd

// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_
#define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_

#include <array>
#include <ostream>

#include "absl/status/statusor.h"  // from @com_google_absl
#include "runtime/engine/io_types.h"
#include "runtime/util/status_macros.h"  // IWYU pragma: keep

namespace litert::lm {

// Configuration for audio preprocessing.
class AudioPreprocessorConfig {
 public:
  // The padding type of for FFT bins.
  enum FftPaddingType {
    // Right padding. The resulted FFT frame will be right padding with zeros or
    // truncated to the given FFT frame length.
    kRight = 0,
    // Center padding. The results FFT frame will be left and right padding with
    // zeros with same amount, or truncated with same amount on left and right,
    // to the given FFT frame length.
    kCenter = 1
  };

  // Creates Google's Universal Speech Model (USM) audio preprocessing
  // configuration.
  static AudioPreprocessorConfig CreateDefaultUsmConfig() {
    return AudioPreprocessorConfig(
        /* sample_rate_hz= */ 16000,
        /* num_channels= */ 1,
        /* frame_length= */ 512,
        /* hop_length= */ 160,
        /* fft_length = */ 1024,
        /* input_scale = */ 32768,
        /* pre_emphasis_factor = */ 0.97,
        /* num_mel_bins= */ 128,
        /* mel_low_hz= */ 125.0,
        /* mel_high_hz= */ 7500.0,
        /* mel_floor= */ 1e-6,
        /* normalize_mel= */ true,
        /* add_floor_to_mel_before_log= */ false,
        /* semicausal_padding= */ false, /* non_zero_hanning= */ true,
        /* periodic_hanning= */ true,
        /* fft_padding_type= */ FftPaddingType::kRight);
  }

  static AudioPreprocessorConfig Create(
      int sample_rate_hz, int num_channels, int frame_length, int hop_length,
      int fft_length, float input_scale, float pre_emphasis_factor,
      int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
      bool normalize_mel, bool add_floor_to_mel_before_log,
      bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
      FftPaddingType fft_padding_type) {
    return AudioPreprocessorConfig(
        sample_rate_hz, num_channels, frame_length, hop_length, fft_length,
        input_scale, pre_emphasis_factor, num_mel_bins, mel_low_hz, mel_high_hz,
        mel_floor, normalize_mel, add_floor_to_mel_before_log,
        semicausal_padding, non_zero_hanning, periodic_hanning,
        fft_padding_type);
  }

  friend std::ostream& operator<<(std::ostream& os,
                                  const FftPaddingType& padding_type) {
    switch (padding_type) {
      case FftPaddingType::kRight:
        os << "right";
        break;
      case FftPaddingType::kCenter:
        os << "center";
        break;
      default:
        os << "unknown";
        break;
    }
    return os;
  }

  // Allows logging of the config.
  friend std::ostream& operator<<(std::ostream& os,
                                  const AudioPreprocessorConfig& config) {
    os << "AudioPreprocessorConfig {\n";
    os << "  sample_rate_hz: " << config.GetSampleRateHz() << "\n";
    os << "  num_channels: " << config.GetNumChannels() << "\n";
    os << "  input_scale: " << config.GetInputScale() << "\n";
    os << "  pre_emphasis_factor: " << config.GetPreEmphasisFactor() << "\n";
    os << "  fft_length: " << config.GetFftLength() << "\n";
    os << "  fft_bins: " << config.GetFftBins() << "\n";
    os << "  frame_length: " << config.GetFrameLength() << "\n";
    os << "  hop_length: " << config.GetHopLength() << "\n";
    os << "  num_mel_bins: " << config.GetNumMelBins() << "\n";
    os << "  mel_low_hz: " << config.GetMelLowHz() << "\n";
    os << "  mel_high_hz: " << config.GetMelHighHz() << "\n";
    os << "  mel_floor: " << config.GetMelFloor() << "\n";
    os << "  normalize_mel: " << config.GetNormalizeMel() << "\n";
    os << "  add_floor_to_mel_before_log: "
       << config.GetAddFloorToMelBeforeLog() << "\n";
    os << "  semicausal_padding: " << config.GetSemicausalPadding() << "\n";
    os << "  non_zero_hanning: " << config.GetNonZeroHanning() << "\n";
    os << "  periodic_hanning: " << config.GetPeriodicHanning() << "\n";
    os << "  fft_padding_type: " << config.GetFftPaddingType() << "\n";
    os << "}";
    return os;
  }

  // Getter APIs.
  // The sample rate while loading the audio. The audio should be resampled to
  // the configured sample rate.
  int GetSampleRateHz() const { return sample_rate_hz_; }
  // The number of audio channels the preprocessor expect from the audio
  // content.
  int GetNumChannels() const { return num_channels_; }
  // The scale applied to the audio PCM frames before processing to
  // spectrogram.
  float GetInputScale() const { return input_scale_; }
  // The pre-emphasis factor applied to the audio before processing to
  // spectrogram.
  float GetPreEmphasisFactor() const { return pre_emphasis_factor_; }
  // The FFT length used for processing the audio.
  int GetFftLength() const { return fft_length_; }
  // The number of FFT bins used for real-sequence Fourier transform (RFFT) and
  // Mel spectrogram processing. It is derived from the FFT length as FFT
  // length / 2 + 1.
  int GetFftBins() const { return fft_bins_; }
  // The frame length used for for each frame of Short-Time Fourier Transform
  // (STFT).
  int GetFrameLength() const { return frame_length_; }
  // The hop length used for in sliding window of Short-Time Fourier Transform
  // (STFT).
  int GetHopLength() const { return hop_length_; }
  // The number of Mel bins used for Mel spectrogram processing.
  int GetNumMelBins() const { return num_mel_bins_; }
  // The lower bound of the Mel frequency range.
  float GetMelLowHz() const { return mel_low_hz_; }
  // The upper bound of the Mel frequency range.
  float GetMelHighHz() const { return mel_high_hz_; }
  // The floor value of the Mel spectrogram.
  float GetMelFloor() const { return mel_floor_; }
  // Whether to normalize the Mel spectrogram with precalculated mean and std
  // dev.
  bool GetNormalizeMel() const { return normalize_mel_; }
  // Whether to add the floor value to the Mel spectrogram before taking the
  // logarithm.
  bool GetAddFloorToMelBeforeLog() const {
    return add_floor_to_mel_before_log_;
  }
  // Whether to use semicausal padding for the audio frames.
  bool GetSemicausalPadding() const { return semicausal_padding_; }
  // Whether to use non-zero Hanning window for FFT.
  bool GetNonZeroHanning() const { return non_zero_hanning_; }
  // Whether to use the periodic Hanning window for FFT.
  bool GetPeriodicHanning() const { return periodic_hanning_; }
  // The padding type used for FFT.
  FftPaddingType GetFftPaddingType() const { return fft_padding_type_; }

  // Setter APIs.
  void SetSampleRateHz(int sample_rate_hz) { sample_rate_hz_ = sample_rate_hz; }
  void SetNumChannels(int num_channels) { num_channels_ = num_channels; }
  void SetInputScale(float input_scale) { input_scale_ = input_scale; }
  void SetPreEmphasisFactor(float pre_emphasis_factor) {
    pre_emphasis_factor_ = pre_emphasis_factor;
  }
  // The FFT length must be even for real FFT optimization. The FFT bins will be
  // derived from the FFT length as FFT length / 2 + 1.
  void SetFftLength(int fft_length) {
    fft_length_ = fft_length;
    fft_bins_ = fft_length / 2 + 1;
  }
  void SetFrameLength(int frame_length) { frame_length_ = frame_length; }
  void SetHopLength(int hop_length) { hop_length_ = hop_length; }
  void SetNumMelBins(int num_mel_bins) { num_mel_bins_ = num_mel_bins; }
  void SetMelLowHz(float mel_low_hz) { mel_low_hz_ = mel_low_hz; }
  void SetMelHighHz(float mel_high_hz) { mel_high_hz_ = mel_high_hz; }
  void SetMelFloor(float mel_floor) { mel_floor_ = mel_floor; }
  void SetNormalizeMel(bool normalize_mel) { normalize_mel_ = normalize_mel; }
  void SetAddFloorToMelBeforeLog(bool add_floor_to_mel_before_log) {
    add_floor_to_mel_before_log_ = add_floor_to_mel_before_log;
  }
  void SetSemicausalPadding(bool semicausal_padding) {
    semicausal_padding_ = semicausal_padding;
  }
  void SetNonZeroHanning(bool non_zero_hanning) {
    non_zero_hanning_ = non_zero_hanning;
  }
  void SetPeriodicHanning(bool periodic_hanning) {
    periodic_hanning_ = periodic_hanning;
  }
  void SetFftPaddingType(FftPaddingType fft_padding_type) {
    fft_padding_type_ = fft_padding_type;
  }

  // The Mel Spectrogram means used for Universal Speech Model (USM) during
  // preprocessing.
  static constexpr std::array<float, 128> kUsmMelMean{
      6.398797734146062,  6.5292966718485665, 6.636971307272159,
      6.73283598251503,   6.83729192594687,   6.955722303271236,
      7.102944890730766,  7.114182036087843,  7.1506544101153,
      7.174958993259514,  7.1890256978077804, 7.196835788986042,
      7.211737590554171,  7.365040287042535,  7.350661707754529,
      7.34752702412618,   7.370936184320344,  7.552167274579683,
      7.4736985912567455, 7.461733145619613,  7.655010083032587,
      7.537023586741711,  7.59332033698754,   7.678828995158089,
      7.573545549481997,  7.721706263812856,  7.548489195294597,
      7.647480899467908,  7.546350507038094,  7.552359044394656,
      7.60142267532906,   7.510803537242497,  7.547512749381739,
      7.5734628575808145, 7.516065818981327,  7.544310572169082,
      7.556128732606547,  7.578428971230521,  7.565946473157099,
      7.565821431053628,  7.582146705201401,  7.5917054493764775,
      7.59647680034444,   7.612909043144701,  7.642191074647679,
      7.682020208604412,  7.669657702288002,  7.636762908696176,
      7.645613169792156,  7.687786852309006,  7.733375349074729,
      7.705414197270183,  7.773851002316419,  7.767855696186511,
      7.804625030416079,  7.8095583241565505, 7.845300151068656,
      7.832030482713495,  7.876477438621265,  7.886595835981996,
      7.907747879286325,  7.926010325946424,  7.927971987569718,
      7.94765994925662,   7.9609369675109205, 7.977485334083968,
      7.995276449058029,  8.020093867153456,  8.026893789702653,
      8.036394113138993,  8.072079269745391,  8.072009510709744,
      8.15832987882215,   8.169035932109242,  8.201262910500471,
      8.203176911295596,  8.237251381186532,  8.265968214462914,
      8.278791003594298,  8.279921657260331,  8.303751782080207,
      8.323985266369666,  8.358499418073363,  8.368121771923692,
      8.392162333974197,  8.40529917133684,   8.421934604788884,
      8.43307981480797,   8.416437732709245,  8.380481381138022,
      8.313028108945332,  8.172698101608145,  7.987087868524417,
      7.775018865353218,  7.587469885918491,  7.485680948258058,
      7.425561455270659,  7.426161453764725,  7.500171657170674,
      7.473711809407939,  7.497915553109761,  7.555291079941853,
      7.5404297094497155, 7.554637855844384,  7.5536294881940025,
      7.597411437015373,  7.620857310821611,  7.622024042245356,
      7.643684482318661,  7.651806604022742,  7.647768200868812,
      7.619968160658521,  7.663675433728041,  7.770133777809638,
      7.775737195054957,  7.756637821283381,  7.7958903182806445,
      7.824714343764584,  7.8699194044250325, 7.857690367947652,
      7.854133456399421,  7.83057312917979,   7.780062155284722,
      7.687571300835443,  7.626255596158039,  7.475138444832542,
      7.31241576045514,   7.162930372619685,
  };

  // The Mel Spectrogram standard deviations constants used for Universal Speech
  // Model (USM) during preprocessing.
  static constexpr std::array<float, 128> kUsmMelStdDev{
      1.6785894541269812, 1.6687138672328043, 1.6906522689607268,
      1.7375192957945016, 1.7755335232132188, 1.7945350399969586,
      1.8160038735261768, 1.8455822079478754, 1.854889301328728,
      1.8544058257314018, 1.8531530795826658, 1.8568193392072,
      1.8568580559801775, 1.8403822120311448, 1.8311156303932052,
      1.8381223837390877, 1.8582757939740133, 1.8751353033960765,
      1.8940031697532662, 1.9045566324594227, 1.9114104933328382,
      1.9234409916967738, 1.932244372950416,  1.9354540832886058,
      1.9196173248258872, 1.8884371698304272, 1.8666212011400265,
      1.851852265212217,  1.8466309429379515, 1.8370433682382064,
      1.8312948374209728, 1.8233918348681029, 1.8162900339615862,
      1.813554336166136,  1.7988012203002604, 1.7783664628243725,
      1.762995373099593,  1.754638830337111,  1.7562192553046327,
      1.7570134298011308, 1.748103676233597,  1.7420266564237143,
      1.7433799765791382, 1.7405273444710188, 1.7681605535143332,
      1.7928765468247894, 1.7832784911754684, 1.7556019331853459,
      1.734978397119943,  1.7251193027145706, 1.711577677561937,
      1.7077475454470532, 1.702793505675667,  1.7087228728780646,
      1.7055479598955696, 1.7048659481569446, 1.7136985315687527,
      1.7003759527643025, 1.7038510617369829, 1.712407460050622,
      1.7195395708962748, 1.715985369102956,  1.7047382463157097,
      1.6858892841332958, 1.6803980138770978, 1.6883086163746897,
      1.678822586089551,  1.6704169259147215, 1.6824154866833487,
      1.7002006169486261, 1.7095077608591729, 1.7127719919531275,
      1.7007540237588394, 1.7007030789334565, 1.7006801726721705,
      1.7084333739135957, 1.7080081837410785, 1.7088852843730529,
      1.7058124003569382, 1.7104967128913229, 1.7017088898161998,
      1.6946290530635235, 1.6886895951157692, 1.6913609136330663,
      1.6802034976166595, 1.6778644057956866, 1.6844856225324205,
      1.6919889285341483, 1.6918548241011255, 1.6771215766236411,
      1.6753742459089904, 1.6732896439517075, 1.665104739745144,
      1.682512689327978,  1.7001049276791989, 1.71496232533367,
      1.751371703351037,  1.7589949482516734, 1.7274831977280356,
      1.7428303906628124, 1.7427952258580872, 1.7072930970436015,
      1.72696991469254,   1.7128335116767701, 1.7266508365456639,
      1.699287147275948,  1.6860698274507981, 1.6862991003373358,
      1.683393071329867,  1.687619365543026,  1.7100825041856975,
      1.7407356256589301, 1.7218710733945026, 1.6776658140019411,
      1.6864518015922916, 1.7273244787326472, 1.6992470398169233,
      1.6800806970795965, 1.6579370965601807, 1.6647055065206582,
      1.65766768806214,   1.6294301234765352, 1.5918612004781831,
      1.5335441292387613, 1.3949765253217616, 1.2628815962896491,
      1.1053653031914006, 0.9263256925938697,
  };

 private:
  explicit AudioPreprocessorConfig(
      // Audio decoding parameters.
      int sample_rate_hz, int num_channels,
      // FFT parameters.
      int frame_length, int hop_length, int fft_length, float input_scale,
      float pre_emphasis_factor,
      // Mel spectrogram parameters.
      int num_mel_bins, float mel_low_hz, float mel_high_hz, float mel_floor,
      bool normalize_mel, bool add_floor_to_mel_before_log,
      bool semicausal_padding, bool non_zero_hanning, bool periodic_hanning,
      FftPaddingType fft_padding_type)
      : sample_rate_hz_(sample_rate_hz),
        num_channels_(num_channels),
        fft_length_(fft_length),
        fft_bins_(fft_length / 2 + 1),
        frame_length_(frame_length),
        hop_length_(hop_length),
        num_mel_bins_(num_mel_bins),
        mel_low_hz_(mel_low_hz),
        mel_high_hz_(mel_high_hz),
        mel_floor_(mel_floor),
        input_scale_(input_scale),
        pre_emphasis_factor_(pre_emphasis_factor),
        normalize_mel_(normalize_mel),
        add_floor_to_mel_before_log_(add_floor_to_mel_before_log),
        semicausal_padding_(semicausal_padding),
        non_zero_hanning_(non_zero_hanning),
        periodic_hanning_(periodic_hanning),
        fft_padding_type_(fft_padding_type) {}
  int sample_rate_hz_;
  int num_channels_;
  int fft_length_;
  int fft_bins_;
  int frame_length_;
  int hop_length_;
  int num_mel_bins_;
  float mel_low_hz_;
  float mel_high_hz_;
  float mel_floor_;
  float input_scale_;
  float pre_emphasis_factor_;
  bool normalize_mel_;
  bool add_floor_to_mel_before_log_;
  bool semicausal_padding_;
  bool non_zero_hanning_;
  bool periodic_hanning_;
  FftPaddingType fft_padding_type_;
};

// Interface for audio preprocessing.
class AudioPreprocessor {
 public:
  virtual ~AudioPreprocessor() = default;

  // Preprocesses the undecoded audio bytes and returns the preprocessed audio.
  virtual absl::StatusOr<InputAudio> Preprocess(
      const InputAudio& input_audio) = 0;

  // Reset the audio preprocessor to the initial state.
  virtual void Reset() = 0;
};

std::ostream& operator<<(
    std::ostream& os,
    const AudioPreprocessorConfig::FftPaddingType& padding_type);

}  // namespace litert::lm

#endif  // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_H_