| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from huggingface_hub import hf_hub_download |
| | from functools import lru_cache |
| | import os |
| |
|
| | os.system( |
| | "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/" |
| | ) |
| |
|
| | import k2 |
| | import sherpa |
| |
|
| |
|
| | sample_rate = 16000 |
| |
|
| |
|
| | @lru_cache(maxsize=30) |
| | def get_pretrained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ) -> sherpa.OfflineRecognizer: |
| | if repo_id in chinese_models: |
| | return chinese_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | elif repo_id in english_models: |
| | return english_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | elif repo_id in chinese_english_mixed_models: |
| | return chinese_english_mixed_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | elif repo_id in tibetan_models: |
| | return tibetan_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | elif repo_id in arabic_models: |
| | return arabic_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | elif repo_id in german_models: |
| | return german_models[repo_id]( |
| | repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths |
| | ) |
| | else: |
| | raise ValueError(f"Unsupported repo_id: {repo_id}") |
| |
|
| |
|
| | def _get_nn_model_filename( |
| | repo_id: str, |
| | filename: str, |
| | subfolder: str = "exp", |
| | ) -> str: |
| | nn_model_filename = hf_hub_download( |
| | repo_id=repo_id, |
| | filename=filename, |
| | subfolder=subfolder, |
| | ) |
| | return nn_model_filename |
| |
|
| |
|
| | def _get_bpe_model_filename( |
| | repo_id: str, |
| | filename: str = "bpe.model", |
| | subfolder: str = "data/lang_bpe_500", |
| | ) -> str: |
| | bpe_model_filename = hf_hub_download( |
| | repo_id=repo_id, |
| | filename=filename, |
| | subfolder=subfolder, |
| | ) |
| | return bpe_model_filename |
| |
|
| |
|
| | def _get_token_filename( |
| | repo_id: str, |
| | filename: str = "tokens.txt", |
| | subfolder: str = "data/lang_char", |
| | ) -> str: |
| | token_filename = hf_hub_download( |
| | repo_id=repo_id, |
| | filename=filename, |
| | subfolder=subfolder, |
| | ) |
| | return token_filename |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_aishell2_pretrained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ) -> sherpa.OfflineRecognizer: |
| | assert repo_id in [ |
| | |
| | "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12", |
| | |
| | "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit.pt", |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id) |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_gigaspeech_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ) -> sherpa.OfflineRecognizer: |
| | assert repo_id in [ |
| | "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit-iter-3488000-avg-20.pt", |
| | ) |
| | tokens = "./giga-tokens.txt" |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_librispeech_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ) -> sherpa.OfflineRecognizer: |
| | assert repo_id in [ |
| | "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02", |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13", |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11", |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14", |
| | ], repo_id |
| |
|
| | filename = "cpu_jit.pt" |
| | if ( |
| | repo_id |
| | == "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11" |
| | ): |
| | filename = "cpu_jit-torch-1.10.0.pt" |
| |
|
| | if ( |
| | repo_id |
| | == "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02" |
| | ): |
| | filename = "cpu_jit-torch-1.10.pt" |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename=filename, |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500") |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_wenetspeech_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit_epoch_10_avg_2_torch_1.7.1.pt", |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id) |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_chinese_english_mixed_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5", |
| | "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh", |
| | ], repo_id |
| |
|
| | if repo_id == "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": |
| | filename = "cpu_jit.pt" |
| | subfolder = "data/lang_char" |
| | elif repo_id == "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": |
| | filename = "cpu_jit-epoch-11-avg-1.pt" |
| | subfolder = "data/lang_char_bpe" |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename=filename, |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder) |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_alimeeting_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit_torch_1.7.1.pt", |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id) |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_wenet_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "csukuangfj/wenet-chinese-model", |
| | "csukuangfj/wenet-english-model", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="final.zip", |
| | subfolder=".", |
| | ) |
| | tokens = _get_token_filename( |
| | repo_id=repo_id, |
| | filename="units.txt", |
| | subfolder=".", |
| | ) |
| |
|
| | feat_config = sherpa.FeatureConfig(normalize_samples=False) |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_aidatatang_200zh_pretrained_mode( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit_torch.1.7.1.pt", |
| | ) |
| | tokens = _get_token_filename(repo_id=repo_id) |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_tibetan_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02", |
| | "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29", |
| | ], repo_id |
| |
|
| | filename = "cpu_jit.pt" |
| | if ( |
| | repo_id |
| | == "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29" |
| | ): |
| | filename = "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt" |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename=filename, |
| | ) |
| |
|
| | tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500") |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_arabic_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="cpu_jit.pt", |
| | ) |
| |
|
| | tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_5000") |
| |
|
| | feat_config = sherpa.FeatureConfig() |
| | feat_config.fbank_opts.frame_opts.samp_freq = sample_rate |
| | feat_config.fbank_opts.mel_opts.num_bins = 80 |
| | feat_config.fbank_opts.frame_opts.dither = 0 |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | feat_config=feat_config, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | @lru_cache(maxsize=10) |
| | def _get_german_pre_trained_model( |
| | repo_id: str, |
| | decoding_method: str, |
| | num_active_paths: int, |
| | ): |
| | assert repo_id in [ |
| | "csukuangfj/wav2vec2.0-torchaudio", |
| | ], repo_id |
| |
|
| | nn_model = _get_nn_model_filename( |
| | repo_id=repo_id, |
| | filename="voxpopuli_asr_base_10k_de.pt", |
| | subfolder=".", |
| | ) |
| |
|
| | tokens = _get_token_filename( |
| | repo_id=repo_id, |
| | filename="tokens-de.txt", |
| | subfolder=".", |
| | ) |
| |
|
| | config = sherpa.OfflineRecognizerConfig( |
| | nn_model=nn_model, |
| | tokens=tokens, |
| | use_gpu=False, |
| | decoding_method=decoding_method, |
| | num_active_paths=num_active_paths, |
| | ) |
| |
|
| | recognizer = sherpa.OfflineRecognizer(config) |
| |
|
| | return recognizer |
| |
|
| |
|
| | chinese_models = { |
| | "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model, |
| | "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model, |
| | "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model, |
| | "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode, |
| | "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model, |
| | "csukuangfj/wenet-chinese-model": _get_wenet_model, |
| | } |
| |
|
| | english_models = { |
| | "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model, |
| | "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_librispeech_pre_trained_model, |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_librispeech_pre_trained_model, |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_librispeech_pre_trained_model, |
| | "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model, |
| | "csukuangfj/wenet-english-model": _get_wenet_model, |
| | } |
| |
|
| | chinese_english_mixed_models = { |
| | "ptrnull/icefall-asr-conv-emformer-transducer-stateless2-zh": _get_chinese_english_mixed_model, |
| | "luomingshuang/icefall_asr_tal-csasr_pruned_transducer_stateless5": _get_chinese_english_mixed_model, |
| | } |
| |
|
| | tibetan_models = { |
| | "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02": _get_tibetan_pre_trained_model, |
| | "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29": _get_tibetan_pre_trained_model, |
| | } |
| |
|
| | arabic_models = { |
| | "AmirHussein/icefall-asr-mgb2-conformer_ctc-2022-27-06": _get_arabic_pre_trained_model, |
| | } |
| |
|
| | german_models = { |
| | "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model, |
| | } |
| |
|
| | all_models = { |
| | **chinese_models, |
| | **english_models, |
| | **chinese_english_mixed_models, |
| | **tibetan_models, |
| | **arabic_models, |
| | **german_models, |
| | } |
| |
|
| | language_to_models = { |
| | "Chinese": list(chinese_models.keys()), |
| | "English": list(english_models.keys()), |
| | "Chinese+English": list(chinese_english_mixed_models.keys()), |
| | "Tibetan": list(tibetan_models.keys()), |
| | "Arabic": list(arabic_models.keys()), |
| | "German": list(german_models.keys()), |
| | } |
| |
|