| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | import argparse |
| | import librosa |
| | import logging |
| | import soundfile as sf |
| | import sys |
| | from pathlib import Path |
| |
|
| |
|
| | sub_modules = ["", "semantic_tokenizer/f40ms", "semantic_detokenizer"] |
| | for sub in sub_modules: |
| | sys.path.append(str((Path(__file__).parent / sub).absolute())) |
| |
|
| | from semantic_tokenizer.f40ms.simple_tokenizer_infer import SpeechTokenizer, TOKENIZER_CFG_NAME |
| | from semantic_detokenizer.chunk_infer import SpeechDetokenizer |
| |
|
| |
|
| | class ReconstructionPipeline: |
| | def __init__( |
| | self, |
| | detok_vocoder: str, |
| | tokenizer_cfg_name: str = TOKENIZER_CFG_NAME, |
| | tokenizer_cfg_path: str = str( |
| | (Path(__file__).parent / "semantic_tokenizer/f40ms/config").absolute() |
| | ), |
| | tokenizer_ckpt: str = str( |
| | ( |
| | Path(__file__).parent / "semantic_tokenizer/f40ms/ckpt/model.pt" |
| | ).absolute() |
| | ), |
| | detok_model_cfg: str = str( |
| | (Path(__file__).parent / "semantic_detokenizer/ckpt/model.yaml").absolute() |
| | ), |
| | detok_ckpt: str = str( |
| | (Path(__file__).parent / "semantic_detokenizer/ckpt/model.pt").absolute() |
| | ), |
| | detok_vocab: str = str( |
| | ( |
| | Path(__file__).parent / "semantic_detokenizer/ckpt/vocab_4096.txt" |
| | ).absolute() |
| | ), |
| | ): |
| | self.tokenizer_cfg_name = tokenizer_cfg_name |
| | self.tokenizer = SpeechTokenizer( |
| | ckpt_path=tokenizer_ckpt, |
| | cfg_path=tokenizer_cfg_path, |
| | cfg_name=self.tokenizer_cfg_name, |
| | ) |
| |
|
| | self.device = "cuda:0" |
| | self.detoker = SpeechDetokenizer( |
| | vocoder_path=detok_vocoder, |
| | model_cfg=detok_model_cfg, |
| | ckpt_file=detok_ckpt, |
| | vocab_file=detok_vocab, |
| | device=self.device, |
| | ) |
| |
|
| | self.token_chunk_len = 75 |
| | self.chunk_cond_proportion = 0.3 |
| | self.chunk_look_ahead = 10 |
| | self.max_ref_duration = 4.5 |
| | self.ref_audio_cut_from_head = False |
| |
|
| | def reconstruct(self, ref_wav, input_wav): |
| | ref_wavs_list = [] |
| | raw_ref_wav, sr = librosa.load(ref_wav, sr=16000) |
| | ref_wavs_list.append(raw_ref_wav) |
| |
|
| | raw_input_wav, sr = librosa.load(input_wav, sr=16000) |
| | ref_wavs_list.append(raw_input_wav) |
| |
|
| | token_list, token_info_list = self.tokenizer.extract( |
| | ref_wavs_list |
| | ) |
| | ref_tokens = token_info_list[0]["reduced_unit_sequence"] |
| | input_tokens = token_info_list[1]["reduced_unit_sequence"] |
| | logging.info("tokens for ref wav: %s are [%s]" % (ref_wav, ref_tokens)) |
| | logging.info("tokens for input wav: %s are [%s]" % (input_wav, input_tokens)) |
| |
|
| | generated_wave, target_sample_rate = self.detoker.chunk_generate( |
| | ref_wav, |
| | ref_tokens.split(), |
| | input_tokens.split(), |
| | self.token_chunk_len, |
| | self.chunk_cond_proportion, |
| | self.chunk_look_ahead, |
| | self.max_ref_duration, |
| | self.ref_audio_cut_from_head, |
| | ) |
| |
|
| | if generated_wave is None: |
| | logging.info("generation FAILED") |
| | return None, None |
| | return generated_wave, target_sample_rate |
| |
|
| |
|
| | def main(args): |
| | |
| | reconsturctor = ReconstructionPipeline( |
| | detok_vocoder=args.detok_vocoder, |
| | ) |
| |
|
| | generated_wave, target_sample_rate = reconsturctor.reconstruct(args.ref_wav, args.input_wav) |
| | with open(args.output_wav, "wb") as f: |
| | sf.write(f.name, generated_wave, target_sample_rate) |
| | logging.info(f"write output to: {f.name}") |
| |
|
| | logging.info("Finished") |
| | return |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument( |
| | "--tokenizer-ckpt", |
| | required=False, |
| | help="path to ckpt", |
| | ) |
| | parser.add_argument( |
| | "--tokenizer-cfg-path", |
| | required=False, |
| | default="semantic_tokenizer/f40ms/config", |
| | help="path to config", |
| | ) |
| | parser.add_argument( |
| | "--detok-ckpt", |
| | required=False, |
| | help="path to ckpt", |
| | ) |
| | parser.add_argument( |
| | "--detok-model-cfg", |
| | required=False, |
| | help="path to model_cfg", |
| | ) |
| | parser.add_argument( |
| | "--detok-vocab", |
| | required=False, |
| | help="path to vocab", |
| | ) |
| | parser.add_argument( |
| | "--detok-vocoder", |
| | required=True, |
| | help="path to vocoder", |
| | ) |
| | parser.add_argument( |
| | "--ref-wav", |
| | required=True, |
| | help="path to ref wav", |
| | ) |
| | parser.add_argument( |
| | "--output-wav", |
| | required=True, |
| | help="path to output reconstructed wav", |
| | ) |
| | parser.add_argument( |
| | "--input-wav", |
| | required=True, |
| | help="input wav to reconstruction", |
| | ) |
| |
|
| | args = parser.parse_args() |
| |
|
| | main(args) |
| |
|