--- library_name: transformers base_model: - k2-fsa/OmniVoice --- This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [k2-fsa/OmniVoice](https://huggingface.co/k2-fsa/OmniVoice). | File path | Size | |------|------| | model.safetensors | 5.5MB | | audio_tokenizer/model.safetensors | 6.7MB | ### Example usage: ```python from omnivoice import OmniVoice import torch import torchaudio model_id = "tiny-random/omnivoice" model = OmniVoice.from_pretrained( model_id, dtype=torch.bfloat16, ) audio = model.generate( text="Hello, this is test example 1", instruct="low pitch, british accent", ) torchaudio.save("/tmp/example1.wav", audio[0], 24000) audio2 = model.generate( text="Hello, this is test example 2", ref_audio="/tmp/example1.wav", ref_text="Hello, this is test example 1", ) torchaudio.save("/tmp/example2.wav", audio2[0], 24000) ``` ### Codes to create this repo:
Click to expand ```python import torch import os from transformers import ( set_seed, AutoConfig, AutoTokenizer, HiggsAudioV2TokenizerModel, AutoFeatureExtractor, ) from huggingface_hub import hf_hub_download import json from omnivoice import OmniVoice, OmniVoiceConfig source_model_id = "k2-fsa/OmniVoice" save_folder = "/tmp/tiny-random/omnivoice" set_seed(42) tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True) tokenizer.save_pretrained(save_folder) with open( hf_hub_download(source_model_id, filename="audio_tokenizer/config.json", repo_type="model"), "r", encoding="utf-8", ) as f: config_dict = json.load(f) config_dict["acoustic_model_config"].update( { "decoder_hidden_size": 32, "encoder_hidden_size": 4, "hidden_size": 4, "codebook_dim": 8, } ) config_dict["semantic_model_config"].update( { "conv_dim": [8] * 7, "hidden_size": 16 * 4, "intermediate_size": 64, "num_attention_heads": 4, "num_hidden_layers": 2, } ) os.makedirs(os.path.join(save_folder, "audio_tokenizer"), exist_ok=True) with open(os.path.join(save_folder, "audio_tokenizer/config.json"), "w", encoding="utf-8") as f: json.dump(config_dict, f, ensure_ascii=False, indent=2) audio_tokenizer = HiggsAudioV2TokenizerModel( AutoConfig.from_pretrained(os.path.join(save_folder, "audio_tokenizer")) ) audio_tokenizer.save_pretrained(os.path.join(save_folder, "audio_tokenizer")) print(audio_tokenizer) set_seed(42) with torch.no_grad(): for name, p in sorted(audio_tokenizer.named_parameters()): torch.nn.init.normal_(p, 0, 0.2) print(name, p.shape) feature_extractor = AutoFeatureExtractor.from_pretrained(source_model_id, subfolder="audio_tokenizer") feature_extractor.save_pretrained(os.path.join(save_folder, "audio_tokenizer")) with open( hf_hub_download(source_model_id, filename="config.json", repo_type="model"), "r", encoding="utf-8", ) as f: config_dict = json.load(f) config_dict["llm_config"].update( { "hidden_size": 8, "head_dim": 32, "intermediate_size": 32, "num_attention_heads": 8, "num_key_value_heads": 4, "num_hidden_layers": 4, "max_window_layers": 2, "layer_types": ["full_attention"] * 4, } ) config = OmniVoiceConfig.from_dict(config_dict) model = OmniVoice(config).eval() set_seed(42) with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.2) print(name, p.shape) model.save_pretrained(save_folder) ```