tiny-random
/

omnivoice

Model card Files Files and versions

omnivoice / README.md

yujiepan's picture

Upload folder using huggingface_hub

cdc2d96 verified 25 days ago

|

history blame contribute delete

3.65 kB

	---
	library_name: transformers
	base_model:
	- k2-fsa/OmniVoice
	---

	This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [k2-fsa/OmniVoice](https://huggingface.co/k2-fsa/OmniVoice).

	\| File path \| Size \|
	\|------\|------\|
	\| model.safetensors \| 5.5MB \|
	\| audio_tokenizer/model.safetensors \| 6.7MB \|


	### Example usage:

	```python
	from omnivoice import OmniVoice
	import torch
	import torchaudio

	model_id = "tiny-random/omnivoice"
	model = OmniVoice.from_pretrained(
	model_id,
	dtype=torch.bfloat16,
	)
	audio = model.generate(
	text="Hello, this is test example 1",
	instruct="low pitch, british accent",
	)
	torchaudio.save("/tmp/example1.wav", audio[0], 24000)

	audio2 = model.generate(
	text="Hello, this is test example 2",
	ref_audio="/tmp/example1.wav",
	ref_text="Hello, this is test example 1",
	)
	torchaudio.save("/tmp/example2.wav", audio2[0], 24000)
	```

	### Codes to create this repo:

	<details>
	<summary>Click to expand</summary>

	```python
	import torch
	import os

	from transformers import (
	set_seed,
	AutoConfig,
	AutoTokenizer,
	HiggsAudioV2TokenizerModel,
	AutoFeatureExtractor,
	)
	from huggingface_hub import hf_hub_download
	import json
	from omnivoice import OmniVoice, OmniVoiceConfig

	source_model_id = "k2-fsa/OmniVoice"
	save_folder = "/tmp/tiny-random/omnivoice"

	set_seed(42)
	tokenizer = AutoTokenizer.from_pretrained(source_model_id, trust_remote_code=True)
	tokenizer.save_pretrained(save_folder)

	with open(
	hf_hub_download(source_model_id, filename="audio_tokenizer/config.json", repo_type="model"),
	"r",
	encoding="utf-8",
	) as f:
	config_dict = json.load(f)
	config_dict["acoustic_model_config"].update(
	{
	"decoder_hidden_size": 32,
	"encoder_hidden_size": 4,
	"hidden_size": 4,
	"codebook_dim": 8,
	}
	)
	config_dict["semantic_model_config"].update(
	{
	"conv_dim": [8] * 7,
	"hidden_size": 16 * 4,
	"intermediate_size": 64,
	"num_attention_heads": 4,
	"num_hidden_layers": 2,
	}
	)
	os.makedirs(os.path.join(save_folder, "audio_tokenizer"), exist_ok=True)
	with open(os.path.join(save_folder, "audio_tokenizer/config.json"), "w", encoding="utf-8") as f:
	json.dump(config_dict, f, ensure_ascii=False, indent=2)
	audio_tokenizer = HiggsAudioV2TokenizerModel(
	AutoConfig.from_pretrained(os.path.join(save_folder, "audio_tokenizer"))
	)
	audio_tokenizer.save_pretrained(os.path.join(save_folder, "audio_tokenizer"))
	print(audio_tokenizer)
	set_seed(42)
	with torch.no_grad():
	for name, p in sorted(audio_tokenizer.named_parameters()):
	torch.nn.init.normal_(p, 0, 0.2)
	print(name, p.shape)

	feature_extractor = AutoFeatureExtractor.from_pretrained(source_model_id, subfolder="audio_tokenizer")
	feature_extractor.save_pretrained(os.path.join(save_folder, "audio_tokenizer"))

	with open(
	hf_hub_download(source_model_id, filename="config.json", repo_type="model"),
	"r",
	encoding="utf-8",
	) as f:
	config_dict = json.load(f)
	config_dict["llm_config"].update(
	{
	"hidden_size": 8,
	"head_dim": 32,
	"intermediate_size": 32,
	"num_attention_heads": 8,
	"num_key_value_heads": 4,
	"num_hidden_layers": 4,
	"max_window_layers": 2,
	"layer_types": ["full_attention"] * 4,
	}
	)
	config = OmniVoiceConfig.from_dict(config_dict)
	model = OmniVoice(config).eval()
	set_seed(42)
	with torch.no_grad():
	for name, p in sorted(model.named_parameters()):
	torch.nn.init.normal_(p, 0, 0.2)
	print(name, p.shape)
	model.save_pretrained(save_folder)
	```

	</details>