FluidInference
/

StyleTTS-2-coreml

Model card Files Files and versions

StyleTTS-2-coreml / config.json

alexwengg's picture

Upload 101 files

a7fc1eb verified 1 day ago

history blame contribute delete

4.09 kB

	{
	"_comment": "Runtime config for the StyleTTS2 (LibriTTS) CoreML bundle. Values mirror yl4579/StyleTTS2 Configs/config_libritts.yml at the conversion commit. The host application (Swift/Python) owns the sampler loop, alignment construction, and bucket routing; this file just documents the contract between the host and the shipped mlpackages.",

	"model_type": "styletts2",
	"checkpoint": "yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth",
	"library": "coreml",
	"format_versions": ["mlpackage", "mlmodelc"],
	"minimum_deployment_target": {
	"ios": "17.0",
	"macos": "14.0"
	},

	"audio": {
	"sample_rate": 24000,
	"n_fft": 2048,
	"win_length": 1200,
	"hop_length": 300,
	"n_mels": 80
	},

	"tokenizer": {
	"type": "espeak-ng-ipa",
	"vocab_file": "constants/text_cleaner_vocab.json",
	"n_tokens": 178,
	"pad_token": "$",
	"pad_id": 0
	},

	"model": {
	"style_dim": 128,
	"hidden_dim": 512,
	"n_layer": 3,
	"max_dur": 50,
	"ref_s_dim": 256,
	"_ref_s_split_comment": "ref_s is concat([acoustic_ref (128,), prosody_ref (128,)]) — same convention as upstream Inference_LibriTTS.ipynb"
	},

	"sampler": {
	"type": "ADPM2",
	"schedule": "karras",
	"num_steps": 5,
	"classifier_free_guidance": true,
	"cfg_scale_default": 1.0,
	"_comment": "Sampler loop runs in the host. Each step calls styletts2_diffusion_step_512.mlpackage."
	},

	"stages": [
	{
	"name": "text_predictor",
	"packages": [
	"styletts2_text_predictor_32.mlpackage",
	"styletts2_text_predictor_64.mlpackage",
	"styletts2_text_predictor_128.mlpackage",
	"styletts2_text_predictor_256.mlpackage",
	"styletts2_text_predictor_512.mlpackage"
	],
	"bucket_axis": "tokens",
	"bucket_sizes": [32, 64, 128, 256, 512],
	"compute_unit_recommended": "ANE",
	"precision": "fp16",
	"calls_per_utterance": 1,
	"inputs": ["tokens (1, T_tok) int32"],
	"outputs": ["d_en (1, T_dur, hidden)", "s_pred (1, 256)", "duration_logits"]
	},
	{
	"name": "diffusion_step",
	"packages": ["styletts2_diffusion_step_512.mlpackage"],
	"bucket_axis": "bert_dur",
	"bucket_sizes": [512],
	"compute_unit_recommended": "CPU_AND_GPU",
	"precision": "fp16",
	"calls_per_utterance_default": 5,
	"inputs": ["x", "sigma", "embedding (bert_dur)", "features (ref_s)"],
	"outputs": ["x_next"]
	},
	{
	"name": "f0n_energy",
	"packages": ["styletts2_f0n_energy.mlpackage"],
	"bucket_axis": null,
	"bucket_sizes": null,
	"compute_unit_recommended": "ANE",
	"precision": "fp16",
	"calls_per_utterance": 1,
	"inputs": ["en (1, 512, T_mel)", "s (1, 128)"],
	"outputs": ["F0", "N"]
	},
	{
	"name": "decoder",
	"packages": [
	"styletts2_decoder_256.mlpackage",
	"styletts2_decoder_512.mlpackage",
	"styletts2_decoder_1024.mlpackage",
	"styletts2_decoder_2048.mlpackage",
	"styletts2_decoder_4096.mlpackage"
	],
	"bucket_axis": "mel_frames",
	"bucket_sizes": [256, 512, 1024, 2048, 4096],
	"compute_unit_recommended": "CPU_AND_GPU",
	"precision": "fp32",
	"_precision_comment": "fp32 is required end-to-end. SineGen accumulates phase via cumsum × 2π × hop=300, reaching ~4000 mid-frame; fp16 precision at that magnitude scrambles the sine output. See PHASE6_FP16_DECODER.md.",
	"calls_per_utterance": 1,
	"inputs": ["asr", "F0", "N", "ref (1, 128)"],
	"outputs": ["waveform (1, T_audio) @ 24kHz"]
	}
	],

	"bucket_routing": {
	"_comment": "Round each variable-length input UP to the next bucket. Pad with zeros.",
	"text_predictor.tokens": [32, 64, 128, 256, 512],
	"diffusion_step.embedding": [512],
	"decoder.asr": [256, 512, 1024, 2048, 4096]
	},

	"performance": {
	"rtfx_warm_m_series": 4.32,
	"log_mel_cosine_vs_pytorch_fp32": 0.9687,
	"ecapa_tdnn_cosine_to_reference": 0.18,
	"_pytorch_fp32_ecapa_baseline": 0.29,
	"on_disk_size_gb": 1.4
	}
	}