StyleTTS-2-coreml / config.json
alexwengg's picture
Upload 101 files
a7fc1eb verified
{
"_comment": "Runtime config for the StyleTTS2 (LibriTTS) CoreML bundle. Values mirror yl4579/StyleTTS2 Configs/config_libritts.yml at the conversion commit. The host application (Swift/Python) owns the sampler loop, alignment construction, and bucket routing; this file just documents the contract between the host and the shipped mlpackages.",
"model_type": "styletts2",
"checkpoint": "yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth",
"library": "coreml",
"format_versions": ["mlpackage", "mlmodelc"],
"minimum_deployment_target": {
"ios": "17.0",
"macos": "14.0"
},
"audio": {
"sample_rate": 24000,
"n_fft": 2048,
"win_length": 1200,
"hop_length": 300,
"n_mels": 80
},
"tokenizer": {
"type": "espeak-ng-ipa",
"vocab_file": "constants/text_cleaner_vocab.json",
"n_tokens": 178,
"pad_token": "$",
"pad_id": 0
},
"model": {
"style_dim": 128,
"hidden_dim": 512,
"n_layer": 3,
"max_dur": 50,
"ref_s_dim": 256,
"_ref_s_split_comment": "ref_s is concat([acoustic_ref (128,), prosody_ref (128,)]) — same convention as upstream Inference_LibriTTS.ipynb"
},
"sampler": {
"type": "ADPM2",
"schedule": "karras",
"num_steps": 5,
"classifier_free_guidance": true,
"cfg_scale_default": 1.0,
"_comment": "Sampler loop runs in the host. Each step calls styletts2_diffusion_step_512.mlpackage."
},
"stages": [
{
"name": "text_predictor",
"packages": [
"styletts2_text_predictor_32.mlpackage",
"styletts2_text_predictor_64.mlpackage",
"styletts2_text_predictor_128.mlpackage",
"styletts2_text_predictor_256.mlpackage",
"styletts2_text_predictor_512.mlpackage"
],
"bucket_axis": "tokens",
"bucket_sizes": [32, 64, 128, 256, 512],
"compute_unit_recommended": "ANE",
"precision": "fp16",
"calls_per_utterance": 1,
"inputs": ["tokens (1, T_tok) int32"],
"outputs": ["d_en (1, T_dur, hidden)", "s_pred (1, 256)", "duration_logits"]
},
{
"name": "diffusion_step",
"packages": ["styletts2_diffusion_step_512.mlpackage"],
"bucket_axis": "bert_dur",
"bucket_sizes": [512],
"compute_unit_recommended": "CPU_AND_GPU",
"precision": "fp16",
"calls_per_utterance_default": 5,
"inputs": ["x", "sigma", "embedding (bert_dur)", "features (ref_s)"],
"outputs": ["x_next"]
},
{
"name": "f0n_energy",
"packages": ["styletts2_f0n_energy.mlpackage"],
"bucket_axis": null,
"bucket_sizes": null,
"compute_unit_recommended": "ANE",
"precision": "fp16",
"calls_per_utterance": 1,
"inputs": ["en (1, 512, T_mel)", "s (1, 128)"],
"outputs": ["F0", "N"]
},
{
"name": "decoder",
"packages": [
"styletts2_decoder_256.mlpackage",
"styletts2_decoder_512.mlpackage",
"styletts2_decoder_1024.mlpackage",
"styletts2_decoder_2048.mlpackage",
"styletts2_decoder_4096.mlpackage"
],
"bucket_axis": "mel_frames",
"bucket_sizes": [256, 512, 1024, 2048, 4096],
"compute_unit_recommended": "CPU_AND_GPU",
"precision": "fp32",
"_precision_comment": "fp32 is required end-to-end. SineGen accumulates phase via cumsum × 2π × hop=300, reaching ~4000 mid-frame; fp16 precision at that magnitude scrambles the sine output. See PHASE6_FP16_DECODER.md.",
"calls_per_utterance": 1,
"inputs": ["asr", "F0", "N", "ref (1, 128)"],
"outputs": ["waveform (1, T_audio) @ 24kHz"]
}
],
"bucket_routing": {
"_comment": "Round each variable-length input UP to the next bucket. Pad with zeros.",
"text_predictor.tokens": [32, 64, 128, 256, 512],
"diffusion_step.embedding": [512],
"decoder.asr": [256, 512, 1024, 2048, 4096]
},
"performance": {
"rtfx_warm_m_series": 4.32,
"log_mel_cosine_vs_pytorch_fp32": 0.9687,
"ecapa_tdnn_cosine_to_reference": 0.18,
"_pytorch_fp32_ecapa_baseline": 0.29,
"on_disk_size_gb": 1.4
}
}