{ "_comment": "Runtime config for the StyleTTS2 (LibriTTS) CoreML bundle. Values mirror yl4579/StyleTTS2 Configs/config_libritts.yml at the conversion commit. The host application (Swift/Python) owns the sampler loop, alignment construction, and bucket routing; this file just documents the contract between the host and the shipped mlpackages.", "model_type": "styletts2", "checkpoint": "yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth", "library": "coreml", "format_versions": ["mlpackage", "mlmodelc"], "minimum_deployment_target": { "ios": "17.0", "macos": "14.0" }, "audio": { "sample_rate": 24000, "n_fft": 2048, "win_length": 1200, "hop_length": 300, "n_mels": 80 }, "tokenizer": { "type": "espeak-ng-ipa", "vocab_file": "constants/text_cleaner_vocab.json", "n_tokens": 178, "pad_token": "$", "pad_id": 0 }, "model": { "style_dim": 128, "hidden_dim": 512, "n_layer": 3, "max_dur": 50, "ref_s_dim": 256, "_ref_s_split_comment": "ref_s is concat([acoustic_ref (128,), prosody_ref (128,)]) — same convention as upstream Inference_LibriTTS.ipynb" }, "sampler": { "type": "ADPM2", "schedule": "karras", "num_steps": 5, "classifier_free_guidance": true, "cfg_scale_default": 1.0, "_comment": "Sampler loop runs in the host. Each step calls styletts2_diffusion_step_512.mlpackage." }, "stages": [ { "name": "text_predictor", "packages": [ "styletts2_text_predictor_32.mlpackage", "styletts2_text_predictor_64.mlpackage", "styletts2_text_predictor_128.mlpackage", "styletts2_text_predictor_256.mlpackage", "styletts2_text_predictor_512.mlpackage" ], "bucket_axis": "tokens", "bucket_sizes": [32, 64, 128, 256, 512], "compute_unit_recommended": "ANE", "precision": "fp16", "calls_per_utterance": 1, "inputs": ["tokens (1, T_tok) int32"], "outputs": ["d_en (1, T_dur, hidden)", "s_pred (1, 256)", "duration_logits"] }, { "name": "diffusion_step", "packages": ["styletts2_diffusion_step_512.mlpackage"], "bucket_axis": "bert_dur", "bucket_sizes": [512], "compute_unit_recommended": "CPU_AND_GPU", "precision": "fp16", "calls_per_utterance_default": 5, "inputs": ["x", "sigma", "embedding (bert_dur)", "features (ref_s)"], "outputs": ["x_next"] }, { "name": "f0n_energy", "packages": ["styletts2_f0n_energy.mlpackage"], "bucket_axis": null, "bucket_sizes": null, "compute_unit_recommended": "ANE", "precision": "fp16", "calls_per_utterance": 1, "inputs": ["en (1, 512, T_mel)", "s (1, 128)"], "outputs": ["F0", "N"] }, { "name": "decoder", "packages": [ "styletts2_decoder_256.mlpackage", "styletts2_decoder_512.mlpackage", "styletts2_decoder_1024.mlpackage", "styletts2_decoder_2048.mlpackage", "styletts2_decoder_4096.mlpackage" ], "bucket_axis": "mel_frames", "bucket_sizes": [256, 512, 1024, 2048, 4096], "compute_unit_recommended": "CPU_AND_GPU", "precision": "fp32", "_precision_comment": "fp32 is required end-to-end. SineGen accumulates phase via cumsum × 2π × hop=300, reaching ~4000 mid-frame; fp16 precision at that magnitude scrambles the sine output. See PHASE6_FP16_DECODER.md.", "calls_per_utterance": 1, "inputs": ["asr", "F0", "N", "ref (1, 128)"], "outputs": ["waveform (1, T_audio) @ 24kHz"] } ], "bucket_routing": { "_comment": "Round each variable-length input UP to the next bucket. Pad with zeros.", "text_predictor.tokens": [32, 64, 128, 256, 512], "diffusion_step.embedding": [512], "decoder.asr": [256, 512, 1024, 2048, 4096] }, "performance": { "rtfx_warm_m_series": 4.32, "log_mel_cosine_vs_pytorch_fp32": 0.9687, "ecapa_tdnn_cosine_to_reference": 0.18, "_pytorch_fp32_ecapa_baseline": 0.29, "on_disk_size_gb": 1.4 } }