| { |
| "_comment": "Runtime config for the StyleTTS2 (LibriTTS) CoreML bundle. Values mirror yl4579/StyleTTS2 Configs/config_libritts.yml at the conversion commit. The host application (Swift/Python) owns the sampler loop, alignment construction, and bucket routing; this file just documents the contract between the host and the shipped mlpackages.", |
|
|
| "model_type": "styletts2", |
| "checkpoint": "yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth", |
| "library": "coreml", |
| "format_versions": ["mlpackage", "mlmodelc"], |
| "minimum_deployment_target": { |
| "ios": "17.0", |
| "macos": "14.0" |
| }, |
|
|
| "audio": { |
| "sample_rate": 24000, |
| "n_fft": 2048, |
| "win_length": 1200, |
| "hop_length": 300, |
| "n_mels": 80 |
| }, |
|
|
| "tokenizer": { |
| "type": "espeak-ng-ipa", |
| "vocab_file": "constants/text_cleaner_vocab.json", |
| "n_tokens": 178, |
| "pad_token": "$", |
| "pad_id": 0 |
| }, |
|
|
| "model": { |
| "style_dim": 128, |
| "hidden_dim": 512, |
| "n_layer": 3, |
| "max_dur": 50, |
| "ref_s_dim": 256, |
| "_ref_s_split_comment": "ref_s is concat([acoustic_ref (128,), prosody_ref (128,)]) — same convention as upstream Inference_LibriTTS.ipynb" |
| }, |
|
|
| "sampler": { |
| "type": "ADPM2", |
| "schedule": "karras", |
| "num_steps": 5, |
| "classifier_free_guidance": true, |
| "cfg_scale_default": 1.0, |
| "_comment": "Sampler loop runs in the host. Each step calls styletts2_diffusion_step_512.mlpackage." |
| }, |
|
|
| "stages": [ |
| { |
| "name": "text_predictor", |
| "packages": [ |
| "styletts2_text_predictor_32.mlpackage", |
| "styletts2_text_predictor_64.mlpackage", |
| "styletts2_text_predictor_128.mlpackage", |
| "styletts2_text_predictor_256.mlpackage", |
| "styletts2_text_predictor_512.mlpackage" |
| ], |
| "bucket_axis": "tokens", |
| "bucket_sizes": [32, 64, 128, 256, 512], |
| "compute_unit_recommended": "ANE", |
| "precision": "fp16", |
| "calls_per_utterance": 1, |
| "inputs": ["tokens (1, T_tok) int32"], |
| "outputs": ["d_en (1, T_dur, hidden)", "s_pred (1, 256)", "duration_logits"] |
| }, |
| { |
| "name": "diffusion_step", |
| "packages": ["styletts2_diffusion_step_512.mlpackage"], |
| "bucket_axis": "bert_dur", |
| "bucket_sizes": [512], |
| "compute_unit_recommended": "CPU_AND_GPU", |
| "precision": "fp16", |
| "calls_per_utterance_default": 5, |
| "inputs": ["x", "sigma", "embedding (bert_dur)", "features (ref_s)"], |
| "outputs": ["x_next"] |
| }, |
| { |
| "name": "f0n_energy", |
| "packages": ["styletts2_f0n_energy.mlpackage"], |
| "bucket_axis": null, |
| "bucket_sizes": null, |
| "compute_unit_recommended": "ANE", |
| "precision": "fp16", |
| "calls_per_utterance": 1, |
| "inputs": ["en (1, 512, T_mel)", "s (1, 128)"], |
| "outputs": ["F0", "N"] |
| }, |
| { |
| "name": "decoder", |
| "packages": [ |
| "styletts2_decoder_256.mlpackage", |
| "styletts2_decoder_512.mlpackage", |
| "styletts2_decoder_1024.mlpackage", |
| "styletts2_decoder_2048.mlpackage", |
| "styletts2_decoder_4096.mlpackage" |
| ], |
| "bucket_axis": "mel_frames", |
| "bucket_sizes": [256, 512, 1024, 2048, 4096], |
| "compute_unit_recommended": "CPU_AND_GPU", |
| "precision": "fp32", |
| "_precision_comment": "fp32 is required end-to-end. SineGen accumulates phase via cumsum × 2π × hop=300, reaching ~4000 mid-frame; fp16 precision at that magnitude scrambles the sine output. See PHASE6_FP16_DECODER.md.", |
| "calls_per_utterance": 1, |
| "inputs": ["asr", "F0", "N", "ref (1, 128)"], |
| "outputs": ["waveform (1, T_audio) @ 24kHz"] |
| } |
| ], |
|
|
| "bucket_routing": { |
| "_comment": "Round each variable-length input UP to the next bucket. Pad with zeros.", |
| "text_predictor.tokens": [32, 64, 128, 256, 512], |
| "diffusion_step.embedding": [512], |
| "decoder.asr": [256, 512, 1024, 2048, 4096] |
| }, |
|
|
| "performance": { |
| "rtfx_warm_m_series": 4.32, |
| "log_mel_cosine_vs_pytorch_fp32": 0.9687, |
| "ecapa_tdnn_cosine_to_reference": 0.18, |
| "_pytorch_fp32_ecapa_baseline": 0.29, |
| "on_disk_size_gb": 1.4 |
| } |
| } |
|
|