File size: 4,090 Bytes
a7fc1eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
{
  "_comment": "Runtime config for the StyleTTS2 (LibriTTS) CoreML bundle. Values mirror yl4579/StyleTTS2 Configs/config_libritts.yml at the conversion commit. The host application (Swift/Python) owns the sampler loop, alignment construction, and bucket routing; this file just documents the contract between the host and the shipped mlpackages.",

  "model_type": "styletts2",
  "checkpoint": "yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth",
  "library": "coreml",
  "format_versions": ["mlpackage", "mlmodelc"],
  "minimum_deployment_target": {
    "ios": "17.0",
    "macos": "14.0"
  },

  "audio": {
    "sample_rate": 24000,
    "n_fft": 2048,
    "win_length": 1200,
    "hop_length": 300,
    "n_mels": 80
  },

  "tokenizer": {
    "type": "espeak-ng-ipa",
    "vocab_file": "constants/text_cleaner_vocab.json",
    "n_tokens": 178,
    "pad_token": "$",
    "pad_id": 0
  },

  "model": {
    "style_dim": 128,
    "hidden_dim": 512,
    "n_layer": 3,
    "max_dur": 50,
    "ref_s_dim": 256,
    "_ref_s_split_comment": "ref_s is concat([acoustic_ref (128,), prosody_ref (128,)]) — same convention as upstream Inference_LibriTTS.ipynb"
  },

  "sampler": {
    "type": "ADPM2",
    "schedule": "karras",
    "num_steps": 5,
    "classifier_free_guidance": true,
    "cfg_scale_default": 1.0,
    "_comment": "Sampler loop runs in the host. Each step calls styletts2_diffusion_step_512.mlpackage."
  },

  "stages": [
    {
      "name": "text_predictor",
      "packages": [
        "styletts2_text_predictor_32.mlpackage",
        "styletts2_text_predictor_64.mlpackage",
        "styletts2_text_predictor_128.mlpackage",
        "styletts2_text_predictor_256.mlpackage",
        "styletts2_text_predictor_512.mlpackage"
      ],
      "bucket_axis": "tokens",
      "bucket_sizes": [32, 64, 128, 256, 512],
      "compute_unit_recommended": "ANE",
      "precision": "fp16",
      "calls_per_utterance": 1,
      "inputs": ["tokens (1, T_tok) int32"],
      "outputs": ["d_en (1, T_dur, hidden)", "s_pred (1, 256)", "duration_logits"]
    },
    {
      "name": "diffusion_step",
      "packages": ["styletts2_diffusion_step_512.mlpackage"],
      "bucket_axis": "bert_dur",
      "bucket_sizes": [512],
      "compute_unit_recommended": "CPU_AND_GPU",
      "precision": "fp16",
      "calls_per_utterance_default": 5,
      "inputs": ["x", "sigma", "embedding (bert_dur)", "features (ref_s)"],
      "outputs": ["x_next"]
    },
    {
      "name": "f0n_energy",
      "packages": ["styletts2_f0n_energy.mlpackage"],
      "bucket_axis": null,
      "bucket_sizes": null,
      "compute_unit_recommended": "ANE",
      "precision": "fp16",
      "calls_per_utterance": 1,
      "inputs": ["en (1, 512, T_mel)", "s (1, 128)"],
      "outputs": ["F0", "N"]
    },
    {
      "name": "decoder",
      "packages": [
        "styletts2_decoder_256.mlpackage",
        "styletts2_decoder_512.mlpackage",
        "styletts2_decoder_1024.mlpackage",
        "styletts2_decoder_2048.mlpackage",
        "styletts2_decoder_4096.mlpackage"
      ],
      "bucket_axis": "mel_frames",
      "bucket_sizes": [256, 512, 1024, 2048, 4096],
      "compute_unit_recommended": "CPU_AND_GPU",
      "precision": "fp32",
      "_precision_comment": "fp32 is required end-to-end. SineGen accumulates phase via cumsum × 2π × hop=300, reaching ~4000 mid-frame; fp16 precision at that magnitude scrambles the sine output. See PHASE6_FP16_DECODER.md.",
      "calls_per_utterance": 1,
      "inputs": ["asr", "F0", "N", "ref (1, 128)"],
      "outputs": ["waveform (1, T_audio) @ 24kHz"]
    }
  ],

  "bucket_routing": {
    "_comment": "Round each variable-length input UP to the next bucket. Pad with zeros.",
    "text_predictor.tokens": [32, 64, 128, 256, 512],
    "diffusion_step.embedding": [512],
    "decoder.asr": [256, 512, 1024, 2048, 4096]
  },

  "performance": {
    "rtfx_warm_m_series": 4.32,
    "log_mel_cosine_vs_pytorch_fp32": 0.9687,
    "ecapa_tdnn_cosine_to_reference": 0.18,
    "_pytorch_fp32_ecapa_baseline": 0.29,
    "on_disk_size_gb": 1.4
  }
}