Abdelrahman2922 commited on 26 days ago

Commit

a4d9876

verified ·

1 Parent(s): de598c1

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.github/ISSUE_TEMPLATE/bug_report.yml +50 -0
.github/ISSUE_TEMPLATE/config.yml +1 -0
.github/ISSUE_TEMPLATE/feature_request.yml +62 -0
.github/ISSUE_TEMPLATE/help_wanted.yml +52 -0
.github/ISSUE_TEMPLATE/question.yml +26 -0
.gitignore +30 -0
LICENSE +201 -0
README.md +322 -0
api.py +277 -0
docs/OmniVoice.ipynb +144 -0
docs/community-projects.md +46 -0
docs/data_preparation.md +182 -0
docs/data_preparation_advanced.md +67 -0
docs/evaluation.md +48 -0
docs/generation-parameters.md +68 -0
docs/lang_id_name_map.tsv +647 -0
docs/languages.md +659 -0
docs/tips.md +10 -0
docs/training.md +102 -0
docs/voice-design.md +129 -0
examples/README.md +120 -0
examples/config/data_config_emilia.json +36 -0
examples/config/data_config_finetune.json +12 -0
examples/config/ds_config_zero2.json +19 -0
examples/config/train_config_emilia.json +39 -0
examples/config/train_config_finetune.json +39 -0
examples/config/train_config_finetune_sdpa.json +43 -0
examples/config/train_config_multilingual.json +39 -0
examples/run_emilia.sh +115 -0
examples/run_eval.sh +283 -0
examples/run_finetune.sh +85 -0
exp_v1/omnivoice_finetune/checkpoint-4500/chat_template.jinja +89 -0
exp_v1/omnivoice_finetune/checkpoint-4500/config.json +101 -0
exp_v1/omnivoice_finetune/checkpoint-4500/tokenizer_config.json +24 -0
exp_v1/omnivoice_finetune/checkpoint-4500/train_config.json +57 -0
exp_v1/omnivoice_finetune/checkpoint-500/chat_template.jinja +89 -0
exp_v1/omnivoice_finetune/checkpoint-500/config.json +101 -0
exp_v1/omnivoice_finetune/checkpoint-500/train_config.json +57 -0
exp_v1/omnivoice_finetune/checkpoint-5000/chat_template.jinja +89 -0
exp_v1/omnivoice_finetune/checkpoint-5000/config.json +101 -0
exp_v1/omnivoice_finetune/checkpoint-5000/tokenizer_config.json +24 -0
exp_v1/omnivoice_finetune/checkpoint-5000/train_config.json +57 -0
exp_v1/omnivoice_finetune/initial_config.json +57 -0
infer.py +58 -0
omnivoice/__init__.py +28 -0
prepare_sync_data.py +90 -0
pyproject.toml +98 -0
ref_audio/women_ref_1.mp3 +0 -0
upload_to_hf.py +59 -0
uv.lock +0 -0

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: "Bug Report"
+description: |
+  Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
+labels:
+  - bug
+body:
+  - type: checkboxes
+    attributes:
+      label: Checks
+      description: "To ensure timely help, please confirm the following:"
+      options:
+        - label: This template is only for bug reports, usage problems go with 'Help Wanted'.
+          required: true
+        - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
+          required: true
+        - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
+          required: true
+        - label: I am using English to submit this issue to facilitate community communication.
+          required: true
+  - type: textarea
+    attributes:
+      label: Environment Details
+      description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and training/finetuning configuration (if applicable)."
+      placeholder: e.g., Ubuntu 20.04.6 LTS, 4 * H20, Python 3.13, torch==2.8.0+cu128, cuda 12.8
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Steps to Reproduce
+      description: |
+        Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
+      placeholder: |
+        1. Clone the repo and install omnivoice with `uv sync`.
+        2. Run the command: `omnivoice-infer --text "This is a test." --ref_audio ref.wav --ref_text "Transcription." --output output.wav`
+        3. Got the following error message... (attach full logs).
+        4. Upload relevant audio files (e.g., ref.wav, output.wav) as .wav or packed in .zip.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: ✔️ Expected Behavior
+      placeholder: Describe in detail what you expected to happen.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: ❌ Actual Behavior
+      placeholder: Describe in detail what actually happened.
+    validations:
+      required: false

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ blank_issues_enabled: false

.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

	@@ -0,0 +1,62 @@

+name: "Feature Request"
+description: |
+  Some constructive suggestions and new ideas regarding current repo.
+labels:
+  - enhancement
+body:
+  - type: checkboxes
+    attributes:
+      label: Checks
+      description: "To help us grasp quickly, please confirm the following:"
+      options:
+        - label: This template is only for feature request.
+          required: true
+        - label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
+          required: true
+        - label: I have searched for existing issues, including closed ones, and found not discussion yet.
+          required: true
+        - label: I am using English to submit this issue to facilitate community communication.
+          required: true
+  - type: textarea
+    attributes:
+      label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
+      description: |
+        Describe the specific problem or scenario you're facing in detail. For example:
+        *"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
+      placeholder: Please describe the situation in as much detail as possible.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 2. What is your suggested solution?
+      description: |
+        Provide a clear description of the feature or enhancement you'd like to propose.
+        How would this feature solve your issue or improve the project?
+      placeholder: Describe your idea or proposed solution here.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 3. Additional context or comments
+      description: |
+        Any other relevant information, links, documents, or screenshots that provide clarity.
+        Use this section for anything not covered above.
+      placeholder: Add any extra details here.
+    validations:
+      required: false
+  - type: checkboxes
+    attributes:
+      label: 4. Can you help us with this feature?
+      description: |
+        Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
+      options:
+        - label: I am interested in contributing to this feature.
+          required: false
+  - type: markdown
+    attributes:
+      value: |
+        **Note:** Please submit only one request per issue to keep discussions focused and manageable.

.github/ISSUE_TEMPLATE/help_wanted.yml ADDED Viewed

	@@ -0,0 +1,52 @@

+name: "Help Wanted"
+description: |
+  Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
+labels:
+  - help wanted
+body:
+  - type: checkboxes
+    attributes:
+      label: Checks
+      description: "To ensure timely help, please confirm the following:"
+      options:
+        - label: This template is only for usage issues encountered.
+          required: true
+        - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
+          required: true
+        - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
+          required: true
+        - label: I am using English to submit this issue to facilitate community communication.
+          required: true
+  - type: textarea
+    attributes:
+      label: Environment Details
+      description: "Provide details such as OS, Python version, and any relevant software or dependencies."
+      placeholder: |
+        e.g., macOS 13.5, Python 3.13, torch==2.8.0
+        If training or finetuning related, provide detailed configuration including GPU info and training setup.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Steps to Reproduce
+      description: |
+        Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
+      placeholder: |
+        1. Clone the repo and install omnivoice with `uv sync`.
+        2. Run the command: `omnivoice-infer --text "This is a test." --ref_audio ref.wav --ref_text "Transcription." --output output.wav`
+        3. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
+        4. Upload relevant audio files (e.g., ref.wav, output.wav) as .wav or packed in .zip.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: ✔️ Expected Behavior
+      placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: ❌ Actual Behavior
+      placeholder: Describe what actually happened in detail, failure messages, etc.
+    validations:
+      required: false

.github/ISSUE_TEMPLATE/question.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: "Question"
+description: |
+  Research question or pure inquiry about the project, usage issue goes with "help wanted".
+labels:
+  - question
+body:
+  - type: checkboxes
+    attributes:
+      label: Checks
+      description: "To help us grasp quickly, please confirm the following:"
+      options:
+        - label: This template is only for research question, not usage problems, feature requests or bug reports.
+          required: true
+        - label: I have thoroughly reviewed the project documentation and read the related paper(s).
+          required: true
+        - label: I have searched for existing issues, including closed ones, no similar questions.
+          required: true
+        - label: I am using English to submit this issue to facilitate community communication.
+          required: true
+  - type: textarea
+    attributes:
+      label: Question details
+      description: |
+        Question details, clearly stated using proper markdown syntax.
+    validations:
+      required: true

.gitignore ADDED Viewed

	@@ -0,0 +1,30 @@

+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+*.egg
+dist/
+build/
+.venv/
+.env
+.DS_Store
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+*.so
+/.cache*
+/exp*/
+/.tmp/
+/results/
+/data/
+/download
+/local/
+/run*
+example.py
+results/
+examples/data*
+examples/download*
+examples/exp*
+.claude/
+*.wav
+*.jsonl

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2026 Xiaomi Corp.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,322 @@

+# OmniVoice 🌍
+<p align="center">
+  <img width="200" height="200" alt="OmniVoice" src="https://zhu-han.github.io/omnivoice/pics/omnivoice.jpg" />
+</p>
+<p align="center">
+  <a href="https://huggingface.co/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E" alt="Hugging Face Model"></a>
+  &nbsp;
+  <a href="https://huggingface.co/spaces/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue" alt="Hugging Face Space"></a>
+  &nbsp;
+  <a href="https://arxiv.org/abs/2604.00688"><img src="https://img.shields.io/badge/arXiv-Paper-B31B1B.svg"></a>
+  &nbsp;
+  <a href="https://zhu-han.github.io/omnivoice"><img src="https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=GitHub&style=flat-square"></a>
+  &nbsp;
+  <a href="https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
+</p>
+OmniVoice is a state-of-the-art massively multilingual zero-shot text-to-speech (TTS) model supporting over 600 languages. Built on a novel diffusion language model-style architecture, it generates high-quality speech with superior inference speed, supporting voice cloning and voice design.
+**Contents**: [Key Features](#key-features) | [Installation](#installation) | [Quick Start](#quick-start) | [Python API](#python-api) | [Command-Line Tools](#command-line-tools) | [Training & Evaluation](#training--evaluation) | [Discussion](#discussion--communication) | [Citation](#citation)
+## Key Features
+- **600+ Languages Supported**: The broadest language coverage among zero-shot TTS models ([full list](docs/languages.md)).
+- **Voice Cloning**: State-of-the-art voice cloning quality.
+- **Voice Design**: Control voices via assigned speaker attributes (gender, age, pitch, dialect/accent, whisper, etc.).
+- **Fine-grained Control**: Non-verbal symbols (e.g., `[laughter]`) and pronunciation correction via pinyin or phonemes.
+- **Fast Inference**: RTF as low as 0.025 (40x faster than real-time).
+- **Diffusion Language Model-style Architecture**: A clean, streamlined, and scalable design that delivers both quality and speed.
+---
+## Installation
+Choose **one** of the following methods: **pip** or **uv**.
+### pip
+> We recommend using a fresh virtual environment (e.g., `conda`, `venv`, etc.) to avoid conflicts.
+**Step 1**: Install PyTorch
+<details>
+<summary>NVIDIA GPU</summary>
+```bash
+# Install pytorch with your CUDA version, e.g.
+pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128
+```
+> See [PyTorch official site](https://pytorch.org/get-started/locally/) for other versions installation.
+</details>
+<details>
+<summary>Apple Silicon</summary>
+```bash
+pip install torch==2.8.0 torchaudio==2.8.0
+```
+</details>
+**Step 2**: Install OmniVoice (choose one)
+```bash
+# From PyPI (stable release)
+pip install omnivoice
+# From the latest source on GitHub (no need to clone)
+pip install git+https://github.com/k2-fsa/OmniVoice.git
+# For development (clone first, editable install)
+git clone https://github.com/k2-fsa/OmniVoice.git
+cd OmniVoice
+pip install -e .
+```
+### uv
+Clone the repository and sync dependencies:
+```bash
+git clone https://github.com/k2-fsa/OmniVoice.git
+cd OmniVoice
+uv sync
+```
+> **Tip**: Can use mirror with `uv sync --default-index "https://mirrors.aliyun.com/pypi/simple"`
+---
+## Quick Start
+Try OmniVoice without coding:
+- Launch the local web UI: `omnivoice-demo --ip 0.0.0.0 --port 8001`
+- Or try it directly on [HuggingFace Space](https://huggingface.co/spaces/k2-fsa/OmniVoice)
+- Or run it in Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb)
+> If you have trouble connecting to HuggingFace when downloading the pre-trained models, set `export HF_ENDPOINT="https://hf-mirror.com"` before running.
+For full usage, see the [Python API](#python-api) and [Command-Line Tools](#command-line-tools) sections below.
+---
+## Python API
+OmniVoice supports three generation modes. All features in this section are also available via [command-line tools](#command-line-tools).
+### Voice Cloning
+Clone a voice from a short reference audio. Provide `ref_audio` and `ref_text`:
+```python
+from omnivoice import OmniVoice
+import soundfile as sf
+import torch
+model = OmniVoice.from_pretrained(
+    "k2-fsa/OmniVoice",
+    device_map="cuda:0",
+    dtype=torch.float16
+)
+# Apple Silicon users: use device_map="mps" instead
+audio = model.generate(
+    text="Hello, this is a test of zero-shot voice cloning.",
+    ref_audio="ref.wav",
+    ref_text="Transcription of the reference audio.",
+) # audio is a list of `np.ndarray` with shape (T,) at 24 kHz.
+# If you don't want to input `ref_text` manually, you can directly omit the `ref_text`.
+# The model will use Whisper ASR to auto-transcribe it.
+sf.write("out.wav", audio[0], 24000)
+```
+> **Tips**
+>
+> - Use a 3–10 seconds reference audio clip. Longer audio slows down inference and may degrade cloning quality.
+> - For standard pronunciation, use a reference audio in the **same language** as the target speech. In cross-lingual voice cloning (i.e., the reference audio and target speech are in different languages), the generated speech will carry an accent from the reference audio's language.
+> - For better results with Arabic numerals, normalize them to words first (e.g., "123" → "one hundred twenty-three") with text normalization tools (e.g., [WeTextProcessing](https://github.com/wenet-e2e/WeTextProcessing)).
+>
+> For more tips, see [docs/tips.md](docs/tips.md).
+### Voice Design
+Describe the desired voice with speaker attributes — no reference audio needed.
+Supported attributes: **gender** (male/female), **age** (child to elderly),
+**pitch** (very low to very high), **style** (whisper), **English accent**
+(American, British, etc.), and **Chinese dialect** (四川话, 陕西话, etc.).
+Attributes are comma-separated and freely combinable across categories.
+```python
+audio = model.generate(
+    text="Hello, this is a test of zero-shot voice design.",
+    instruct="female, low pitch, british accent",
+)
+```
+> **Note**: Voice design was trained on Chinese and English data only. It can generalize to other languages, but results can be unstable for some low-resource languages.
+See [docs/voice-design.md](docs/voice-design.md) for the full attribute
+reference, Chinese equivalents, and usage tips.
+### Auto Voice
+Let the model choose a voice automatically:
+```python
+audio = model.generate(text="This is a sentence without any voice prompt.")
+```
+### Generation Parameters
+All above three modes share the same `model.generate()` API. You can further control the generation behavior via keyword arguments:
+```python
+audio = model.generate(
+    text="...",
+    num_step=32,  # diffusion steps (or 16 for faster inference)
+    speed=1.0,     # speed factor (>1.0 faster, <1.0 slower)
+    duration=10.0, # fixed output duration in seconds (overrides speed)
+    # ... more options
+)
+```
+See more detailed control in [docs/generation-parameters.md](docs/generation-parameters.md).
+### Non-Verbal & Pronunciation Control
+OmniVoice supports inline **non-verbal symbols** and **pronunciation correction** within the input text.
+**Non-verbal symbols**: Insert tags like `[laughter]` directly in the text to add expressive non-verbal sounds.
+```python
+audio = model.generate(text="[laughter] You really got me. I didn't see that coming at all.")
+```
+Supported tags: `[laughter]`, `[sigh]`, `[confirmation-en]`, `[question-en]`, `[question-ah]`, `[question-oh]`, `[question-ei]`, `[question-yi]`, `[surprise-ah]`, `[surprise-oh]`, `[surprise-wa]`, `[surprise-yo]`, `[dissatisfaction-hnn]`.
+**Pronunciation control (Chinese)**: Use pinyin with tone numbers to correct specific character pronunciations.
+```python
+audio = model.generate(text="这批货物打ZHE2出售后他严重SHE2本了，再也经不起ZHE1腾了。")
+```
+**Pronunciation control (English)**: Use [CMU pronunciation dictionary](https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a)  (uppercase, in brackets) to override default English pronunciations.
+```python
+audio = model.generate(text="He plays the [B EY1 S] guitar while catching a [B AE1 S] fish.")
+```
+---
+## Command-Line Tools
+Three CLI entry points are provided. The CLI tools support all features available in the Python API (voice cloning, voice design, auto voice, generation parameters, etc.) — all controlled via command-line arguments.
+| Command | Description | Source |
+|---|---|---|
+| `omnivoice-demo` | Interactive Gradio web demo | [omnivoice/cli/demo.py](omnivoice/cli/demo.py) |
+| `omnivoice-infer` | Single-item inference | [omnivoice/cli/infer.py](omnivoice/cli/infer.py) |
+| `omnivoice-infer-batch` | Batch inference across multiple GPUs | [omnivoice/cli/infer_batch.py](omnivoice/cli/infer_batch.py) |
+### Demo
+```bash
+omnivoice-demo --ip 0.0.0.0 --port 8001
+```
+Provides a web UI for voice cloning and voice design. See `omnivoice-demo --help` for all options.
+### Single Inference
+```bash
+# Voice Cloning
+# ref_text can be omitted (Whisper will auto-transcribe ref_audio to get it).
+omnivoice-infer \
+    --model k2-fsa/OmniVoice \
+    --text "This is a test for text to speech." \
+    --ref_audio ref.wav \
+    --ref_text "Transcription of the reference audio." \
+    --output hello.wav
+# Voice Design
+omnivoice-infer --model k2-fsa/OmniVoice \
+    --text "This is a test for text to speech." \
+    --instruct "male, British accent" \
+    --output hello.wav
+# Auto Voice
+omnivoice-infer \
+    --model k2-fsa/OmniVoice \
+    --text "This is a test for text to speech."\
+    --output hello.wav
+```
+### Batch Inference
+`omnivoice-infer-batch` can distribute batch inference across multiple GPUs, designed for large-scale TTS tasks.
+```bash
+omnivoice-infer-batch \
+    --model k2-fsa/OmniVoice \
+    --test_list test.jsonl \
+    --res_dir results/
+```
+The test list is a JSONL file where each line is a JSON object:
+```json
+{"id": "sample_001", "text": "Hello world", "ref_audio": "/path/to/ref.wav", "ref_text": "Reference transcript", "instruct": "female, british accent", "language_id": "en", "duration": 10.0, "speed": 1.0}
+```
+Only `id` and `text` are mandatory fields. `ref_audio` and `ref_text` are used in voice cloning mode. `instruct` is used in voice design mode. If no reference audio or instruct are provided, the model will generate text in a random voice.
+`language_id`, `duration`, and `speed` are optional. `duration` (in seconds) fixes the output length; `speed` controls the speaking rate. If `duration` and `speed` are both provided, `speed` will be ignored.
+---
+## Training & Evaluation
+See [examples/](examples/) for the complete pipeline — from data preparation to training, evaluation, and finetuning.
+---
+## Discussion & Communication
+You can directly discuss on [GitHub Issues](https://github.com/k2-fsa/OmniVoice/issues).
+You can also scan the QR code to join our wechat group or follow our wechat official account.
+| Wechat Group | Wechat Official Account |
+| ------------ | ----------------------- |
+|![wechat](https://k2-fsa.org/zh-CN/assets/pic/wechat_group.jpg) |![wechat](https://k2-fsa.org/zh-CN/assets/pic/wechat_account.jpg) |
+---
+## Community Projects
+OmniVoice is supported by a growing ecosystem of community projects.
+Explore them in [Community Projects](docs/community-projects.md).
+---
+## Citation
+```bibtex
+@article{zhu2026omnivoice,
+      title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models},
+      author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel},
+      journal={arXiv preprint arXiv:2604.00688},
+      year={2026}
+}
+```
+---
+## Disclaimer
+Users are strictly prohibited from using this model for unauthorized voice cloning, voice impersonation, fraud, scams, or any other illegal or unethical activities. All users shall ensure full compliance with applicable local laws, regulations, and ethical standards. The developers assume no liability for any misuse of this model and advocate for responsible AI development and use, encouraging the community to uphold safety and ethical principles in AI research and applications.

api.py ADDED Viewed

	@@ -0,0 +1,277 @@

+# app.py
+import asyncio
+import base64
+import io
+import json
+import time
+from typing import AsyncGenerator
+import numpy as np
+import soundfile as sf
+import torch
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from omnivoice import OmniVoice
+# =========================================================
+# App
+# =========================================================
+app = FastAPI(title="OmniVoice OpenAI-Compatible TTS")
+# =========================================================
+# Constants
+# =========================================================
+SAMPLE_RATE = 24000
+NUM_CHANNELS = 1
+BYTES_PER_SAMPLE = 2
+FRAME_MS = 20
+CHUNK_SIZE = int(
+    SAMPLE_RATE * (FRAME_MS / 1000) * BYTES_PER_SAMPLE * NUM_CHANNELS
+)
+# =========================================================
+# Fixed Voice Config
+# =========================================================
+FIXED_REF_AUDIO = "ref_audio/women_ref_1.mp3"
+FIXED_REF_TEXT = (
+    "شوفي يا حلوة هالكريم الجديد للبشرة، يخلي وجهك مثل القمر!"
+)
+FIXED_INSTRUCT = "female, young adult, high pitch"
+# =========================================================
+# Load Model
+# =========================================================
+model = OmniVoice.from_pretrained(
+    "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune/checkpoint-5000",
+    device_map="cuda:0",
+    dtype=torch.float16,
+)
+# Prevent concurrent GPU inference crashes
+generation_lock = asyncio.Lock()
+# =========================================================
+# Request Schema
+# =========================================================
+class SpeechRequest(BaseModel):
+    model: str = "omnivoice"
+    input: str
+    speed: float = 1.1
+    response_format: str = "pcm"
+    # audio | sse
+    stream_format: str = "audio"
+# =========================================================
+# Audio Helpers
+# =========================================================
+def float32_to_pcm16(audio: np.ndarray) -> bytes:
+    audio = np.clip(audio, -1, 1)
+    pcm16 = (audio * 32767).astype(np.int16)
+    return pcm16.tobytes()
+# =========================================================
+# Generate Audio
+# =========================================================
+async def generate_audio(req: SpeechRequest) -> np.ndarray:
+    async with generation_lock:
+        def _generate():
+            with torch.inference_mode():
+                print("*" * 50)
+                print("user text : " , req.input)
+                print("*" * 50)
+                audio = model.generate(
+                    text=req.input,
+                    ref_audio=FIXED_REF_AUDIO,
+                    ref_text=FIXED_REF_TEXT,
+                    instruct=FIXED_INSTRUCT,
+                    speed=req.speed,
+                    num_step = 30,
+                    guidance_scale=2.0,
+                    t_shift=0.1,
+                    position_temperature=3,
+                    layer_penalty_factor=5.0,
+                )
+            return audio[0]
+        return await asyncio.to_thread(_generate)
+# =========================================================
+# Raw Audio Stream
+# =========================================================
+async def audio_stream_generator(
+    req: SpeechRequest,
+) -> AsyncGenerator[bytes, None]:
+    audio = await generate_audio(req)
+    if req.response_format == "pcm":
+        pcm_bytes = float32_to_pcm16(audio)
+        for i in range(0, len(pcm_bytes), CHUNK_SIZE):
+            yield pcm_bytes[i:i + CHUNK_SIZE]
+            await asyncio.sleep(0)
+    elif req.response_format == "wav":
+        buffer = io.BytesIO()
+        sf.write(
+            buffer,
+            audio,
+            SAMPLE_RATE,
+            format="WAV",
+        )
+        buffer.seek(0)
+        while True:
+            chunk = buffer.read(4096)
+            if not chunk:
+                break
+            yield chunk
+            await asyncio.sleep(0)
+    else:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported response_format: {req.response_format}"
+        )
+# =========================================================
+# SSE Stream
+# =========================================================
+async def sse_stream_generator(
+    req: SpeechRequest,
+) -> AsyncGenerator[str, None]:
+    start_time = time.time()
+    audio = await generate_audio(req)
+    generation_time = time.time() - start_time
+    pcm_bytes = float32_to_pcm16(audio)
+    for i in range(0, len(pcm_bytes), CHUNK_SIZE):
+        chunk = pcm_bytes[i:i + CHUNK_SIZE]
+        b64_chunk = base64.b64encode(chunk).decode("utf-8")
+        event = {
+            "type": "speech.audio.delta",
+            "delta": b64_chunk,
+        }
+        yield f"data: {json.dumps(event)}\n\n"
+        await asyncio.sleep(0)
+    audio_duration = len(audio) / SAMPLE_RATE
+    usage = {
+        "input_tokens": len(req.input.split()),
+        "output_tokens": int(audio_duration * 50),
+    }
+    done_event = {
+        "type": "speech.audio.done",
+        "usage": usage,
+        "metrics": {
+            "generation_time_sec": generation_time,
+            "audio_duration_sec": audio_duration,
+            "rtf": round(generation_time / audio_duration, 4),
+        }
+    }
+    yield f"data: {json.dumps(done_event)}\n\n"
+    yield "data: [DONE]\n\n"
+# =========================================================
+# OpenAI-Compatible Endpoint
+# =========================================================
+@app.post("/v1/audio/speech")
+async def create_speech(req: SpeechRequest):
+    if req.stream_format == "sse":
+        return StreamingResponse(
+            sse_stream_generator(req),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+            },
+        )
+    media_type = (
+        "audio/pcm"
+        if req.response_format == "pcm"
+        else "audio/wav"
+    )
+    return StreamingResponse(
+        audio_stream_generator(req),
+        media_type=media_type,
+    )
+# =========================================================
+# Health
+# =========================================================
+@app.get("/health")
+async def health():
+    return {
+        "status": "ok",
+        "sample_rate": SAMPLE_RATE,
+        "voice": {
+            "ref_audio": FIXED_REF_AUDIO,
+            "instruct": FIXED_INSTRUCT,
+        }
+    }

docs/OmniVoice.ipynb ADDED Viewed

	@@ -0,0 +1,144 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "# OmniVoice Quick Start\n\n[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb)\n\nThis notebook demonstrates the basic usage of [OmniVoice](https://github.com/k2-fsa/OmniVoice), a massively multilingual zero-shot TTS model supporting 600+ languages.\n\n**Contents:**\n1. Installation\n2. Option A — Gradio Demo (interactive web UI, no code needed)\n3. Option B — Python API\n   - 3.1 Load Model\n   - 3.2 Voice Cloning\n   - 3.3 Voice Design\n   - 3.4 Auto Voice"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Installation\n",
+    "\n",
+    "Colab already provides a compatible PyTorch + CUDA environment, so we only need to install OmniVoice."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install omnivoice"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## 2. Option A — Gradio Demo\n\nLaunch an interactive web UI with a public Gradio link. The `--share` flag creates a temporary public URL so you can access the demo from any browser.\n\n> **If you prefer to use the Python API directly, skip to Option B below.**"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!omnivoice-demo --share"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## 3. Option B — Python API\n\n### 3.1 Load Model"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from omnivoice import OmniVoice\nimport soundfile as sf\nimport torch\nfrom IPython.display import Audio, display\n\nmodel = OmniVoice.from_pretrained(\n    \"k2-fsa/OmniVoice\",\n    device_map=\"cuda:0\",\n    dtype=torch.float16,\n    load_asr=True,\n)"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "### 3.2 Voice Cloning\n\nClone a voice from a short (3-10s) reference audio clip. Upload your own `ref.wav` or use any audio file.\n\n`ref_text` is optional — if omitted, the model uses Whisper ASR to auto-transcribe it."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from google.colab import files\n",
+    "\n",
+    "print(\"Upload a reference audio file (wav/mp3/flac):\")\n",
+    "uploaded = files.upload()\n",
+    "ref_audio_path = list(uploaded.keys())[0]\n",
+    "print(f\"Uploaded: {ref_audio_path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio = model.generate(\n",
+    "    text=\"Hello, this is a test of zero-shot voice cloning.\",\n",
+    "    ref_audio=ref_audio_path,\n",
+    "    # ref_text=\"Transcription of the reference audio.\",  # optional\n",
+    ")\n",
+    "\n",
+    "sf.write(\"clone_out.wav\", audio[0], 24000)\n",
+    "display(Audio(audio[0], rate=24000))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "### 3.3 Voice Design\n\nDescribe the desired voice with speaker attributes — no reference audio needed.\n\nSupported attributes: gender, age, pitch, style (whisper), English accent, Chinese dialect. See [docs/voice-design.md](https://github.com/k2-fsa/OmniVoice/blob/master/docs/voice-design.md) for the full list."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio = model.generate(\n",
+    "    text=\"Hello, this is a test of zero-shot voice design.\",\n",
+    "    instruct=\"female, low pitch, british accent\",\n",
+    ")\n",
+    "\n",
+    "sf.write(\"design_out.wav\", audio[0], 24000)\n",
+    "display(Audio(audio[0], rate=24000))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "### 3.4 Auto Voice\n\nLet the model choose a voice automatically — no reference audio or instruct needed."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "audio = model.generate(\n",
+    "    text=\"This is a sentence generated with automatic voice selection.\",\n",
+    ")\n",
+    "\n",
+    "sf.write(\"auto_out.wav\", audio[0], 24000)\n",
+    "display(Audio(audio[0], rate=24000))"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

docs/community-projects.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# Community Projects
+The following projects are built and maintained by the community. We appreciate all contributions! Note that these projects are not officially supported by the OmniVoice team.
+If you have a project you'd like to add, please open a PR.
+---
+- **[ComfyUI-OmniVoice-TTS](https://github.com/Saganaki22/ComfyUI-OmniVoice-TTS)** —
+  ComfyUI custom node for OmniVoice text-to-speech generation.
+- **[vLLM-Omni](https://github.com/vllm-project/vllm-omni)** —
+  A framework for efficient model inference with omni-modality model. Supports OmniVoice serving.
+- **[pyVideoTrans](https://github.com/jianchang512/pyvideotrans)** —
+  Video translation tool with dubbing & subtitles. Supports OmniVoice as a TTS engine.
+- **[MLX-Audio](https://github.com/Blaizzy/mlx-audio)** —
+  TTS, STT, and STS library built on Apple's MLX framework. Supports
+  OmniVoice among other models for efficient speech processing on Apple Silicon.
+- **[RealtimeTTS](https://github.com/KoljaB/RealtimeTTS)** —
+  Converts text to speech in realtime. Supports OmniVoice as a TTS engine.
+- **[TTS-WebUI](https://github.com/rsxdalv/TTS-WebUI)** —
+  Gradio web UI for multiple TTS models. Supports OmniVoice as one of its backends.
+- **[OmniVoice-Studio](https://github.com/debpalash/OmniVoice-Studio)** —
+  Desktop application for OmniVoice voice generation.
+- **[omnivoice-server](https://github.com/maemreyo/omnivoice-server)** —
+  OpenAI-compatible HTTP server for serving OmniVoice via `/v1/audio/speech`.
+  Supports voice profiles for persistent cloning, sentence-level streaming,
+  and optional Bearer auth.
+- **[omnivoice-rs](https://github.com/FerrisMind/omnivoice-rs)** —
+  GPU-first Rust workspace for OmniVoice inference, parity validation, CLI
+  execution, and an OpenAI-compatible HTTP server built with Candle.
+- **[omnivoice-trtllm](https://github.com/tlitech/omnivoice-trtllm)** —
+  Deploy OmniVoice TTS model using TensorRT-LLM and Triton Inference Server
+  on Modal, faster than PyTorch.
+  - **[Auris](https://github.com/nikhilprasanth/Auris)** —
+  Offline audiobook reader for EPUB, PDF, and TXT with local OmniVoice TTS, character-aware voices, and per-book narrator control.

docs/data_preparation.md ADDED Viewed

	@@ -0,0 +1,182 @@

+# Data Preparation
+OmniVoice trains on a custom WebDataset format where audio data is packed into **tar shards** with paired **JSONL metadata** files. Each tar shard contains hundreds to thousands of samples (as `.npy` audio token arrays), drastically reducing disk I/O during training. The separated jsonl file allows for easier modification of metadata. This document explains the data format in detail and walks through the preparation pipeline.
+## 1. Input Format
+Prepare a JSONL file where each line is a JSON object:
+```jsonl
+{"id": "sample_001", "audio_path": "/data/audio/001.wav", "text": "Hello world", "language_id": "en"}
+{"id": "sample_002", "audio_path": "/data/audio/002.wav", "text": "你好世界", "language_id": "zh"}
+```
+Fields:
+- `id` — unique sample identifier (used to match samples across shards and label files)
+- `audio_path` — absolute path to the audio file (wav/flac/mp3, will be resampled to 24 kHz)
+- `text` — transcript text
+- `language_id` — (optional) language code, used for multilingual training, can be omitted
+## 2. Processing
+The tokenization script `extract_audio_tokens.py` converts audio into 8-layer discrete tokens and packs them into WebDataset shards.
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,4"  # GPUs used for token extraction
+python -m omnivoice.scripts.extract_audio_tokens \
+    --input_jsonl data.jsonl \
+    --tar_output_pattern output/audios/shard-%06d.tar \
+    --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+    --tokenizer_path eustlb/higgs-audio-v2-tokenizer \
+    --nj_per_gpu 3 \
+    --shuffle True
+```
+What it does:
+1. Reads your JSONL manifest
+2. Encodes each audio file into discrete tokens using audio tokenizer
+3. Packs tokens into WebDataset tar shards with paired jsonl metadata files
+4. Generates a `data.lst` manifest file
+<details>
+<summary><strong>Alternative:</strong> WebDataset Input (if you already have raw-audio tar shards)</summary>
+Pass the `data.lst` manifest instead of `--input_jsonl`:
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,4"  # GPUs used for token extraction
+python -m omnivoice.scripts.extract_audio_tokens \
+    --input_manifest existing_data/data.lst \
+    --tar_output_pattern output/audios/shard-%06d.tar \
+    --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+    --tokenizer_path eustlb/higgs-audio-v2-tokenizer \
+    --nj_per_gpu 3 \
+    --shuffle True
+```
+The existing_data/data.lst is generated with:
+```bash
+python -m omnivoice.scripts.jsonl_to_webdataset \
+    --input data.jsonl \
+    --output data/shards \
+    --sr 24000 \
+    --shard-size 1000
+```
+This resamples audio to the target sample rate and packs FLAC files into tar shards with paired jsonl metadata files.
+</details>
+### Explanation of the script's options:
+| Option | Default | Description |
+|---|---|---|
+| `--input_manifest` | None | Path to input dataset manifest (`data.lst`), mutually exclusive with `--input_jsonl` |
+| `--input_jsonl` | None | Path to raw JSONL file, mutually exclusive with `--input_manifest` |
+| `--tar_output_pattern` | (required) | Tar shard output pattern, e.g. `output/audios/shard-%06d.tar` |
+| `--jsonl_output_pattern` | (required) | JSONL shard output pattern, e.g. `output/txts/shard-%06d.jsonl` |
+| `--tokenizer_path` | `eustlb/higgs-audio-v2-tokenizer` | HuggingFace tokenizer path or local path |
+| `--nj_per_gpu` | 3 | Worker processes per GPU |
+| `--loader_workers` | 24 | DataLoader workers for streaming `IterableDataset` |
+| `--shuffle` | True | Shuffle samples before sharding |
+| `--shuffle-seed` | 42 | Random seed for shuffling |
+| `--samples_per_shard` | 1000 | Max samples per tar shard |
+| `--min_num_shards` | 32 | Minimum number of output shards (ensures shard count >= num\_gpu × num\_workers) |
+| `--min_length` | 0.0 | Skip audio shorter than this (seconds) |
+| `--max_length` | inf | Skip audio longer than this (seconds) |
+| `--skip_errors` | False | Continue on processing errors instead of aborting |
+| `--num_machines` | 1 | Total number of machines for distributed runs |
+| `--machine_index` | 0 | Zero-based machine index for distributed preprocessing |
+### Output Structure
+Output structure with the following output patterns
+```bash
+--tar_output_pattern output/audios/shard-%06d.tar \
+--jsonl_output_pattern output/txts/shard-%06d.jsonl
+```
+will be:
+```
+output/
+├── audios/                    # WebDataset tar shards (audio tokens)
+│   ├── shard-000000.tar       # Each tar packs ~1000 samples
+│   ├── shard-000001.tar
+│   └── ...
+├── txts/                      # Per-shard companion JSONL labels
+│   ├── shard-000000.jsonl     # One JSON line per sample in the corresponding tar
+│   ├── shard-000001.jsonl
+│   └── ...
+├── data.lst                   # Manifest linking tar ↔ jsonl shards
+└── errors.jsonl               # Samples that failed processing (if any)
+```
+`data.lst` and `errors.jsonl` are written to the **parent directory** of `audios/` and `txts/`.
+### The `data.lst` manifest
+Each line in `data.lst` describes one shard:
+```
+/path/to/shard-000000.tar /path/to/shard-000000.jsonl 1000 3600.500
+/path/to/shard-000001.tar /path/to/shard-000001.jsonl 800 2880.200
+```
+Format: `<tar_path> <jsonl_path> <num_samples> <total_duration_seconds>`
+- Paths are **absolute**
+- `.tar` file contains the audio tokens.
+- `.jsonl` file contains the metadata in the original provided JSONL file, allows easier access and modification of metadata without decompressing the tar file.
+- This manifest is what the training data config references.
+### Inside a tar shard
+Each `.tar` file packs **many samples** (default 1000 per shard) into a single archive. This is the key advantage of WebDataset: instead of reading thousands of tiny files, the dataloader reads sequentially from a few large tars, drastically reducing disk I/O pressure.
+Each sample in the tar is a pair of files with matching keys:
+```
+shard-000000.tar:
+  sample_001.npy    # Audio tokens: numpy array, shape [8, T], dtype int16
+  sample_002.npy
+  ...
+  sample_1000.npy
+```
+## 3. Data Config for Training
+After creating WebDataset shards, write a data config JSON that references them:
+```json
+{
+    "train": [
+        {
+            "language_id": "en",
+            "manifest_path": ["data/custom/tokens/train/data.lst"],
+            "repeat": 1
+        }
+    ],
+    "dev": [
+        {
+            "language_id": "en",
+            "manifest_path": ["data/custom/tokens/dev/data.lst"],
+            "repeat": 1
+        }
+    ]
+}
+```
+- `manifest_path` — list of `data.lst` files (one per shard directory)
+- `repeat` — how many times to repeat this dataset per epoch (useful for balancing languages)
+- `language_id` is not used, just for a better data organization.
+See [examples/config/](../examples/config/) for ready-to-use data config files.
+> See [docs/data_preparation_advanced.md](../docs/data_preparation_advanced.md) for denoising and noise augmentation.

docs/data_preparation_advanced.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Advanced Data Preparation
+The advanced pipeline adds **denoising** and **prompt noise augmentation** on top of the basic tokenization workflow. Each stage is optional.
+## Prerequisites
+- **Denoising**: Sidon model checkpoints (`feature_extractor_cuda.pt`, `decoder_cuda.pt`) from https://huggingface.co/sarulab-speech/sidon-v0.1/tree/main.
+- **Noise augmentation**: noise + RIR tar shards with `data.lst` manifests
+## Pipeline Overview
+```
+Step 1 (optional): Denoise
+  Raw audio → Sidon denoiser → clean audio
+Step 2: Tokenize (with optional noise augmentation)
+  Clean audio + noise augment on prefix → audio tokenizer → tokens
+```
+## Denoise
+Use the [Sidon](https://github.com/sarulab-speech/Sidon) speech enhancement model to remove background noise from raw audio.
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+python -m omnivoice.scripts.denoise_audio \
+    --input_jsonl data.jsonl \
+    --tar_output_pattern data/denoised/audios/shard-%06d.tar \
+    --jsonl_output_pattern data/denoised/txts/shard-%06d.jsonl \
+    --feature_extractor_path /path/to/sidon_feature_extractor_cuda.pt \
+    --decoder_path /path/to/sidon_decoder_cuda.pt \
+    --target_sample_rate 24000 \
+    --batch_duration 200.0
+```
+What it does:
+1. Reads your JSONL manifest
+2. Runs Sidon denoiser on each audio file
+3. Outputs denoised audio as custom WebDataset tar/jsonl shards
+4. Generates a `data.lst` manifest in `data/denoised/`
+> You can also pass `--input_manifest /path/to/data.lst` if you already have a custom webdataset format dataset.
+> The next step would be passing the generated `data.lst` file with `--input_manifest` to `omnivoice.scripts.extract_audio_tokens` for tokens extraction.
+### Tokenize with noise augmentation
+Adds environmental noise and room reverb to **prompt audio** during tokenization, making the model robust to noisy reference audio at inference time. Note that in our model, we only add noise augmentation for a small proportion of data, making sure the model can also generate good audio with clean reference audio.
+You need two additional datasets in WebDataset format:
+- **Noise recordings**: environmental noise tar shards with a `data.lst` manifest
+- **Room impulse responses (RIR)**: RIR tar shards with a `data.lst` manifest
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,4"
+python -m omnivoice.scripts.extract_audio_tokens_add_noise \
+    --input_jsonl data.jsonl \
+    --tar_output_pattern data/tokens/shard-%06d.tar \
+    --jsonl_output_pattern data/txts/shard-%06d.jsonl \
+    --tokenizer_path eustlb/higgs-audio-v2-tokenizer \
+    --noise_manifest data/noise_shards/data.lst \
+    --rir_manifest data/rir_shards/data.lst \
+    --nj_per_gpu 3
+```
+> You can also pass `--input_manifest /path/to/data.lst` if you already have a custom webdataset format dataset.

docs/evaluation.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# Evaluation
+Evaluate OmniVoice models with standard TTS metrics: WER (intelligibility), SIM-o (speaker similarity), and UTMOS (naturalness).
+## Supported Test Sets
+| Test Set | Languages | WER Module | Metrics |
+|---|---|---|---|
+| **LibriSpeech-PC** | English | HuBERT WER | WER + Speaker Sim + MOS |
+| **Seed-TTS (en)** | English | Whisper WER | WER + MOS |
+| **Seed-TTS (zh)** | Chinese | Paraformer WER | WER + MOS |
+| **FLEURS** | 102 languages | Omnilingual-ASR WER | WER (per-language + macro-avg) |
+| **MiniMax Multilingual** | 24 languages | Whisper + Paraformer | WER + MOS |
+## Prerequisites
+```bash
+pip install omnivoice[eval]
+# or
+uv sync --extra eval
+```
+## Quick Start
+```bash
+cd examples
+bash run_eval.sh
+# run_eval.sh will
+# (1) download all required test sets and test models;
+# (2) inference and evaluation for each test set.
+```
+## Metrics Explained
+### WER (Word Error Rate)
+Measures how intelligible the generated speech is by transcribing it with an ASR model and comparing to the reference text. Lower is better. Note that some languages actually use CER (Character Error Rate).
+- **LibriSpeech-PC**: HuBERT-based ASR
+- **Seed-TTS**: Whisper (en) or Paraformer (zh)
+- **MiniMax**: Whisper for non-Chinese, Paraformer for Chinese
+- **FLEURS**: Omnilingual-ASR multilingual model
+### Speaker Similarity
+Cosine similarity between speaker embeddings (ECAPA-TDNN + WavLM) of the reference and generated audio. Higher is better.
+### UTMOS (Predicted MOS)
+Neural network that predicts Mean Opinion Score from audio. Higher is better.

docs/generation-parameters.md ADDED Viewed

	@@ -0,0 +1,68 @@

+# Generation Parameters
+Parameters can be passed as keyword arguments to `model.generate(...)` or via the `OmniVoiceGenerationConfig` dataclass. See below for the full list and which category each belongs to.
+```python
+# 1) Direct keyword arguments
+audio = model.generate(text="Hello world", num_step=32, guidance_scale=2.0)
+# 2) Via OmniVoiceGenerationConfig dataclass
+from omnivoice import OmniVoiceGenerationConfig
+config = OmniVoiceGenerationConfig(num_step=32, guidance_scale=2.0)
+audio = model.generate(text="Hello world", generation_config=config)
+```
+## Decoding
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `num_step` | int | 32 | Number of iterative unmasking steps. Higher values improve quality but slow down generation. Use 16 for faster inference. |
+| `denoise` | bool | True | Prepend the `<|denoise|>` token to the input, which signals the model to produce cleaner speech. |
+| `guidance_scale` | float | 2.0 | Classifier-free guidance scale.|
+| `t_shift` | float | 0.1 | Time-step shift for the noise schedule. Smaller values emphasise earlier steps in decoding. |
+## Sampling
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `position_temperature` | float | 5.0 | Temperature for mask-position selection. 0 = greedy (deterministic). Higher values increase randomness. |
+| `class_temperature` | float | 0.0 | Temperature for token sampling at each step. 0 = greedy (deterministic). Higher values increase randomness. |
+| `layer_penalty_factor` | float | 5.0 | Penalty applied to deeper codebook layers, encouraging earlier (lower) layers to unmask first. |
+## Duration & Speed
+These accept a single value applied to all items, or a per-item list (useful in batch mode):
+```python
+# Fixed 10-second output
+audio = model.generate(text="Hello, this is a test of duration control", duration=10.0)
+# Faster speech (1.2x faster than estimated)
+audio = model.generate(text="Hello, this is a test of duration control", speed=1.2)
+```
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `duration` | float or list[float \| None] | None | Fixed output duration in seconds. Overrides `speed` when set. |
+| `speed` | float or list[float \| None] | None | Speed factor. Values > 1.0 produce shorter audio (faster); values < 1.0 produce longer audio (slower). Ignored when `duration` is set. Defaults to 1.0 when both are None. |
+Priority: `duration` > `speed`.
+> **Note:** When using `duration`, the default post-processing step may trim trailing silence, causing the actual output to be slightly shorter than the requested duration. If you need the output duration to **exactly** match the specified value, set `postprocess_output=False` to disable silence removal.
+## Pre/Post Processing
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `preprocess_prompt` | bool | True | Whether to apply preprocessing to the voice-clone prompt audio (remove long silences in reference audio, add punctuation in the end of reference text). |
+| `postprocess_output` | bool | True | Apply post-processing to generated audio (remove long silences). |
+## Long-Form Generation
+To support stable long-form speech generation with low VRAM consumption, the text is automatically split into smaller segments when the estimated duration of the generated speech exceeds `audio_chunk_duration`, with each segment producing approximately `audio_chunk_duration` seconds of audio. This approach allows the model to accept arbitrarily long text and generate arbitrarily long speech with near-constant VRAM consumption.
+| Parameter | Type | Default | Description |
+|---|---|---|---|
+| `audio_chunk_duration` | float | 15.0 | Target chunk duration (seconds) when splitting long text. |
+| `audio_chunk_threshold` | float | 30.0 | Estimated audio duration (seconds) above which chunking is activated. |

docs/lang_id_name_map.tsv ADDED Viewed

	@@ -0,0 +1,647 @@

+language_id	language_name	iso_639_3_id	train_data_duration
+aae	Arbëreshë Albanian	aae	6.11
+aal	Afade	aal	10.19
+aao	Algerian Saharan Arabic	aao	2.02
+ab	Abkhazian	abk	57.27
+abb	Bankon	abb	11.2
+abn	Abua	abn	10.27
+abr	Abron	abr	9.22
+abs	Ambonese Malay	abs	10.03
+abv	Baharna Arabic	abv	10.41
+acm	Mesopotamian Arabic	acm	3.78
+acw	Hijazi Arabic	acw	22.32
+acx	Omani Arabic	acx	22.03
+adf	Dhofari Arabic	adf	0.31
+adx	Amdo Tibetan	adx	56.94
+ady	Adyghe	ady	32.6
+aeb	Tunisian Arabic	aeb	21.63
+aec	Saidi Arabic	aec	9.28
+af	Afrikaans	afr	4.4
+afb	Gulf Arabic	afb	98.55
+afo	Eloyi	afo	11.21
+ahl	Igo	ahl	9.22
+ahs	Ashe	ahs	10.62
+ajg	Aja (Benin)	ajg	5.63
+aju	Judeo-Moroccan Arabic	aju	7.21
+ala	Alago	ala	11.04
+aln	Gheg Albanian	aln	3.92
+alo	Larike-Wakasihu	alo	9.97
+am	Amharic	amh	12.83
+amu	Guerrero Amuzgo	amu	10.1
+an	Aragonese	arg	16.4
+anc	Ngas	anc	10.14
+ank	Goemai	ank	10.0
+anp	Angika	anp	10.65
+anw	Anaang	anw	9.65
+aom	Ömie	aom	8.19
+apc	Levantine Arabic	apc	15.65
+apd	Sudanese Arabic	apd	9.93
+arb	Standard Arabic	arb	1483.53
+arq	Algerian Arabic	arq	9.64
+ars	Najdi Arabic	ars	203.54
+ary	Moroccan Arabic	ary	104.67
+arz	Egyptian Arabic	arz	23.23
+as	Assamese	asm	270.85
+ast	Asturian	ast	8.48
+avl	Eastern Egyptian Bedawi Arabic	avl	1.86
+awo	Awak	awo	10.22
+ayl	Libyan Arabic	ayl	20.13
+ayp	North Mesopotamian Arabic	ayp	10.92
+az	Azerbaijani	aze	9.84
+ba	Bashkir	bak	249.1
+bag	Tuki	bag	10.97
+bas	Basa (Cameroon)	bas	10.66
+bax	Bamun	bax	10.24
+bba	Baatonum	bba	10.53
+bbj	Ghomálá'	bbj	7.32
+bbl	Bats	bbl	11.22
+bbu	Kulung (Nigeria)	bbu	10.39
+bce	Bamenyam	bce	9.9
+bci	Baoulé	bci	10.21
+bcs	Kohumono	bcs	10.45
+bcy	Bacama	bcy	9.94
+bda	Bayot	bda	9.47
+bde	Bade	bde	9.89
+bdm	Buduma	bdm	10.17
+be	Belarusian	bel	1809.43
+beb	Bebele	beb	7.52
+bew	Betawi	bew	11.15
+bfd	Bafut	bfd	9.03
+bft	Balti	bft	16.28
+bg	Bulgarian	bul	2190.76
+bgp	Eastern Balochi	bgp	10.98
+bhb	Bhili	bhb	9.98
+bhh	Bukharic	bhh	11.38
+bho	Bhojpuri	bho	10.05
+bhp	Bima	bhp	10.67
+bhr	Bara Malagasy	bhr	12.14
+bjj	Kanauji	bjj	11.01
+bjk	Barok	bjk	10.16
+bjn	Banjar	bjn	11.68
+bjt	Balanta-Ganja	bjt	9.41
+bkh	Bakoko	bkh	6.0
+bkm	Kom (Cameroon)	bkm	10.76
+bky	Bokyi	bky	9.85
+bmm	Northern Betsimisaraka Malagasy	bmm	19.12
+bmq	Bomu	bmq	10.68
+bn	Bengali	ben	271.76
+bnm	Batanga	bnm	15.01
+bnn	Bunun	bnn	9.26
+bns	Bundeli	bns	10.88
+bo	Tibetan	bod	82.27
+bou	Bondei	bou	9.98
+bqg	Bago-Kusuntu	bqg	8.86
+br	Breton	bre	25.48
+bra	Braj	bra	10.68
+brh	Brahui	brh	19.89
+bri	Mokpwe	bri	7.53
+brx	Bodo	brx	231.57
+bs	Bosnian	bos	690.73
+bsh	Kati	bsh	8.77
+bsj	Bangwinji	bsj	10.0
+bsk	Burushaski	bsk	9.14
+btm	Batak Mandailing	btm	11.09
+btv	Bateri	btv	9.8
+bug	Buginese	bug	11.09
+bum	Bulu (Cameroon)	bum	9.06
+buo	Terei	buo	9.48
+bux	Boghom	bux	10.48
+bwr	Bura-Pabir	bwr	10.4
+bxf	Bilur	bxf	10.84
+byc	Ubaghara	byc	11.11
+bys	Burak	bys	9.92
+byv	Medumba	byv	10.95
+byx	Qaqet	byx	9.79
+bzc	Southern Betsimisaraka Malagasy	bzc	17.45
+bzw	Basa (Nigeria)	bzw	10.27
+ca	Catalan	cat	3358.6
+ccg	Samba Daka	ccg	10.11
+ceb	Cebuano	ceb	12.17
+cen	Cen	cen	9.85
+cfa	Dijim-Bwilim	cfa	10.32
+cgg	Chiga	cgg	10.84
+chq	Quiotepec Chinantec	chq	9.76
+cjk	Chokwe	cjk	11.01
+ckb	Central Kurdish	ckb	137.52
+ckl	Cibak	ckl	10.91
+ckr	Kairak	ckr	10.51
+cky	Cakfem-Mushere	cky	8.96
+cnh	Hakha Chin	cnh	2.24
+cpy	South Ucayali Ashéninka	cpy	9.15
+cs	Czech	ces	148.13
+cte	Tepinapa Chinantec	cte	9.54
+ctl	Tlacoatzintepec Chinantec	ctl	10.04
+cut	Teutila Cuicatec	cut	8.04
+cux	Tepeuxila Cuicatec	cux	7.83
+cv	Chuvash	chv	23.96
+cy	Welsh	cym	131.21
+da	Danish	dan	1665.98
+dag	Dagbani	dag	10.14
+dar	Dargwa	dar	1.22
+dav	Taita	dav	9.12
+dbd	Dadiya	dbd	9.61
+dcc	Deccan	dcc	10.38
+de	German	deu	21927.13
+deg	Degema	deg	11.07
+dgh	Dghwede	dgh	9.95
+dgo	Dogri	dgo	117.04
+dje	Zarma	dje	10.72
+dmk	Domaaki	dmk	6.38
+dml	Dameli	dml	9.18
+dru	Rukai	dru	9.26
+dty	Dotyali	dty	10.85
+dua	Duala	dua	12.13
+dv	Dhivehi	div	38.61
+dyu	Dyula	dyu	0.34
+dzg	Dazaga	dzg	9.96
+ebr	Ebrié	ebr	1.5
+ebu	Embu	ebu	9.81
+ego	Eggon	ego	9.95
+eiv	Askopan	eiv	10.44
+eko	Koti	eko	8.15
+ekr	Yace	ekr	10.76
+el	Greek	ell	2412.54
+elm	Eleme	elm	11.27
+en	English	eng	206061.1
+eo	Esperanto	epo	1396.64
+es	Spanish	spa	27559.74
+esu	Central Yupik	esu	2.18
+et	Estonian	est	960.37
+eto	Eton (Cameroon)	eto	7.43
+ets	Yekhee	ets	10.11
+etu	Ejagham	etu	10.3
+eu	Basque	eus	479.86
+ewo	Ewondo	ewo	12.71
+ext	Extremaduran	ext	13.59
+eyo	Keiyo	eyo	9.24
+fa	Persian	fas	366.07
+fan	Fang (Equatorial Guinea)	fan	3.51
+fat	Fanti	fat	11.38
+ff	Fulah	ful	13.84
+ffm	Maasina Fulfulde	ffm	10.46
+fi	Finnish	fin	468.62
+fia	Nobiin	fia	9.96
+fil	Filipino	fil	7.71
+fip	Fipa	fip	10.55
+fkk	Kirya-Konzəl	fkk	9.98
+fmp	Fe'fe'	fmp	9.86
+fr	French	fra	23675.32
+fub	Adamawa Fulfulde	fub	13.12
+fuc	Pulaar	fuc	14.77
+fue	Borgu Fulfulde	fue	20.1
+fuf	Pular	fuf	13.77
+fuh	Western Niger Fulfulde	fuh	9.69
+fui	Bagirmi Fulfulde	fui	15.04
+fuq	Central-Eastern Niger Fulfulde	fuq	9.28
+fuv	Nigerian Fulfulde	fuv	9.97
+fy	Western Frisian	fry	70.41
+ga	Irish	gle	21.4
+gbm	Garhwali	gbm	19.14
+gbr	Gbagyi	gbr	12.12
+gby	Gbari	gby	12.59
+gcc	Mali	gcc	9.87
+gdf	Guduf-Gava	gdf	12.21
+gej	Gen	gej	5.39
+ges	Geser-Gorom	ges	10.08
+ggg	Gurgula	ggg	7.12
+gid	Gidar	gid	10.06
+gig	Goaria	gig	9.41
+giz	South Giziga	giz	10.03
+gjk	Kachi Koli	gjk	20.83
+gju	Gujari	gju	8.66
+gl	Galician	glg	208.81
+glw	Glavda	glw	10.51
+gn	Guarani	grn	4.06
+gol	Gola	gol	9.26
+gom	Goan Konkani	gom	9.82
+gsl	Gusilay	gsl	10.0
+gu	Gujarati	guj	91.18
+gui	Eastern Bolivian Guaraní	gui	22.72
+gur	Farefare	gur	9.24
+guz	Gusii	guz	9.5
+gv	Manx	glv	10.07
+gwc	Gawri	gwc	10.83
+gwe	Gweno	gwe	8.87
+gwt	Gawar-Bati	gwt	12.16
+gya	Northwest Gbaya	gya	8.45
+gyz	Geji	gyz	10.49
+ha	Hausa	hau	17.75
+hah	Hahon	hah	9.64
+hao	Hakö	hao	8.56
+haw	Hawaiian	haw	11.79
+haz	Hazaragi	haz	9.69
+hbb	Huba	hbb	10.7
+he	Hebrew	heb	13.4
+hem	Hemba	hem	9.53
+hi	Hindi	hin	117.17
+hia	Lamang	hia	11.07
+hkk	Hunjara-Kaina Ke	hkk	8.69
+hla	Halia	hla	9.86
+hno	Northern Hindko	hno	20.04
+hoj	Hadothi	hoj	10.08
+hr	Croatian	hrv	2795.31
+hsb	Upper Sorbian	hsb	2.71
+ht	Haitian	hat	0.04
+hu	Hungarian	hun	255.83
+hue	San Francisco Del Mar Huave	hue	9.45
+hul	Hula	hul	10.33
+hux	Nüpode Huitoto	hux	9.04
+hwo	Hwana	hwo	11.23
+hy	Armenian	hye	42.15
+hz	Herero	her	9.59
+ia	Interlingua (International Auxiliary Language Association)	ina	13.48
+ibb	Ibibio	ibb	7.38
+id	Indonesian	ind	6327.87
+ida	Idakho-Isukha-Tiriki	ida	9.31
+idu	Idoma	idu	11.16
+ig	Igbo	ibo	13.69
+ijc	Izon	ijc	9.95
+ijn	Kalabari	ijn	11.04
+ik	Inupiaq	ipk	2.11
+ikw	Ikwere	ikw	10.0
+is	Icelandic	isl	647.29
+ish	Esan	ish	10.05
+iso	Isoko	iso	10.33
+it	Italian	ita	9402.46
+its	Isekiri	its	11.85
+itw	Ito	itw	9.19
+itz	Itzá	itz	7.08
+ja	Japanese	jpn	36914.4
+jal	Yalahatan	jal	11.18
+jax	Jambi Malay	jax	10.29
+jgo	Ngomba	jgo	10.15
+jmx	Western Juxtlahuaca Mixtec	jmx	10.01
+jns	Jaunsari	jns	11.25
+jqr	Jaqaru	jqr	9.32
+juk	Wapan	juk	10.22
+juo	Jiba	juo	10.43
+jv	Javanese	jav	11.19
+ka	Georgian	kat	156.96
+kab	Kabyle	kab	529.52
+kai	Karekare	kai	10.52
+kaj	Jju	kaj	10.16
+kam	Kamba	kam	14.72
+kbd	Kabardian	kbd	108.35
+kbl	Kanembu	kbl	10.19
+kbt	Abadi	kbt	9.73
+kcq	Kamo	kcq	10.49
+kdh	Tem	kdh	4.07
+kea	Kabuverdianu	kea	10.51
+keu	Akebu	keu	9.1
+kfe	Kota (India)	kfe	10.25
+kfk	Kinnauri	kfk	10.32
+kfp	Korwa	kfp	11.87
+khg	Khams Tibetan	khg	6.38
+khw	Khowar	khw	15.55
+kj	Kuanyama	kua	9.88
+kjc	Coastal Konjo	kjc	10.18
+kjk	Highland Konjo	kjk	10.21
+kk	Kazakh	kaz	1537.29
+kln	Kalenjin	kln	40.42
+kls	Kalasha	kls	9.11
+km	Khmer	khm	7.1
+kmr	Northern Kurdish	kmr	69.59
+kmy	Koma	kmy	10.28
+kn	Kannada	kan	128.06
+kna	Dera (Nigeria)	kna	11.91
+knn	Konkani	knn	112.83
+ko	Korean	kor	8609.28
+kol	Kol (Papua New Guinea)	kol	9.95
+koo	Konzo	koo	13.23
+kpo	Ikposo	kpo	7.83
+kqo	Eastern Krahn	kqo	9.28
+ks	Kashmiri	kas	110.42
+ksd	Kuanua	ksd	9.91
+ksf	Bafia	ksf	16.43
+kto	Kuot	kto	9.77
+kuh	Kushi	kuh	10.35
+kvx	Parkari Koli	kvx	11.04
+kw	Cornish	cor	12.15
+kwm	Kwambi	kwm	9.9
+kxp	Wadiyara Koli	kxp	20.0
+ky	Kirghiz	kir	46.63
+kyx	Rapoisi	kyx	9.17
+lag	Rangi	lag	9.47
+lb	Luxembourgish	ltz	8.46
+lcm	Tungag	lcm	9.77
+ldb	Dũya	ldb	11.31
+lg	Ganda	lug	447.82
+lij	Ligurian	lij	15.97
+lir	Liberian English	lir	10.26
+lkb	Kabras	lkb	9.99
+lla	Lala-Roba	lla	10.38
+ln	Lingala	lin	17.99
+lnu	Longuda	lnu	10.46
+lo	Lao	lao	7.63
+loa	Loloda	loa	9.31
+lrk	Loarki	lrk	10.5
+lss	Lasi	lss	6.53
+lt	Lithuanian	lit	2629.45
+ltg	Latgalian	ltg	27.23
+lto	Tsotso	lto	9.77
+lua	Luba-Lulua	lua	8.47
+luo	Luo	luo	36.17
+lus	Lushai	lus	20.24
+lv	Latvian	lav	1441.58
+lwg	Wanga	lwg	9.36
+mab	Yutanduchi Mixtec	mab	9.26
+maf	Mafa	maf	9.97
+mai	Maithili	mai	131.37
+mau	Huautla Mazatec	mau	6.39
+max	North Moluccan Malay	max	9.43
+mbo	Mbo (Cameroon)	mbo	9.51
+mcf	Matsés	mcf	9.61
+mcn	Masana	mcn	10.09
+mcx	Mpiemo	mcx	9.88
+mdd	Mbum	mdd	9.82
+mde	Maba (Chad)	mde	9.5
+mdf	Moksha	mdf	0.47
+mek	Mekeo	mek	9.18
+mer	Meru	mer	9.89
+meu	Motu	meu	9.88
+mfm	Marghi South	mfm	10.05
+mfn	Cross River Mbembe	mfn	10.03
+mfo	Mbe	mfo	10.24
+mfv	Mandjak	mfv	9.55
+mgg	Mpumpong	mgg	4.94
+mgi	Lijili	mgi	10.89
+mhk	Mungaka	mhk	7.53
+mhr	Eastern Mari	mhr	272.31
+mi	Maori	mri	18.02
+mig	San Miguel El Grande Mixtec	mig	9.66
+miu	Cacaloxtepec Mixtec	miu	9.18
+mk	Macedonian	mkd	27.21
+mkf	Miya	mkf	10.16
+mki	Dhatki	mki	8.83
+ml	Malayalam	mal	166.57
+mlq	Western Maninkakan	mlq	9.83
+mn	Mongolian	mon	269.08
+mne	Naba	mne	10.37
+mni	Manipuri	mni	44.46
+mqy	Manggarai	mqy	10.5
+mr	Marathi	mar	156.71
+mrj	Western Mari	mrj	32.26
+mrr	Maria (India)	mrr	11.0
+mrt	Marghi Central	mrt	10.36
+ms	Malay	msa	9.57
+mse	Musey	mse	7.21
+msh	Masikoro Malagasy	msh	14.16
+msw	Mansoanka	msw	9.32
+mt	Maltese	mlt	630.29
+mtr	Mewari	mtr	10.58
+mtu	Tututepec Mixtec	mtu	10.13
+mtx	Tidaá Mixtec	mtx	9.09
+mua	Mundang	mua	9.2
+mug	Musgu	mug	4.74
+mui	Musi	mui	10.52
+mve	Marwari (Pakistan)	mve	9.96
+mvy	Indus Kohistani	mvy	21.64
+mxs	Huitepec Mixtec	mxs	9.64
+mxu	Mada (Cameroon)	mxu	12.0
+mxy	Southeastern Nochixtlán Mixtec	mxy	9.48
+my	Burmese	mya	12.14
+myv	Erzya	myv	3.1
+mzl	Mazatlán Mixe	mzl	10.05
+nal	Nalik	nal	10.33
+nan	Min Nan Chinese	nan	17.55
+nap	Neapolitan	nap	9.97
+nb	Norwegian Bokmål	nob	12.7
+nbh	Ngamo	nbh	10.04
+ncf	Notsi	ncf	9.84
+nco	Sibe	nco	9.96
+ncx	Central Puebla Nahuatl	ncx	9.86
+ndi	Samba Leko	ndi	11.27
+ng	Ndonga	ndo	9.08
+ngi	Ngizim	ngi	10.06
+nhg	Tetelcingo Nahuatl	nhg	8.92
+nhi	Zacatlán-Ahuacatlán-Tepetzintla Nahuatl	nhi	0.05
+nhn	Central Nahuatl	nhn	9.51
+nhq	Huaxcaleca Nahuatl	nhq	5.07
+nja	Nzanyi	nja	10.02
+nl	Dutch	nld	2264.13
+nla	Ngombale	nla	8.79
+nlv	Orizaba Nahuatl	nlv	11.42
+nmg	Kwasio	nmg	10.39
+nmz	Nawdm	nmz	6.3
+nn	Norwegian Nynorsk	nno	1.54
+nnh	Ngiemboon	nnh	16.15
+no	Norwegian	nor	3849.8
+noe	Nimadi	noe	11.12
+npi	Nepali	npi	171.5
+nso	Pedi	nso	12.64
+ny	Chichewa	nya	10.8
+nyu	Nyungwe	nyu	8.98
+oc	Occitan	oci	16.8
+odk	Od	odk	20.26
+odu	Odual	odu	10.57
+ogo	Khana	ogo	10.51
+om	Oromo	orm	6.6
+orc	Orma	orc	22.01
+oru	Ormuri	oru	16.74
+ory	Odia	ory	144.81
+os	Iron Ossetic	oss	1.38
+pa	Panjabi	pan	147.37
+pbs	Central Pame	pbs	9.69
+pbt	Southern Pashto	pbt	11.6
+pbu	Northern Pashto	pbu	11.03
+pcm	Nigerian Pidgin	pcm	11.04
+pex	Petats	pex	10.2
+phl	Phalura	phl	20.69
+phr	Pahari-Potwari	phr	24.03
+pip	Pero	pip	9.85
+piy	Piya-Kwonci	piy	10.38
+pko	Pökoot	pko	10.4
+pl	Polish	pol	911.68
+plk	Kohistani Shina	plk	12.75
+plt	Plateau Malagasy	plt	19.39
+pmq	Northern Pame	pmq	10.24
+pms	Piemontese	pms	16.01
+pmy	Papuan Malay	pmy	10.17
+pnb	Western Panjabi	pnb	10.0
+poc	Poqomam	poc	9.63
+poe	San Juan Atzingo Popoloca	poe	10.01
+pow	San Felipe Otlaltepec Popoloca	pow	8.84
+prq	Ashéninka Perené	prq	7.16
+ps	Pushto	pus	88.62
+pst	Central Pashto	pst	11.4
+pt	Portuguese	por	16855.05
+pua	Western Highland Purepecha	pua	10.17
+pwn	Paiwan	pwn	13.76
+qug	Chimborazo Highland Quichua	qug	10.12
+qum	Sipacapense	qum	9.37
+qup	Southern Pastaza Quechua	qup	11.13
+qur	Yanahuanca Pasco Quechua	qur	9.95
+qus	Santiago del Estero Quichua	qus	9.55
+quv	Sacapulteco	quv	8.9
+qux	Yauyos Quechua	qux	9.35
+quy	Ayacucho Quechua	quy	0.05
+qva	Ambo-Pasco Quechua	qva	9.59
+qvi	Imbabura Highland Quichua	qvi	11.0
+qvj	Loja Highland Quichua	qvj	10.59
+qvl	Cajatambo North Lima Quechua	qvl	9.95
+qwa	Corongo Ancash Quechua	qwa	9.72
+qws	Sihuas Ancash Quechua	qws	10.18
+qxa	Chiquián Ancash Quechua	qxa	9.99
+qxp	Puno Quechua	qxp	9.81
+qxt	Santa Ana de Tusi Pasco Quechua	qxt	10.05
+qxu	Arequipa-La Unión Quechua	qxu	10.12
+qxw	Jauja Wanca Quechua	qxw	11.42
+rag	Logooli	rag	9.39
+rm	Romansh	roh	9.21
+ro	Romanian	ron	70.23
+rob	Tae'	rob	9.02
+rof	Rombo	rof	18.9
+roo	Rotokas	roo	9.07
+rth	Ratahan	rth	9.34
+ru	Russian	rus	20338.5
+rup	Macedo-Romanian	rup	0.02
+rw	Kinyarwanda	kin	2021.66
+sa	Sanskrit	san	84.44
+sah	Yakut	sah	16.08
+sat	Santali	sat	98.37
+sau	Saleman	sau	10.53
+say	Saya	say	10.02
+sbn	Sindhi Bhil	sbn	10.53
+sc	Sardinian	srd	2.77
+scl	Shina	scl	9.84
+scn	Sicilian	scn	13.35
+sd	Sindhi	snd	46.27
+sei	Seri	sei	9.81
+shu	Chadian Arabic	shu	2.29
+si	Sinhala	sin	11.98
+sip	Sikkimese	sip	10.07
+siw	Siwai	siw	10.47
+sjr	Siar-Lak	sjr	9.87
+sk	Slovak	slk	2478.46
+skg	Sakalava Malagasy	skg	9.02
+skr	Saraiki	skr	4.13
+sl	Slovenian	slv	1172.61
+sn	Shona	sna	9.96
+snc	Sinaugoro	snc	10.38
+snk	Soninke	snk	10.04
+so	Somali	som	13.22
+sol	Solos	sol	9.95
+sps	Saposa	sps	9.81
+sq	Albanian	sqi	8.59
+sr	Serbian	srp	1855.33
+src	Logudorese Sardinian	src	10.67
+sro	Campidanese Sardinian	sro	10.16
+ssi	Sansi	ssi	10.47
+ste	Liana-Seti	ste	10.43
+sua	Sulka	sua	10.12
+sv	Swedish	swe	2453.14
+sva	Svan	sva	15.11
+sw	Swahili	swa	418.41
+szy	Sakizaya	szy	11.47
+ta	Tamil	tam	423.09
+tan	Tangale	tan	10.14
+tar	Central Tarahumara	tar	9.73
+tay	Atayal	tay	7.02
+tbf	Mandara	tbf	10.01
+tcf	Malinaltepec Me'phaa	tcf	9.04
+tcy	Tulu	tcy	11.72
+tdn	Tondano	tdn	9.14
+tdx	Tandroy-Mahafaly Malagasy	tdx	3.81
+te	Telugu	tel	230.21
+tg	Tajik	tgk	9.23
+tgc	Tigak	tgc	9.71
+th	Thai	tha	10499.77
+the	Chitwania Tharu	the	10.06
+thq	Kochila Tharu	thq	10.28
+thr	Rana Tharu	thr	9.99
+thv	Tahaggart Tamahaq	thv	4.25
+ti	Tigrinya	tir	0.08
+tig	Tigre	tig	7.49
+tio	Teop	tio	9.85
+tk	Turkmen	tuk	2.86
+tkg	Tesaka Malagasy	tkg	17.86
+tkt	Kathoriya Tharu	tkt	10.64
+tli	Tlingit	tli	0.41
+tlp	Filomena Mata-Coahuitlán Totonac	tlp	11.35
+tn	Tswana	tsn	4.24
+tok	Toki Pona	tok	13.51
+tpl	Tlacoapa Me'phaa	tpl	9.28
+tpz	Tinputz	tpz	9.33
+tqp	Tomoip	tqp	10.1
+tr	Turkish	tur	125.36
+trp	Kok Borok	trp	10.74
+trq	San Martín Itunyoso Triqui	trq	8.29
+trv	Sediq	trv	7.77
+trw	Torwali	trw	14.98
+tt	Tatar	tat	30.03
+ttj	Tooro	ttj	10.31
+ttr	Tera	ttr	9.89
+ttu	Torau	ttu	9.87
+tui	Tupuri	tui	9.26
+tul	Tula	tul	9.79
+tuq	Tedaga	tuq	10.0
+tuv	Turkana	tuv	10.17
+tuy	Tugen	tuy	8.79
+tvo	Tidore	tvo	10.31
+tvu	Tunen	tvu	9.85
+tw	Twi	twi	0.25
+twu	Termanu	twu	11.45
+txs	Tonsea	txs	9.32
+txy	Tanosy Malagasy	txy	12.07
+udl	Wuzlam	udl	9.23
+ug	Uighur	uig	428.77
+uk	Ukrainian	ukr	1851.97
+uki	Kui (India)	uki	10.77
+umb	Umbundu	umb	10.59
+ur	Urdu	urd	211.27
+ush	Ushojo	ush	6.36
+uz	Uzbek	uzb	115.28
+uzn	Northern Uzbek	uzn	15.23
+vai	Vai	vai	8.76
+var	Huarijio	var	9.28
+ver	Mom Jango	ver	10.93
+vi	Vietnamese	vie	8481.98
+vmc	Juxtlahuaca Mixtec	vmc	9.43
+vmj	Ixtayutla Mixtec	vmj	10.17
+vmm	Mitlatongo Mixtec	vmm	9.95
+vmp	Soyaltepec Mazatec	vmp	10.17
+vmz	Mazatlán Mazatec	vmz	9.82
+vot	Votic	vot	0.1
+vro	Võro	vro	15.66
+wbl	Wakhi	wbl	11.67
+wci	Waci Gbe	wci	8.02
+weo	Wemale	weo	9.09
+wes	Cameroon Pidgin	wes	10.06
+wja	Waja	wja	10.22
+wji	Warji	wji	11.39
+wo	Wolof	wol	8.71
+wof	Gambian Wolof	wof	9.46
+xh	Xhosa	xho	13.35
+xhe	Khetrani	xhe	9.4
+xka	Kalkoti	xka	8.0
+xmf	Mingrelian	xmf	11.47
+xmv	Antankarana Malagasy	xmv	17.9
+xmw	Tsimihety Malagasy	xmw	11.53
+xpe	Liberia Kpelle	xpe	9.5
+xti	Sinicahua Mixtec	xti	9.5
+xtu	Cuyamecalco Mixtec	xtu	9.4
+yaq	Yaqui	yaq	9.93
+yav	Yangben	yav	8.7
+yay	Agwagwune	yay	8.26
+ydd	Eastern Yiddish	ydd	18.43
+ydg	Yidgha	ydg	9.89
+yer	Tarok	yer	10.08
+yes	Nyankpa	yes	10.26
+yi	Yiddish	yid	1.81
+yo	Yoruba	yor	15.66
+yue	Cantonese	yue	13302.38
+zga	Kinga	zga	9.5
+zgh	Standard Moroccan Tamazight	zgh	1.19
+zh	Chinese	cmn	111343.3
+zoc	Copainalá Zoque	zoc	10.07
+zoh	Chimalapa Zoque	zoh	9.35
+zor	Rayón Zoque	zor	9.04
+zpv	Chichicapan Zapotec	zpv	9.85
+zpy	Mazaltepec Zapotec	zpy	9.47
+ztg	Xanaguía Zapotec	ztg	9.86
+ztn	Santa Catarina Albarradas Zapotec	ztn	10.02
+ztp	Loxicha Zapotec	ztp	9.62
+zts	Tilquiapan Zapotec	zts	9.33
+ztu	Güilá Zapotec	ztu	9.17
+zu	Zulu	zul	14.83
+zza	Zaza	zza	1.52

docs/languages.md ADDED Viewed

	@@ -0,0 +1,659 @@

+# Supported Languages
+OmniVoice supports **646 languages** with a total of **581k hours** of training data.
+The table below lists each language with its OmniVoice language ID,
+ISO 639-3 code, and training data duration (hours).
+| # | Language | OmniVoice ID | ISO 639-3 | Duration (h) |
+|--:|----------|:------------:|:---------:|:------------:|
+| 1 | Abadi | kbt | kbt | 9.73 |
+| 2 | Abkhazian | ab | abk | 57.27 |
+| 3 | Abron | abr | abr | 9.22 |
+| 4 | Abua | abn | abn | 10.27 |
+| 5 | Adamawa Fulfulde | fub | fub | 13.12 |
+| 6 | Adyghe | ady | ady | 32.6 |
+| 7 | Afade | aal | aal | 10.19 |
+| 8 | Afrikaans | af | afr | 4.4 |
+| 9 | Agwagwune | yay | yay | 8.26 |
+| 10 | Aja (Benin) | ajg | ajg | 5.63 |
+| 11 | Akebu | keu | keu | 9.1 |
+| 12 | Alago | ala | ala | 11.04 |
+| 13 | Albanian | sq | sqi | 8.59 |
+| 14 | Algerian Arabic | arq | arq | 9.64 |
+| 15 | Algerian Saharan Arabic | aao | aao | 2.02 |
+| 16 | Ambo-Pasco Quechua | qva | qva | 9.59 |
+| 17 | Ambonese Malay | abs | abs | 10.03 |
+| 18 | Amdo Tibetan | adx | adx | 56.94 |
+| 19 | Amharic | am | amh | 12.83 |
+| 20 | Anaang | anw | anw | 9.65 |
+| 21 | Angika | anp | anp | 10.65 |
+| 22 | Antankarana Malagasy | xmv | xmv | 17.9 |
+| 23 | Aragonese | an | arg | 16.4 |
+| 24 | Arbëreshë Albanian | aae | aae | 6.11 |
+| 25 | Arequipa-La Unión Quechua | qxu | qxu | 10.12 |
+| 26 | Armenian | hy | hye | 42.15 |
+| 27 | Ashe | ahs | ahs | 10.62 |
+| 28 | Ashéninka Perené | prq | prq | 7.16 |
+| 29 | Askopan | eiv | eiv | 10.44 |
+| 30 | Assamese | as | asm | 270.85 |
+| 31 | Asturian | ast | ast | 8.48 |
+| 32 | Atayal | tay | tay | 7.02 |
+| 33 | Awak | awo | awo | 10.22 |
+| 34 | Ayacucho Quechua | quy | quy | 0.05 |
+| 35 | Azerbaijani | az | aze | 9.84 |
+| 36 | Baatonum | bba | bba | 10.53 |
+| 37 | Bacama | bcy | bcy | 9.94 |
+| 38 | Bade | bde | bde | 9.89 |
+| 39 | Bafia | ksf | ksf | 16.43 |
+| 40 | Bafut | bfd | bfd | 9.03 |
+| 41 | Bagirmi Fulfulde | fui | fui | 15.04 |
+| 42 | Bago-Kusuntu | bqg | bqg | 8.86 |
+| 43 | Baharna Arabic | abv | abv | 10.41 |
+| 44 | Bakoko | bkh | bkh | 6.0 |
+| 45 | Balanta-Ganja | bjt | bjt | 9.41 |
+| 46 | Balti | bft | bft | 16.28 |
+| 47 | Bamenyam | bce | bce | 9.9 |
+| 48 | Bamun | bax | bax | 10.24 |
+| 49 | Bangwinji | bsj | bsj | 10.0 |
+| 50 | Banjar | bjn | bjn | 11.68 |
+| 51 | Bankon | abb | abb | 11.2 |
+| 52 | Baoulé | bci | bci | 10.21 |
+| 53 | Bara Malagasy | bhr | bhr | 12.14 |
+| 54 | Barok | bjk | bjk | 10.16 |
+| 55 | Basa (Cameroon) | bas | bas | 10.66 |
+| 56 | Basa (Nigeria) | bzw | bzw | 10.27 |
+| 57 | Bashkir | ba | bak | 249.1 |
+| 58 | Basque | eu | eus | 479.86 |
+| 59 | Batak Mandailing | btm | btm | 11.09 |
+| 60 | Batanga | bnm | bnm | 15.01 |
+| 61 | Bateri | btv | btv | 9.8 |
+| 62 | Bats | bbl | bbl | 11.22 |
+| 63 | Bayot | bda | bda | 9.47 |
+| 64 | Bebele | beb | beb | 7.52 |
+| 65 | Belarusian | be | bel | 1809.43 |
+| 66 | Bengali | bn | ben | 271.76 |
+| 67 | Betawi | bew | bew | 11.15 |
+| 68 | Bhili | bhb | bhb | 9.98 |
+| 69 | Bhojpuri | bho | bho | 10.05 |
+| 70 | Bilur | bxf | bxf | 10.84 |
+| 71 | Bima | bhp | bhp | 10.67 |
+| 72 | Bodo | brx | brx | 231.57 |
+| 73 | Boghom | bux | bux | 10.48 |
+| 74 | Bokyi | bky | bky | 9.85 |
+| 75 | Bomu | bmq | bmq | 10.68 |
+| 76 | Bondei | bou | bou | 9.98 |
+| 77 | Borgu Fulfulde | fue | fue | 20.1 |
+| 78 | Bosnian | bs | bos | 690.73 |
+| 79 | Brahui | brh | brh | 19.89 |
+| 80 | Braj | bra | bra | 10.68 |
+| 81 | Breton | br | bre | 25.48 |
+| 82 | Buduma | bdm | bdm | 10.17 |
+| 83 | Buginese | bug | bug | 11.09 |
+| 84 | Bukharic | bhh | bhh | 11.38 |
+| 85 | Bulgarian | bg | bul | 2190.76 |
+| 86 | Bulu (Cameroon) | bum | bum | 9.06 |
+| 87 | Bundeli | bns | bns | 10.88 |
+| 88 | Bunun | bnn | bnn | 9.26 |
+| 89 | Bura-Pabir | bwr | bwr | 10.4 |
+| 90 | Burak | bys | bys | 9.92 |
+| 91 | Burmese | my | mya | 12.14 |
+| 92 | Burushaski | bsk | bsk | 9.14 |
+| 93 | Cacaloxtepec Mixtec | miu | miu | 9.18 |
+| 94 | Cajatambo North Lima Quechua | qvl | qvl | 9.95 |
+| 95 | Cakfem-Mushere | cky | cky | 8.96 |
+| 96 | Cameroon Pidgin | wes | wes | 10.06 |
+| 97 | Campidanese Sardinian | sro | sro | 10.16 |
+| 98 | Cantonese | yue | yue | 13302.38 |
+| 99 | Catalan | ca | cat | 3358.6 |
+| 100 | Cebuano | ceb | ceb | 12.17 |
+| 101 | Cen | cen | cen | 9.85 |
+| 102 | Central Kurdish | ckb | ckb | 137.52 |
+| 103 | Central Nahuatl | nhn | nhn | 9.51 |
+| 104 | Central Pame | pbs | pbs | 9.69 |
+| 105 | Central Pashto | pst | pst | 11.4 |
+| 106 | Central Puebla Nahuatl | ncx | ncx | 9.86 |
+| 107 | Central Tarahumara | tar | tar | 9.73 |
+| 108 | Central Yupik | esu | esu | 2.18 |
+| 109 | Central-Eastern Niger Fulfulde | fuq | fuq | 9.28 |
+| 110 | Chadian Arabic | shu | shu | 2.29 |
+| 111 | Chichewa | ny | nya | 10.8 |
+| 112 | Chichicapan Zapotec | zpv | zpv | 9.85 |
+| 113 | Chiga | cgg | cgg | 10.84 |
+| 114 | Chimalapa Zoque | zoh | zoh | 9.35 |
+| 115 | Chimborazo Highland Quichua | qug | qug | 10.12 |
+| 116 | Chinese | zh | cmn | 111343.3 |
+| 117 | Chiquián Ancash Quechua | qxa | qxa | 9.99 |
+| 118 | Chitwania Tharu | the | the | 10.06 |
+| 119 | Chokwe | cjk | cjk | 11.01 |
+| 120 | Chuvash | cv | chv | 23.96 |
+| 121 | Cibak | ckl | ckl | 10.91 |
+| 122 | Coastal Konjo | kjc | kjc | 10.18 |
+| 123 | Copainalá Zoque | zoc | zoc | 10.07 |
+| 124 | Cornish | kw | cor | 12.15 |
+| 125 | Corongo Ancash Quechua | qwa | qwa | 9.72 |
+| 126 | Croatian | hr | hrv | 2795.31 |
+| 127 | Cross River Mbembe | mfn | mfn | 10.03 |
+| 128 | Cuyamecalco Mixtec | xtu | xtu | 9.4 |
+| 129 | Czech | cs | ces | 148.13 |
+| 130 | Dadiya | dbd | dbd | 9.61 |
+| 131 | Dagbani | dag | dag | 10.14 |
+| 132 | Dameli | dml | dml | 9.18 |
+| 133 | Danish | da | dan | 1665.98 |
+| 134 | Dargwa | dar | dar | 1.22 |
+| 135 | Dazaga | dzg | dzg | 9.96 |
+| 136 | Deccan | dcc | dcc | 10.38 |
+| 137 | Degema | deg | deg | 11.07 |
+| 138 | Dera (Nigeria) | kna | kna | 11.91 |
+| 139 | Dghwede | dgh | dgh | 9.95 |
+| 140 | Dhatki | mki | mki | 8.83 |
+| 141 | Dhivehi | dv | div | 38.61 |
+| 142 | Dhofari Arabic | adf | adf | 0.31 |
+| 143 | Dijim-Bwilim | cfa | cfa | 10.32 |
+| 144 | Dogri | dgo | dgo | 117.04 |
+| 145 | Domaaki | dmk | dmk | 6.38 |
+| 146 | Dotyali | dty | dty | 10.85 |
+| 147 | Duala | dua | dua | 12.13 |
+| 148 | Dutch | nl | nld | 2264.13 |
+| 149 | Dũya | ldb | ldb | 11.31 |
+| 150 | Dyula | dyu | dyu | 0.34 |
+| 151 | Eastern Balochi | bgp | bgp | 10.98 |
+| 152 | Eastern Bolivian Guaraní | gui | gui | 22.72 |
+| 153 | Eastern Egyptian Bedawi Arabic | avl | avl | 1.86 |
+| 154 | Eastern Krahn | kqo | kqo | 9.28 |
+| 155 | Eastern Mari | mhr | mhr | 272.31 |
+| 156 | Eastern Yiddish | ydd | ydd | 18.43 |
+| 157 | Ebrié | ebr | ebr | 1.5 |
+| 158 | Eggon | ego | ego | 9.95 |
+| 159 | Egyptian Arabic | arz | arz | 23.23 |
+| 160 | Ejagham | etu | etu | 10.3 |
+| 161 | Eleme | elm | elm | 11.27 |
+| 162 | Eloyi | afo | afo | 11.21 |
+| 163 | Embu | ebu | ebu | 9.81 |
+| 164 | English | en | eng | 206061.1 |
+| 165 | Erzya | myv | myv | 3.1 |
+| 166 | Esan | ish | ish | 10.05 |
+| 167 | Esperanto | eo | epo | 1396.64 |
+| 168 | Estonian | et | est | 960.37 |
+| 169 | Eton (Cameroon) | eto | eto | 7.43 |
+| 170 | Ewondo | ewo | ewo | 12.71 |
+| 171 | Extremaduran | ext | ext | 13.59 |
+| 172 | Fang (Equatorial Guinea) | fan | fan | 3.51 |
+| 173 | Fanti | fat | fat | 11.38 |
+| 174 | Farefare | gur | gur | 9.24 |
+| 175 | Fe'fe' | fmp | fmp | 9.86 |
+| 176 | Filipino | fil | fil | 7.71 |
+| 177 | Filomena Mata-Coahuitlán Totonac | tlp | tlp | 11.35 |
+| 178 | Finnish | fi | fin | 468.62 |
+| 179 | Fipa | fip | fip | 10.55 |
+| 180 | French | fr | fra | 23675.32 |
+| 181 | Fulah | ff | ful | 13.84 |
+| 182 | Galician | gl | glg | 208.81 |
+| 183 | Gambian Wolof | wof | wof | 9.46 |
+| 184 | Ganda | lg | lug | 447.82 |
+| 185 | Garhwali | gbm | gbm | 19.14 |
+| 186 | Gawar-Bati | gwt | gwt | 12.16 |
+| 187 | Gawri | gwc | gwc | 10.83 |
+| 188 | Gbagyi | gbr | gbr | 12.12 |
+| 189 | Gbari | gby | gby | 12.59 |
+| 190 | Geji | gyz | gyz | 10.49 |
+| 191 | Gen | gej | gej | 5.39 |
+| 192 | Georgian | ka | kat | 156.96 |
+| 193 | German | de | deu | 21927.13 |
+| 194 | Geser-Gorom | ges | ges | 10.08 |
+| 195 | Gheg Albanian | aln | aln | 3.92 |
+| 196 | Ghomálá' | bbj | bbj | 7.32 |
+| 197 | Gidar | gid | gid | 10.06 |
+| 198 | Glavda | glw | glw | 10.51 |
+| 199 | Goan Konkani | gom | gom | 9.82 |
+| 200 | Goaria | gig | gig | 9.41 |
+| 201 | Goemai | ank | ank | 10.0 |
+| 202 | Gola | gol | gol | 9.26 |
+| 203 | Greek | el | ell | 2412.54 |
+| 204 | Guarani | gn | grn | 4.06 |
+| 205 | Guduf-Gava | gdf | gdf | 12.21 |
+| 206 | Guerrero Amuzgo | amu | amu | 10.1 |
+| 207 | Gujarati | gu | guj | 91.18 |
+| 208 | Gujari | gju | gju | 8.66 |
+| 209 | Gulf Arabic | afb | afb | 98.55 |
+| 210 | Gurgula | ggg | ggg | 7.12 |
+| 211 | Gusii | guz | guz | 9.5 |
+| 212 | Gusilay | gsl | gsl | 10.0 |
+| 213 | Gweno | gwe | gwe | 8.87 |
+| 214 | Güilá Zapotec | ztu | ztu | 9.17 |
+| 215 | Hadothi | hoj | hoj | 10.08 |
+| 216 | Hahon | hah | hah | 9.64 |
+| 217 | Haitian | ht | hat | 0.04 |
+| 218 | Hakha Chin | cnh | cnh | 2.24 |
+| 219 | Hakö | hao | hao | 8.56 |
+| 220 | Halia | hla | hla | 9.86 |
+| 221 | Hausa | ha | hau | 17.75 |
+| 222 | Hawaiian | haw | haw | 11.79 |
+| 223 | Hazaragi | haz | haz | 9.69 |
+| 224 | Hebrew | he | heb | 13.4 |
+| 225 | Hemba | hem | hem | 9.53 |
+| 226 | Herero | hz | her | 9.59 |
+| 227 | Highland Konjo | kjk | kjk | 10.21 |
+| 228 | Hijazi Arabic | acw | acw | 22.32 |
+| 229 | Hindi | hi | hin | 117.17 |
+| 230 | Huarijio | var | var | 9.28 |
+| 231 | Huautla Mazatec | mau | mau | 6.39 |
+| 232 | Huaxcaleca Nahuatl | nhq | nhq | 5.07 |
+| 233 | Huba | hbb | hbb | 10.7 |
+| 234 | Huitepec Mixtec | mxs | mxs | 9.64 |
+| 235 | Hula | hul | hul | 10.33 |
+| 236 | Hungarian | hu | hun | 255.83 |
+| 237 | Hunjara-Kaina Ke | hkk | hkk | 8.69 |
+| 238 | Hwana | hwo | hwo | 11.23 |
+| 239 | Ibibio | ibb | ibb | 7.38 |
+| 240 | Icelandic | is | isl | 647.29 |
+| 241 | Idakho-Isukha-Tiriki | ida | ida | 9.31 |
+| 242 | Idoma | idu | idu | 11.16 |
+| 243 | Igbo | ig | ibo | 13.69 |
+| 244 | Igo | ahl | ahl | 9.22 |
+| 245 | Ikposo | kpo | kpo | 7.83 |
+| 246 | Ikwere | ikw | ikw | 10.0 |
+| 247 | Imbabura Highland Quichua | qvi | qvi | 11.0 |
+| 248 | Indonesian | id | ind | 6327.87 |
+| 249 | Indus Kohistani | mvy | mvy | 21.64 |
+| 250 | Interlingua (International Auxiliary Language Association) | ia | ina | 13.48 |
+| 251 | Inupiaq | ik | ipk | 2.11 |
+| 252 | Irish | ga | gle | 21.4 |
+| 253 | Iron Ossetic | os | oss | 1.38 |
+| 254 | Isekiri | its | its | 11.85 |
+| 255 | Isoko | iso | iso | 10.33 |
+| 256 | Italian | it | ita | 9402.46 |
+| 257 | Ito | itw | itw | 9.19 |
+| 258 | Itzá | itz | itz | 7.08 |
+| 259 | Ixtayutla Mixtec | vmj | vmj | 10.17 |
+| 260 | Izon | ijc | ijc | 9.95 |
+| 261 | Jambi Malay | jax | jax | 10.29 |
+| 262 | Japanese | ja | jpn | 36914.4 |
+| 263 | Jaqaru | jqr | jqr | 9.32 |
+| 264 | Jauja Wanca Quechua | qxw | qxw | 11.42 |
+| 265 | Jaunsari | jns | jns | 11.25 |
+| 266 | Javanese | jv | jav | 11.19 |
+| 267 | Jiba | juo | juo | 10.43 |
+| 268 | Jju | kaj | kaj | 10.16 |
+| 269 | Judeo-Moroccan Arabic | aju | aju | 7.21 |
+| 270 | Juxtlahuaca Mixtec | vmc | vmc | 9.43 |
+| 271 | Kabardian | kbd | kbd | 108.35 |
+| 272 | Kabras | lkb | lkb | 9.99 |
+| 273 | Kabuverdianu | kea | kea | 10.51 |
+| 274 | Kabyle | kab | kab | 529.52 |
+| 275 | Kachi Koli | gjk | gjk | 20.83 |
+| 276 | Kairak | ckr | ckr | 10.51 |
+| 277 | Kalabari | ijn | ijn | 11.04 |
+| 278 | Kalasha | kls | kls | 9.11 |
+| 279 | Kalenjin | kln | kln | 40.42 |
+| 280 | Kalkoti | xka | xka | 8.0 |
+| 281 | Kamba | kam | kam | 14.72 |
+| 282 | Kamo | kcq | kcq | 10.49 |
+| 283 | Kanauji | bjj | bjj | 11.01 |
+| 284 | Kanembu | kbl | kbl | 10.19 |
+| 285 | Kannada | kn | kan | 128.06 |
+| 286 | Karekare | kai | kai | 10.52 |
+| 287 | Kashmiri | ks | kas | 110.42 |
+| 288 | Kathoriya Tharu | tkt | tkt | 10.64 |
+| 289 | Kati | bsh | bsh | 8.77 |
+| 290 | Kazakh | kk | kaz | 1537.29 |
+| 291 | Keiyo | eyo | eyo | 9.24 |
+| 292 | Khams Tibetan | khg | khg | 6.38 |
+| 293 | Khana | ogo | ogo | 10.51 |
+| 294 | Khetrani | xhe | xhe | 9.4 |
+| 295 | Khmer | km | khm | 7.1 |
+| 296 | Khowar | khw | khw | 15.55 |
+| 297 | Kinga | zga | zga | 9.5 |
+| 298 | Kinnauri | kfk | kfk | 10.32 |
+| 299 | Kinyarwanda | rw | kin | 2021.66 |
+| 300 | Kirghiz | ky | kir | 46.63 |
+| 301 | Kirya-Konzəl | fkk | fkk | 9.98 |
+| 302 | Kochila Tharu | thq | thq | 10.28 |
+| 303 | Kohistani Shina | plk | plk | 12.75 |
+| 304 | Kohumono | bcs | bcs | 10.45 |
+| 305 | Kok Borok | trp | trp | 10.74 |
+| 306 | Kol (Papua New Guinea) | kol | kol | 9.95 |
+| 307 | Kom (Cameroon) | bkm | bkm | 10.76 |
+| 308 | Koma | kmy | kmy | 10.28 |
+| 309 | Konkani | knn | knn | 112.83 |
+| 310 | Konzo | koo | koo | 13.23 |
+| 311 | Korean | ko | kor | 8609.28 |
+| 312 | Korwa | kfp | kfp | 11.87 |
+| 313 | Kota (India) | kfe | kfe | 10.25 |
+| 314 | Koti | eko | eko | 8.15 |
+| 315 | Kuanua | ksd | ksd | 9.91 |
+| 316 | Kuanyama | kj | kua | 9.88 |
+| 317 | Kui (India) | uki | uki | 10.77 |
+| 318 | Kulung (Nigeria) | bbu | bbu | 10.39 |
+| 319 | Kuot | kto | kto | 9.77 |
+| 320 | Kushi | kuh | kuh | 10.35 |
+| 321 | Kwambi | kwm | kwm | 9.9 |
+| 322 | Kwasio | nmg | nmg | 10.39 |
+| 323 | Lala-Roba | lla | lla | 10.38 |
+| 324 | Lamang | hia | hia | 11.07 |
+| 325 | Lao | lo | lao | 7.63 |
+| 326 | Larike-Wakasihu | alo | alo | 9.97 |
+| 327 | Lasi | lss | lss | 6.53 |
+| 328 | Latgalian | ltg | ltg | 27.23 |
+| 329 | Latvian | lv | lav | 1441.58 |
+| 330 | Levantine Arabic | apc | apc | 15.65 |
+| 331 | Liana-Seti | ste | ste | 10.43 |
+| 332 | Liberia Kpelle | xpe | xpe | 9.5 |
+| 333 | Liberian English | lir | lir | 10.26 |
+| 334 | Libyan Arabic | ayl | ayl | 20.13 |
+| 335 | Ligurian | lij | lij | 15.97 |
+| 336 | Lijili | mgi | mgi | 10.89 |
+| 337 | Lingala | ln | lin | 17.99 |
+| 338 | Lithuanian | lt | lit | 2629.45 |
+| 339 | Loarki | lrk | lrk | 10.5 |
+| 340 | Logooli | rag | rag | 9.39 |
+| 341 | Logudorese Sardinian | src | src | 10.67 |
+| 342 | Loja Highland Quichua | qvj | qvj | 10.59 |
+| 343 | Loloda | loa | loa | 9.31 |
+| 344 | Longuda | lnu | lnu | 10.46 |
+| 345 | Loxicha Zapotec | ztp | ztp | 9.62 |
+| 346 | Luba-Lulua | lua | lua | 8.47 |
+| 347 | Luo | luo | luo | 36.17 |
+| 348 | Lushai | lus | lus | 20.24 |
+| 349 | Luxembourgish | lb | ltz | 8.46 |
+| 350 | Maasina Fulfulde | ffm | ffm | 10.46 |
+| 351 | Maba (Chad) | mde | mde | 9.5 |
+| 352 | Macedo-Romanian | rup | rup | 0.02 |
+| 353 | Macedonian | mk | mkd | 27.21 |
+| 354 | Mada (Cameroon) | mxu | mxu | 12.0 |
+| 355 | Mafa | maf | maf | 9.97 |
+| 356 | Maithili | mai | mai | 131.37 |
+| 357 | Malay | ms | msa | 9.57 |
+| 358 | Malayalam | ml | mal | 166.57 |
+| 359 | Mali | gcc | gcc | 9.87 |
+| 360 | Malinaltepec Me'phaa | tcf | tcf | 9.04 |
+| 361 | Maltese | mt | mlt | 630.29 |
+| 362 | Mandara | tbf | tbf | 10.01 |
+| 363 | Mandjak | mfv | mfv | 9.55 |
+| 364 | Manggarai | mqy | mqy | 10.5 |
+| 365 | Manipuri | mni | mni | 44.46 |
+| 366 | Mansoanka | msw | msw | 9.32 |
+| 367 | Manx | gv | glv | 10.07 |
+| 368 | Maori | mi | mri | 18.02 |
+| 369 | Marathi | mr | mar | 156.71 |
+| 370 | Marghi Central | mrt | mrt | 10.36 |
+| 371 | Marghi South | mfm | mfm | 10.05 |
+| 372 | Maria (India) | mrr | mrr | 11.0 |
+| 373 | Marwari (Pakistan) | mve | mve | 9.96 |
+| 374 | Masana | mcn | mcn | 10.09 |
+| 375 | Masikoro Malagasy | msh | msh | 14.16 |
+| 376 | Matsés | mcf | mcf | 9.61 |
+| 377 | Mazaltepec Zapotec | zpy | zpy | 9.47 |
+| 378 | Mazatlán Mazatec | vmz | vmz | 9.82 |
+| 379 | Mazatlán Mixe | mzl | mzl | 10.05 |
+| 380 | Mbe | mfo | mfo | 10.24 |
+| 381 | Mbo (Cameroon) | mbo | mbo | 9.51 |
+| 382 | Mbum | mdd | mdd | 9.82 |
+| 383 | Medumba | byv | byv | 10.95 |
+| 384 | Mekeo | mek | mek | 9.18 |
+| 385 | Meru | mer | mer | 9.89 |
+| 386 | Mesopotamian Arabic | acm | acm | 3.78 |
+| 387 | Mewari | mtr | mtr | 10.58 |
+| 388 | Min Nan Chinese | nan | nan | 17.55 |
+| 389 | Mingrelian | xmf | xmf | 11.47 |
+| 390 | Mitlatongo Mixtec | vmm | vmm | 9.95 |
+| 391 | Miya | mkf | mkf | 10.16 |
+| 392 | Mokpwe | bri | bri | 7.53 |
+| 393 | Moksha | mdf | mdf | 0.47 |
+| 394 | Mom Jango | ver | ver | 10.93 |
+| 395 | Mongolian | mn | mon | 269.08 |
+| 396 | Moroccan Arabic | ary | ary | 104.67 |
+| 397 | Motu | meu | meu | 9.88 |
+| 398 | Mpiemo | mcx | mcx | 9.88 |
+| 399 | Mpumpong | mgg | mgg | 4.94 |
+| 400 | Mundang | mua | mua | 9.2 |
+| 401 | Mungaka | mhk | mhk | 7.53 |
+| 402 | Musey | mse | mse | 7.21 |
+| 403 | Musgu | mug | mug | 4.74 |
+| 404 | Musi | mui | mui | 10.52 |
+| 405 | Naba | mne | mne | 10.37 |
+| 406 | Najdi Arabic | ars | ars | 203.54 |
+| 407 | Nalik | nal | nal | 10.33 |
+| 408 | Nawdm | nmz | nmz | 6.3 |
+| 409 | Ndonga | ng | ndo | 9.08 |
+| 410 | Neapolitan | nap | nap | 9.97 |
+| 411 | Nepali | npi | npi | 171.5 |
+| 412 | Ngamo | nbh | nbh | 10.04 |
+| 413 | Ngas | anc | anc | 10.14 |
+| 414 | Ngiemboon | nnh | nnh | 16.15 |
+| 415 | Ngizim | ngi | ngi | 10.06 |
+| 416 | Ngomba | jgo | jgo | 10.15 |
+| 417 | Ngombale | nla | nla | 8.79 |
+| 418 | Nigerian Fulfulde | fuv | fuv | 9.97 |
+| 419 | Nigerian Pidgin | pcm | pcm | 11.04 |
+| 420 | Nimadi | noe | noe | 11.12 |
+| 421 | Nobiin | fia | fia | 9.96 |
+| 422 | North Mesopotamian Arabic | ayp | ayp | 10.92 |
+| 423 | North Moluccan Malay | max | max | 9.43 |
+| 424 | Northern Betsimisaraka Malagasy | bmm | bmm | 19.12 |
+| 425 | Northern Hindko | hno | hno | 20.04 |
+| 426 | Northern Kurdish | kmr | kmr | 69.59 |
+| 427 | Northern Pame | pmq | pmq | 10.24 |
+| 428 | Northern Pashto | pbu | pbu | 11.03 |
+| 429 | Northern Uzbek | uzn | uzn | 15.23 |
+| 430 | Northwest Gbaya | gya | gya | 8.45 |
+| 431 | Norwegian | no | nor | 3849.8 |
+| 432 | Norwegian Bokmål | nb | nob | 12.7 |
+| 433 | Norwegian Nynorsk | nn | nno | 1.54 |
+| 434 | Notsi | ncf | ncf | 9.84 |
+| 435 | Nyankpa | yes | yes | 10.26 |
+| 436 | Nyungwe | nyu | nyu | 8.98 |
+| 437 | Nzanyi | nja | nja | 10.02 |
+| 438 | Nüpode Huitoto | hux | hux | 9.04 |
+| 439 | Occitan | oc | oci | 16.8 |
+| 440 | Od | odk | odk | 20.26 |
+| 441 | Odia | ory | ory | 144.81 |
+| 442 | Odual | odu | odu | 10.57 |
+| 443 | Omani Arabic | acx | acx | 22.03 |
+| 444 | Orizaba Nahuatl | nlv | nlv | 11.42 |
+| 445 | Orma | orc | orc | 22.01 |
+| 446 | Ormuri | oru | oru | 16.74 |
+| 447 | Oromo | om | orm | 6.6 |
+| 448 | Pahari-Potwari | phr | phr | 24.03 |
+| 449 | Paiwan | pwn | pwn | 13.76 |
+| 450 | Panjabi | pa | pan | 147.37 |
+| 451 | Papuan Malay | pmy | pmy | 10.17 |
+| 452 | Parkari Koli | kvx | kvx | 11.04 |
+| 453 | Pedi | nso | nso | 12.64 |
+| 454 | Pero | pip | pip | 9.85 |
+| 455 | Persian | fa | fas | 366.07 |
+| 456 | Petats | pex | pex | 10.2 |
+| 457 | Phalura | phl | phl | 20.69 |
+| 458 | Piemontese | pms | pms | 16.01 |
+| 459 | Piya-Kwonci | piy | piy | 10.38 |
+| 460 | Plateau Malagasy | plt | plt | 19.39 |
+| 461 | Polish | pl | pol | 911.68 |
+| 462 | Poqomam | poc | poc | 9.63 |
+| 463 | Portuguese | pt | por | 16855.05 |
+| 464 | Pulaar | fuc | fuc | 14.77 |
+| 465 | Pular | fuf | fuf | 13.77 |
+| 466 | Puno Quechua | qxp | qxp | 9.81 |
+| 467 | Pushto | ps | pus | 88.62 |
+| 468 | Pökoot | pko | pko | 10.4 |
+| 469 | Qaqet | byx | byx | 9.79 |
+| 470 | Quiotepec Chinantec | chq | chq | 9.76 |
+| 471 | Rana Tharu | thr | thr | 9.99 |
+| 472 | Rangi | lag | lag | 9.47 |
+| 473 | Rapoisi | kyx | kyx | 9.17 |
+| 474 | Ratahan | rth | rth | 9.34 |
+| 475 | Rayón Zoque | zor | zor | 9.04 |
+| 476 | Romanian | ro | ron | 70.23 |
+| 477 | Romansh | rm | roh | 9.21 |
+| 478 | Rombo | rof | rof | 18.9 |
+| 479 | Rotokas | roo | roo | 9.07 |
+| 480 | Rukai | dru | dru | 9.26 |
+| 481 | Russian | ru | rus | 20338.5 |
+| 482 | Sacapulteco | quv | quv | 8.9 |
+| 483 | Saidi Arabic | aec | aec | 9.28 |
+| 484 | Sakalava Malagasy | skg | skg | 9.02 |
+| 485 | Sakizaya | szy | szy | 11.47 |
+| 486 | Saleman | sau | sau | 10.53 |
+| 487 | Samba Daka | ccg | ccg | 10.11 |
+| 488 | Samba Leko | ndi | ndi | 11.27 |
+| 489 | San Felipe Otlaltepec Popoloca | pow | pow | 8.84 |
+| 490 | San Francisco Del Mar Huave | hue | hue | 9.45 |
+| 491 | San Juan Atzingo Popoloca | poe | poe | 10.01 |
+| 492 | San Martín Itunyoso Triqui | trq | trq | 8.29 |
+| 493 | San Miguel El Grande Mixtec | mig | mig | 9.66 |
+| 494 | Sansi | ssi | ssi | 10.47 |
+| 495 | Sanskrit | sa | san | 84.44 |
+| 496 | Santa Ana de Tusi Pasco Quechua | qxt | qxt | 10.05 |
+| 497 | Santa Catarina Albarradas Zapotec | ztn | ztn | 10.02 |
+| 498 | Santali | sat | sat | 98.37 |
+| 499 | Santiago del Estero Quichua | qus | qus | 9.55 |
+| 500 | Saposa | sps | sps | 9.81 |
+| 501 | Saraiki | skr | skr | 4.13 |
+| 502 | Sardinian | sc | srd | 2.77 |
+| 503 | Saya | say | say | 10.02 |
+| 504 | Sediq | trv | trv | 7.77 |
+| 505 | Serbian | sr | srp | 1855.33 |
+| 506 | Seri | sei | sei | 9.81 |
+| 507 | Shina | scl | scl | 9.84 |
+| 508 | Shona | sn | sna | 9.96 |
+| 509 | Siar-Lak | sjr | sjr | 9.87 |
+| 510 | Sibe | nco | nco | 9.96 |
+| 511 | Sicilian | scn | scn | 13.35 |
+| 512 | Sihuas Ancash Quechua | qws | qws | 10.18 |
+| 513 | Sikkimese | sip | sip | 10.07 |
+| 514 | Sinaugoro | snc | snc | 10.38 |
+| 515 | Sindhi | sd | snd | 46.27 |
+| 516 | Sindhi Bhil | sbn | sbn | 10.53 |
+| 517 | Sinhala | si | sin | 11.98 |
+| 518 | Sinicahua Mixtec | xti | xti | 9.5 |
+| 519 | Sipacapense | qum | qum | 9.37 |
+| 520 | Siwai | siw | siw | 10.47 |
+| 521 | Slovak | sk | slk | 2478.46 |
+| 522 | Slovenian | sl | slv | 1172.61 |
+| 523 | Solos | sol | sol | 9.95 |
+| 524 | Somali | so | som | 13.22 |
+| 525 | Soninke | snk | snk | 10.04 |
+| 526 | South Giziga | giz | giz | 10.03 |
+| 527 | South Ucayali Ashéninka | cpy | cpy | 9.15 |
+| 528 | Southeastern Nochixtlán Mixtec | mxy | mxy | 9.48 |
+| 529 | Southern Betsimisaraka Malagasy | bzc | bzc | 17.45 |
+| 530 | Southern Pashto | pbt | pbt | 11.6 |
+| 531 | Southern Pastaza Quechua | qup | qup | 11.13 |
+| 532 | Soyaltepec Mazatec | vmp | vmp | 10.17 |
+| 533 | Spanish | es | spa | 27559.74 |
+| 534 | Standard Arabic | arb | arb | 1483.53 |
+| 535 | Standard Moroccan Tamazight | zgh | zgh | 1.19 |
+| 536 | Sudanese Arabic | apd | apd | 9.93 |
+| 537 | Sulka | sua | sua | 10.12 |
+| 538 | Svan | sva | sva | 15.11 |
+| 539 | Swahili | sw | swa | 418.41 |
+| 540 | Swedish | sv | swe | 2453.14 |
+| 541 | Tae' | rob | rob | 9.02 |
+| 542 | Tahaggart Tamahaq | thv | thv | 4.25 |
+| 543 | Taita | dav | dav | 9.12 |
+| 544 | Tajik | tg | tgk | 9.23 |
+| 545 | Tamil | ta | tam | 423.09 |
+| 546 | Tandroy-Mahafaly Malagasy | tdx | tdx | 3.81 |
+| 547 | Tangale | tan | tan | 10.14 |
+| 548 | Tanosy Malagasy | txy | txy | 12.07 |
+| 549 | Tarok | yer | yer | 10.08 |
+| 550 | Tatar | tt | tat | 30.03 |
+| 551 | Tedaga | tuq | tuq | 10.0 |
+| 552 | Telugu | te | tel | 230.21 |
+| 553 | Tem | kdh | kdh | 4.07 |
+| 554 | Teop | tio | tio | 9.85 |
+| 555 | Tepeuxila Cuicatec | cux | cux | 7.83 |
+| 556 | Tepinapa Chinantec | cte | cte | 9.54 |
+| 557 | Tera | ttr | ttr | 9.89 |
+| 558 | Terei | buo | buo | 9.48 |
+| 559 | Termanu | twu | twu | 11.45 |
+| 560 | Tesaka Malagasy | tkg | tkg | 17.86 |
+| 561 | Tetelcingo Nahuatl | nhg | nhg | 8.92 |
+| 562 | Teutila Cuicatec | cut | cut | 8.04 |
+| 563 | Thai | th | tha | 10499.77 |
+| 564 | Tibetan | bo | bod | 82.27 |
+| 565 | Tidaá Mixtec | mtx | mtx | 9.09 |
+| 566 | Tidore | tvo | tvo | 10.31 |
+| 567 | Tigak | tgc | tgc | 9.71 |
+| 568 | Tigre | tig | tig | 7.49 |
+| 569 | Tigrinya | ti | tir | 0.08 |
+| 570 | Tilquiapan Zapotec | zts | zts | 9.33 |
+| 571 | Tinputz | tpz | tpz | 9.33 |
+| 572 | Tlacoapa Me'phaa | tpl | tpl | 9.28 |
+| 573 | Tlacoatzintepec Chinantec | ctl | ctl | 10.04 |
+| 574 | Tlingit | tli | tli | 0.41 |
+| 575 | Toki Pona | tok | tok | 13.51 |
+| 576 | Tomoip | tqp | tqp | 10.1 |
+| 577 | Tondano | tdn | tdn | 9.14 |
+| 578 | Tonsea | txs | txs | 9.32 |
+| 579 | Tooro | ttj | ttj | 10.31 |
+| 580 | Torau | ttu | ttu | 9.87 |
+| 581 | Torwali | trw | trw | 14.98 |
+| 582 | Tsimihety Malagasy | xmw | xmw | 11.53 |
+| 583 | Tsotso | lto | lto | 9.77 |
+| 584 | Tswana | tn | tsn | 4.24 |
+| 585 | Tugen | tuy | tuy | 8.79 |
+| 586 | Tuki | bag | bag | 10.97 |
+| 587 | Tula | tul | tul | 9.79 |
+| 588 | Tulu | tcy | tcy | 11.72 |
+| 589 | Tunen | tvu | tvu | 9.85 |
+| 590 | Tungag | lcm | lcm | 9.77 |
+| 591 | Tunisian Arabic | aeb | aeb | 21.63 |
+| 592 | Tupuri | tui | tui | 9.26 |
+| 593 | Turkana | tuv | tuv | 10.17 |
+| 594 | Turkish | tr | tur | 125.36 |
+| 595 | Turkmen | tk | tuk | 2.86 |
+| 596 | Tututepec Mixtec | mtu | mtu | 10.13 |
+| 597 | Twi | tw | twi | 0.25 |
+| 598 | Ubaghara | byc | byc | 11.11 |
+| 599 | Uighur | ug | uig | 428.77 |
+| 600 | Ukrainian | uk | ukr | 1851.97 |
+| 601 | Umbundu | umb | umb | 10.59 |
+| 602 | Upper Sorbian | hsb | hsb | 2.71 |
+| 603 | Urdu | ur | urd | 211.27 |
+| 604 | Ushojo | ush | ush | 6.36 |
+| 605 | Uzbek | uz | uzb | 115.28 |
+| 606 | Vai | vai | vai | 8.76 |
+| 607 | Vietnamese | vi | vie | 8481.98 |
+| 608 | Votic | vot | vot | 0.1 |
+| 609 | Võro | vro | vro | 15.66 |
+| 610 | Waci Gbe | wci | wci | 8.02 |
+| 611 | Wadiyara Koli | kxp | kxp | 20.0 |
+| 612 | Waja | wja | wja | 10.22 |
+| 613 | Wakhi | wbl | wbl | 11.67 |
+| 614 | Wanga | lwg | lwg | 9.36 |
+| 615 | Wapan | juk | juk | 10.22 |
+| 616 | Warji | wji | wji | 11.39 |
+| 617 | Welsh | cy | cym | 131.21 |
+| 618 | Wemale | weo | weo | 9.09 |
+| 619 | Western Frisian | fy | fry | 70.41 |
+| 620 | Western Highland Purepecha | pua | pua | 10.17 |
+| 621 | Western Juxtlahuaca Mixtec | jmx | jmx | 10.01 |
+| 622 | Western Maninkakan | mlq | mlq | 9.83 |
+| 623 | Western Mari | mrj | mrj | 32.26 |
+| 624 | Western Niger Fulfulde | fuh | fuh | 9.69 |
+| 625 | Western Panjabi | pnb | pnb | 10.0 |
+| 626 | Wolof | wo | wol | 8.71 |
+| 627 | Wuzlam | udl | udl | 9.23 |
+| 628 | Xanaguía Zapotec | ztg | ztg | 9.86 |
+| 629 | Xhosa | xh | xho | 13.35 |
+| 630 | Yace | ekr | ekr | 10.76 |
+| 631 | Yakut | sah | sah | 16.08 |
+| 632 | Yalahatan | jal | jal | 11.18 |
+| 633 | Yanahuanca Pasco Quechua | qur | qur | 9.95 |
+| 634 | Yangben | yav | yav | 8.7 |
+| 635 | Yaqui | yaq | yaq | 9.93 |
+| 636 | Yauyos Quechua | qux | qux | 9.35 |
+| 637 | Yekhee | ets | ets | 10.11 |
+| 638 | Yiddish | yi | yid | 1.81 |
+| 639 | Yidgha | ydg | ydg | 9.89 |
+| 640 | Yoruba | yo | yor | 15.66 |
+| 641 | Yutanduchi Mixtec | mab | mab | 9.26 |
+| 642 | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | nhi | nhi | 0.05 |
+| 643 | Zarma | dje | dje | 10.72 |
+| 644 | Zaza | zza | zza | 1.52 |
+| 645 | Zulu | zu | zul | 14.83 |
+| 646 | Ömie | aom | aom | 8.19 |
+*646 languages, 581k hours total.*
+Data source: [docs/lang_id_name_map.tsv](lang_id_name_map.tsv)

docs/tips.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Tips & Notes
+- **Combination of `ref_audio` and `instruct`**:
+  When both `ref_audio` and `instruct` are provided and they **conflict**, the model will most likely follow the style of the reference audio. When the two are **consistent**, `instruct` can improve cloning stability for the attributes it describes. A typical example is **Chinese dialect cloning**: provide both dialect reference audio and a matching dialect instruct (e.g., `ref_audio="sichuan.wav", instruct="四川话"`) for more stable dialect output.
+- **Short Audio Generation**:
+  The model may not reliably generate short audio clips (e.g., 1–2 seconds) without reference audio. If you need to generate short clips, provide reference audio to the model.
+- **Min Nan Chinese (Hokkien) Input Format**:
+  Min Nan Chinese (闽南语, also known as Hokkien) can only be synthesized using [Tai-lo romanization](https://en.wikipedia.org/wiki/T%C3%A2i-l%C3%B4) as input; Chinese characters are not supported for Min Nan Chinese in the current model version.

docs/training.md ADDED Viewed

	@@ -0,0 +1,102 @@

+# Training
+## Training Config
+All training is controlled by a JSON training config file and a JSON data config file.
+See [examples/config/](../examples/config/) for ready-to-use configs.
+Training config file on Emilia is: [examples/config/train_config_emilia.json](../examples/config/train_config_emilia.json)
+Data config file for Emilia is: [examples/config/data_config_emilia.json](../examples/config/data_config_emilia.json)
+Key fields in training config file:
+| Field | Description | Default |
+|---|---|---|
+| `llm_name_or_path` | local LLM path or huggingface id | Qwen/Qwen3-0.6B |
+| `steps` | Total training steps | 300,000 |
+| `learning_rate` | Peak learning rate | 1e-4 |
+| `batch_tokens` | Tokens per batch on each GPU | 8192 |
+| `attn_implementation` | Attention backend: `"flex_attention"` or `"sdpa"` | `"flex_attention"` |
+`output_dir` and `data_config` are passed via command line (see below).
+## Attention Implementation
+By default, training uses `flex_attention`, which requires PyTorch ≥ 2.5 and a compatible GPU (e.g. NVIDIA Ampere or newer). If your environment does not support `flex_attention`, set `attn_implementation` to `"sdpa"` in your training config. See [examples/config/train_config_finetune_sdpa.json](../examples/config/train_config_finetune_sdpa.json) for a ready-to-use SDPA config:
+```json
+{
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64
+}
+```
+`"sdpa"` uses PyTorch's built-in scaled dot-product attention and works on a wider range of hardware.
+The following fields only apply when `attn_implementation != "flex_attention"`:
+| Field | Description | Default |
+|---|---|---|
+| `max_sample_tokens` | Maximum token length per sample; longer samples are dropped | 2000 |
+| `min_sample_tokens` | Minimum token length per sample; shorter samples are dropped | 50 |
+| `max_batch_size` | Cap on the number of samples per batch | 64 |
+`batch_tokens` remains the primary control for memory usage — it sets the total token budget per batch. `max_batch_size` is a safety guard to prevent a batch of many short samples from creating an unusually large batch dimension.
+### Batching strategy
+The two backends use **different batching strategies**, which are selected automatically:
+| Backend | Batching strategy | Batch shape | Notes |
+|---|---|---|---|
+| `flex_attention` | Sequence packing | `[1, C, batch_tokens]` | Multiple samples concatenated into one long sequence; document boundaries tracked via `document_ids` |
+| `sdpa` | Length-grouped padding | `[B, C, max_len]` | Samples with similar token lengths are grouped into the same batch and padded to the local maximum length |
+**Why different strategies?**
+- With `flex_attention`, sequence packing is memory-efficient because a compact `BlockMask` (not a dense matrix) describes which tokens can attend to each other across document boundaries.
+- With `sdpa`, length-grouped padding is used instead: samples of similar token lengths are batched together and padded to the local maximum, so a lightweight `[B, 1, max_len, max_len]` boolean attention mask suffices with low overhead and minimal wasted padding.
+## Launching Training
+```bash
+accelerate launch \
+    --gpu_ids "0,1,2,3,4,5,6,7" \
+    --num_processes 8 \
+    -m omnivoice.cli.train \
+    --train_config config/train_config_emilia.json \
+    --data_config config/data_config_emilia.json \
+    --output_dir exp/omnivoice_emilia
+```
+## Resuming Training
+Set `resume_from_checkpoint` in your training config to resume from an existing checkpoint:
+```json
+{
+    "resume_from_checkpoint": "exp/omnivoice/checkpoint-100000"
+}
+```
+## Initializing from a Pretrained Model
+To start training from a pretrained OmniVoice checkpoint (for fine-tuning):
+```json
+{
+    "init_from_checkpoint": "exp/omnivoice/checkpoint-100000"
+}
+```
+## Monitoring
+Training logs to TensorBoard:
+```bash
+tensorboard --logdir exp/omnivoice_emilia/tensorboard
+```

docs/voice-design.md ADDED Viewed

	@@ -0,0 +1,129 @@

+# Voice Design
+Voice Design mode lets you describe the desired speaker through speaker attributes (`instruct` parameter) — no reference audio needed. The model
+generates a matching voice on the fly.
+## Quick Example
+```python
+import torch
+from omnivoice import OmniVoice
+model = OmniVoice.from_pretrained(
+    "k2-fsa/OmniVoice",
+    device_map="cuda:0",
+    dtype=torch.float16
+)
+audio = model.generate(
+    text="This is a test for voice design.",
+    instruct="female, young adult, high pitch, british accent",
+)
+```
+## How It Works
+The `instruct` parameter accepts a comma-separated string of speaker attributes.
+Each attribute belongs to a **category** (gender, age, pitch, style, accent,
+or dialect). Within a category, only one attribute may be selected at a time.
+Attributes from different categories can be freely combined.
+The model auto-detects the language of the instruct text and normalises it
+internally — you can write in English, Chinese, or a mix of both.
+## Supported Attributes
+### Gender
+| English | Chinese |
+|---------|---------|
+| male | 男 |
+| female | 女 |
+### Age
+| English | Chinese |
+|---------|---------|
+| child | 儿童 |
+| teenager | 少年 |
+| young adult | 青年 |
+| middle-aged | 中年 |
+| elderly | 老年 |
+### Pitch
+| English | Chinese |
+|---------|---------|
+| very low pitch | 极低音调 |
+| low pitch | 低音调 |
+| moderate pitch | 中音调 |
+| high pitch | 高音调 |
+| very high pitch | 极高音调 |
+### Style
+| English | Chinese |
+|---------|---------|
+| whisper | 耳语 |
+### English Accent
+Only effective when the synthesis text is in English.
+| Accent |
+|--------|
+| american accent |
+| british accent |
+| australian accent |
+| canadian accent |
+| indian accent |
+| chinese accent |
+| korean accent |
+| japanese accent |
+| portuguese accent |
+| russian accent |
+### Chinese Dialect
+Only effective when the synthesis text is in Chinese.
+| Dialect |
+|---------|
+| 河南话 |
+| 陕西话 |
+| 四川话 |
+| 贵州话 |
+| 云南话 |
+| 桂林话 |
+| 济南话 |
+| 石家庄话 |
+| 甘肃话 |
+| 宁夏话 |
+| 青岛话 |
+| 东北话 |
+## Writing Instruct Strings
+Separate attributes with commas (half-width `,` for English, full-width `，`
+for Chinese — the model auto-fixes mismatches).
+```
+# English
+"female, young adult, high pitch, british accent"
+# Chinese
+"女，青年，高音调，四川话"
+# Mixed (auto-normalised)
+"female, young adult, 四川话"
+```
+### Tips
+- **Combine freely** across categories: `"male, elderly, low pitch, whisper"`.
+- **Leave it to the model**: omit attributes you don't care about — the model
+  fills in the rest. For example `"female"` alone is valid.
+- **Case-insensitive**: `"Male"`, `"MALE"`, and `"male"` are all accepted, the code will normalize them to lower case.
+- **Accent vs Dialect**: English accents are only applied to English speech, Chinese dialects are only applied to Chinese speech.
+- **Attribute combinations**: Due to training data limitations, some attribute combinations may not work well — the model may ignore certain attributes in a combination. If the output doesn't match your expectation, try simplifying the instruct string.

examples/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# OmniVoice Examples
+This directory contains scripts and configs for training, fine-tuning, and evaluating OmniVoice.
+| Use Case | Script | Description |
+|---|---|---|
+| Training from scratch | [run_emilia.sh](run_emilia.sh) | Full pipeline on the Emilia dataset (data check, tokenization, training) |
+| Fine-tuning | [run_finetune.sh](run_finetune.sh) | Fine-tune from a pretrained checkpoint using your own JSONL data |
+| Evaluation | [run_eval.sh](run_eval.sh) | Evaluate WER, speaker similarity, and UTMOS on standard test sets |
+---
+## Training from Scratch (Emilia)
+[run_emilia.sh](run_emilia.sh) runs the full pipeline in 3 stages:
+| Stage | What it does |
+|---|---|
+| 0 | Verify the Emilia dataset and JSONL manifests are in place |
+| 1 | Tokenize audio into WebDataset shards |
+| 2 | Launch multi-GPU training with `accelerate` |
+**Prerequisites:**
+1. Download the Emilia dataset from [OpenXLab](https://openxlab.org.cn/datasets/Amphion/Emilia) and place it under `download/`:
+   ```
+   download/Amphion___Emilia
+   └── raw
+       ├── EN
+       └── ZH
+   ```
+2. Obtain JSONL manifests and place them in `data/emilia/manifests/`:
+   - `emilia_en_train.jsonl`, `emilia_en_dev.jsonl`
+   - `emilia_zh_train.jsonl`, `emilia_zh_dev.jsonl`
+   You can generate them from the raw data, or download pre-processed manifests from [HuggingFace](https://huggingface.co/datasets/zhu-han/Emilia-Manifests).
+**Run the full pipeline:**
+```bash
+bash examples/run_emilia.sh
+```
+Or run individual stages by setting `stage` and `stop_stage` at the top of the script (e.g. `stage=1`, `stop_stage=1` to only tokenize).
+> See [docs/training.md](../docs/training.md) for config details, checkpoint resuming, and TensorBoard monitoring.
+---
+## Fine-tuning
+[run_finetune.sh](run_finetune.sh) fine-tunes from a pretrained checkpoint on your own data.
+### Step 1: Prepare Your Data
+Create a JSONL manifest where each line describes one audio sample:
+```jsonl
+{"id": "sample_001", "audio_path": "/data/audio/001.wav", "text": "Hello world", "language_id": "en"}
+{"id": "sample_002", "audio_path": "/data/audio/002.wav", "text": "你好世界", "language_id": "zh"}
+```
+`id`, `audio_path`, and `text` are mandatory. `language_id` is optional.
+> See [docs/data_preparation.md](../docs/data_preparation.md) for the full data format specification.
+### Step 2: Configure the Script
+Edit the variables at the top of `run_finetune.sh`:
+```bash
+TRAIN_JSONL="data/my_data_train.jsonl"   # path to training JSONL
+DEV_JSONL="data/my_data_dev.jsonl"       # path to dev JSONL
+GPU_IDS="0,1"                            # GPUs to use
+NUM_GPUS=2
+OUTPUT_DIR="exp/omnivoice_finetune"      # output directory
+```
+### Step 3: Run
+```bash
+bash examples/run_finetune.sh
+```
+The script will:
+1. Tokenize your audio into WebDataset shards
+2. Launch fine-tuning with `accelerate`
+Main difference between fine-tuning config ([config/train_config_finetune.json](config/train_config_finetune.json)) and the Emilia training config ([config/train_config_emilia.json](config/train_config_emilia.json)) are:
+| Parameter | Emilia (from scratch) | Fine-tune | Why |
+|---|---|---|---|
+| `init_from_checkpoint` | `null` | `"k2-fsa/OmniVoice"` | Load pretrained weights |
+| `steps` | 300,000 | 5,000 | Fewer steps for fine-tuning, can be tuned according to your data/task. |
+| `learning_rate` | 1e-4 | 5e-5 | Lower LR for fine-tuning, can be tuned according to your data/task |
+To use a different pretrained checkpoint, modify `init_from_checkpoint` in the config file.
+If you encounter issues with `flex_attention` on your GPU, use [config/train_config_finetune_sdpa.json](config/train_config_finetune_sdpa.json) instead, which uses SDPA attention for broader compatibility. See [docs/training.md](../docs/training.md#attention-implementation) for details.
+---
+## Evaluation
+Install evaluation dependencies first:
+```bash
+pip install omnivoice[eval]
+# or
+uv sync --extra eval
+```
+Supported test sets: `librispeech_pc`, `seedtts_en`, `seedtts_zh`, `fleurs`, `minimax`.
+```bash
+bash examples/run_eval.sh
+```
+> See [docs/evaluation.md](../docs/evaluation.md) for metrics details, test set preparation, and running individual metrics.

examples/config/data_config_emilia.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+    "train":
+    [
+        {
+            "language_id": "en",
+            "manifest_path": [
+                "data/emilia/tokens/emilia_en_train/data.lst"
+            ],
+            "repeat": 1
+        },
+        {
+            "language_id": "zh",
+            "manifest_path": [
+                 "data/emilia/tokens/emilia_zh_train/data.lst"
+            ],
+            "repeat": 1
+        }
+    ],
+    "dev":
+    [
+        {
+            "language_id": "en",
+            "manifest_path": [
+                 "data/emilia/tokens/emilia_en_dev/data.lst"
+            ],
+            "repeat": 1
+        },
+        {
+            "language_id": "zh",
+            "manifest_path": [
+                 "data/emilia/tokens/emilia_zh_dev/data.lst"
+            ],
+            "repeat": 1
+        }
+    ]
+}

examples/config/data_config_finetune.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "train": [
+        {
+            "manifest_path": ["data/finetune/tokens/train/data.lst"]
+        }
+    ],
+    "dev": [
+        {
+            "manifest_path": ["data/finetune/tokens/dev/data.lst"]
+        }
+    ]
+}

examples/config/ds_config_zero2.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "steps_per_print": 100,
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "bf16": {
+        "enabled": "auto"
+    }
+}

examples/config/train_config_emilia.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [0.0, 0.3],
+    "mask_ratio_range": [0.0, 1.0],
+    "language_ratio": 0.0,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": null,
+    "learning_rate": 1e-4,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 300000,
+    "seed": 42,
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.03,
+    "warmup_steps": 0,
+    "batch_tokens": 8192,
+    "gradient_accumulation_steps": 1,
+    "num_workers": 4,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "logging_steps": 100,
+    "eval_steps": 1000,
+    "save_steps": 10000,
+    "keep_last_n_checkpoints": -1
+}

examples/config/train_config_finetune.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [0.0, 0.3],
+    "mask_ratio_range": [0.0, 1.0],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "k2-fsa/OmniVoice",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 8192,
+    "gradient_accumulation_steps": 1,
+    "num_workers": 2,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

examples/config/train_config_finetune_sdpa.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [0.0, 0.3],
+    "mask_ratio_range": [0.0, 1.0],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "k2-fsa/OmniVoice",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 8192,
+    "gradient_accumulation_steps": 1,
+    "num_workers": 2,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

examples/config/train_config_multilingual.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [0.0, 0.3],
+    "mask_ratio_range": [0.0, 1.0],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.3,
+    "instruct_ratio": 1.0,
+    "only_instruct_ratio": 0.5,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": null,
+    "learning_rate": 1e-4,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 2000000,
+    "seed": 42,
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.03,
+    "warmup_steps": 0,
+    "batch_tokens": 8192,
+    "gradient_accumulation_steps": 1,
+    "num_workers": 4,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "logging_steps": 100,
+    "eval_steps": 1000,
+    "save_steps": 10000,
+    "keep_last_n_checkpoints": -1
+}

examples/run_emilia.sh ADDED Viewed

	@@ -0,0 +1,115 @@

+#!/bin/bash
+# This script demonstrates how to run the full training pipeline on the Emilia dataset.
+set -euo pipefail
+stage=0
+stop_stage=2
+# ====== Modify as needed ======
+# GPUs to use
+GPU_IDS="0,1,2,3,4,5,6,7"
+NUM_GPUS=8
+# Download directory for raw Emilia data
+dl_dir="download"
+# Directory containing JSONL manifests for train/dev splits
+# Stage 0 will check for the presence of the following files:
+#   data/emilia/manifests/emilia_en_train.jsonl
+#   data/emilia/manifests/emilia_en_dev.jsonl
+#   data/emilia/manifests/emilia_zh_train.jsonl
+#   data/emilia/manifests/emilia_zh_dev.jsonl
+MANIFEST_DIR="data/emilia/manifests"
+# Directory to write tokenized WebDataset shards
+TOKEN_DIR="data/emilia/tokens"
+# Audio tokenizer model (HuggingFace repo or local path)
+TOKENIZER_PATH="eustlb/higgs-audio-v2-tokenizer"
+# Training config file
+TRAIN_CONFIG="config/train_config_emilia.json"
+# Data config file
+data_config="config/data_config_emilia.json"
+# Output directory for checkpoints
+OUTPUT_DIR="exp/omnivoice_emilia"
+# =================================
+export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
+# Stage 0: Download data
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+    echo "Stage 0: Download data"
+    # You should manually download the Emilia dataset from
+    # https://openxlab.org.cn/datasets/Amphion/Emilia
+    # or https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07
+    # and place it in the download directory.
+    # Your download directory should at least contain the following structure:
+    #
+    #    download/Amphion___Emilia
+    #    ├── raw
+    #    │   ├── EN
+    #    │   └── ZH
+    if [ ! -d "$dl_dir"/Amphion___Emilia/raw ]; then
+        echo "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
+        exit 1
+    fi
+    # We require JSONL manifests for the training and dev splits. You can
+    # either generate them yourself using the raw data and the provided
+    # metadata, or download our processed JSONL manifests from HuggingFace.
+    # https://huggingface.co/datasets/zhu-han/Emilia-Manifests
+    #
+    # Place them as data/emilia/manifests/{emilia_en_train,emilia_en_dev,emilia_zh_train,emilia_zh_dev}.jsonl
+    for split in emilia_en_dev emilia_zh_dev emilia_en_train emilia_zh_train; do
+        if [ ! -f "${MANIFEST_DIR}/${split}.jsonl" ]; then
+            echo "Please download the manifest for ${split} and place it in ${MANIFEST_DIR}/${split}.jsonl"
+            exit 1
+        fi
+    done
+    echo "  Done. All manifests and data are in place."
+fi
+# Stage 1: Tokenize splits into directories matching data_config_emilia.json
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    echo "Stage 1: Tokenizing audio"
+    for split in emilia_en_dev emilia_zh_dev emilia_en_train emilia_zh_train; do
+        echo "  Tokenizing ${split} from ${MANIFEST_DIR}/${split}.jsonl"
+        CUDA_VISIBLE_DEVICES=${GPU_IDS} \
+            python -m omnivoice.scripts.extract_audio_tokens \
+            --input_jsonl "${MANIFEST_DIR}/${split}.jsonl" \
+            --tar_output_pattern "${TOKEN_DIR}/${split}/audios/shard-%06d.tar" \
+            --jsonl_output_pattern "${TOKEN_DIR}/${split}/txts/shard-%06d.jsonl" \
+            --tokenizer_path "${TOKENIZER_PATH}" \
+            --nj_per_gpu 3 \
+            --shuffle True
+        echo "  Done. Tokens written to ${TOKEN_DIR}/${split}"
+    done
+fi
+# Stage 2: Train
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+    echo "Stage 2: Training"
+    accelerate launch \
+        --gpu_ids "${GPU_IDS}" \
+        --num_processes ${NUM_GPUS} \
+        -m omnivoice.cli.train \
+        --train_config ${TRAIN_CONFIG} \
+        --data_config ${data_config} \
+        --output_dir ${OUTPUT_DIR}
+fi

examples/run_eval.sh ADDED Viewed

	@@ -0,0 +1,283 @@

+#!/bin/bash
+# Evaluate OmniVoice models on TTS benchmarks.
+# Stage 1: Download the test sets and evaluation models.
+# Stage 2: LibriSpeech-PC
+# Stage 3: seedtts_en
+# Stage 4: seedtts_zh
+# Stage 5: fleurs
+# Stage 6: minimax
+set -euo pipefail
+# Specify the stages to run by setting the `stage` and `stop_stage` variables.
+stage=1
+stop_stage=6
+# Available GPUs for evaluation. Adjust this according to your setup.
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# Specify the checkpoint to evaluate.
+CHECKPOINT=k2-fsa/OmniVoice
+emilia_checkpoint=false
+# CHECKPOINT=k2-fsa/OmniVoice
+# emilia_checkpoint=true
+# For the OmniVoice-Emilia checkpoint, we set denoise to False and lang_id to None
+#, as the model is trained without prompt denoising or language id.
+if [ "${emilia_checkpoint}" = true ]; then
+    infer_options="--preprocess_prompt False \
+        --postprocess_output False \
+        --batch_duration 600 \
+        --denoise False \
+        --lang_id None \
+        --audio_chunk_threshold 1000"
+else
+    infer_options="--preprocess_prompt False \
+        --postprocess_output False \
+        --batch_duration 600 \
+        --audio_chunk_threshold 1000"
+fi
+export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
+download_dir="download"
+TTS_EVAL_MODEL_DIR="${download_dir}/tts_eval_models/"
+TTS_EVAL_DATA_DIR="${download_dir}/tts_eval_datasets/"
+# Map test_name to its test.jsonl path.
+get_test_list() {
+    case "$1" in
+        librispeech_pc) echo "${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean.jsonl" ;;
+        seedtts_en)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_en.jsonl" ;;
+        seedtts_zh)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_zh.jsonl" ;;
+        minimax)        echo "${TTS_EVAL_DATA_DIR}/minimax_multilingual_24.jsonl" ;;
+        fleurs)         echo "${TTS_EVAL_DATA_DIR}/fleurs_multilingual_102.jsonl" ;;
+        *)              echo ""; return 1 ;;
+    esac
+}
+# ============================================================
+# Stage 1: Prepare the test sets and evaluation models
+# ============================================================
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "Stage 1: Download test sets and evaluation models"
+    hf_repo=k2-fsa/TTS_eval_datasets
+    mkdir -p ${TTS_EVAL_DATA_DIR}/
+    for file in \
+        librispeech_pc_test_clean.jsonl \
+        librispeech_pc_test_clean_transcript.jsonl \
+        seedtts_test_en.jsonl \
+        seedtts_test_zh.jsonl \
+        minimax_multilingual_24.jsonl \
+        fleurs_multilingual_102.jsonl; do
+        echo "Downloading ${file}..."
+        huggingface-cli download \
+                --repo-type dataset \
+                --local-dir ${TTS_EVAL_DATA_DIR}/ \
+                ${hf_repo} \
+                ${file}
+    done
+    for file in \
+        librispeech_pc_testset.tar.gz \
+        seedtts_testset.tar.gz \
+        minimax_multilingual_24.tar.gz \
+        fleurs_multilingual_102.tar.gz; do
+        echo "Downloading ${file}..."
+        huggingface-cli download \
+                --repo-type dataset \
+                --local-dir ${TTS_EVAL_DATA_DIR}/ \
+                ${hf_repo} \
+                ${file}
+        echo "Extracting ${file}..."
+        tar -xzf ${TTS_EVAL_DATA_DIR}/${file} -C ${TTS_EVAL_DATA_DIR}/
+    done
+    echo "Download all evaluation models"
+    hf_repo=k2-fsa/TTS_eval_models
+    mkdir -p ${TTS_EVAL_MODEL_DIR}
+    huggingface-cli download \
+        --local-dir ${TTS_EVAL_MODEL_DIR} \
+        ${hf_repo}
+fi
+# ============================================================
+# Stage 2: Evaluation on LibriSpeech-PC
+# ============================================================
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "Stage 2: Evaluation on LibriSpeech-PC"
+    wav_path="results/librispeech_pc"
+    test_jsonl="$(get_test_list librispeech_pc)"
+    transcript_jsonl="${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean_transcript.jsonl"
+    python -m omnivoice.cli.infer_batch \
+        --model "${CHECKPOINT}" \
+        --test_list "${test_jsonl}" \
+        --res_dir "${wav_path}" ${infer_options}
+    python -m omnivoice.eval.speaker_similarity.sim \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.sim.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    python -m omnivoice.eval.wer.hubert \
+        --wav-path "${wav_path}" \
+        --test-list "${transcript_jsonl}" \
+        --decode-path "${wav_path}.wer.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    python -m omnivoice.eval.mos.utmos \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.mos.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+fi
+# ============================================================
+# Stage 3: Evaluation on Seed-TTS en
+# ============================================================
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "Stage 3: Evaluation on Seed-TTS en"
+    wav_path="results/seedtts_en"
+    test_jsonl="$(get_test_list seedtts_en)"
+    python -m omnivoice.cli.infer_batch \
+        --model "${CHECKPOINT}" \
+        --test_list "${test_jsonl}" \
+        --res_dir "${wav_path}"  ${infer_options}
+    python -m omnivoice.eval.speaker_similarity.sim \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.sim.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    python -m omnivoice.eval.wer.seedtts \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.wer.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}" \
+        --lang en
+    python -m omnivoice.eval.mos.utmos \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.mos.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+fi
+# ============================================================
+# Stage 4: Evaluation on Seed-TTS zh
+# ============================================================
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "Stage 4: Evaluation on Seed-TTS zh"
+    wav_path="results/seedtts_zh"
+    test_jsonl="$(get_test_list seedtts_zh)"
+    python -m omnivoice.cli.infer_batch \
+        --model "${CHECKPOINT}" \
+        --test_list "${test_jsonl}" \
+        --res_dir "${wav_path}"  ${infer_options}
+    python -m omnivoice.eval.speaker_similarity.sim \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.sim.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    python -m omnivoice.eval.wer.seedtts \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.wer.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}" \
+        --lang zh
+    python -m omnivoice.eval.mos.utmos \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.mos.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+fi
+# ============================================================
+# Stage 5: Evaluation on MiniMax multilingual
+# ============================================================
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "Stage 5: Evaluation on MiniMax multilingual"
+    wav_path="results/minimax"
+    test_jsonl="$(get_test_list minimax)"
+    python -m omnivoice.cli.infer_batch \
+        --model "${CHECKPOINT}" \
+        --test_list "${test_jsonl}" \
+        --res_dir "${wav_path}"  ${infer_options}
+    python -m omnivoice.eval.speaker_similarity.sim \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.sim.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    python -m omnivoice.eval.wer.minimax \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.wer.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+fi
+# ============================================================
+# Stage 6: Evaluation on FLEURS multilingual
+# ============================================================
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "Stage 6: Evaluation on FLEURS multilingual"
+    wav_path="results/fleurs"
+    test_jsonl="$(get_test_list fleurs)"
+    python -m omnivoice.cli.infer_batch \
+        --model "${CHECKPOINT}" \
+        --test_list "${test_jsonl}" \
+        --res_dir "${wav_path}"  ${infer_options}
+    python -m omnivoice.eval.speaker_similarity.sim \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.sim.log" \
+        --model-dir "${TTS_EVAL_MODEL_DIR}"
+    # Evaluation on FLEURS requires omnilingual-asr, which has dependencies that
+    # conflict with other packages (at least the transformers package) in our project.
+    # To evaluate on FLEURS, we suggest users to set up a separate virtual
+    # environment to install omnilingual-asr. Install instructions can be found in
+    # https://github.com/facebookresearch/omnilingual-asr
+    python ${PWD}/../omnivoice/eval/wer/fleurs.py \
+        --wav-path "${wav_path}" \
+        --test-list "${test_jsonl}" \
+        --decode-path "${wav_path}.wer.log" \
+        --model-card omniASR_LLM_Unlimited_7B_v2 \
+        --chunk-size 100 \
+        --batch-size 50
+fi

examples/run_finetune.sh ADDED Viewed

	@@ -0,0 +1,85 @@

+#!/bin/bash
+# This script demonstrates how to fine-tune OmniVoice from a JSONL manifest.
+set -euo pipefail
+stage=0
+stop_stage=1
+# ====== Modify as needed ======
+# GPUs to use
+GPU_IDS="0,1"
+NUM_GPUS=2
+# Path to your input JSONL file
+# (each line: {"id": ..., "audio_path": ..., "text": ..., "language_id": ...})
+TRAIN_JSONL="/home/riftuser/OmniVoice/sync_data/data/train_raw.jsonl"
+# Path to your dev JSONL file. Set to empty string to skip dev set.
+DEV_JSONL="/home/riftuser/OmniVoice/sync_data/data/dev_raw.jsonl"
+# Directory to write tokenized WebDataset shards
+TOKEN_DIR="/home/riftuser/OmniVoice/sync_data/tokens"
+# Audio tokenizer model (HuggingFace repo or local path)
+TOKENIZER_PATH="eustlb/higgs-audio-v2-tokenizer"
+# Training config file
+# If you encounter issues with flex_attention on your GPU, use the SDPA config instead:
+# TRAIN_CONFIG="config/train_config_finetune_sdpa.json"
+TRAIN_CONFIG="/home/riftuser/OmniVoice/sync_data/configs/config.json"
+# Data config file
+data_config="/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json"
+# Output directory for fine-tuned checkpoints
+OUTPUT_DIR="/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune"
+# =================================
+export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
+# Stage 0: Tokenize audio into WebDataset shards
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+    echo "Stage 0: Tokenizing audio"
+    for split_jsonl_path in ${TRAIN_JSONL} ${DEV_JSONL}; do
+        if [ -z "${split_jsonl_path}" ]; then
+            continue
+        fi
+        if [ "${split_jsonl_path}" = "${TRAIN_JSONL}" ]; then
+            split="train"
+        else
+            split="dev"
+        fi
+        echo "  Tokenizing ${split} from ${split_jsonl_path}"
+        CUDA_VISIBLE_DEVICES=${GPU_IDS} \
+            python -m omnivoice.scripts.extract_audio_tokens \
+            --input_jsonl "${split_jsonl_path}" \
+            --tar_output_pattern "${TOKEN_DIR}/${split}/audios/shard-%06d.tar" \
+            --jsonl_output_pattern "${TOKEN_DIR}/${split}/txts/shard-%06d.jsonl" \
+            --tokenizer_path "${TOKENIZER_PATH}" \
+            --nj_per_gpu 3 \
+            --shuffle True
+        echo "  Done. Manifest written to ${TOKEN_DIR}/${split}/data.lst"
+    done
+fi
+# Stage 1: Fine-tune
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+    echo "Stage 1: Fine-tuning"
+    accelerate launch \
+        --gpu_ids "${GPU_IDS}" \
+        --num_processes ${NUM_GPUS} \
+        -m omnivoice.cli.train \
+        --train_config ${TRAIN_CONFIG} \
+        --data_config ${data_config} \
+        --output_dir ${OUTPUT_DIR}
+fi

exp_v1/omnivoice_finetune/checkpoint-4500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

exp_v1/omnivoice_finetune/checkpoint-4500/config.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "OmniVoice"
+  ],
+  "audio_codebook_weights": [
+    8,
+    8,
+    6,
+    6,
+    4,
+    4,
+    2,
+    2
+  ],
+  "audio_mask_id": 1024,
+  "audio_vocab_size": 1025,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "llm_config": {
+    "_name_or_path": "",
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "dtype": "float32",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 28,
+    "model_type": "qwen3",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "pad_token_id": null,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 1000000,
+      "rope_type": "default"
+    },
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151676
+  },
+  "model_type": "omnivoice",
+  "num_audio_codebook": 8,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

exp_v1/omnivoice_finetune/checkpoint-4500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|denoise|>",
+    "<|lang_start|>",
+    "<|lang_end|>",
+    "<|instruct_start|>",
+    "<|instruct_end|>",
+    "<|text_start|>",
+    "<|text_end|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

exp_v1/omnivoice_finetune/checkpoint-4500/train_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+    "output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
+    "data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [
+        8,
+        8,
+        6,
+        6,
+        4,
+        4,
+        2,
+        2
+    ],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [
+        0.0,
+        0.3
+    ],
+    "mask_ratio_range": [
+        0.0,
+        1.0
+    ],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
+    "learning_rate": 1e-05,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "lr_scheduler_type": "cosine",
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 4096,
+    "gradient_accumulation_steps": 2,
+    "num_workers": 3,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "use_deepspeed": false,
+    "deepspeed_config": null,
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

exp_v1/omnivoice_finetune/checkpoint-500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

exp_v1/omnivoice_finetune/checkpoint-500/config.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "OmniVoice"
+  ],
+  "audio_codebook_weights": [
+    8,
+    8,
+    6,
+    6,
+    4,
+    4,
+    2,
+    2
+  ],
+  "audio_mask_id": 1024,
+  "audio_vocab_size": 1025,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "llm_config": {
+    "_name_or_path": "",
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "dtype": "float32",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 28,
+    "model_type": "qwen3",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "pad_token_id": null,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 1000000,
+      "rope_type": "default"
+    },
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151676
+  },
+  "model_type": "omnivoice",
+  "num_audio_codebook": 8,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

exp_v1/omnivoice_finetune/checkpoint-500/train_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+    "output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
+    "data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [
+        8,
+        8,
+        6,
+        6,
+        4,
+        4,
+        2,
+        2
+    ],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [
+        0.0,
+        0.3
+    ],
+    "mask_ratio_range": [
+        0.0,
+        1.0
+    ],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
+    "learning_rate": 1e-05,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "lr_scheduler_type": "cosine",
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 4096,
+    "gradient_accumulation_steps": 2,
+    "num_workers": 3,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "use_deepspeed": false,
+    "deepspeed_config": null,
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

exp_v1/omnivoice_finetune/checkpoint-5000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,89 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

exp_v1/omnivoice_finetune/checkpoint-5000/config.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+  "architectures": [
+    "OmniVoice"
+  ],
+  "audio_codebook_weights": [
+    8,
+    8,
+    6,
+    6,
+    4,
+    4,
+    2,
+    2
+  ],
+  "audio_mask_id": 1024,
+  "audio_vocab_size": 1025,
+  "bos_token_id": null,
+  "dtype": "float32",
+  "eos_token_id": 151645,
+  "llm_config": {
+    "_name_or_path": "",
+    "architectures": [
+      "Qwen3ForCausalLM"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "chunk_size_feed_forward": 0,
+    "dtype": "float32",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 40960,
+    "max_window_layers": 28,
+    "model_type": "qwen3",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "pad_token_id": null,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 1000000,
+      "rope_type": "default"
+    },
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 151676
+  },
+  "model_type": "omnivoice",
+  "num_audio_codebook": 8,
+  "pad_token_id": 151643,
+  "transformers_version": "5.3.0"
+}

exp_v1/omnivoice_finetune/checkpoint-5000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|denoise|>",
+    "<|lang_start|>",
+    "<|lang_end|>",
+    "<|instruct_start|>",
+    "<|instruct_end|>",
+    "<|text_start|>",
+    "<|text_end|>"
+  ],
+  "is_local": true,
+  "local_files_only": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

exp_v1/omnivoice_finetune/checkpoint-5000/train_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+    "output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
+    "data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [
+        8,
+        8,
+        6,
+        6,
+        4,
+        4,
+        2,
+        2
+    ],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [
+        0.0,
+        0.3
+    ],
+    "mask_ratio_range": [
+        0.0,
+        1.0
+    ],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
+    "learning_rate": 1e-05,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "lr_scheduler_type": "cosine",
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 4096,
+    "gradient_accumulation_steps": 2,
+    "num_workers": 3,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "use_deepspeed": false,
+    "deepspeed_config": null,
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

exp_v1/omnivoice_finetune/initial_config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+    "output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
+    "data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
+    "llm_name_or_path": "Qwen/Qwen3-0.6B",
+    "audio_vocab_size": 1025,
+    "audio_mask_id": 1024,
+    "num_audio_codebook": 8,
+    "audio_codebook_weights": [
+        8,
+        8,
+        6,
+        6,
+        4,
+        4,
+        2,
+        2
+    ],
+    "drop_cond_ratio": 0.1,
+    "prompt_ratio_range": [
+        0.0,
+        0.3
+    ],
+    "mask_ratio_range": [
+        0.0,
+        1.0
+    ],
+    "language_ratio": 0.8,
+    "use_pinyin_ratio": 0.0,
+    "instruct_ratio": 0.0,
+    "only_instruct_ratio": 0.0,
+    "resume_from_checkpoint": null,
+    "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
+    "learning_rate": 1e-05,
+    "weight_decay": 0.01,
+    "max_grad_norm": 1.0,
+    "steps": 5000,
+    "seed": 42,
+    "lr_scheduler_type": "cosine",
+    "warmup_type": "ratio",
+    "warmup_ratio": 0.01,
+    "warmup_steps": 0,
+    "batch_tokens": 4096,
+    "gradient_accumulation_steps": 2,
+    "num_workers": 3,
+    "mixed_precision": "bf16",
+    "allow_tf32": true,
+    "use_deepspeed": false,
+    "deepspeed_config": null,
+    "attn_implementation": "sdpa",
+    "max_sample_tokens": 2000,
+    "min_sample_tokens": 50,
+    "max_batch_size": 64,
+    "logging_steps": 50,
+    "eval_steps": 500,
+    "save_steps": 500,
+    "keep_last_n_checkpoints": -1
+}

infer.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import time
+from omnivoice import OmniVoice
+import soundfile as sf
+import torch
+model = OmniVoice.from_pretrained(
+    "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune/checkpoint-500",
+    device_map="cuda:0",
+    dtype=torch.float16
+)
+saudi_tts_text = """
+السلام عليكم يا شباب، كيف الحال؟
+اليوم عندي تقريبًا 3 meetings مهمة، وأول meeting بتبدأ الساعة 10:30 الصباح. [sigh] [sigh]
+بصراحة كنت ناوي أخلص الـ report بدري، لكن الـ internet صار بطيء بشكل مو طبيعي. [dissatisfaction-hnn] [sigh] [dissatisfaction-hnn]
+قلت خلاص، خلني آخذ coffee وأروق شوي قبل ما أبدأ الشغل. [laughter] [laughter] [confirmation-en]
+وبعدين اكتشفت إن الـ laptop يحتاج update من أمس! [surprise-oh] [dissatisfaction-hnn]
+قلت يا ساتر، شكله يوم طويل جدًا. [sigh] [laughter]
+لكن الحمد لله الأمور مشت تمام بالنهاية.
+"""
+# Measure generation time
+start_time = time.time()
+audio = model.generate(
+    text=saudi_tts_text,
+    ref_audio="ref_audio/women_ref_1.mp3",
+    ref_text="شوفي يا حلوة هالكريم الجديد للبشرة، يخلي وجهك مثل القمر! ",
+    instruct = "female, young adult, high pitch",
+    speed = 1.1,
+    num_step = 25,
+    guidance_scale=2.0,
+    t_shift=0.1,
+    position_temperature=3,
+    layer_penalty_factor=5.0,
+)
+generation_time = time.time() - start_time
+# Save audio
+sf.write("out_1.wav", audio[0], 24000)
+# Calculate audio duration
+audio_duration = len(audio[0]) / 24000
+# Calculate RTF
+rtf = generation_time / audio_duration
+print(f"Generation Time: {generation_time:.2f} sec")
+print(f"Audio Duration: {audio_duration:.2f} sec")
+print(f"RTF: {rtf:.4f}")

omnivoice/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import warnings
+from importlib.metadata import PackageNotFoundError, version
+warnings.filterwarnings("ignore", module="torchaudio")
+warnings.filterwarnings(
+    "ignore",
+    category=SyntaxWarning,
+    message="invalid escape sequence",
+    module="pydub.utils",
+)
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    module="torch.distributed.algorithms.ddp_comm_hooks",
+)
+try:
+    __version__ = version("omnivoice")
+except PackageNotFoundError:
+    __version__ = "0.0.0"
+from omnivoice.models.omnivoice import (
+    OmniVoice,
+    OmniVoiceConfig,
+    OmniVoiceGenerationConfig,
+)
+__all__ = ["OmniVoice", "OmniVoiceConfig", "OmniVoiceGenerationConfig"]

prepare_sync_data.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import argparse
+import io
+import json
+import random
+import shutil
+from pathlib import Path
+import soundfile as sf
+from datasets import Audio, load_dataset
+from tqdm import tqdm
+DEFAULT_REPO = "saleh1312/syncing_data"
+MAX_DURATION = 10.0
+def main():
+    parser = argparse.ArgumentParser(description="Prepare data for OmniVoice Training")
+    parser.add_argument("--repo", default=DEFAULT_REPO, help="HF Dataset ID")
+    parser.add_argument("--out", default="sync_data", help="Output directory")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    args = parser.parse_args()
+    out_root = Path(args.out).resolve()
+    out_root = out_root / "data"
+    if out_root.exists():
+        print(f"Cleaning up old directory: {out_root}")
+        shutil.rmtree(out_root)
+    wav_dir = out_root / "wavs"
+    wav_dir.mkdir(parents=True)
+    print(f"Loading dataset: {args.repo}")
+    ds = load_dataset(args.repo, split="train")
+    ds = ds.cast_column("audio", Audio(decode=False))
+    processed_records = []
+    skipped = 0
+    print("Processing audio files...")
+    for i, row in enumerate(tqdm(ds)):
+        audio_data = row["audio"]["bytes"]
+        if not audio_data:
+            continue
+        # Load audio to check duration
+        with io.BytesIO(audio_data) as f:
+            data, sr = sf.read(f)
+        duration = len(data) / sr
+        if duration > MAX_DURATION:
+            skipped += 1
+            continue
+        # Ensure Mono
+        if data.ndim > 1:
+            data = data.mean(axis=1)
+        sample_id = f"sample_{i:06d}"
+        wav_path = wav_dir / f"{sample_id}.wav"
+        sf.write(wav_path, data, sr, subtype='PCM_16')
+        tone = str(row.get("tone", "neutral")).strip().lower()
+        processed_records.append({
+            "id": sample_id,
+            "audio_path": str(wav_path.resolve()),
+            "text": row["text"],
+            "language_id": "ar",
+            "instruct": f"saudi, conversational, {tone}"
+        })
+    random.seed(args.seed)
+    random.shuffle(processed_records)
+    split_idx = int(len(processed_records) * 0.95)
+    train_data = processed_records[:split_idx]
+    dev_data = processed_records[split_idx:]
+    # Write input files for the tokenization script
+    for name, data in [("train_raw.jsonl", train_data), ("dev_raw.jsonl", dev_data)]:
+        out_path = out_root / name
+        with open(out_path, "w", encoding="utf-8") as f:
+            for rec in data:
+                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+        print(f"Created {out_path} ({len(data)} samples)")
+    print(f"\nPreparation Complete!")
+    print(f"Skipped {skipped} samples (> {MAX_DURATION}s)")
+    print(f"Next: Run the 'extract_audio_tokens.py' script using 'sync_data/train_raw.jsonl'")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,98 @@

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "omnivoice"
+version = "0.1.5"
+description = "OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.10"
+authors = [{name = "Han Zhu"}]
+keywords = [
+    "tts",
+    "text-to-speech",
+    "speech-synthesis",
+    "zero-shot",
+    "multilingual",
+    "diffusion",
+    "voice-cloning",
+]
+classifiers = [
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Developers",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+]
+dependencies = [
+    "torch>=2.4",
+    "torchaudio>=2.4",
+    "transformers>=5.3.0",
+    "accelerate",
+    "pydub",
+    "gradio",
+    "tensorboardX",
+    "webdataset",
+    "numpy",
+    "soundfile",
+    "librosa",
+    "uvicorn>=0.42.0",
+    "fastapi>=0.135.2",
+]
+[project.optional-dependencies]
+eval = [
+    "jiwer==3.1.0",       # WER
+    "s3prl",               # Speech representation (HuBERT etc.)
+    "funasr",              # ASR models
+    "zhconv",              # Chinese character normalization
+    "zhon",                # Chinese punctuation
+    "unidecode",            # Unicode normalization
+]
+[project.scripts]
+omnivoice-infer = "omnivoice.cli.infer:main"
+omnivoice-infer-batch = "omnivoice.cli.infer_batch:main"
+omnivoice-demo = "omnivoice.cli.demo:main"
+[project.urls]
+Homepage = "https://github.com/k2-fsa/OmniVoice"
+Repository = "https://github.com/k2-fsa/OmniVoice"
+"Bug Tracker" = "https://github.com/k2-fsa/OmniVoice/issues"
+[tool.uv.sources]
+# Install PyTorch with CUDA support on Linux/Windows (CUDA doesn't exist for Mac).
+# NOTE: We must explicitly request them as `dependencies` above. These improved
+# versions will not be selected if they're only third-party dependencies.
+torch = [
+  { index = "pytorch-cuda", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+torchaudio = [
+  { index = "pytorch-cuda", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+]
+[[tool.uv.index]]
+name = "pytorch-cuda"
+# Use PyTorch built for NVIDIA Toolkit version 12.8.
+# Available versions: https://pytorch.org/get-started/locally/
+url = "https://download.pytorch.org/whl/cu128"
+# Only use this index when explicitly requested by `tool.uv.sources`.
+explicit = true
+[tool.uv]
+constraint-dependencies = [
+    "torch==2.8.0",
+    "torchaudio==2.8.0",
+]
+[tool.hatch.build.targets.sdist]
+include = ["omnivoice"]
+[tool.hatch.build.targets.wheel]
+packages = ["omnivoice"]

ref_audio/women_ref_1.mp3 ADDED Viewed

Binary file (79 kB). View file

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from huggingface_hub import HfApi, create_repo
+import os
+# =========================
+# CONFIG
+# =========================
+repo_name = "OmniVoice_sync_data_and_code"
+username = "TTS-ORG"
+local_path = os.path.expanduser("~/OmniVoice")
+repo_id = f"{username}/{repo_name}"
+# =========================
+# INIT
+# =========================
+api = HfApi()
+# =========================
+# CREATE REPO
+# =========================
+create_repo(
+    repo_id=repo_id,
+    repo_type="model",
+    exist_ok=True,
+)
+print(f"Repo ready: {repo_id}")
+# =========================
+# UPLOAD LARGE FOLDER
+# =========================
+api.upload_large_folder(
+    folder_path=local_path,
+    repo_id=repo_id,
+    repo_type="model",
+    # VERY IMPORTANT
+    ignore_patterns=[
+        ".git/*",
+        ".venv/*",
+        "__pycache__/*",
+        "*.pyc",
+        # huge optimizer states
+        "*.bin",
+        # cache/temp
+        "*.tmp",
+        "*.log",
+        # optional
+        "out.wav",
+    ],
+)
+print("Upload completed successfully 🚀")

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff