Add files using upload-large-folder tool
Browse files- .github/ISSUE_TEMPLATE/bug_report.yml +50 -0
- .github/ISSUE_TEMPLATE/config.yml +1 -0
- .github/ISSUE_TEMPLATE/feature_request.yml +62 -0
- .github/ISSUE_TEMPLATE/help_wanted.yml +52 -0
- .github/ISSUE_TEMPLATE/question.yml +26 -0
- .gitignore +30 -0
- LICENSE +201 -0
- README.md +322 -0
- api.py +277 -0
- docs/OmniVoice.ipynb +144 -0
- docs/community-projects.md +46 -0
- docs/data_preparation.md +182 -0
- docs/data_preparation_advanced.md +67 -0
- docs/evaluation.md +48 -0
- docs/generation-parameters.md +68 -0
- docs/lang_id_name_map.tsv +647 -0
- docs/languages.md +659 -0
- docs/tips.md +10 -0
- docs/training.md +102 -0
- docs/voice-design.md +129 -0
- examples/README.md +120 -0
- examples/config/data_config_emilia.json +36 -0
- examples/config/data_config_finetune.json +12 -0
- examples/config/ds_config_zero2.json +19 -0
- examples/config/train_config_emilia.json +39 -0
- examples/config/train_config_finetune.json +39 -0
- examples/config/train_config_finetune_sdpa.json +43 -0
- examples/config/train_config_multilingual.json +39 -0
- examples/run_emilia.sh +115 -0
- examples/run_eval.sh +283 -0
- examples/run_finetune.sh +85 -0
- exp_v1/omnivoice_finetune/checkpoint-4500/chat_template.jinja +89 -0
- exp_v1/omnivoice_finetune/checkpoint-4500/config.json +101 -0
- exp_v1/omnivoice_finetune/checkpoint-4500/tokenizer_config.json +24 -0
- exp_v1/omnivoice_finetune/checkpoint-4500/train_config.json +57 -0
- exp_v1/omnivoice_finetune/checkpoint-500/chat_template.jinja +89 -0
- exp_v1/omnivoice_finetune/checkpoint-500/config.json +101 -0
- exp_v1/omnivoice_finetune/checkpoint-500/train_config.json +57 -0
- exp_v1/omnivoice_finetune/checkpoint-5000/chat_template.jinja +89 -0
- exp_v1/omnivoice_finetune/checkpoint-5000/config.json +101 -0
- exp_v1/omnivoice_finetune/checkpoint-5000/tokenizer_config.json +24 -0
- exp_v1/omnivoice_finetune/checkpoint-5000/train_config.json +57 -0
- exp_v1/omnivoice_finetune/initial_config.json +57 -0
- infer.py +58 -0
- omnivoice/__init__.py +28 -0
- prepare_sync_data.py +90 -0
- pyproject.toml +98 -0
- ref_audio/women_ref_1.mp3 +0 -0
- upload_to_hf.py +59 -0
- uv.lock +0 -0
.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Bug Report"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- bug
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for bug reports, usage problems go with 'Help Wanted'.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and training/finetuning configuration (if applicable)."
|
| 24 |
+
placeholder: e.g., Ubuntu 20.04.6 LTS, 4 * H20, Python 3.13, torch==2.8.0+cu128, cuda 12.8
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: Steps to Reproduce
|
| 30 |
+
description: |
|
| 31 |
+
Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
|
| 32 |
+
placeholder: |
|
| 33 |
+
1. Clone the repo and install omnivoice with `uv sync`.
|
| 34 |
+
2. Run the command: `omnivoice-infer --text "This is a test." --ref_audio ref.wav --ref_text "Transcription." --output output.wav`
|
| 35 |
+
3. Got the following error message... (attach full logs).
|
| 36 |
+
4. Upload relevant audio files (e.g., ref.wav, output.wav) as .wav or packed in .zip.
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
- type: textarea
|
| 40 |
+
attributes:
|
| 41 |
+
label: ✔️ Expected Behavior
|
| 42 |
+
placeholder: Describe in detail what you expected to happen.
|
| 43 |
+
validations:
|
| 44 |
+
required: false
|
| 45 |
+
- type: textarea
|
| 46 |
+
attributes:
|
| 47 |
+
label: ❌ Actual Behavior
|
| 48 |
+
placeholder: Describe in detail what actually happened.
|
| 49 |
+
validations:
|
| 50 |
+
required: false
|
.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
.github/ISSUE_TEMPLATE/feature_request.yml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Feature Request"
|
| 2 |
+
description: |
|
| 3 |
+
Some constructive suggestions and new ideas regarding current repo.
|
| 4 |
+
labels:
|
| 5 |
+
- enhancement
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for feature request.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and found not discussion yet.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
|
| 23 |
+
description: |
|
| 24 |
+
Describe the specific problem or scenario you're facing in detail. For example:
|
| 25 |
+
*"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
|
| 26 |
+
placeholder: Please describe the situation in as much detail as possible.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
|
| 30 |
+
- type: textarea
|
| 31 |
+
attributes:
|
| 32 |
+
label: 2. What is your suggested solution?
|
| 33 |
+
description: |
|
| 34 |
+
Provide a clear description of the feature or enhancement you'd like to propose.
|
| 35 |
+
How would this feature solve your issue or improve the project?
|
| 36 |
+
placeholder: Describe your idea or proposed solution here.
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
|
| 40 |
+
- type: textarea
|
| 41 |
+
attributes:
|
| 42 |
+
label: 3. Additional context or comments
|
| 43 |
+
description: |
|
| 44 |
+
Any other relevant information, links, documents, or screenshots that provide clarity.
|
| 45 |
+
Use this section for anything not covered above.
|
| 46 |
+
placeholder: Add any extra details here.
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
| 49 |
+
|
| 50 |
+
- type: checkboxes
|
| 51 |
+
attributes:
|
| 52 |
+
label: 4. Can you help us with this feature?
|
| 53 |
+
description: |
|
| 54 |
+
Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
|
| 55 |
+
options:
|
| 56 |
+
- label: I am interested in contributing to this feature.
|
| 57 |
+
required: false
|
| 58 |
+
|
| 59 |
+
- type: markdown
|
| 60 |
+
attributes:
|
| 61 |
+
value: |
|
| 62 |
+
**Note:** Please submit only one request per issue to keep discussions focused and manageable.
|
.github/ISSUE_TEMPLATE/help_wanted.yml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Help Wanted"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- help wanted
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for usage issues encountered.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details such as OS, Python version, and any relevant software or dependencies."
|
| 24 |
+
placeholder: |
|
| 25 |
+
e.g., macOS 13.5, Python 3.13, torch==2.8.0
|
| 26 |
+
If training or finetuning related, provide detailed configuration including GPU info and training setup.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
- type: textarea
|
| 30 |
+
attributes:
|
| 31 |
+
label: Steps to Reproduce
|
| 32 |
+
description: |
|
| 33 |
+
Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
|
| 34 |
+
placeholder: |
|
| 35 |
+
1. Clone the repo and install omnivoice with `uv sync`.
|
| 36 |
+
2. Run the command: `omnivoice-infer --text "This is a test." --ref_audio ref.wav --ref_text "Transcription." --output output.wav`
|
| 37 |
+
3. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
|
| 38 |
+
4. Upload relevant audio files (e.g., ref.wav, output.wav) as .wav or packed in .zip.
|
| 39 |
+
validations:
|
| 40 |
+
required: true
|
| 41 |
+
- type: textarea
|
| 42 |
+
attributes:
|
| 43 |
+
label: ✔️ Expected Behavior
|
| 44 |
+
placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
|
| 45 |
+
validations:
|
| 46 |
+
required: false
|
| 47 |
+
- type: textarea
|
| 48 |
+
attributes:
|
| 49 |
+
label: ❌ Actual Behavior
|
| 50 |
+
placeholder: Describe what actually happened in detail, failure messages, etc.
|
| 51 |
+
validations:
|
| 52 |
+
required: false
|
.github/ISSUE_TEMPLATE/question.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Question"
|
| 2 |
+
description: |
|
| 3 |
+
Research question or pure inquiry about the project, usage issue goes with "help wanted".
|
| 4 |
+
labels:
|
| 5 |
+
- question
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for research question, not usage problems, feature requests or bug reports.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation and read the related paper(s).
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, no similar questions.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Question details
|
| 23 |
+
description: |
|
| 24 |
+
Question details, clearly stated using proper markdown syntax.
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
.gitignore
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.egg-info/
|
| 5 |
+
*.egg
|
| 6 |
+
dist/
|
| 7 |
+
build/
|
| 8 |
+
.venv/
|
| 9 |
+
.env
|
| 10 |
+
.DS_Store
|
| 11 |
+
.pytest_cache/
|
| 12 |
+
.mypy_cache/
|
| 13 |
+
.ruff_cache/
|
| 14 |
+
*.so
|
| 15 |
+
/.cache*
|
| 16 |
+
/exp*/
|
| 17 |
+
/.tmp/
|
| 18 |
+
/results/
|
| 19 |
+
/data/
|
| 20 |
+
/download
|
| 21 |
+
/local/
|
| 22 |
+
/run*
|
| 23 |
+
example.py
|
| 24 |
+
results/
|
| 25 |
+
examples/data*
|
| 26 |
+
examples/download*
|
| 27 |
+
examples/exp*
|
| 28 |
+
.claude/
|
| 29 |
+
*.wav
|
| 30 |
+
*.jsonl
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright 2026 Xiaomi Corp.
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OmniVoice 🌍
|
| 2 |
+
|
| 3 |
+
<p align="center">
|
| 4 |
+
<img width="200" height="200" alt="OmniVoice" src="https://zhu-han.github.io/omnivoice/pics/omnivoice.jpg" />
|
| 5 |
+
</p>
|
| 6 |
+
|
| 7 |
+
<p align="center">
|
| 8 |
+
<a href="https://huggingface.co/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E" alt="Hugging Face Model"></a>
|
| 9 |
+
|
| 10 |
+
<a href="https://huggingface.co/spaces/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue" alt="Hugging Face Space"></a>
|
| 11 |
+
|
| 12 |
+
<a href="https://arxiv.org/abs/2604.00688"><img src="https://img.shields.io/badge/arXiv-Paper-B31B1B.svg"></a>
|
| 13 |
+
|
| 14 |
+
<a href="https://zhu-han.github.io/omnivoice"><img src="https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=GitHub&style=flat-square"></a>
|
| 15 |
+
|
| 16 |
+
<a href="https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
|
| 17 |
+
</p>
|
| 18 |
+
|
| 19 |
+
OmniVoice is a state-of-the-art massively multilingual zero-shot text-to-speech (TTS) model supporting over 600 languages. Built on a novel diffusion language model-style architecture, it generates high-quality speech with superior inference speed, supporting voice cloning and voice design.
|
| 20 |
+
|
| 21 |
+
**Contents**: [Key Features](#key-features) | [Installation](#installation) | [Quick Start](#quick-start) | [Python API](#python-api) | [Command-Line Tools](#command-line-tools) | [Training & Evaluation](#training--evaluation) | [Discussion](#discussion--communication) | [Citation](#citation)
|
| 22 |
+
|
| 23 |
+
## Key Features
|
| 24 |
+
|
| 25 |
+
- **600+ Languages Supported**: The broadest language coverage among zero-shot TTS models ([full list](docs/languages.md)).
|
| 26 |
+
- **Voice Cloning**: State-of-the-art voice cloning quality.
|
| 27 |
+
- **Voice Design**: Control voices via assigned speaker attributes (gender, age, pitch, dialect/accent, whisper, etc.).
|
| 28 |
+
- **Fine-grained Control**: Non-verbal symbols (e.g., `[laughter]`) and pronunciation correction via pinyin or phonemes.
|
| 29 |
+
- **Fast Inference**: RTF as low as 0.025 (40x faster than real-time).
|
| 30 |
+
- **Diffusion Language Model-style Architecture**: A clean, streamlined, and scalable design that delivers both quality and speed.
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## Installation
|
| 35 |
+
|
| 36 |
+
Choose **one** of the following methods: **pip** or **uv**.
|
| 37 |
+
|
| 38 |
+
### pip
|
| 39 |
+
|
| 40 |
+
> We recommend using a fresh virtual environment (e.g., `conda`, `venv`, etc.) to avoid conflicts.
|
| 41 |
+
|
| 42 |
+
**Step 1**: Install PyTorch
|
| 43 |
+
|
| 44 |
+
<details>
|
| 45 |
+
<summary>NVIDIA GPU</summary>
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
# Install pytorch with your CUDA version, e.g.
|
| 49 |
+
pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128
|
| 50 |
+
```
|
| 51 |
+
> See [PyTorch official site](https://pytorch.org/get-started/locally/) for other versions installation.
|
| 52 |
+
|
| 53 |
+
</details>
|
| 54 |
+
|
| 55 |
+
<details>
|
| 56 |
+
<summary>Apple Silicon</summary>
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
pip install torch==2.8.0 torchaudio==2.8.0
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
</details>
|
| 63 |
+
|
| 64 |
+
**Step 2**: Install OmniVoice (choose one)
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
# From PyPI (stable release)
|
| 68 |
+
pip install omnivoice
|
| 69 |
+
|
| 70 |
+
# From the latest source on GitHub (no need to clone)
|
| 71 |
+
pip install git+https://github.com/k2-fsa/OmniVoice.git
|
| 72 |
+
|
| 73 |
+
# For development (clone first, editable install)
|
| 74 |
+
git clone https://github.com/k2-fsa/OmniVoice.git
|
| 75 |
+
cd OmniVoice
|
| 76 |
+
pip install -e .
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### uv
|
| 80 |
+
|
| 81 |
+
Clone the repository and sync dependencies:
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
git clone https://github.com/k2-fsa/OmniVoice.git
|
| 85 |
+
cd OmniVoice
|
| 86 |
+
uv sync
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
> **Tip**: Can use mirror with `uv sync --default-index "https://mirrors.aliyun.com/pypi/simple"`
|
| 90 |
+
|
| 91 |
+
---
|
| 92 |
+
|
| 93 |
+
## Quick Start
|
| 94 |
+
|
| 95 |
+
Try OmniVoice without coding:
|
| 96 |
+
|
| 97 |
+
- Launch the local web UI: `omnivoice-demo --ip 0.0.0.0 --port 8001`
|
| 98 |
+
|
| 99 |
+
- Or try it directly on [HuggingFace Space](https://huggingface.co/spaces/k2-fsa/OmniVoice)
|
| 100 |
+
|
| 101 |
+
- Or run it in Google Colab: [](https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb)
|
| 102 |
+
|
| 103 |
+
> If you have trouble connecting to HuggingFace when downloading the pre-trained models, set `export HF_ENDPOINT="https://hf-mirror.com"` before running.
|
| 104 |
+
|
| 105 |
+
For full usage, see the [Python API](#python-api) and [Command-Line Tools](#command-line-tools) sections below.
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## Python API
|
| 110 |
+
|
| 111 |
+
OmniVoice supports three generation modes. All features in this section are also available via [command-line tools](#command-line-tools).
|
| 112 |
+
|
| 113 |
+
### Voice Cloning
|
| 114 |
+
|
| 115 |
+
Clone a voice from a short reference audio. Provide `ref_audio` and `ref_text`:
|
| 116 |
+
|
| 117 |
+
```python
|
| 118 |
+
from omnivoice import OmniVoice
|
| 119 |
+
import soundfile as sf
|
| 120 |
+
import torch
|
| 121 |
+
|
| 122 |
+
model = OmniVoice.from_pretrained(
|
| 123 |
+
"k2-fsa/OmniVoice",
|
| 124 |
+
device_map="cuda:0",
|
| 125 |
+
dtype=torch.float16
|
| 126 |
+
)
|
| 127 |
+
# Apple Silicon users: use device_map="mps" instead
|
| 128 |
+
|
| 129 |
+
audio = model.generate(
|
| 130 |
+
text="Hello, this is a test of zero-shot voice cloning.",
|
| 131 |
+
ref_audio="ref.wav",
|
| 132 |
+
ref_text="Transcription of the reference audio.",
|
| 133 |
+
) # audio is a list of `np.ndarray` with shape (T,) at 24 kHz.
|
| 134 |
+
|
| 135 |
+
# If you don't want to input `ref_text` manually, you can directly omit the `ref_text`.
|
| 136 |
+
# The model will use Whisper ASR to auto-transcribe it.
|
| 137 |
+
|
| 138 |
+
sf.write("out.wav", audio[0], 24000)
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
> **Tips**
|
| 142 |
+
>
|
| 143 |
+
> - Use a 3–10 seconds reference audio clip. Longer audio slows down inference and may degrade cloning quality.
|
| 144 |
+
> - For standard pronunciation, use a reference audio in the **same language** as the target speech. In cross-lingual voice cloning (i.e., the reference audio and target speech are in different languages), the generated speech will carry an accent from the reference audio's language.
|
| 145 |
+
> - For better results with Arabic numerals, normalize them to words first (e.g., "123" → "one hundred twenty-three") with text normalization tools (e.g., [WeTextProcessing](https://github.com/wenet-e2e/WeTextProcessing)).
|
| 146 |
+
>
|
| 147 |
+
> For more tips, see [docs/tips.md](docs/tips.md).
|
| 148 |
+
|
| 149 |
+
### Voice Design
|
| 150 |
+
|
| 151 |
+
Describe the desired voice with speaker attributes — no reference audio needed.
|
| 152 |
+
Supported attributes: **gender** (male/female), **age** (child to elderly),
|
| 153 |
+
**pitch** (very low to very high), **style** (whisper), **English accent**
|
| 154 |
+
(American, British, etc.), and **Chinese dialect** (四川话, 陕西话, etc.).
|
| 155 |
+
Attributes are comma-separated and freely combinable across categories.
|
| 156 |
+
|
| 157 |
+
```python
|
| 158 |
+
audio = model.generate(
|
| 159 |
+
text="Hello, this is a test of zero-shot voice design.",
|
| 160 |
+
instruct="female, low pitch, british accent",
|
| 161 |
+
)
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
> **Note**: Voice design was trained on Chinese and English data only. It can generalize to other languages, but results can be unstable for some low-resource languages.
|
| 165 |
+
|
| 166 |
+
See [docs/voice-design.md](docs/voice-design.md) for the full attribute
|
| 167 |
+
reference, Chinese equivalents, and usage tips.
|
| 168 |
+
|
| 169 |
+
### Auto Voice
|
| 170 |
+
|
| 171 |
+
Let the model choose a voice automatically:
|
| 172 |
+
|
| 173 |
+
```python
|
| 174 |
+
audio = model.generate(text="This is a sentence without any voice prompt.")
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
### Generation Parameters
|
| 178 |
+
|
| 179 |
+
All above three modes share the same `model.generate()` API. You can further control the generation behavior via keyword arguments:
|
| 180 |
+
|
| 181 |
+
```python
|
| 182 |
+
audio = model.generate(
|
| 183 |
+
text="...",
|
| 184 |
+
num_step=32, # diffusion steps (or 16 for faster inference)
|
| 185 |
+
speed=1.0, # speed factor (>1.0 faster, <1.0 slower)
|
| 186 |
+
duration=10.0, # fixed output duration in seconds (overrides speed)
|
| 187 |
+
# ... more options
|
| 188 |
+
)
|
| 189 |
+
```
|
| 190 |
+
See more detailed control in [docs/generation-parameters.md](docs/generation-parameters.md).
|
| 191 |
+
|
| 192 |
+
### Non-Verbal & Pronunciation Control
|
| 193 |
+
|
| 194 |
+
OmniVoice supports inline **non-verbal symbols** and **pronunciation correction** within the input text.
|
| 195 |
+
|
| 196 |
+
**Non-verbal symbols**: Insert tags like `[laughter]` directly in the text to add expressive non-verbal sounds.
|
| 197 |
+
|
| 198 |
+
```python
|
| 199 |
+
audio = model.generate(text="[laughter] You really got me. I didn't see that coming at all.")
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
Supported tags: `[laughter]`, `[sigh]`, `[confirmation-en]`, `[question-en]`, `[question-ah]`, `[question-oh]`, `[question-ei]`, `[question-yi]`, `[surprise-ah]`, `[surprise-oh]`, `[surprise-wa]`, `[surprise-yo]`, `[dissatisfaction-hnn]`.
|
| 203 |
+
|
| 204 |
+
**Pronunciation control (Chinese)**: Use pinyin with tone numbers to correct specific character pronunciations.
|
| 205 |
+
|
| 206 |
+
```python
|
| 207 |
+
audio = model.generate(text="这批货物打ZHE2出售后他严重SHE2本了,再也经不起ZHE1腾了。")
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
**Pronunciation control (English)**: Use [CMU pronunciation dictionary](https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict.0.7a) (uppercase, in brackets) to override default English pronunciations.
|
| 211 |
+
|
| 212 |
+
```python
|
| 213 |
+
audio = model.generate(text="He plays the [B EY1 S] guitar while catching a [B AE1 S] fish.")
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
---
|
| 217 |
+
|
| 218 |
+
## Command-Line Tools
|
| 219 |
+
|
| 220 |
+
Three CLI entry points are provided. The CLI tools support all features available in the Python API (voice cloning, voice design, auto voice, generation parameters, etc.) — all controlled via command-line arguments.
|
| 221 |
+
|
| 222 |
+
| Command | Description | Source |
|
| 223 |
+
|---|---|---|
|
| 224 |
+
| `omnivoice-demo` | Interactive Gradio web demo | [omnivoice/cli/demo.py](omnivoice/cli/demo.py) |
|
| 225 |
+
| `omnivoice-infer` | Single-item inference | [omnivoice/cli/infer.py](omnivoice/cli/infer.py) |
|
| 226 |
+
| `omnivoice-infer-batch` | Batch inference across multiple GPUs | [omnivoice/cli/infer_batch.py](omnivoice/cli/infer_batch.py) |
|
| 227 |
+
|
| 228 |
+
### Demo
|
| 229 |
+
|
| 230 |
+
```bash
|
| 231 |
+
omnivoice-demo --ip 0.0.0.0 --port 8001
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
Provides a web UI for voice cloning and voice design. See `omnivoice-demo --help` for all options.
|
| 235 |
+
|
| 236 |
+
### Single Inference
|
| 237 |
+
|
| 238 |
+
```bash
|
| 239 |
+
# Voice Cloning
|
| 240 |
+
# ref_text can be omitted (Whisper will auto-transcribe ref_audio to get it).
|
| 241 |
+
omnivoice-infer \
|
| 242 |
+
--model k2-fsa/OmniVoice \
|
| 243 |
+
--text "This is a test for text to speech." \
|
| 244 |
+
--ref_audio ref.wav \
|
| 245 |
+
--ref_text "Transcription of the reference audio." \
|
| 246 |
+
--output hello.wav
|
| 247 |
+
|
| 248 |
+
# Voice Design
|
| 249 |
+
omnivoice-infer --model k2-fsa/OmniVoice \
|
| 250 |
+
--text "This is a test for text to speech." \
|
| 251 |
+
--instruct "male, British accent" \
|
| 252 |
+
--output hello.wav
|
| 253 |
+
|
| 254 |
+
# Auto Voice
|
| 255 |
+
omnivoice-infer \
|
| 256 |
+
--model k2-fsa/OmniVoice \
|
| 257 |
+
--text "This is a test for text to speech."\
|
| 258 |
+
--output hello.wav
|
| 259 |
+
```
|
| 260 |
+
|
| 261 |
+
### Batch Inference
|
| 262 |
+
|
| 263 |
+
`omnivoice-infer-batch` can distribute batch inference across multiple GPUs, designed for large-scale TTS tasks.
|
| 264 |
+
|
| 265 |
+
```bash
|
| 266 |
+
omnivoice-infer-batch \
|
| 267 |
+
--model k2-fsa/OmniVoice \
|
| 268 |
+
--test_list test.jsonl \
|
| 269 |
+
--res_dir results/
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
The test list is a JSONL file where each line is a JSON object:
|
| 273 |
+
```json
|
| 274 |
+
{"id": "sample_001", "text": "Hello world", "ref_audio": "/path/to/ref.wav", "ref_text": "Reference transcript", "instruct": "female, british accent", "language_id": "en", "duration": 10.0, "speed": 1.0}
|
| 275 |
+
```
|
| 276 |
+
Only `id` and `text` are mandatory fields. `ref_audio` and `ref_text` are used in voice cloning mode. `instruct` is used in voice design mode. If no reference audio or instruct are provided, the model will generate text in a random voice.
|
| 277 |
+
|
| 278 |
+
`language_id`, `duration`, and `speed` are optional. `duration` (in seconds) fixes the output length; `speed` controls the speaking rate. If `duration` and `speed` are both provided, `speed` will be ignored.
|
| 279 |
+
|
| 280 |
+
---
|
| 281 |
+
|
| 282 |
+
## Training & Evaluation
|
| 283 |
+
|
| 284 |
+
See [examples/](examples/) for the complete pipeline — from data preparation to training, evaluation, and finetuning.
|
| 285 |
+
|
| 286 |
+
---
|
| 287 |
+
|
| 288 |
+
## Discussion & Communication
|
| 289 |
+
|
| 290 |
+
You can directly discuss on [GitHub Issues](https://github.com/k2-fsa/OmniVoice/issues).
|
| 291 |
+
|
| 292 |
+
You can also scan the QR code to join our wechat group or follow our wechat official account.
|
| 293 |
+
|
| 294 |
+
| Wechat Group | Wechat Official Account |
|
| 295 |
+
| ------------ | ----------------------- |
|
| 296 |
+
| | |
|
| 297 |
+
|
| 298 |
+
---
|
| 299 |
+
|
| 300 |
+
## Community Projects
|
| 301 |
+
|
| 302 |
+
OmniVoice is supported by a growing ecosystem of community projects.
|
| 303 |
+
Explore them in [Community Projects](docs/community-projects.md).
|
| 304 |
+
|
| 305 |
+
---
|
| 306 |
+
|
| 307 |
+
## Citation
|
| 308 |
+
|
| 309 |
+
```bibtex
|
| 310 |
+
@article{zhu2026omnivoice,
|
| 311 |
+
title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models},
|
| 312 |
+
author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel},
|
| 313 |
+
journal={arXiv preprint arXiv:2604.00688},
|
| 314 |
+
year={2026}
|
| 315 |
+
}
|
| 316 |
+
```
|
| 317 |
+
|
| 318 |
+
---
|
| 319 |
+
|
| 320 |
+
## Disclaimer
|
| 321 |
+
|
| 322 |
+
Users are strictly prohibited from using this model for unauthorized voice cloning, voice impersonation, fraud, scams, or any other illegal or unethical activities. All users shall ensure full compliance with applicable local laws, regulations, and ethical standards. The developers assume no liability for any misuse of this model and advocate for responsible AI development and use, encouraging the community to uphold safety and ethical principles in AI research and applications.
|
api.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import base64
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
from typing import AsyncGenerator
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
import torch
|
| 13 |
+
from fastapi import FastAPI, HTTPException
|
| 14 |
+
from fastapi.responses import StreamingResponse
|
| 15 |
+
from pydantic import BaseModel
|
| 16 |
+
from omnivoice import OmniVoice
|
| 17 |
+
|
| 18 |
+
# =========================================================
|
| 19 |
+
# App
|
| 20 |
+
# =========================================================
|
| 21 |
+
|
| 22 |
+
app = FastAPI(title="OmniVoice OpenAI-Compatible TTS")
|
| 23 |
+
|
| 24 |
+
# =========================================================
|
| 25 |
+
# Constants
|
| 26 |
+
# =========================================================
|
| 27 |
+
|
| 28 |
+
SAMPLE_RATE = 24000
|
| 29 |
+
NUM_CHANNELS = 1
|
| 30 |
+
BYTES_PER_SAMPLE = 2
|
| 31 |
+
|
| 32 |
+
FRAME_MS = 20
|
| 33 |
+
|
| 34 |
+
CHUNK_SIZE = int(
|
| 35 |
+
SAMPLE_RATE * (FRAME_MS / 1000) * BYTES_PER_SAMPLE * NUM_CHANNELS
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# =========================================================
|
| 39 |
+
# Fixed Voice Config
|
| 40 |
+
# =========================================================
|
| 41 |
+
|
| 42 |
+
FIXED_REF_AUDIO = "ref_audio/women_ref_1.mp3"
|
| 43 |
+
|
| 44 |
+
FIXED_REF_TEXT = (
|
| 45 |
+
"شوفي يا حلوة هالكريم الجديد للبشرة، يخلي وجهك مثل القمر!"
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
FIXED_INSTRUCT = "female, young adult, high pitch"
|
| 49 |
+
|
| 50 |
+
# =========================================================
|
| 51 |
+
# Load Model
|
| 52 |
+
# =========================================================
|
| 53 |
+
|
| 54 |
+
model = OmniVoice.from_pretrained(
|
| 55 |
+
"/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune/checkpoint-5000",
|
| 56 |
+
device_map="cuda:0",
|
| 57 |
+
dtype=torch.float16,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Prevent concurrent GPU inference crashes
|
| 61 |
+
generation_lock = asyncio.Lock()
|
| 62 |
+
|
| 63 |
+
# =========================================================
|
| 64 |
+
# Request Schema
|
| 65 |
+
# =========================================================
|
| 66 |
+
|
| 67 |
+
class SpeechRequest(BaseModel):
|
| 68 |
+
|
| 69 |
+
model: str = "omnivoice"
|
| 70 |
+
|
| 71 |
+
input: str
|
| 72 |
+
|
| 73 |
+
speed: float = 1.1
|
| 74 |
+
|
| 75 |
+
response_format: str = "pcm"
|
| 76 |
+
|
| 77 |
+
# audio | sse
|
| 78 |
+
stream_format: str = "audio"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# =========================================================
|
| 82 |
+
# Audio Helpers
|
| 83 |
+
# =========================================================
|
| 84 |
+
|
| 85 |
+
def float32_to_pcm16(audio: np.ndarray) -> bytes:
|
| 86 |
+
|
| 87 |
+
audio = np.clip(audio, -1, 1)
|
| 88 |
+
|
| 89 |
+
pcm16 = (audio * 32767).astype(np.int16)
|
| 90 |
+
|
| 91 |
+
return pcm16.tobytes()
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# =========================================================
|
| 95 |
+
# Generate Audio
|
| 96 |
+
# =========================================================
|
| 97 |
+
|
| 98 |
+
async def generate_audio(req: SpeechRequest) -> np.ndarray:
|
| 99 |
+
|
| 100 |
+
async with generation_lock:
|
| 101 |
+
|
| 102 |
+
def _generate():
|
| 103 |
+
|
| 104 |
+
with torch.inference_mode():
|
| 105 |
+
|
| 106 |
+
print("*" * 50)
|
| 107 |
+
print("user text : " , req.input)
|
| 108 |
+
print("*" * 50)
|
| 109 |
+
|
| 110 |
+
audio = model.generate(
|
| 111 |
+
text=req.input,
|
| 112 |
+
ref_audio=FIXED_REF_AUDIO,
|
| 113 |
+
ref_text=FIXED_REF_TEXT,
|
| 114 |
+
instruct=FIXED_INSTRUCT,
|
| 115 |
+
speed=req.speed,
|
| 116 |
+
num_step = 30,
|
| 117 |
+
guidance_scale=2.0,
|
| 118 |
+
t_shift=0.1,
|
| 119 |
+
position_temperature=3,
|
| 120 |
+
layer_penalty_factor=5.0,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
return audio[0]
|
| 124 |
+
|
| 125 |
+
return await asyncio.to_thread(_generate)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# =========================================================
|
| 129 |
+
# Raw Audio Stream
|
| 130 |
+
# =========================================================
|
| 131 |
+
|
| 132 |
+
async def audio_stream_generator(
|
| 133 |
+
req: SpeechRequest,
|
| 134 |
+
) -> AsyncGenerator[bytes, None]:
|
| 135 |
+
|
| 136 |
+
audio = await generate_audio(req)
|
| 137 |
+
|
| 138 |
+
if req.response_format == "pcm":
|
| 139 |
+
|
| 140 |
+
pcm_bytes = float32_to_pcm16(audio)
|
| 141 |
+
|
| 142 |
+
for i in range(0, len(pcm_bytes), CHUNK_SIZE):
|
| 143 |
+
|
| 144 |
+
yield pcm_bytes[i:i + CHUNK_SIZE]
|
| 145 |
+
|
| 146 |
+
await asyncio.sleep(0)
|
| 147 |
+
|
| 148 |
+
elif req.response_format == "wav":
|
| 149 |
+
|
| 150 |
+
buffer = io.BytesIO()
|
| 151 |
+
|
| 152 |
+
sf.write(
|
| 153 |
+
buffer,
|
| 154 |
+
audio,
|
| 155 |
+
SAMPLE_RATE,
|
| 156 |
+
format="WAV",
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
buffer.seek(0)
|
| 160 |
+
|
| 161 |
+
while True:
|
| 162 |
+
|
| 163 |
+
chunk = buffer.read(4096)
|
| 164 |
+
|
| 165 |
+
if not chunk:
|
| 166 |
+
break
|
| 167 |
+
|
| 168 |
+
yield chunk
|
| 169 |
+
|
| 170 |
+
await asyncio.sleep(0)
|
| 171 |
+
|
| 172 |
+
else:
|
| 173 |
+
|
| 174 |
+
raise HTTPException(
|
| 175 |
+
status_code=400,
|
| 176 |
+
detail=f"Unsupported response_format: {req.response_format}"
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
# =========================================================
|
| 181 |
+
# SSE Stream
|
| 182 |
+
# =========================================================
|
| 183 |
+
|
| 184 |
+
async def sse_stream_generator(
|
| 185 |
+
req: SpeechRequest,
|
| 186 |
+
) -> AsyncGenerator[str, None]:
|
| 187 |
+
|
| 188 |
+
start_time = time.time()
|
| 189 |
+
|
| 190 |
+
audio = await generate_audio(req)
|
| 191 |
+
|
| 192 |
+
generation_time = time.time() - start_time
|
| 193 |
+
|
| 194 |
+
pcm_bytes = float32_to_pcm16(audio)
|
| 195 |
+
|
| 196 |
+
for i in range(0, len(pcm_bytes), CHUNK_SIZE):
|
| 197 |
+
|
| 198 |
+
chunk = pcm_bytes[i:i + CHUNK_SIZE]
|
| 199 |
+
|
| 200 |
+
b64_chunk = base64.b64encode(chunk).decode("utf-8")
|
| 201 |
+
|
| 202 |
+
event = {
|
| 203 |
+
"type": "speech.audio.delta",
|
| 204 |
+
"delta": b64_chunk,
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
yield f"data: {json.dumps(event)}\n\n"
|
| 208 |
+
|
| 209 |
+
await asyncio.sleep(0)
|
| 210 |
+
|
| 211 |
+
audio_duration = len(audio) / SAMPLE_RATE
|
| 212 |
+
|
| 213 |
+
usage = {
|
| 214 |
+
"input_tokens": len(req.input.split()),
|
| 215 |
+
"output_tokens": int(audio_duration * 50),
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
done_event = {
|
| 219 |
+
"type": "speech.audio.done",
|
| 220 |
+
"usage": usage,
|
| 221 |
+
"metrics": {
|
| 222 |
+
"generation_time_sec": generation_time,
|
| 223 |
+
"audio_duration_sec": audio_duration,
|
| 224 |
+
"rtf": round(generation_time / audio_duration, 4),
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
yield f"data: {json.dumps(done_event)}\n\n"
|
| 229 |
+
|
| 230 |
+
yield "data: [DONE]\n\n"
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# =========================================================
|
| 234 |
+
# OpenAI-Compatible Endpoint
|
| 235 |
+
# =========================================================
|
| 236 |
+
|
| 237 |
+
@app.post("/v1/audio/speech")
|
| 238 |
+
async def create_speech(req: SpeechRequest):
|
| 239 |
+
|
| 240 |
+
if req.stream_format == "sse":
|
| 241 |
+
|
| 242 |
+
return StreamingResponse(
|
| 243 |
+
sse_stream_generator(req),
|
| 244 |
+
media_type="text/event-stream",
|
| 245 |
+
headers={
|
| 246 |
+
"Cache-Control": "no-cache",
|
| 247 |
+
"Connection": "keep-alive",
|
| 248 |
+
},
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
media_type = (
|
| 252 |
+
"audio/pcm"
|
| 253 |
+
if req.response_format == "pcm"
|
| 254 |
+
else "audio/wav"
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
return StreamingResponse(
|
| 258 |
+
audio_stream_generator(req),
|
| 259 |
+
media_type=media_type,
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# =========================================================
|
| 264 |
+
# Health
|
| 265 |
+
# =========================================================
|
| 266 |
+
|
| 267 |
+
@app.get("/health")
|
| 268 |
+
async def health():
|
| 269 |
+
|
| 270 |
+
return {
|
| 271 |
+
"status": "ok",
|
| 272 |
+
"sample_rate": SAMPLE_RATE,
|
| 273 |
+
"voice": {
|
| 274 |
+
"ref_audio": FIXED_REF_AUDIO,
|
| 275 |
+
"instruct": FIXED_INSTRUCT,
|
| 276 |
+
}
|
| 277 |
+
}
|
docs/OmniVoice.ipynb
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": "# OmniVoice Quick Start\n\n[](https://colab.research.google.com/github/k2-fsa/OmniVoice/blob/master/docs/OmniVoice.ipynb)\n\nThis notebook demonstrates the basic usage of [OmniVoice](https://github.com/k2-fsa/OmniVoice), a massively multilingual zero-shot TTS model supporting 600+ languages.\n\n**Contents:**\n1. Installation\n2. Option A — Gradio Demo (interactive web UI, no code needed)\n3. Option B — Python API\n - 3.1 Load Model\n - 3.2 Voice Cloning\n - 3.3 Voice Design\n - 3.4 Auto Voice"
|
| 7 |
+
},
|
| 8 |
+
{
|
| 9 |
+
"cell_type": "markdown",
|
| 10 |
+
"metadata": {},
|
| 11 |
+
"source": [
|
| 12 |
+
"## 1. Installation\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"Colab already provides a compatible PyTorch + CUDA environment, so we only need to install OmniVoice."
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "code",
|
| 19 |
+
"execution_count": null,
|
| 20 |
+
"metadata": {},
|
| 21 |
+
"outputs": [],
|
| 22 |
+
"source": [
|
| 23 |
+
"!pip install omnivoice"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": "## 2. Option A — Gradio Demo\n\nLaunch an interactive web UI with a public Gradio link. The `--share` flag creates a temporary public URL so you can access the demo from any browser.\n\n> **If you prefer to use the Python API directly, skip to Option B below.**"
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "code",
|
| 33 |
+
"execution_count": null,
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"outputs": [],
|
| 36 |
+
"source": [
|
| 37 |
+
"!omnivoice-demo --share"
|
| 38 |
+
]
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"cell_type": "markdown",
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"source": "## 3. Option B — Python API\n\n### 3.1 Load Model"
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": null,
|
| 48 |
+
"metadata": {},
|
| 49 |
+
"outputs": [],
|
| 50 |
+
"source": "from omnivoice import OmniVoice\nimport soundfile as sf\nimport torch\nfrom IPython.display import Audio, display\n\nmodel = OmniVoice.from_pretrained(\n \"k2-fsa/OmniVoice\",\n device_map=\"cuda:0\",\n dtype=torch.float16,\n load_asr=True,\n)"
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"cell_type": "markdown",
|
| 54 |
+
"metadata": {},
|
| 55 |
+
"source": "### 3.2 Voice Cloning\n\nClone a voice from a short (3-10s) reference audio clip. Upload your own `ref.wav` or use any audio file.\n\n`ref_text` is optional — if omitted, the model uses Whisper ASR to auto-transcribe it."
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": null,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [],
|
| 62 |
+
"source": [
|
| 63 |
+
"from google.colab import files\n",
|
| 64 |
+
"\n",
|
| 65 |
+
"print(\"Upload a reference audio file (wav/mp3/flac):\")\n",
|
| 66 |
+
"uploaded = files.upload()\n",
|
| 67 |
+
"ref_audio_path = list(uploaded.keys())[0]\n",
|
| 68 |
+
"print(f\"Uploaded: {ref_audio_path}\")"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": null,
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"outputs": [],
|
| 76 |
+
"source": [
|
| 77 |
+
"audio = model.generate(\n",
|
| 78 |
+
" text=\"Hello, this is a test of zero-shot voice cloning.\",\n",
|
| 79 |
+
" ref_audio=ref_audio_path,\n",
|
| 80 |
+
" # ref_text=\"Transcription of the reference audio.\", # optional\n",
|
| 81 |
+
")\n",
|
| 82 |
+
"\n",
|
| 83 |
+
"sf.write(\"clone_out.wav\", audio[0], 24000)\n",
|
| 84 |
+
"display(Audio(audio[0], rate=24000))"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "markdown",
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"source": "### 3.3 Voice Design\n\nDescribe the desired voice with speaker attributes — no reference audio needed.\n\nSupported attributes: gender, age, pitch, style (whisper), English accent, Chinese dialect. See [docs/voice-design.md](https://github.com/k2-fsa/OmniVoice/blob/master/docs/voice-design.md) for the full list."
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": null,
|
| 95 |
+
"metadata": {},
|
| 96 |
+
"outputs": [],
|
| 97 |
+
"source": [
|
| 98 |
+
"audio = model.generate(\n",
|
| 99 |
+
" text=\"Hello, this is a test of zero-shot voice design.\",\n",
|
| 100 |
+
" instruct=\"female, low pitch, british accent\",\n",
|
| 101 |
+
")\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"sf.write(\"design_out.wav\", audio[0], 24000)\n",
|
| 104 |
+
"display(Audio(audio[0], rate=24000))"
|
| 105 |
+
]
|
| 106 |
+
},
|
| 107 |
+
{
|
| 108 |
+
"cell_type": "markdown",
|
| 109 |
+
"metadata": {},
|
| 110 |
+
"source": "### 3.4 Auto Voice\n\nLet the model choose a voice automatically — no reference audio or instruct needed."
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": null,
|
| 115 |
+
"metadata": {},
|
| 116 |
+
"outputs": [],
|
| 117 |
+
"source": [
|
| 118 |
+
"audio = model.generate(\n",
|
| 119 |
+
" text=\"This is a sentence generated with automatic voice selection.\",\n",
|
| 120 |
+
")\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"sf.write(\"auto_out.wav\", audio[0], 24000)\n",
|
| 123 |
+
"display(Audio(audio[0], rate=24000))"
|
| 124 |
+
]
|
| 125 |
+
}
|
| 126 |
+
],
|
| 127 |
+
"metadata": {
|
| 128 |
+
"accelerator": "GPU",
|
| 129 |
+
"colab": {
|
| 130 |
+
"gpuType": "T4",
|
| 131 |
+
"provenance": []
|
| 132 |
+
},
|
| 133 |
+
"kernelspec": {
|
| 134 |
+
"display_name": "Python 3",
|
| 135 |
+
"name": "python3"
|
| 136 |
+
},
|
| 137 |
+
"language_info": {
|
| 138 |
+
"name": "python",
|
| 139 |
+
"version": "3.10.0"
|
| 140 |
+
}
|
| 141 |
+
},
|
| 142 |
+
"nbformat": 4,
|
| 143 |
+
"nbformat_minor": 0
|
| 144 |
+
}
|
docs/community-projects.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Community Projects
|
| 2 |
+
|
| 3 |
+
The following projects are built and maintained by the community. We appreciate all contributions! Note that these projects are not officially supported by the OmniVoice team.
|
| 4 |
+
|
| 5 |
+
If you have a project you'd like to add, please open a PR.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
- **[ComfyUI-OmniVoice-TTS](https://github.com/Saganaki22/ComfyUI-OmniVoice-TTS)** —
|
| 10 |
+
ComfyUI custom node for OmniVoice text-to-speech generation.
|
| 11 |
+
|
| 12 |
+
- **[vLLM-Omni](https://github.com/vllm-project/vllm-omni)** —
|
| 13 |
+
A framework for efficient model inference with omni-modality model. Supports OmniVoice serving.
|
| 14 |
+
|
| 15 |
+
- **[pyVideoTrans](https://github.com/jianchang512/pyvideotrans)** —
|
| 16 |
+
Video translation tool with dubbing & subtitles. Supports OmniVoice as a TTS engine.
|
| 17 |
+
|
| 18 |
+
- **[MLX-Audio](https://github.com/Blaizzy/mlx-audio)** —
|
| 19 |
+
TTS, STT, and STS library built on Apple's MLX framework. Supports
|
| 20 |
+
OmniVoice among other models for efficient speech processing on Apple Silicon.
|
| 21 |
+
|
| 22 |
+
- **[RealtimeTTS](https://github.com/KoljaB/RealtimeTTS)** —
|
| 23 |
+
Converts text to speech in realtime. Supports OmniVoice as a TTS engine.
|
| 24 |
+
|
| 25 |
+
- **[TTS-WebUI](https://github.com/rsxdalv/TTS-WebUI)** —
|
| 26 |
+
Gradio web UI for multiple TTS models. Supports OmniVoice as one of its backends.
|
| 27 |
+
|
| 28 |
+
- **[OmniVoice-Studio](https://github.com/debpalash/OmniVoice-Studio)** —
|
| 29 |
+
Desktop application for OmniVoice voice generation.
|
| 30 |
+
|
| 31 |
+
- **[omnivoice-server](https://github.com/maemreyo/omnivoice-server)** —
|
| 32 |
+
OpenAI-compatible HTTP server for serving OmniVoice via `/v1/audio/speech`.
|
| 33 |
+
Supports voice profiles for persistent cloning, sentence-level streaming,
|
| 34 |
+
and optional Bearer auth.
|
| 35 |
+
|
| 36 |
+
- **[omnivoice-rs](https://github.com/FerrisMind/omnivoice-rs)** —
|
| 37 |
+
GPU-first Rust workspace for OmniVoice inference, parity validation, CLI
|
| 38 |
+
execution, and an OpenAI-compatible HTTP server built with Candle.
|
| 39 |
+
|
| 40 |
+
- **[omnivoice-trtllm](https://github.com/tlitech/omnivoice-trtllm)** —
|
| 41 |
+
Deploy OmniVoice TTS model using TensorRT-LLM and Triton Inference Server
|
| 42 |
+
on Modal, faster than PyTorch.
|
| 43 |
+
|
| 44 |
+
- **[Auris](https://github.com/nikhilprasanth/Auris)** —
|
| 45 |
+
Offline audiobook reader for EPUB, PDF, and TXT with local OmniVoice TTS, character-aware voices, and per-book narrator control.
|
| 46 |
+
|
docs/data_preparation.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Data Preparation
|
| 2 |
+
|
| 3 |
+
OmniVoice trains on a custom WebDataset format where audio data is packed into **tar shards** with paired **JSONL metadata** files. Each tar shard contains hundreds to thousands of samples (as `.npy` audio token arrays), drastically reducing disk I/O during training. The separated jsonl file allows for easier modification of metadata. This document explains the data format in detail and walks through the preparation pipeline.
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
## 1. Input Format
|
| 7 |
+
|
| 8 |
+
Prepare a JSONL file where each line is a JSON object:
|
| 9 |
+
|
| 10 |
+
```jsonl
|
| 11 |
+
{"id": "sample_001", "audio_path": "/data/audio/001.wav", "text": "Hello world", "language_id": "en"}
|
| 12 |
+
{"id": "sample_002", "audio_path": "/data/audio/002.wav", "text": "你好世界", "language_id": "zh"}
|
| 13 |
+
```
|
| 14 |
+
|
| 15 |
+
Fields:
|
| 16 |
+
- `id` — unique sample identifier (used to match samples across shards and label files)
|
| 17 |
+
- `audio_path` — absolute path to the audio file (wav/flac/mp3, will be resampled to 24 kHz)
|
| 18 |
+
- `text` — transcript text
|
| 19 |
+
- `language_id` — (optional) language code, used for multilingual training, can be omitted
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## 2. Processing
|
| 23 |
+
|
| 24 |
+
The tokenization script `extract_audio_tokens.py` converts audio into 8-layer discrete tokens and packs them into WebDataset shards.
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,4" # GPUs used for token extraction
|
| 28 |
+
python -m omnivoice.scripts.extract_audio_tokens \
|
| 29 |
+
--input_jsonl data.jsonl \
|
| 30 |
+
--tar_output_pattern output/audios/shard-%06d.tar \
|
| 31 |
+
--jsonl_output_pattern output/txts/shard-%06d.jsonl \
|
| 32 |
+
--tokenizer_path eustlb/higgs-audio-v2-tokenizer \
|
| 33 |
+
--nj_per_gpu 3 \
|
| 34 |
+
--shuffle True
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
What it does:
|
| 38 |
+
1. Reads your JSONL manifest
|
| 39 |
+
2. Encodes each audio file into discrete tokens using audio tokenizer
|
| 40 |
+
3. Packs tokens into WebDataset tar shards with paired jsonl metadata files
|
| 41 |
+
4. Generates a `data.lst` manifest file
|
| 42 |
+
|
| 43 |
+
<details>
|
| 44 |
+
<summary><strong>Alternative:</strong> WebDataset Input (if you already have raw-audio tar shards)</summary>
|
| 45 |
+
|
| 46 |
+
Pass the `data.lst` manifest instead of `--input_jsonl`:
|
| 47 |
+
|
| 48 |
+
```bash
|
| 49 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,4" # GPUs used for token extraction
|
| 50 |
+
python -m omnivoice.scripts.extract_audio_tokens \
|
| 51 |
+
--input_manifest existing_data/data.lst \
|
| 52 |
+
--tar_output_pattern output/audios/shard-%06d.tar \
|
| 53 |
+
--jsonl_output_pattern output/txts/shard-%06d.jsonl \
|
| 54 |
+
--tokenizer_path eustlb/higgs-audio-v2-tokenizer \
|
| 55 |
+
--nj_per_gpu 3 \
|
| 56 |
+
--shuffle True
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
The existing_data/data.lst is generated with:
|
| 60 |
+
```bash
|
| 61 |
+
python -m omnivoice.scripts.jsonl_to_webdataset \
|
| 62 |
+
--input data.jsonl \
|
| 63 |
+
--output data/shards \
|
| 64 |
+
--sr 24000 \
|
| 65 |
+
--shard-size 1000
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
This resamples audio to the target sample rate and packs FLAC files into tar shards with paired jsonl metadata files.
|
| 69 |
+
|
| 70 |
+
</details>
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
### Explanation of the script's options:
|
| 75 |
+
|
| 76 |
+
| Option | Default | Description |
|
| 77 |
+
|---|---|---|
|
| 78 |
+
| `--input_manifest` | None | Path to input dataset manifest (`data.lst`), mutually exclusive with `--input_jsonl` |
|
| 79 |
+
| `--input_jsonl` | None | Path to raw JSONL file, mutually exclusive with `--input_manifest` |
|
| 80 |
+
| `--tar_output_pattern` | (required) | Tar shard output pattern, e.g. `output/audios/shard-%06d.tar` |
|
| 81 |
+
| `--jsonl_output_pattern` | (required) | JSONL shard output pattern, e.g. `output/txts/shard-%06d.jsonl` |
|
| 82 |
+
| `--tokenizer_path` | `eustlb/higgs-audio-v2-tokenizer` | HuggingFace tokenizer path or local path |
|
| 83 |
+
| `--nj_per_gpu` | 3 | Worker processes per GPU |
|
| 84 |
+
| `--loader_workers` | 24 | DataLoader workers for streaming `IterableDataset` |
|
| 85 |
+
| `--shuffle` | True | Shuffle samples before sharding |
|
| 86 |
+
| `--shuffle-seed` | 42 | Random seed for shuffling |
|
| 87 |
+
| `--samples_per_shard` | 1000 | Max samples per tar shard |
|
| 88 |
+
| `--min_num_shards` | 32 | Minimum number of output shards (ensures shard count >= num\_gpu × num\_workers) |
|
| 89 |
+
| `--min_length` | 0.0 | Skip audio shorter than this (seconds) |
|
| 90 |
+
| `--max_length` | inf | Skip audio longer than this (seconds) |
|
| 91 |
+
| `--skip_errors` | False | Continue on processing errors instead of aborting |
|
| 92 |
+
| `--num_machines` | 1 | Total number of machines for distributed runs |
|
| 93 |
+
| `--machine_index` | 0 | Zero-based machine index for distributed preprocessing |
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
### Output Structure
|
| 97 |
+
|
| 98 |
+
Output structure with the following output patterns
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
--tar_output_pattern output/audios/shard-%06d.tar \
|
| 102 |
+
--jsonl_output_pattern output/txts/shard-%06d.jsonl
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
will be:
|
| 106 |
+
|
| 107 |
+
```
|
| 108 |
+
output/
|
| 109 |
+
├── audios/ # WebDataset tar shards (audio tokens)
|
| 110 |
+
│ ├── shard-000000.tar # Each tar packs ~1000 samples
|
| 111 |
+
│ ├── shard-000001.tar
|
| 112 |
+
│ └── ...
|
| 113 |
+
├── txts/ # Per-shard companion JSONL labels
|
| 114 |
+
│ ├── shard-000000.jsonl # One JSON line per sample in the corresponding tar
|
| 115 |
+
│ ├── shard-000001.jsonl
|
| 116 |
+
│ └── ...
|
| 117 |
+
├── data.lst # Manifest linking tar ↔ jsonl shards
|
| 118 |
+
└── errors.jsonl # Samples that failed processing (if any)
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
`data.lst` and `errors.jsonl` are written to the **parent directory** of `audios/` and `txts/`.
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
### The `data.lst` manifest
|
| 125 |
+
|
| 126 |
+
Each line in `data.lst` describes one shard:
|
| 127 |
+
|
| 128 |
+
```
|
| 129 |
+
/path/to/shard-000000.tar /path/to/shard-000000.jsonl 1000 3600.500
|
| 130 |
+
/path/to/shard-000001.tar /path/to/shard-000001.jsonl 800 2880.200
|
| 131 |
+
```
|
| 132 |
+
|
| 133 |
+
Format: `<tar_path> <jsonl_path> <num_samples> <total_duration_seconds>`
|
| 134 |
+
|
| 135 |
+
- Paths are **absolute**
|
| 136 |
+
- `.tar` file contains the audio tokens.
|
| 137 |
+
- `.jsonl` file contains the metadata in the original provided JSONL file, allows easier access and modification of metadata without decompressing the tar file.
|
| 138 |
+
- This manifest is what the training data config references.
|
| 139 |
+
|
| 140 |
+
### Inside a tar shard
|
| 141 |
+
|
| 142 |
+
Each `.tar` file packs **many samples** (default 1000 per shard) into a single archive. This is the key advantage of WebDataset: instead of reading thousands of tiny files, the dataloader reads sequentially from a few large tars, drastically reducing disk I/O pressure.
|
| 143 |
+
|
| 144 |
+
Each sample in the tar is a pair of files with matching keys:
|
| 145 |
+
|
| 146 |
+
```
|
| 147 |
+
shard-000000.tar:
|
| 148 |
+
sample_001.npy # Audio tokens: numpy array, shape [8, T], dtype int16
|
| 149 |
+
sample_002.npy
|
| 150 |
+
...
|
| 151 |
+
sample_1000.npy
|
| 152 |
+
```
|
| 153 |
+
|
| 154 |
+
## 3. Data Config for Training
|
| 155 |
+
|
| 156 |
+
After creating WebDataset shards, write a data config JSON that references them:
|
| 157 |
+
|
| 158 |
+
```json
|
| 159 |
+
{
|
| 160 |
+
"train": [
|
| 161 |
+
{
|
| 162 |
+
"language_id": "en",
|
| 163 |
+
"manifest_path": ["data/custom/tokens/train/data.lst"],
|
| 164 |
+
"repeat": 1
|
| 165 |
+
}
|
| 166 |
+
],
|
| 167 |
+
"dev": [
|
| 168 |
+
{
|
| 169 |
+
"language_id": "en",
|
| 170 |
+
"manifest_path": ["data/custom/tokens/dev/data.lst"],
|
| 171 |
+
"repeat": 1
|
| 172 |
+
}
|
| 173 |
+
]
|
| 174 |
+
}
|
| 175 |
+
```
|
| 176 |
+
- `manifest_path` — list of `data.lst` files (one per shard directory)
|
| 177 |
+
- `repeat` — how many times to repeat this dataset per epoch (useful for balancing languages)
|
| 178 |
+
- `language_id` is not used, just for a better data organization.
|
| 179 |
+
|
| 180 |
+
See [examples/config/](../examples/config/) for ready-to-use data config files.
|
| 181 |
+
|
| 182 |
+
> See [docs/data_preparation_advanced.md](../docs/data_preparation_advanced.md) for denoising and noise augmentation.
|
docs/data_preparation_advanced.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Advanced Data Preparation
|
| 2 |
+
|
| 3 |
+
The advanced pipeline adds **denoising** and **prompt noise augmentation** on top of the basic tokenization workflow. Each stage is optional.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
|
| 7 |
+
- **Denoising**: Sidon model checkpoints (`feature_extractor_cuda.pt`, `decoder_cuda.pt`) from https://huggingface.co/sarulab-speech/sidon-v0.1/tree/main.
|
| 8 |
+
- **Noise augmentation**: noise + RIR tar shards with `data.lst` manifests
|
| 9 |
+
|
| 10 |
+
## Pipeline Overview
|
| 11 |
+
|
| 12 |
+
```
|
| 13 |
+
Step 1 (optional): Denoise
|
| 14 |
+
Raw audio → Sidon denoiser → clean audio
|
| 15 |
+
|
| 16 |
+
Step 2: Tokenize (with optional noise augmentation)
|
| 17 |
+
Clean audio + noise augment on prefix → audio tokenizer → tokens
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
## Denoise
|
| 22 |
+
|
| 23 |
+
Use the [Sidon](https://github.com/sarulab-speech/Sidon) speech enhancement model to remove background noise from raw audio.
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
| 27 |
+
python -m omnivoice.scripts.denoise_audio \
|
| 28 |
+
--input_jsonl data.jsonl \
|
| 29 |
+
--tar_output_pattern data/denoised/audios/shard-%06d.tar \
|
| 30 |
+
--jsonl_output_pattern data/denoised/txts/shard-%06d.jsonl \
|
| 31 |
+
--feature_extractor_path /path/to/sidon_feature_extractor_cuda.pt \
|
| 32 |
+
--decoder_path /path/to/sidon_decoder_cuda.pt \
|
| 33 |
+
--target_sample_rate 24000 \
|
| 34 |
+
--batch_duration 200.0
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
What it does:
|
| 38 |
+
1. Reads your JSONL manifest
|
| 39 |
+
2. Runs Sidon denoiser on each audio file
|
| 40 |
+
3. Outputs denoised audio as custom WebDataset tar/jsonl shards
|
| 41 |
+
4. Generates a `data.lst` manifest in `data/denoised/`
|
| 42 |
+
|
| 43 |
+
> You can also pass `--input_manifest /path/to/data.lst` if you already have a custom webdataset format dataset.
|
| 44 |
+
> The next step would be passing the generated `data.lst` file with `--input_manifest` to `omnivoice.scripts.extract_audio_tokens` for tokens extraction.
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
### Tokenize with noise augmentation
|
| 48 |
+
|
| 49 |
+
Adds environmental noise and room reverb to **prompt audio** during tokenization, making the model robust to noisy reference audio at inference time. Note that in our model, we only add noise augmentation for a small proportion of data, making sure the model can also generate good audio with clean reference audio.
|
| 50 |
+
|
| 51 |
+
You need two additional datasets in WebDataset format:
|
| 52 |
+
- **Noise recordings**: environmental noise tar shards with a `data.lst` manifest
|
| 53 |
+
- **Room impulse responses (RIR)**: RIR tar shards with a `data.lst` manifest
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,4"
|
| 57 |
+
python -m omnivoice.scripts.extract_audio_tokens_add_noise \
|
| 58 |
+
--input_jsonl data.jsonl \
|
| 59 |
+
--tar_output_pattern data/tokens/shard-%06d.tar \
|
| 60 |
+
--jsonl_output_pattern data/txts/shard-%06d.jsonl \
|
| 61 |
+
--tokenizer_path eustlb/higgs-audio-v2-tokenizer \
|
| 62 |
+
--noise_manifest data/noise_shards/data.lst \
|
| 63 |
+
--rir_manifest data/rir_shards/data.lst \
|
| 64 |
+
--nj_per_gpu 3
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
> You can also pass `--input_manifest /path/to/data.lst` if you already have a custom webdataset format dataset.
|
docs/evaluation.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluation
|
| 2 |
+
|
| 3 |
+
Evaluate OmniVoice models with standard TTS metrics: WER (intelligibility), SIM-o (speaker similarity), and UTMOS (naturalness).
|
| 4 |
+
|
| 5 |
+
## Supported Test Sets
|
| 6 |
+
|
| 7 |
+
| Test Set | Languages | WER Module | Metrics |
|
| 8 |
+
|---|---|---|---|
|
| 9 |
+
| **LibriSpeech-PC** | English | HuBERT WER | WER + Speaker Sim + MOS |
|
| 10 |
+
| **Seed-TTS (en)** | English | Whisper WER | WER + MOS |
|
| 11 |
+
| **Seed-TTS (zh)** | Chinese | Paraformer WER | WER + MOS |
|
| 12 |
+
| **FLEURS** | 102 languages | Omnilingual-ASR WER | WER (per-language + macro-avg) |
|
| 13 |
+
| **MiniMax Multilingual** | 24 languages | Whisper + Paraformer | WER + MOS |
|
| 14 |
+
|
| 15 |
+
## Prerequisites
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
pip install omnivoice[eval]
|
| 19 |
+
# or
|
| 20 |
+
uv sync --extra eval
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
## Quick Start
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
cd examples
|
| 28 |
+
bash run_eval.sh
|
| 29 |
+
# run_eval.sh will
|
| 30 |
+
# (1) download all required test sets and test models;
|
| 31 |
+
# (2) inference and evaluation for each test set.
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Metrics Explained
|
| 35 |
+
|
| 36 |
+
### WER (Word Error Rate)
|
| 37 |
+
Measures how intelligible the generated speech is by transcribing it with an ASR model and comparing to the reference text. Lower is better. Note that some languages actually use CER (Character Error Rate).
|
| 38 |
+
|
| 39 |
+
- **LibriSpeech-PC**: HuBERT-based ASR
|
| 40 |
+
- **Seed-TTS**: Whisper (en) or Paraformer (zh)
|
| 41 |
+
- **MiniMax**: Whisper for non-Chinese, Paraformer for Chinese
|
| 42 |
+
- **FLEURS**: Omnilingual-ASR multilingual model
|
| 43 |
+
|
| 44 |
+
### Speaker Similarity
|
| 45 |
+
Cosine similarity between speaker embeddings (ECAPA-TDNN + WavLM) of the reference and generated audio. Higher is better.
|
| 46 |
+
|
| 47 |
+
### UTMOS (Predicted MOS)
|
| 48 |
+
Neural network that predicts Mean Opinion Score from audio. Higher is better.
|
docs/generation-parameters.md
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Generation Parameters
|
| 2 |
+
|
| 3 |
+
Parameters can be passed as keyword arguments to `model.generate(...)` or via the `OmniVoiceGenerationConfig` dataclass. See below for the full list and which category each belongs to.
|
| 4 |
+
|
| 5 |
+
```python
|
| 6 |
+
# 1) Direct keyword arguments
|
| 7 |
+
audio = model.generate(text="Hello world", num_step=32, guidance_scale=2.0)
|
| 8 |
+
|
| 9 |
+
# 2) Via OmniVoiceGenerationConfig dataclass
|
| 10 |
+
from omnivoice import OmniVoiceGenerationConfig
|
| 11 |
+
|
| 12 |
+
config = OmniVoiceGenerationConfig(num_step=32, guidance_scale=2.0)
|
| 13 |
+
audio = model.generate(text="Hello world", generation_config=config)
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
## Decoding
|
| 17 |
+
|
| 18 |
+
| Parameter | Type | Default | Description |
|
| 19 |
+
|---|---|---|---|
|
| 20 |
+
| `num_step` | int | 32 | Number of iterative unmasking steps. Higher values improve quality but slow down generation. Use 16 for faster inference. |
|
| 21 |
+
| `denoise` | bool | True | Prepend the `<|denoise|>` token to the input, which signals the model to produce cleaner speech. |
|
| 22 |
+
| `guidance_scale` | float | 2.0 | Classifier-free guidance scale.|
|
| 23 |
+
| `t_shift` | float | 0.1 | Time-step shift for the noise schedule. Smaller values emphasise earlier steps in decoding. |
|
| 24 |
+
|
| 25 |
+
## Sampling
|
| 26 |
+
|
| 27 |
+
| Parameter | Type | Default | Description |
|
| 28 |
+
|---|---|---|---|
|
| 29 |
+
| `position_temperature` | float | 5.0 | Temperature for mask-position selection. 0 = greedy (deterministic). Higher values increase randomness. |
|
| 30 |
+
| `class_temperature` | float | 0.0 | Temperature for token sampling at each step. 0 = greedy (deterministic). Higher values increase randomness. |
|
| 31 |
+
| `layer_penalty_factor` | float | 5.0 | Penalty applied to deeper codebook layers, encouraging earlier (lower) layers to unmask first. |
|
| 32 |
+
|
| 33 |
+
## Duration & Speed
|
| 34 |
+
|
| 35 |
+
These accept a single value applied to all items, or a per-item list (useful in batch mode):
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
# Fixed 10-second output
|
| 39 |
+
audio = model.generate(text="Hello, this is a test of duration control", duration=10.0)
|
| 40 |
+
|
| 41 |
+
# Faster speech (1.2x faster than estimated)
|
| 42 |
+
audio = model.generate(text="Hello, this is a test of duration control", speed=1.2)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
| Parameter | Type | Default | Description |
|
| 46 |
+
|---|---|---|---|
|
| 47 |
+
| `duration` | float or list[float \| None] | None | Fixed output duration in seconds. Overrides `speed` when set. |
|
| 48 |
+
| `speed` | float or list[float \| None] | None | Speed factor. Values > 1.0 produce shorter audio (faster); values < 1.0 produce longer audio (slower). Ignored when `duration` is set. Defaults to 1.0 when both are None. |
|
| 49 |
+
|
| 50 |
+
Priority: `duration` > `speed`.
|
| 51 |
+
|
| 52 |
+
> **Note:** When using `duration`, the default post-processing step may trim trailing silence, causing the actual output to be slightly shorter than the requested duration. If you need the output duration to **exactly** match the specified value, set `postprocess_output=False` to disable silence removal.
|
| 53 |
+
|
| 54 |
+
## Pre/Post Processing
|
| 55 |
+
|
| 56 |
+
| Parameter | Type | Default | Description |
|
| 57 |
+
|---|---|---|---|
|
| 58 |
+
| `preprocess_prompt` | bool | True | Whether to apply preprocessing to the voice-clone prompt audio (remove long silences in reference audio, add punctuation in the end of reference text). |
|
| 59 |
+
| `postprocess_output` | bool | True | Apply post-processing to generated audio (remove long silences). |
|
| 60 |
+
|
| 61 |
+
## Long-Form Generation
|
| 62 |
+
|
| 63 |
+
To support stable long-form speech generation with low VRAM consumption, the text is automatically split into smaller segments when the estimated duration of the generated speech exceeds `audio_chunk_duration`, with each segment producing approximately `audio_chunk_duration` seconds of audio. This approach allows the model to accept arbitrarily long text and generate arbitrarily long speech with near-constant VRAM consumption.
|
| 64 |
+
|
| 65 |
+
| Parameter | Type | Default | Description |
|
| 66 |
+
|---|---|---|---|
|
| 67 |
+
| `audio_chunk_duration` | float | 15.0 | Target chunk duration (seconds) when splitting long text. |
|
| 68 |
+
| `audio_chunk_threshold` | float | 30.0 | Estimated audio duration (seconds) above which chunking is activated. |
|
docs/lang_id_name_map.tsv
ADDED
|
@@ -0,0 +1,647 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
language_id language_name iso_639_3_id train_data_duration
|
| 2 |
+
aae Arbëreshë Albanian aae 6.11
|
| 3 |
+
aal Afade aal 10.19
|
| 4 |
+
aao Algerian Saharan Arabic aao 2.02
|
| 5 |
+
ab Abkhazian abk 57.27
|
| 6 |
+
abb Bankon abb 11.2
|
| 7 |
+
abn Abua abn 10.27
|
| 8 |
+
abr Abron abr 9.22
|
| 9 |
+
abs Ambonese Malay abs 10.03
|
| 10 |
+
abv Baharna Arabic abv 10.41
|
| 11 |
+
acm Mesopotamian Arabic acm 3.78
|
| 12 |
+
acw Hijazi Arabic acw 22.32
|
| 13 |
+
acx Omani Arabic acx 22.03
|
| 14 |
+
adf Dhofari Arabic adf 0.31
|
| 15 |
+
adx Amdo Tibetan adx 56.94
|
| 16 |
+
ady Adyghe ady 32.6
|
| 17 |
+
aeb Tunisian Arabic aeb 21.63
|
| 18 |
+
aec Saidi Arabic aec 9.28
|
| 19 |
+
af Afrikaans afr 4.4
|
| 20 |
+
afb Gulf Arabic afb 98.55
|
| 21 |
+
afo Eloyi afo 11.21
|
| 22 |
+
ahl Igo ahl 9.22
|
| 23 |
+
ahs Ashe ahs 10.62
|
| 24 |
+
ajg Aja (Benin) ajg 5.63
|
| 25 |
+
aju Judeo-Moroccan Arabic aju 7.21
|
| 26 |
+
ala Alago ala 11.04
|
| 27 |
+
aln Gheg Albanian aln 3.92
|
| 28 |
+
alo Larike-Wakasihu alo 9.97
|
| 29 |
+
am Amharic amh 12.83
|
| 30 |
+
amu Guerrero Amuzgo amu 10.1
|
| 31 |
+
an Aragonese arg 16.4
|
| 32 |
+
anc Ngas anc 10.14
|
| 33 |
+
ank Goemai ank 10.0
|
| 34 |
+
anp Angika anp 10.65
|
| 35 |
+
anw Anaang anw 9.65
|
| 36 |
+
aom Ömie aom 8.19
|
| 37 |
+
apc Levantine Arabic apc 15.65
|
| 38 |
+
apd Sudanese Arabic apd 9.93
|
| 39 |
+
arb Standard Arabic arb 1483.53
|
| 40 |
+
arq Algerian Arabic arq 9.64
|
| 41 |
+
ars Najdi Arabic ars 203.54
|
| 42 |
+
ary Moroccan Arabic ary 104.67
|
| 43 |
+
arz Egyptian Arabic arz 23.23
|
| 44 |
+
as Assamese asm 270.85
|
| 45 |
+
ast Asturian ast 8.48
|
| 46 |
+
avl Eastern Egyptian Bedawi Arabic avl 1.86
|
| 47 |
+
awo Awak awo 10.22
|
| 48 |
+
ayl Libyan Arabic ayl 20.13
|
| 49 |
+
ayp North Mesopotamian Arabic ayp 10.92
|
| 50 |
+
az Azerbaijani aze 9.84
|
| 51 |
+
ba Bashkir bak 249.1
|
| 52 |
+
bag Tuki bag 10.97
|
| 53 |
+
bas Basa (Cameroon) bas 10.66
|
| 54 |
+
bax Bamun bax 10.24
|
| 55 |
+
bba Baatonum bba 10.53
|
| 56 |
+
bbj Ghomálá' bbj 7.32
|
| 57 |
+
bbl Bats bbl 11.22
|
| 58 |
+
bbu Kulung (Nigeria) bbu 10.39
|
| 59 |
+
bce Bamenyam bce 9.9
|
| 60 |
+
bci Baoulé bci 10.21
|
| 61 |
+
bcs Kohumono bcs 10.45
|
| 62 |
+
bcy Bacama bcy 9.94
|
| 63 |
+
bda Bayot bda 9.47
|
| 64 |
+
bde Bade bde 9.89
|
| 65 |
+
bdm Buduma bdm 10.17
|
| 66 |
+
be Belarusian bel 1809.43
|
| 67 |
+
beb Bebele beb 7.52
|
| 68 |
+
bew Betawi bew 11.15
|
| 69 |
+
bfd Bafut bfd 9.03
|
| 70 |
+
bft Balti bft 16.28
|
| 71 |
+
bg Bulgarian bul 2190.76
|
| 72 |
+
bgp Eastern Balochi bgp 10.98
|
| 73 |
+
bhb Bhili bhb 9.98
|
| 74 |
+
bhh Bukharic bhh 11.38
|
| 75 |
+
bho Bhojpuri bho 10.05
|
| 76 |
+
bhp Bima bhp 10.67
|
| 77 |
+
bhr Bara Malagasy bhr 12.14
|
| 78 |
+
bjj Kanauji bjj 11.01
|
| 79 |
+
bjk Barok bjk 10.16
|
| 80 |
+
bjn Banjar bjn 11.68
|
| 81 |
+
bjt Balanta-Ganja bjt 9.41
|
| 82 |
+
bkh Bakoko bkh 6.0
|
| 83 |
+
bkm Kom (Cameroon) bkm 10.76
|
| 84 |
+
bky Bokyi bky 9.85
|
| 85 |
+
bmm Northern Betsimisaraka Malagasy bmm 19.12
|
| 86 |
+
bmq Bomu bmq 10.68
|
| 87 |
+
bn Bengali ben 271.76
|
| 88 |
+
bnm Batanga bnm 15.01
|
| 89 |
+
bnn Bunun bnn 9.26
|
| 90 |
+
bns Bundeli bns 10.88
|
| 91 |
+
bo Tibetan bod 82.27
|
| 92 |
+
bou Bondei bou 9.98
|
| 93 |
+
bqg Bago-Kusuntu bqg 8.86
|
| 94 |
+
br Breton bre 25.48
|
| 95 |
+
bra Braj bra 10.68
|
| 96 |
+
brh Brahui brh 19.89
|
| 97 |
+
bri Mokpwe bri 7.53
|
| 98 |
+
brx Bodo brx 231.57
|
| 99 |
+
bs Bosnian bos 690.73
|
| 100 |
+
bsh Kati bsh 8.77
|
| 101 |
+
bsj Bangwinji bsj 10.0
|
| 102 |
+
bsk Burushaski bsk 9.14
|
| 103 |
+
btm Batak Mandailing btm 11.09
|
| 104 |
+
btv Bateri btv 9.8
|
| 105 |
+
bug Buginese bug 11.09
|
| 106 |
+
bum Bulu (Cameroon) bum 9.06
|
| 107 |
+
buo Terei buo 9.48
|
| 108 |
+
bux Boghom bux 10.48
|
| 109 |
+
bwr Bura-Pabir bwr 10.4
|
| 110 |
+
bxf Bilur bxf 10.84
|
| 111 |
+
byc Ubaghara byc 11.11
|
| 112 |
+
bys Burak bys 9.92
|
| 113 |
+
byv Medumba byv 10.95
|
| 114 |
+
byx Qaqet byx 9.79
|
| 115 |
+
bzc Southern Betsimisaraka Malagasy bzc 17.45
|
| 116 |
+
bzw Basa (Nigeria) bzw 10.27
|
| 117 |
+
ca Catalan cat 3358.6
|
| 118 |
+
ccg Samba Daka ccg 10.11
|
| 119 |
+
ceb Cebuano ceb 12.17
|
| 120 |
+
cen Cen cen 9.85
|
| 121 |
+
cfa Dijim-Bwilim cfa 10.32
|
| 122 |
+
cgg Chiga cgg 10.84
|
| 123 |
+
chq Quiotepec Chinantec chq 9.76
|
| 124 |
+
cjk Chokwe cjk 11.01
|
| 125 |
+
ckb Central Kurdish ckb 137.52
|
| 126 |
+
ckl Cibak ckl 10.91
|
| 127 |
+
ckr Kairak ckr 10.51
|
| 128 |
+
cky Cakfem-Mushere cky 8.96
|
| 129 |
+
cnh Hakha Chin cnh 2.24
|
| 130 |
+
cpy South Ucayali Ashéninka cpy 9.15
|
| 131 |
+
cs Czech ces 148.13
|
| 132 |
+
cte Tepinapa Chinantec cte 9.54
|
| 133 |
+
ctl Tlacoatzintepec Chinantec ctl 10.04
|
| 134 |
+
cut Teutila Cuicatec cut 8.04
|
| 135 |
+
cux Tepeuxila Cuicatec cux 7.83
|
| 136 |
+
cv Chuvash chv 23.96
|
| 137 |
+
cy Welsh cym 131.21
|
| 138 |
+
da Danish dan 1665.98
|
| 139 |
+
dag Dagbani dag 10.14
|
| 140 |
+
dar Dargwa dar 1.22
|
| 141 |
+
dav Taita dav 9.12
|
| 142 |
+
dbd Dadiya dbd 9.61
|
| 143 |
+
dcc Deccan dcc 10.38
|
| 144 |
+
de German deu 21927.13
|
| 145 |
+
deg Degema deg 11.07
|
| 146 |
+
dgh Dghwede dgh 9.95
|
| 147 |
+
dgo Dogri dgo 117.04
|
| 148 |
+
dje Zarma dje 10.72
|
| 149 |
+
dmk Domaaki dmk 6.38
|
| 150 |
+
dml Dameli dml 9.18
|
| 151 |
+
dru Rukai dru 9.26
|
| 152 |
+
dty Dotyali dty 10.85
|
| 153 |
+
dua Duala dua 12.13
|
| 154 |
+
dv Dhivehi div 38.61
|
| 155 |
+
dyu Dyula dyu 0.34
|
| 156 |
+
dzg Dazaga dzg 9.96
|
| 157 |
+
ebr Ebrié ebr 1.5
|
| 158 |
+
ebu Embu ebu 9.81
|
| 159 |
+
ego Eggon ego 9.95
|
| 160 |
+
eiv Askopan eiv 10.44
|
| 161 |
+
eko Koti eko 8.15
|
| 162 |
+
ekr Yace ekr 10.76
|
| 163 |
+
el Greek ell 2412.54
|
| 164 |
+
elm Eleme elm 11.27
|
| 165 |
+
en English eng 206061.1
|
| 166 |
+
eo Esperanto epo 1396.64
|
| 167 |
+
es Spanish spa 27559.74
|
| 168 |
+
esu Central Yupik esu 2.18
|
| 169 |
+
et Estonian est 960.37
|
| 170 |
+
eto Eton (Cameroon) eto 7.43
|
| 171 |
+
ets Yekhee ets 10.11
|
| 172 |
+
etu Ejagham etu 10.3
|
| 173 |
+
eu Basque eus 479.86
|
| 174 |
+
ewo Ewondo ewo 12.71
|
| 175 |
+
ext Extremaduran ext 13.59
|
| 176 |
+
eyo Keiyo eyo 9.24
|
| 177 |
+
fa Persian fas 366.07
|
| 178 |
+
fan Fang (Equatorial Guinea) fan 3.51
|
| 179 |
+
fat Fanti fat 11.38
|
| 180 |
+
ff Fulah ful 13.84
|
| 181 |
+
ffm Maasina Fulfulde ffm 10.46
|
| 182 |
+
fi Finnish fin 468.62
|
| 183 |
+
fia Nobiin fia 9.96
|
| 184 |
+
fil Filipino fil 7.71
|
| 185 |
+
fip Fipa fip 10.55
|
| 186 |
+
fkk Kirya-Konzəl fkk 9.98
|
| 187 |
+
fmp Fe'fe' fmp 9.86
|
| 188 |
+
fr French fra 23675.32
|
| 189 |
+
fub Adamawa Fulfulde fub 13.12
|
| 190 |
+
fuc Pulaar fuc 14.77
|
| 191 |
+
fue Borgu Fulfulde fue 20.1
|
| 192 |
+
fuf Pular fuf 13.77
|
| 193 |
+
fuh Western Niger Fulfulde fuh 9.69
|
| 194 |
+
fui Bagirmi Fulfulde fui 15.04
|
| 195 |
+
fuq Central-Eastern Niger Fulfulde fuq 9.28
|
| 196 |
+
fuv Nigerian Fulfulde fuv 9.97
|
| 197 |
+
fy Western Frisian fry 70.41
|
| 198 |
+
ga Irish gle 21.4
|
| 199 |
+
gbm Garhwali gbm 19.14
|
| 200 |
+
gbr Gbagyi gbr 12.12
|
| 201 |
+
gby Gbari gby 12.59
|
| 202 |
+
gcc Mali gcc 9.87
|
| 203 |
+
gdf Guduf-Gava gdf 12.21
|
| 204 |
+
gej Gen gej 5.39
|
| 205 |
+
ges Geser-Gorom ges 10.08
|
| 206 |
+
ggg Gurgula ggg 7.12
|
| 207 |
+
gid Gidar gid 10.06
|
| 208 |
+
gig Goaria gig 9.41
|
| 209 |
+
giz South Giziga giz 10.03
|
| 210 |
+
gjk Kachi Koli gjk 20.83
|
| 211 |
+
gju Gujari gju 8.66
|
| 212 |
+
gl Galician glg 208.81
|
| 213 |
+
glw Glavda glw 10.51
|
| 214 |
+
gn Guarani grn 4.06
|
| 215 |
+
gol Gola gol 9.26
|
| 216 |
+
gom Goan Konkani gom 9.82
|
| 217 |
+
gsl Gusilay gsl 10.0
|
| 218 |
+
gu Gujarati guj 91.18
|
| 219 |
+
gui Eastern Bolivian Guaraní gui 22.72
|
| 220 |
+
gur Farefare gur 9.24
|
| 221 |
+
guz Gusii guz 9.5
|
| 222 |
+
gv Manx glv 10.07
|
| 223 |
+
gwc Gawri gwc 10.83
|
| 224 |
+
gwe Gweno gwe 8.87
|
| 225 |
+
gwt Gawar-Bati gwt 12.16
|
| 226 |
+
gya Northwest Gbaya gya 8.45
|
| 227 |
+
gyz Geji gyz 10.49
|
| 228 |
+
ha Hausa hau 17.75
|
| 229 |
+
hah Hahon hah 9.64
|
| 230 |
+
hao Hakö hao 8.56
|
| 231 |
+
haw Hawaiian haw 11.79
|
| 232 |
+
haz Hazaragi haz 9.69
|
| 233 |
+
hbb Huba hbb 10.7
|
| 234 |
+
he Hebrew heb 13.4
|
| 235 |
+
hem Hemba hem 9.53
|
| 236 |
+
hi Hindi hin 117.17
|
| 237 |
+
hia Lamang hia 11.07
|
| 238 |
+
hkk Hunjara-Kaina Ke hkk 8.69
|
| 239 |
+
hla Halia hla 9.86
|
| 240 |
+
hno Northern Hindko hno 20.04
|
| 241 |
+
hoj Hadothi hoj 10.08
|
| 242 |
+
hr Croatian hrv 2795.31
|
| 243 |
+
hsb Upper Sorbian hsb 2.71
|
| 244 |
+
ht Haitian hat 0.04
|
| 245 |
+
hu Hungarian hun 255.83
|
| 246 |
+
hue San Francisco Del Mar Huave hue 9.45
|
| 247 |
+
hul Hula hul 10.33
|
| 248 |
+
hux Nüpode Huitoto hux 9.04
|
| 249 |
+
hwo Hwana hwo 11.23
|
| 250 |
+
hy Armenian hye 42.15
|
| 251 |
+
hz Herero her 9.59
|
| 252 |
+
ia Interlingua (International Auxiliary Language Association) ina 13.48
|
| 253 |
+
ibb Ibibio ibb 7.38
|
| 254 |
+
id Indonesian ind 6327.87
|
| 255 |
+
ida Idakho-Isukha-Tiriki ida 9.31
|
| 256 |
+
idu Idoma idu 11.16
|
| 257 |
+
ig Igbo ibo 13.69
|
| 258 |
+
ijc Izon ijc 9.95
|
| 259 |
+
ijn Kalabari ijn 11.04
|
| 260 |
+
ik Inupiaq ipk 2.11
|
| 261 |
+
ikw Ikwere ikw 10.0
|
| 262 |
+
is Icelandic isl 647.29
|
| 263 |
+
ish Esan ish 10.05
|
| 264 |
+
iso Isoko iso 10.33
|
| 265 |
+
it Italian ita 9402.46
|
| 266 |
+
its Isekiri its 11.85
|
| 267 |
+
itw Ito itw 9.19
|
| 268 |
+
itz Itzá itz 7.08
|
| 269 |
+
ja Japanese jpn 36914.4
|
| 270 |
+
jal Yalahatan jal 11.18
|
| 271 |
+
jax Jambi Malay jax 10.29
|
| 272 |
+
jgo Ngomba jgo 10.15
|
| 273 |
+
jmx Western Juxtlahuaca Mixtec jmx 10.01
|
| 274 |
+
jns Jaunsari jns 11.25
|
| 275 |
+
jqr Jaqaru jqr 9.32
|
| 276 |
+
juk Wapan juk 10.22
|
| 277 |
+
juo Jiba juo 10.43
|
| 278 |
+
jv Javanese jav 11.19
|
| 279 |
+
ka Georgian kat 156.96
|
| 280 |
+
kab Kabyle kab 529.52
|
| 281 |
+
kai Karekare kai 10.52
|
| 282 |
+
kaj Jju kaj 10.16
|
| 283 |
+
kam Kamba kam 14.72
|
| 284 |
+
kbd Kabardian kbd 108.35
|
| 285 |
+
kbl Kanembu kbl 10.19
|
| 286 |
+
kbt Abadi kbt 9.73
|
| 287 |
+
kcq Kamo kcq 10.49
|
| 288 |
+
kdh Tem kdh 4.07
|
| 289 |
+
kea Kabuverdianu kea 10.51
|
| 290 |
+
keu Akebu keu 9.1
|
| 291 |
+
kfe Kota (India) kfe 10.25
|
| 292 |
+
kfk Kinnauri kfk 10.32
|
| 293 |
+
kfp Korwa kfp 11.87
|
| 294 |
+
khg Khams Tibetan khg 6.38
|
| 295 |
+
khw Khowar khw 15.55
|
| 296 |
+
kj Kuanyama kua 9.88
|
| 297 |
+
kjc Coastal Konjo kjc 10.18
|
| 298 |
+
kjk Highland Konjo kjk 10.21
|
| 299 |
+
kk Kazakh kaz 1537.29
|
| 300 |
+
kln Kalenjin kln 40.42
|
| 301 |
+
kls Kalasha kls 9.11
|
| 302 |
+
km Khmer khm 7.1
|
| 303 |
+
kmr Northern Kurdish kmr 69.59
|
| 304 |
+
kmy Koma kmy 10.28
|
| 305 |
+
kn Kannada kan 128.06
|
| 306 |
+
kna Dera (Nigeria) kna 11.91
|
| 307 |
+
knn Konkani knn 112.83
|
| 308 |
+
ko Korean kor 8609.28
|
| 309 |
+
kol Kol (Papua New Guinea) kol 9.95
|
| 310 |
+
koo Konzo koo 13.23
|
| 311 |
+
kpo Ikposo kpo 7.83
|
| 312 |
+
kqo Eastern Krahn kqo 9.28
|
| 313 |
+
ks Kashmiri kas 110.42
|
| 314 |
+
ksd Kuanua ksd 9.91
|
| 315 |
+
ksf Bafia ksf 16.43
|
| 316 |
+
kto Kuot kto 9.77
|
| 317 |
+
kuh Kushi kuh 10.35
|
| 318 |
+
kvx Parkari Koli kvx 11.04
|
| 319 |
+
kw Cornish cor 12.15
|
| 320 |
+
kwm Kwambi kwm 9.9
|
| 321 |
+
kxp Wadiyara Koli kxp 20.0
|
| 322 |
+
ky Kirghiz kir 46.63
|
| 323 |
+
kyx Rapoisi kyx 9.17
|
| 324 |
+
lag Rangi lag 9.47
|
| 325 |
+
lb Luxembourgish ltz 8.46
|
| 326 |
+
lcm Tungag lcm 9.77
|
| 327 |
+
ldb Dũya ldb 11.31
|
| 328 |
+
lg Ganda lug 447.82
|
| 329 |
+
lij Ligurian lij 15.97
|
| 330 |
+
lir Liberian English lir 10.26
|
| 331 |
+
lkb Kabras lkb 9.99
|
| 332 |
+
lla Lala-Roba lla 10.38
|
| 333 |
+
ln Lingala lin 17.99
|
| 334 |
+
lnu Longuda lnu 10.46
|
| 335 |
+
lo Lao lao 7.63
|
| 336 |
+
loa Loloda loa 9.31
|
| 337 |
+
lrk Loarki lrk 10.5
|
| 338 |
+
lss Lasi lss 6.53
|
| 339 |
+
lt Lithuanian lit 2629.45
|
| 340 |
+
ltg Latgalian ltg 27.23
|
| 341 |
+
lto Tsotso lto 9.77
|
| 342 |
+
lua Luba-Lulua lua 8.47
|
| 343 |
+
luo Luo luo 36.17
|
| 344 |
+
lus Lushai lus 20.24
|
| 345 |
+
lv Latvian lav 1441.58
|
| 346 |
+
lwg Wanga lwg 9.36
|
| 347 |
+
mab Yutanduchi Mixtec mab 9.26
|
| 348 |
+
maf Mafa maf 9.97
|
| 349 |
+
mai Maithili mai 131.37
|
| 350 |
+
mau Huautla Mazatec mau 6.39
|
| 351 |
+
max North Moluccan Malay max 9.43
|
| 352 |
+
mbo Mbo (Cameroon) mbo 9.51
|
| 353 |
+
mcf Matsés mcf 9.61
|
| 354 |
+
mcn Masana mcn 10.09
|
| 355 |
+
mcx Mpiemo mcx 9.88
|
| 356 |
+
mdd Mbum mdd 9.82
|
| 357 |
+
mde Maba (Chad) mde 9.5
|
| 358 |
+
mdf Moksha mdf 0.47
|
| 359 |
+
mek Mekeo mek 9.18
|
| 360 |
+
mer Meru mer 9.89
|
| 361 |
+
meu Motu meu 9.88
|
| 362 |
+
mfm Marghi South mfm 10.05
|
| 363 |
+
mfn Cross River Mbembe mfn 10.03
|
| 364 |
+
mfo Mbe mfo 10.24
|
| 365 |
+
mfv Mandjak mfv 9.55
|
| 366 |
+
mgg Mpumpong mgg 4.94
|
| 367 |
+
mgi Lijili mgi 10.89
|
| 368 |
+
mhk Mungaka mhk 7.53
|
| 369 |
+
mhr Eastern Mari mhr 272.31
|
| 370 |
+
mi Maori mri 18.02
|
| 371 |
+
mig San Miguel El Grande Mixtec mig 9.66
|
| 372 |
+
miu Cacaloxtepec Mixtec miu 9.18
|
| 373 |
+
mk Macedonian mkd 27.21
|
| 374 |
+
mkf Miya mkf 10.16
|
| 375 |
+
mki Dhatki mki 8.83
|
| 376 |
+
ml Malayalam mal 166.57
|
| 377 |
+
mlq Western Maninkakan mlq 9.83
|
| 378 |
+
mn Mongolian mon 269.08
|
| 379 |
+
mne Naba mne 10.37
|
| 380 |
+
mni Manipuri mni 44.46
|
| 381 |
+
mqy Manggarai mqy 10.5
|
| 382 |
+
mr Marathi mar 156.71
|
| 383 |
+
mrj Western Mari mrj 32.26
|
| 384 |
+
mrr Maria (India) mrr 11.0
|
| 385 |
+
mrt Marghi Central mrt 10.36
|
| 386 |
+
ms Malay msa 9.57
|
| 387 |
+
mse Musey mse 7.21
|
| 388 |
+
msh Masikoro Malagasy msh 14.16
|
| 389 |
+
msw Mansoanka msw 9.32
|
| 390 |
+
mt Maltese mlt 630.29
|
| 391 |
+
mtr Mewari mtr 10.58
|
| 392 |
+
mtu Tututepec Mixtec mtu 10.13
|
| 393 |
+
mtx Tidaá Mixtec mtx 9.09
|
| 394 |
+
mua Mundang mua 9.2
|
| 395 |
+
mug Musgu mug 4.74
|
| 396 |
+
mui Musi mui 10.52
|
| 397 |
+
mve Marwari (Pakistan) mve 9.96
|
| 398 |
+
mvy Indus Kohistani mvy 21.64
|
| 399 |
+
mxs Huitepec Mixtec mxs 9.64
|
| 400 |
+
mxu Mada (Cameroon) mxu 12.0
|
| 401 |
+
mxy Southeastern Nochixtlán Mixtec mxy 9.48
|
| 402 |
+
my Burmese mya 12.14
|
| 403 |
+
myv Erzya myv 3.1
|
| 404 |
+
mzl Mazatlán Mixe mzl 10.05
|
| 405 |
+
nal Nalik nal 10.33
|
| 406 |
+
nan Min Nan Chinese nan 17.55
|
| 407 |
+
nap Neapolitan nap 9.97
|
| 408 |
+
nb Norwegian Bokmål nob 12.7
|
| 409 |
+
nbh Ngamo nbh 10.04
|
| 410 |
+
ncf Notsi ncf 9.84
|
| 411 |
+
nco Sibe nco 9.96
|
| 412 |
+
ncx Central Puebla Nahuatl ncx 9.86
|
| 413 |
+
ndi Samba Leko ndi 11.27
|
| 414 |
+
ng Ndonga ndo 9.08
|
| 415 |
+
ngi Ngizim ngi 10.06
|
| 416 |
+
nhg Tetelcingo Nahuatl nhg 8.92
|
| 417 |
+
nhi Zacatlán-Ahuacatlán-Tepetzintla Nahuatl nhi 0.05
|
| 418 |
+
nhn Central Nahuatl nhn 9.51
|
| 419 |
+
nhq Huaxcaleca Nahuatl nhq 5.07
|
| 420 |
+
nja Nzanyi nja 10.02
|
| 421 |
+
nl Dutch nld 2264.13
|
| 422 |
+
nla Ngombale nla 8.79
|
| 423 |
+
nlv Orizaba Nahuatl nlv 11.42
|
| 424 |
+
nmg Kwasio nmg 10.39
|
| 425 |
+
nmz Nawdm nmz 6.3
|
| 426 |
+
nn Norwegian Nynorsk nno 1.54
|
| 427 |
+
nnh Ngiemboon nnh 16.15
|
| 428 |
+
no Norwegian nor 3849.8
|
| 429 |
+
noe Nimadi noe 11.12
|
| 430 |
+
npi Nepali npi 171.5
|
| 431 |
+
nso Pedi nso 12.64
|
| 432 |
+
ny Chichewa nya 10.8
|
| 433 |
+
nyu Nyungwe nyu 8.98
|
| 434 |
+
oc Occitan oci 16.8
|
| 435 |
+
odk Od odk 20.26
|
| 436 |
+
odu Odual odu 10.57
|
| 437 |
+
ogo Khana ogo 10.51
|
| 438 |
+
om Oromo orm 6.6
|
| 439 |
+
orc Orma orc 22.01
|
| 440 |
+
oru Ormuri oru 16.74
|
| 441 |
+
ory Odia ory 144.81
|
| 442 |
+
os Iron Ossetic oss 1.38
|
| 443 |
+
pa Panjabi pan 147.37
|
| 444 |
+
pbs Central Pame pbs 9.69
|
| 445 |
+
pbt Southern Pashto pbt 11.6
|
| 446 |
+
pbu Northern Pashto pbu 11.03
|
| 447 |
+
pcm Nigerian Pidgin pcm 11.04
|
| 448 |
+
pex Petats pex 10.2
|
| 449 |
+
phl Phalura phl 20.69
|
| 450 |
+
phr Pahari-Potwari phr 24.03
|
| 451 |
+
pip Pero pip 9.85
|
| 452 |
+
piy Piya-Kwonci piy 10.38
|
| 453 |
+
pko Pökoot pko 10.4
|
| 454 |
+
pl Polish pol 911.68
|
| 455 |
+
plk Kohistani Shina plk 12.75
|
| 456 |
+
plt Plateau Malagasy plt 19.39
|
| 457 |
+
pmq Northern Pame pmq 10.24
|
| 458 |
+
pms Piemontese pms 16.01
|
| 459 |
+
pmy Papuan Malay pmy 10.17
|
| 460 |
+
pnb Western Panjabi pnb 10.0
|
| 461 |
+
poc Poqomam poc 9.63
|
| 462 |
+
poe San Juan Atzingo Popoloca poe 10.01
|
| 463 |
+
pow San Felipe Otlaltepec Popoloca pow 8.84
|
| 464 |
+
prq Ashéninka Perené prq 7.16
|
| 465 |
+
ps Pushto pus 88.62
|
| 466 |
+
pst Central Pashto pst 11.4
|
| 467 |
+
pt Portuguese por 16855.05
|
| 468 |
+
pua Western Highland Purepecha pua 10.17
|
| 469 |
+
pwn Paiwan pwn 13.76
|
| 470 |
+
qug Chimborazo Highland Quichua qug 10.12
|
| 471 |
+
qum Sipacapense qum 9.37
|
| 472 |
+
qup Southern Pastaza Quechua qup 11.13
|
| 473 |
+
qur Yanahuanca Pasco Quechua qur 9.95
|
| 474 |
+
qus Santiago del Estero Quichua qus 9.55
|
| 475 |
+
quv Sacapulteco quv 8.9
|
| 476 |
+
qux Yauyos Quechua qux 9.35
|
| 477 |
+
quy Ayacucho Quechua quy 0.05
|
| 478 |
+
qva Ambo-Pasco Quechua qva 9.59
|
| 479 |
+
qvi Imbabura Highland Quichua qvi 11.0
|
| 480 |
+
qvj Loja Highland Quichua qvj 10.59
|
| 481 |
+
qvl Cajatambo North Lima Quechua qvl 9.95
|
| 482 |
+
qwa Corongo Ancash Quechua qwa 9.72
|
| 483 |
+
qws Sihuas Ancash Quechua qws 10.18
|
| 484 |
+
qxa Chiquián Ancash Quechua qxa 9.99
|
| 485 |
+
qxp Puno Quechua qxp 9.81
|
| 486 |
+
qxt Santa Ana de Tusi Pasco Quechua qxt 10.05
|
| 487 |
+
qxu Arequipa-La Unión Quechua qxu 10.12
|
| 488 |
+
qxw Jauja Wanca Quechua qxw 11.42
|
| 489 |
+
rag Logooli rag 9.39
|
| 490 |
+
rm Romansh roh 9.21
|
| 491 |
+
ro Romanian ron 70.23
|
| 492 |
+
rob Tae' rob 9.02
|
| 493 |
+
rof Rombo rof 18.9
|
| 494 |
+
roo Rotokas roo 9.07
|
| 495 |
+
rth Ratahan rth 9.34
|
| 496 |
+
ru Russian rus 20338.5
|
| 497 |
+
rup Macedo-Romanian rup 0.02
|
| 498 |
+
rw Kinyarwanda kin 2021.66
|
| 499 |
+
sa Sanskrit san 84.44
|
| 500 |
+
sah Yakut sah 16.08
|
| 501 |
+
sat Santali sat 98.37
|
| 502 |
+
sau Saleman sau 10.53
|
| 503 |
+
say Saya say 10.02
|
| 504 |
+
sbn Sindhi Bhil sbn 10.53
|
| 505 |
+
sc Sardinian srd 2.77
|
| 506 |
+
scl Shina scl 9.84
|
| 507 |
+
scn Sicilian scn 13.35
|
| 508 |
+
sd Sindhi snd 46.27
|
| 509 |
+
sei Seri sei 9.81
|
| 510 |
+
shu Chadian Arabic shu 2.29
|
| 511 |
+
si Sinhala sin 11.98
|
| 512 |
+
sip Sikkimese sip 10.07
|
| 513 |
+
siw Siwai siw 10.47
|
| 514 |
+
sjr Siar-Lak sjr 9.87
|
| 515 |
+
sk Slovak slk 2478.46
|
| 516 |
+
skg Sakalava Malagasy skg 9.02
|
| 517 |
+
skr Saraiki skr 4.13
|
| 518 |
+
sl Slovenian slv 1172.61
|
| 519 |
+
sn Shona sna 9.96
|
| 520 |
+
snc Sinaugoro snc 10.38
|
| 521 |
+
snk Soninke snk 10.04
|
| 522 |
+
so Somali som 13.22
|
| 523 |
+
sol Solos sol 9.95
|
| 524 |
+
sps Saposa sps 9.81
|
| 525 |
+
sq Albanian sqi 8.59
|
| 526 |
+
sr Serbian srp 1855.33
|
| 527 |
+
src Logudorese Sardinian src 10.67
|
| 528 |
+
sro Campidanese Sardinian sro 10.16
|
| 529 |
+
ssi Sansi ssi 10.47
|
| 530 |
+
ste Liana-Seti ste 10.43
|
| 531 |
+
sua Sulka sua 10.12
|
| 532 |
+
sv Swedish swe 2453.14
|
| 533 |
+
sva Svan sva 15.11
|
| 534 |
+
sw Swahili swa 418.41
|
| 535 |
+
szy Sakizaya szy 11.47
|
| 536 |
+
ta Tamil tam 423.09
|
| 537 |
+
tan Tangale tan 10.14
|
| 538 |
+
tar Central Tarahumara tar 9.73
|
| 539 |
+
tay Atayal tay 7.02
|
| 540 |
+
tbf Mandara tbf 10.01
|
| 541 |
+
tcf Malinaltepec Me'phaa tcf 9.04
|
| 542 |
+
tcy Tulu tcy 11.72
|
| 543 |
+
tdn Tondano tdn 9.14
|
| 544 |
+
tdx Tandroy-Mahafaly Malagasy tdx 3.81
|
| 545 |
+
te Telugu tel 230.21
|
| 546 |
+
tg Tajik tgk 9.23
|
| 547 |
+
tgc Tigak tgc 9.71
|
| 548 |
+
th Thai tha 10499.77
|
| 549 |
+
the Chitwania Tharu the 10.06
|
| 550 |
+
thq Kochila Tharu thq 10.28
|
| 551 |
+
thr Rana Tharu thr 9.99
|
| 552 |
+
thv Tahaggart Tamahaq thv 4.25
|
| 553 |
+
ti Tigrinya tir 0.08
|
| 554 |
+
tig Tigre tig 7.49
|
| 555 |
+
tio Teop tio 9.85
|
| 556 |
+
tk Turkmen tuk 2.86
|
| 557 |
+
tkg Tesaka Malagasy tkg 17.86
|
| 558 |
+
tkt Kathoriya Tharu tkt 10.64
|
| 559 |
+
tli Tlingit tli 0.41
|
| 560 |
+
tlp Filomena Mata-Coahuitlán Totonac tlp 11.35
|
| 561 |
+
tn Tswana tsn 4.24
|
| 562 |
+
tok Toki Pona tok 13.51
|
| 563 |
+
tpl Tlacoapa Me'phaa tpl 9.28
|
| 564 |
+
tpz Tinputz tpz 9.33
|
| 565 |
+
tqp Tomoip tqp 10.1
|
| 566 |
+
tr Turkish tur 125.36
|
| 567 |
+
trp Kok Borok trp 10.74
|
| 568 |
+
trq San Martín Itunyoso Triqui trq 8.29
|
| 569 |
+
trv Sediq trv 7.77
|
| 570 |
+
trw Torwali trw 14.98
|
| 571 |
+
tt Tatar tat 30.03
|
| 572 |
+
ttj Tooro ttj 10.31
|
| 573 |
+
ttr Tera ttr 9.89
|
| 574 |
+
ttu Torau ttu 9.87
|
| 575 |
+
tui Tupuri tui 9.26
|
| 576 |
+
tul Tula tul 9.79
|
| 577 |
+
tuq Tedaga tuq 10.0
|
| 578 |
+
tuv Turkana tuv 10.17
|
| 579 |
+
tuy Tugen tuy 8.79
|
| 580 |
+
tvo Tidore tvo 10.31
|
| 581 |
+
tvu Tunen tvu 9.85
|
| 582 |
+
tw Twi twi 0.25
|
| 583 |
+
twu Termanu twu 11.45
|
| 584 |
+
txs Tonsea txs 9.32
|
| 585 |
+
txy Tanosy Malagasy txy 12.07
|
| 586 |
+
udl Wuzlam udl 9.23
|
| 587 |
+
ug Uighur uig 428.77
|
| 588 |
+
uk Ukrainian ukr 1851.97
|
| 589 |
+
uki Kui (India) uki 10.77
|
| 590 |
+
umb Umbundu umb 10.59
|
| 591 |
+
ur Urdu urd 211.27
|
| 592 |
+
ush Ushojo ush 6.36
|
| 593 |
+
uz Uzbek uzb 115.28
|
| 594 |
+
uzn Northern Uzbek uzn 15.23
|
| 595 |
+
vai Vai vai 8.76
|
| 596 |
+
var Huarijio var 9.28
|
| 597 |
+
ver Mom Jango ver 10.93
|
| 598 |
+
vi Vietnamese vie 8481.98
|
| 599 |
+
vmc Juxtlahuaca Mixtec vmc 9.43
|
| 600 |
+
vmj Ixtayutla Mixtec vmj 10.17
|
| 601 |
+
vmm Mitlatongo Mixtec vmm 9.95
|
| 602 |
+
vmp Soyaltepec Mazatec vmp 10.17
|
| 603 |
+
vmz Mazatlán Mazatec vmz 9.82
|
| 604 |
+
vot Votic vot 0.1
|
| 605 |
+
vro Võro vro 15.66
|
| 606 |
+
wbl Wakhi wbl 11.67
|
| 607 |
+
wci Waci Gbe wci 8.02
|
| 608 |
+
weo Wemale weo 9.09
|
| 609 |
+
wes Cameroon Pidgin wes 10.06
|
| 610 |
+
wja Waja wja 10.22
|
| 611 |
+
wji Warji wji 11.39
|
| 612 |
+
wo Wolof wol 8.71
|
| 613 |
+
wof Gambian Wolof wof 9.46
|
| 614 |
+
xh Xhosa xho 13.35
|
| 615 |
+
xhe Khetrani xhe 9.4
|
| 616 |
+
xka Kalkoti xka 8.0
|
| 617 |
+
xmf Mingrelian xmf 11.47
|
| 618 |
+
xmv Antankarana Malagasy xmv 17.9
|
| 619 |
+
xmw Tsimihety Malagasy xmw 11.53
|
| 620 |
+
xpe Liberia Kpelle xpe 9.5
|
| 621 |
+
xti Sinicahua Mixtec xti 9.5
|
| 622 |
+
xtu Cuyamecalco Mixtec xtu 9.4
|
| 623 |
+
yaq Yaqui yaq 9.93
|
| 624 |
+
yav Yangben yav 8.7
|
| 625 |
+
yay Agwagwune yay 8.26
|
| 626 |
+
ydd Eastern Yiddish ydd 18.43
|
| 627 |
+
ydg Yidgha ydg 9.89
|
| 628 |
+
yer Tarok yer 10.08
|
| 629 |
+
yes Nyankpa yes 10.26
|
| 630 |
+
yi Yiddish yid 1.81
|
| 631 |
+
yo Yoruba yor 15.66
|
| 632 |
+
yue Cantonese yue 13302.38
|
| 633 |
+
zga Kinga zga 9.5
|
| 634 |
+
zgh Standard Moroccan Tamazight zgh 1.19
|
| 635 |
+
zh Chinese cmn 111343.3
|
| 636 |
+
zoc Copainalá Zoque zoc 10.07
|
| 637 |
+
zoh Chimalapa Zoque zoh 9.35
|
| 638 |
+
zor Rayón Zoque zor 9.04
|
| 639 |
+
zpv Chichicapan Zapotec zpv 9.85
|
| 640 |
+
zpy Mazaltepec Zapotec zpy 9.47
|
| 641 |
+
ztg Xanaguía Zapotec ztg 9.86
|
| 642 |
+
ztn Santa Catarina Albarradas Zapotec ztn 10.02
|
| 643 |
+
ztp Loxicha Zapotec ztp 9.62
|
| 644 |
+
zts Tilquiapan Zapotec zts 9.33
|
| 645 |
+
ztu Güilá Zapotec ztu 9.17
|
| 646 |
+
zu Zulu zul 14.83
|
| 647 |
+
zza Zaza zza 1.52
|
docs/languages.md
ADDED
|
@@ -0,0 +1,659 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Supported Languages
|
| 2 |
+
|
| 3 |
+
OmniVoice supports **646 languages** with a total of **581k hours** of training data.
|
| 4 |
+
|
| 5 |
+
The table below lists each language with its OmniVoice language ID,
|
| 6 |
+
ISO 639-3 code, and training data duration (hours).
|
| 7 |
+
|
| 8 |
+
| # | Language | OmniVoice ID | ISO 639-3 | Duration (h) |
|
| 9 |
+
|--:|----------|:------------:|:---------:|:------------:|
|
| 10 |
+
| 1 | Abadi | kbt | kbt | 9.73 |
|
| 11 |
+
| 2 | Abkhazian | ab | abk | 57.27 |
|
| 12 |
+
| 3 | Abron | abr | abr | 9.22 |
|
| 13 |
+
| 4 | Abua | abn | abn | 10.27 |
|
| 14 |
+
| 5 | Adamawa Fulfulde | fub | fub | 13.12 |
|
| 15 |
+
| 6 | Adyghe | ady | ady | 32.6 |
|
| 16 |
+
| 7 | Afade | aal | aal | 10.19 |
|
| 17 |
+
| 8 | Afrikaans | af | afr | 4.4 |
|
| 18 |
+
| 9 | Agwagwune | yay | yay | 8.26 |
|
| 19 |
+
| 10 | Aja (Benin) | ajg | ajg | 5.63 |
|
| 20 |
+
| 11 | Akebu | keu | keu | 9.1 |
|
| 21 |
+
| 12 | Alago | ala | ala | 11.04 |
|
| 22 |
+
| 13 | Albanian | sq | sqi | 8.59 |
|
| 23 |
+
| 14 | Algerian Arabic | arq | arq | 9.64 |
|
| 24 |
+
| 15 | Algerian Saharan Arabic | aao | aao | 2.02 |
|
| 25 |
+
| 16 | Ambo-Pasco Quechua | qva | qva | 9.59 |
|
| 26 |
+
| 17 | Ambonese Malay | abs | abs | 10.03 |
|
| 27 |
+
| 18 | Amdo Tibetan | adx | adx | 56.94 |
|
| 28 |
+
| 19 | Amharic | am | amh | 12.83 |
|
| 29 |
+
| 20 | Anaang | anw | anw | 9.65 |
|
| 30 |
+
| 21 | Angika | anp | anp | 10.65 |
|
| 31 |
+
| 22 | Antankarana Malagasy | xmv | xmv | 17.9 |
|
| 32 |
+
| 23 | Aragonese | an | arg | 16.4 |
|
| 33 |
+
| 24 | Arbëreshë Albanian | aae | aae | 6.11 |
|
| 34 |
+
| 25 | Arequipa-La Unión Quechua | qxu | qxu | 10.12 |
|
| 35 |
+
| 26 | Armenian | hy | hye | 42.15 |
|
| 36 |
+
| 27 | Ashe | ahs | ahs | 10.62 |
|
| 37 |
+
| 28 | Ashéninka Perené | prq | prq | 7.16 |
|
| 38 |
+
| 29 | Askopan | eiv | eiv | 10.44 |
|
| 39 |
+
| 30 | Assamese | as | asm | 270.85 |
|
| 40 |
+
| 31 | Asturian | ast | ast | 8.48 |
|
| 41 |
+
| 32 | Atayal | tay | tay | 7.02 |
|
| 42 |
+
| 33 | Awak | awo | awo | 10.22 |
|
| 43 |
+
| 34 | Ayacucho Quechua | quy | quy | 0.05 |
|
| 44 |
+
| 35 | Azerbaijani | az | aze | 9.84 |
|
| 45 |
+
| 36 | Baatonum | bba | bba | 10.53 |
|
| 46 |
+
| 37 | Bacama | bcy | bcy | 9.94 |
|
| 47 |
+
| 38 | Bade | bde | bde | 9.89 |
|
| 48 |
+
| 39 | Bafia | ksf | ksf | 16.43 |
|
| 49 |
+
| 40 | Bafut | bfd | bfd | 9.03 |
|
| 50 |
+
| 41 | Bagirmi Fulfulde | fui | fui | 15.04 |
|
| 51 |
+
| 42 | Bago-Kusuntu | bqg | bqg | 8.86 |
|
| 52 |
+
| 43 | Baharna Arabic | abv | abv | 10.41 |
|
| 53 |
+
| 44 | Bakoko | bkh | bkh | 6.0 |
|
| 54 |
+
| 45 | Balanta-Ganja | bjt | bjt | 9.41 |
|
| 55 |
+
| 46 | Balti | bft | bft | 16.28 |
|
| 56 |
+
| 47 | Bamenyam | bce | bce | 9.9 |
|
| 57 |
+
| 48 | Bamun | bax | bax | 10.24 |
|
| 58 |
+
| 49 | Bangwinji | bsj | bsj | 10.0 |
|
| 59 |
+
| 50 | Banjar | bjn | bjn | 11.68 |
|
| 60 |
+
| 51 | Bankon | abb | abb | 11.2 |
|
| 61 |
+
| 52 | Baoulé | bci | bci | 10.21 |
|
| 62 |
+
| 53 | Bara Malagasy | bhr | bhr | 12.14 |
|
| 63 |
+
| 54 | Barok | bjk | bjk | 10.16 |
|
| 64 |
+
| 55 | Basa (Cameroon) | bas | bas | 10.66 |
|
| 65 |
+
| 56 | Basa (Nigeria) | bzw | bzw | 10.27 |
|
| 66 |
+
| 57 | Bashkir | ba | bak | 249.1 |
|
| 67 |
+
| 58 | Basque | eu | eus | 479.86 |
|
| 68 |
+
| 59 | Batak Mandailing | btm | btm | 11.09 |
|
| 69 |
+
| 60 | Batanga | bnm | bnm | 15.01 |
|
| 70 |
+
| 61 | Bateri | btv | btv | 9.8 |
|
| 71 |
+
| 62 | Bats | bbl | bbl | 11.22 |
|
| 72 |
+
| 63 | Bayot | bda | bda | 9.47 |
|
| 73 |
+
| 64 | Bebele | beb | beb | 7.52 |
|
| 74 |
+
| 65 | Belarusian | be | bel | 1809.43 |
|
| 75 |
+
| 66 | Bengali | bn | ben | 271.76 |
|
| 76 |
+
| 67 | Betawi | bew | bew | 11.15 |
|
| 77 |
+
| 68 | Bhili | bhb | bhb | 9.98 |
|
| 78 |
+
| 69 | Bhojpuri | bho | bho | 10.05 |
|
| 79 |
+
| 70 | Bilur | bxf | bxf | 10.84 |
|
| 80 |
+
| 71 | Bima | bhp | bhp | 10.67 |
|
| 81 |
+
| 72 | Bodo | brx | brx | 231.57 |
|
| 82 |
+
| 73 | Boghom | bux | bux | 10.48 |
|
| 83 |
+
| 74 | Bokyi | bky | bky | 9.85 |
|
| 84 |
+
| 75 | Bomu | bmq | bmq | 10.68 |
|
| 85 |
+
| 76 | Bondei | bou | bou | 9.98 |
|
| 86 |
+
| 77 | Borgu Fulfulde | fue | fue | 20.1 |
|
| 87 |
+
| 78 | Bosnian | bs | bos | 690.73 |
|
| 88 |
+
| 79 | Brahui | brh | brh | 19.89 |
|
| 89 |
+
| 80 | Braj | bra | bra | 10.68 |
|
| 90 |
+
| 81 | Breton | br | bre | 25.48 |
|
| 91 |
+
| 82 | Buduma | bdm | bdm | 10.17 |
|
| 92 |
+
| 83 | Buginese | bug | bug | 11.09 |
|
| 93 |
+
| 84 | Bukharic | bhh | bhh | 11.38 |
|
| 94 |
+
| 85 | Bulgarian | bg | bul | 2190.76 |
|
| 95 |
+
| 86 | Bulu (Cameroon) | bum | bum | 9.06 |
|
| 96 |
+
| 87 | Bundeli | bns | bns | 10.88 |
|
| 97 |
+
| 88 | Bunun | bnn | bnn | 9.26 |
|
| 98 |
+
| 89 | Bura-Pabir | bwr | bwr | 10.4 |
|
| 99 |
+
| 90 | Burak | bys | bys | 9.92 |
|
| 100 |
+
| 91 | Burmese | my | mya | 12.14 |
|
| 101 |
+
| 92 | Burushaski | bsk | bsk | 9.14 |
|
| 102 |
+
| 93 | Cacaloxtepec Mixtec | miu | miu | 9.18 |
|
| 103 |
+
| 94 | Cajatambo North Lima Quechua | qvl | qvl | 9.95 |
|
| 104 |
+
| 95 | Cakfem-Mushere | cky | cky | 8.96 |
|
| 105 |
+
| 96 | Cameroon Pidgin | wes | wes | 10.06 |
|
| 106 |
+
| 97 | Campidanese Sardinian | sro | sro | 10.16 |
|
| 107 |
+
| 98 | Cantonese | yue | yue | 13302.38 |
|
| 108 |
+
| 99 | Catalan | ca | cat | 3358.6 |
|
| 109 |
+
| 100 | Cebuano | ceb | ceb | 12.17 |
|
| 110 |
+
| 101 | Cen | cen | cen | 9.85 |
|
| 111 |
+
| 102 | Central Kurdish | ckb | ckb | 137.52 |
|
| 112 |
+
| 103 | Central Nahuatl | nhn | nhn | 9.51 |
|
| 113 |
+
| 104 | Central Pame | pbs | pbs | 9.69 |
|
| 114 |
+
| 105 | Central Pashto | pst | pst | 11.4 |
|
| 115 |
+
| 106 | Central Puebla Nahuatl | ncx | ncx | 9.86 |
|
| 116 |
+
| 107 | Central Tarahumara | tar | tar | 9.73 |
|
| 117 |
+
| 108 | Central Yupik | esu | esu | 2.18 |
|
| 118 |
+
| 109 | Central-Eastern Niger Fulfulde | fuq | fuq | 9.28 |
|
| 119 |
+
| 110 | Chadian Arabic | shu | shu | 2.29 |
|
| 120 |
+
| 111 | Chichewa | ny | nya | 10.8 |
|
| 121 |
+
| 112 | Chichicapan Zapotec | zpv | zpv | 9.85 |
|
| 122 |
+
| 113 | Chiga | cgg | cgg | 10.84 |
|
| 123 |
+
| 114 | Chimalapa Zoque | zoh | zoh | 9.35 |
|
| 124 |
+
| 115 | Chimborazo Highland Quichua | qug | qug | 10.12 |
|
| 125 |
+
| 116 | Chinese | zh | cmn | 111343.3 |
|
| 126 |
+
| 117 | Chiquián Ancash Quechua | qxa | qxa | 9.99 |
|
| 127 |
+
| 118 | Chitwania Tharu | the | the | 10.06 |
|
| 128 |
+
| 119 | Chokwe | cjk | cjk | 11.01 |
|
| 129 |
+
| 120 | Chuvash | cv | chv | 23.96 |
|
| 130 |
+
| 121 | Cibak | ckl | ckl | 10.91 |
|
| 131 |
+
| 122 | Coastal Konjo | kjc | kjc | 10.18 |
|
| 132 |
+
| 123 | Copainalá Zoque | zoc | zoc | 10.07 |
|
| 133 |
+
| 124 | Cornish | kw | cor | 12.15 |
|
| 134 |
+
| 125 | Corongo Ancash Quechua | qwa | qwa | 9.72 |
|
| 135 |
+
| 126 | Croatian | hr | hrv | 2795.31 |
|
| 136 |
+
| 127 | Cross River Mbembe | mfn | mfn | 10.03 |
|
| 137 |
+
| 128 | Cuyamecalco Mixtec | xtu | xtu | 9.4 |
|
| 138 |
+
| 129 | Czech | cs | ces | 148.13 |
|
| 139 |
+
| 130 | Dadiya | dbd | dbd | 9.61 |
|
| 140 |
+
| 131 | Dagbani | dag | dag | 10.14 |
|
| 141 |
+
| 132 | Dameli | dml | dml | 9.18 |
|
| 142 |
+
| 133 | Danish | da | dan | 1665.98 |
|
| 143 |
+
| 134 | Dargwa | dar | dar | 1.22 |
|
| 144 |
+
| 135 | Dazaga | dzg | dzg | 9.96 |
|
| 145 |
+
| 136 | Deccan | dcc | dcc | 10.38 |
|
| 146 |
+
| 137 | Degema | deg | deg | 11.07 |
|
| 147 |
+
| 138 | Dera (Nigeria) | kna | kna | 11.91 |
|
| 148 |
+
| 139 | Dghwede | dgh | dgh | 9.95 |
|
| 149 |
+
| 140 | Dhatki | mki | mki | 8.83 |
|
| 150 |
+
| 141 | Dhivehi | dv | div | 38.61 |
|
| 151 |
+
| 142 | Dhofari Arabic | adf | adf | 0.31 |
|
| 152 |
+
| 143 | Dijim-Bwilim | cfa | cfa | 10.32 |
|
| 153 |
+
| 144 | Dogri | dgo | dgo | 117.04 |
|
| 154 |
+
| 145 | Domaaki | dmk | dmk | 6.38 |
|
| 155 |
+
| 146 | Dotyali | dty | dty | 10.85 |
|
| 156 |
+
| 147 | Duala | dua | dua | 12.13 |
|
| 157 |
+
| 148 | Dutch | nl | nld | 2264.13 |
|
| 158 |
+
| 149 | Dũya | ldb | ldb | 11.31 |
|
| 159 |
+
| 150 | Dyula | dyu | dyu | 0.34 |
|
| 160 |
+
| 151 | Eastern Balochi | bgp | bgp | 10.98 |
|
| 161 |
+
| 152 | Eastern Bolivian Guaraní | gui | gui | 22.72 |
|
| 162 |
+
| 153 | Eastern Egyptian Bedawi Arabic | avl | avl | 1.86 |
|
| 163 |
+
| 154 | Eastern Krahn | kqo | kqo | 9.28 |
|
| 164 |
+
| 155 | Eastern Mari | mhr | mhr | 272.31 |
|
| 165 |
+
| 156 | Eastern Yiddish | ydd | ydd | 18.43 |
|
| 166 |
+
| 157 | Ebrié | ebr | ebr | 1.5 |
|
| 167 |
+
| 158 | Eggon | ego | ego | 9.95 |
|
| 168 |
+
| 159 | Egyptian Arabic | arz | arz | 23.23 |
|
| 169 |
+
| 160 | Ejagham | etu | etu | 10.3 |
|
| 170 |
+
| 161 | Eleme | elm | elm | 11.27 |
|
| 171 |
+
| 162 | Eloyi | afo | afo | 11.21 |
|
| 172 |
+
| 163 | Embu | ebu | ebu | 9.81 |
|
| 173 |
+
| 164 | English | en | eng | 206061.1 |
|
| 174 |
+
| 165 | Erzya | myv | myv | 3.1 |
|
| 175 |
+
| 166 | Esan | ish | ish | 10.05 |
|
| 176 |
+
| 167 | Esperanto | eo | epo | 1396.64 |
|
| 177 |
+
| 168 | Estonian | et | est | 960.37 |
|
| 178 |
+
| 169 | Eton (Cameroon) | eto | eto | 7.43 |
|
| 179 |
+
| 170 | Ewondo | ewo | ewo | 12.71 |
|
| 180 |
+
| 171 | Extremaduran | ext | ext | 13.59 |
|
| 181 |
+
| 172 | Fang (Equatorial Guinea) | fan | fan | 3.51 |
|
| 182 |
+
| 173 | Fanti | fat | fat | 11.38 |
|
| 183 |
+
| 174 | Farefare | gur | gur | 9.24 |
|
| 184 |
+
| 175 | Fe'fe' | fmp | fmp | 9.86 |
|
| 185 |
+
| 176 | Filipino | fil | fil | 7.71 |
|
| 186 |
+
| 177 | Filomena Mata-Coahuitlán Totonac | tlp | tlp | 11.35 |
|
| 187 |
+
| 178 | Finnish | fi | fin | 468.62 |
|
| 188 |
+
| 179 | Fipa | fip | fip | 10.55 |
|
| 189 |
+
| 180 | French | fr | fra | 23675.32 |
|
| 190 |
+
| 181 | Fulah | ff | ful | 13.84 |
|
| 191 |
+
| 182 | Galician | gl | glg | 208.81 |
|
| 192 |
+
| 183 | Gambian Wolof | wof | wof | 9.46 |
|
| 193 |
+
| 184 | Ganda | lg | lug | 447.82 |
|
| 194 |
+
| 185 | Garhwali | gbm | gbm | 19.14 |
|
| 195 |
+
| 186 | Gawar-Bati | gwt | gwt | 12.16 |
|
| 196 |
+
| 187 | Gawri | gwc | gwc | 10.83 |
|
| 197 |
+
| 188 | Gbagyi | gbr | gbr | 12.12 |
|
| 198 |
+
| 189 | Gbari | gby | gby | 12.59 |
|
| 199 |
+
| 190 | Geji | gyz | gyz | 10.49 |
|
| 200 |
+
| 191 | Gen | gej | gej | 5.39 |
|
| 201 |
+
| 192 | Georgian | ka | kat | 156.96 |
|
| 202 |
+
| 193 | German | de | deu | 21927.13 |
|
| 203 |
+
| 194 | Geser-Gorom | ges | ges | 10.08 |
|
| 204 |
+
| 195 | Gheg Albanian | aln | aln | 3.92 |
|
| 205 |
+
| 196 | Ghomálá' | bbj | bbj | 7.32 |
|
| 206 |
+
| 197 | Gidar | gid | gid | 10.06 |
|
| 207 |
+
| 198 | Glavda | glw | glw | 10.51 |
|
| 208 |
+
| 199 | Goan Konkani | gom | gom | 9.82 |
|
| 209 |
+
| 200 | Goaria | gig | gig | 9.41 |
|
| 210 |
+
| 201 | Goemai | ank | ank | 10.0 |
|
| 211 |
+
| 202 | Gola | gol | gol | 9.26 |
|
| 212 |
+
| 203 | Greek | el | ell | 2412.54 |
|
| 213 |
+
| 204 | Guarani | gn | grn | 4.06 |
|
| 214 |
+
| 205 | Guduf-Gava | gdf | gdf | 12.21 |
|
| 215 |
+
| 206 | Guerrero Amuzgo | amu | amu | 10.1 |
|
| 216 |
+
| 207 | Gujarati | gu | guj | 91.18 |
|
| 217 |
+
| 208 | Gujari | gju | gju | 8.66 |
|
| 218 |
+
| 209 | Gulf Arabic | afb | afb | 98.55 |
|
| 219 |
+
| 210 | Gurgula | ggg | ggg | 7.12 |
|
| 220 |
+
| 211 | Gusii | guz | guz | 9.5 |
|
| 221 |
+
| 212 | Gusilay | gsl | gsl | 10.0 |
|
| 222 |
+
| 213 | Gweno | gwe | gwe | 8.87 |
|
| 223 |
+
| 214 | Güilá Zapotec | ztu | ztu | 9.17 |
|
| 224 |
+
| 215 | Hadothi | hoj | hoj | 10.08 |
|
| 225 |
+
| 216 | Hahon | hah | hah | 9.64 |
|
| 226 |
+
| 217 | Haitian | ht | hat | 0.04 |
|
| 227 |
+
| 218 | Hakha Chin | cnh | cnh | 2.24 |
|
| 228 |
+
| 219 | Hakö | hao | hao | 8.56 |
|
| 229 |
+
| 220 | Halia | hla | hla | 9.86 |
|
| 230 |
+
| 221 | Hausa | ha | hau | 17.75 |
|
| 231 |
+
| 222 | Hawaiian | haw | haw | 11.79 |
|
| 232 |
+
| 223 | Hazaragi | haz | haz | 9.69 |
|
| 233 |
+
| 224 | Hebrew | he | heb | 13.4 |
|
| 234 |
+
| 225 | Hemba | hem | hem | 9.53 |
|
| 235 |
+
| 226 | Herero | hz | her | 9.59 |
|
| 236 |
+
| 227 | Highland Konjo | kjk | kjk | 10.21 |
|
| 237 |
+
| 228 | Hijazi Arabic | acw | acw | 22.32 |
|
| 238 |
+
| 229 | Hindi | hi | hin | 117.17 |
|
| 239 |
+
| 230 | Huarijio | var | var | 9.28 |
|
| 240 |
+
| 231 | Huautla Mazatec | mau | mau | 6.39 |
|
| 241 |
+
| 232 | Huaxcaleca Nahuatl | nhq | nhq | 5.07 |
|
| 242 |
+
| 233 | Huba | hbb | hbb | 10.7 |
|
| 243 |
+
| 234 | Huitepec Mixtec | mxs | mxs | 9.64 |
|
| 244 |
+
| 235 | Hula | hul | hul | 10.33 |
|
| 245 |
+
| 236 | Hungarian | hu | hun | 255.83 |
|
| 246 |
+
| 237 | Hunjara-Kaina Ke | hkk | hkk | 8.69 |
|
| 247 |
+
| 238 | Hwana | hwo | hwo | 11.23 |
|
| 248 |
+
| 239 | Ibibio | ibb | ibb | 7.38 |
|
| 249 |
+
| 240 | Icelandic | is | isl | 647.29 |
|
| 250 |
+
| 241 | Idakho-Isukha-Tiriki | ida | ida | 9.31 |
|
| 251 |
+
| 242 | Idoma | idu | idu | 11.16 |
|
| 252 |
+
| 243 | Igbo | ig | ibo | 13.69 |
|
| 253 |
+
| 244 | Igo | ahl | ahl | 9.22 |
|
| 254 |
+
| 245 | Ikposo | kpo | kpo | 7.83 |
|
| 255 |
+
| 246 | Ikwere | ikw | ikw | 10.0 |
|
| 256 |
+
| 247 | Imbabura Highland Quichua | qvi | qvi | 11.0 |
|
| 257 |
+
| 248 | Indonesian | id | ind | 6327.87 |
|
| 258 |
+
| 249 | Indus Kohistani | mvy | mvy | 21.64 |
|
| 259 |
+
| 250 | Interlingua (International Auxiliary Language Association) | ia | ina | 13.48 |
|
| 260 |
+
| 251 | Inupiaq | ik | ipk | 2.11 |
|
| 261 |
+
| 252 | Irish | ga | gle | 21.4 |
|
| 262 |
+
| 253 | Iron Ossetic | os | oss | 1.38 |
|
| 263 |
+
| 254 | Isekiri | its | its | 11.85 |
|
| 264 |
+
| 255 | Isoko | iso | iso | 10.33 |
|
| 265 |
+
| 256 | Italian | it | ita | 9402.46 |
|
| 266 |
+
| 257 | Ito | itw | itw | 9.19 |
|
| 267 |
+
| 258 | Itzá | itz | itz | 7.08 |
|
| 268 |
+
| 259 | Ixtayutla Mixtec | vmj | vmj | 10.17 |
|
| 269 |
+
| 260 | Izon | ijc | ijc | 9.95 |
|
| 270 |
+
| 261 | Jambi Malay | jax | jax | 10.29 |
|
| 271 |
+
| 262 | Japanese | ja | jpn | 36914.4 |
|
| 272 |
+
| 263 | Jaqaru | jqr | jqr | 9.32 |
|
| 273 |
+
| 264 | Jauja Wanca Quechua | qxw | qxw | 11.42 |
|
| 274 |
+
| 265 | Jaunsari | jns | jns | 11.25 |
|
| 275 |
+
| 266 | Javanese | jv | jav | 11.19 |
|
| 276 |
+
| 267 | Jiba | juo | juo | 10.43 |
|
| 277 |
+
| 268 | Jju | kaj | kaj | 10.16 |
|
| 278 |
+
| 269 | Judeo-Moroccan Arabic | aju | aju | 7.21 |
|
| 279 |
+
| 270 | Juxtlahuaca Mixtec | vmc | vmc | 9.43 |
|
| 280 |
+
| 271 | Kabardian | kbd | kbd | 108.35 |
|
| 281 |
+
| 272 | Kabras | lkb | lkb | 9.99 |
|
| 282 |
+
| 273 | Kabuverdianu | kea | kea | 10.51 |
|
| 283 |
+
| 274 | Kabyle | kab | kab | 529.52 |
|
| 284 |
+
| 275 | Kachi Koli | gjk | gjk | 20.83 |
|
| 285 |
+
| 276 | Kairak | ckr | ckr | 10.51 |
|
| 286 |
+
| 277 | Kalabari | ijn | ijn | 11.04 |
|
| 287 |
+
| 278 | Kalasha | kls | kls | 9.11 |
|
| 288 |
+
| 279 | Kalenjin | kln | kln | 40.42 |
|
| 289 |
+
| 280 | Kalkoti | xka | xka | 8.0 |
|
| 290 |
+
| 281 | Kamba | kam | kam | 14.72 |
|
| 291 |
+
| 282 | Kamo | kcq | kcq | 10.49 |
|
| 292 |
+
| 283 | Kanauji | bjj | bjj | 11.01 |
|
| 293 |
+
| 284 | Kanembu | kbl | kbl | 10.19 |
|
| 294 |
+
| 285 | Kannada | kn | kan | 128.06 |
|
| 295 |
+
| 286 | Karekare | kai | kai | 10.52 |
|
| 296 |
+
| 287 | Kashmiri | ks | kas | 110.42 |
|
| 297 |
+
| 288 | Kathoriya Tharu | tkt | tkt | 10.64 |
|
| 298 |
+
| 289 | Kati | bsh | bsh | 8.77 |
|
| 299 |
+
| 290 | Kazakh | kk | kaz | 1537.29 |
|
| 300 |
+
| 291 | Keiyo | eyo | eyo | 9.24 |
|
| 301 |
+
| 292 | Khams Tibetan | khg | khg | 6.38 |
|
| 302 |
+
| 293 | Khana | ogo | ogo | 10.51 |
|
| 303 |
+
| 294 | Khetrani | xhe | xhe | 9.4 |
|
| 304 |
+
| 295 | Khmer | km | khm | 7.1 |
|
| 305 |
+
| 296 | Khowar | khw | khw | 15.55 |
|
| 306 |
+
| 297 | Kinga | zga | zga | 9.5 |
|
| 307 |
+
| 298 | Kinnauri | kfk | kfk | 10.32 |
|
| 308 |
+
| 299 | Kinyarwanda | rw | kin | 2021.66 |
|
| 309 |
+
| 300 | Kirghiz | ky | kir | 46.63 |
|
| 310 |
+
| 301 | Kirya-Konzəl | fkk | fkk | 9.98 |
|
| 311 |
+
| 302 | Kochila Tharu | thq | thq | 10.28 |
|
| 312 |
+
| 303 | Kohistani Shina | plk | plk | 12.75 |
|
| 313 |
+
| 304 | Kohumono | bcs | bcs | 10.45 |
|
| 314 |
+
| 305 | Kok Borok | trp | trp | 10.74 |
|
| 315 |
+
| 306 | Kol (Papua New Guinea) | kol | kol | 9.95 |
|
| 316 |
+
| 307 | Kom (Cameroon) | bkm | bkm | 10.76 |
|
| 317 |
+
| 308 | Koma | kmy | kmy | 10.28 |
|
| 318 |
+
| 309 | Konkani | knn | knn | 112.83 |
|
| 319 |
+
| 310 | Konzo | koo | koo | 13.23 |
|
| 320 |
+
| 311 | Korean | ko | kor | 8609.28 |
|
| 321 |
+
| 312 | Korwa | kfp | kfp | 11.87 |
|
| 322 |
+
| 313 | Kota (India) | kfe | kfe | 10.25 |
|
| 323 |
+
| 314 | Koti | eko | eko | 8.15 |
|
| 324 |
+
| 315 | Kuanua | ksd | ksd | 9.91 |
|
| 325 |
+
| 316 | Kuanyama | kj | kua | 9.88 |
|
| 326 |
+
| 317 | Kui (India) | uki | uki | 10.77 |
|
| 327 |
+
| 318 | Kulung (Nigeria) | bbu | bbu | 10.39 |
|
| 328 |
+
| 319 | Kuot | kto | kto | 9.77 |
|
| 329 |
+
| 320 | Kushi | kuh | kuh | 10.35 |
|
| 330 |
+
| 321 | Kwambi | kwm | kwm | 9.9 |
|
| 331 |
+
| 322 | Kwasio | nmg | nmg | 10.39 |
|
| 332 |
+
| 323 | Lala-Roba | lla | lla | 10.38 |
|
| 333 |
+
| 324 | Lamang | hia | hia | 11.07 |
|
| 334 |
+
| 325 | Lao | lo | lao | 7.63 |
|
| 335 |
+
| 326 | Larike-Wakasihu | alo | alo | 9.97 |
|
| 336 |
+
| 327 | Lasi | lss | lss | 6.53 |
|
| 337 |
+
| 328 | Latgalian | ltg | ltg | 27.23 |
|
| 338 |
+
| 329 | Latvian | lv | lav | 1441.58 |
|
| 339 |
+
| 330 | Levantine Arabic | apc | apc | 15.65 |
|
| 340 |
+
| 331 | Liana-Seti | ste | ste | 10.43 |
|
| 341 |
+
| 332 | Liberia Kpelle | xpe | xpe | 9.5 |
|
| 342 |
+
| 333 | Liberian English | lir | lir | 10.26 |
|
| 343 |
+
| 334 | Libyan Arabic | ayl | ayl | 20.13 |
|
| 344 |
+
| 335 | Ligurian | lij | lij | 15.97 |
|
| 345 |
+
| 336 | Lijili | mgi | mgi | 10.89 |
|
| 346 |
+
| 337 | Lingala | ln | lin | 17.99 |
|
| 347 |
+
| 338 | Lithuanian | lt | lit | 2629.45 |
|
| 348 |
+
| 339 | Loarki | lrk | lrk | 10.5 |
|
| 349 |
+
| 340 | Logooli | rag | rag | 9.39 |
|
| 350 |
+
| 341 | Logudorese Sardinian | src | src | 10.67 |
|
| 351 |
+
| 342 | Loja Highland Quichua | qvj | qvj | 10.59 |
|
| 352 |
+
| 343 | Loloda | loa | loa | 9.31 |
|
| 353 |
+
| 344 | Longuda | lnu | lnu | 10.46 |
|
| 354 |
+
| 345 | Loxicha Zapotec | ztp | ztp | 9.62 |
|
| 355 |
+
| 346 | Luba-Lulua | lua | lua | 8.47 |
|
| 356 |
+
| 347 | Luo | luo | luo | 36.17 |
|
| 357 |
+
| 348 | Lushai | lus | lus | 20.24 |
|
| 358 |
+
| 349 | Luxembourgish | lb | ltz | 8.46 |
|
| 359 |
+
| 350 | Maasina Fulfulde | ffm | ffm | 10.46 |
|
| 360 |
+
| 351 | Maba (Chad) | mde | mde | 9.5 |
|
| 361 |
+
| 352 | Macedo-Romanian | rup | rup | 0.02 |
|
| 362 |
+
| 353 | Macedonian | mk | mkd | 27.21 |
|
| 363 |
+
| 354 | Mada (Cameroon) | mxu | mxu | 12.0 |
|
| 364 |
+
| 355 | Mafa | maf | maf | 9.97 |
|
| 365 |
+
| 356 | Maithili | mai | mai | 131.37 |
|
| 366 |
+
| 357 | Malay | ms | msa | 9.57 |
|
| 367 |
+
| 358 | Malayalam | ml | mal | 166.57 |
|
| 368 |
+
| 359 | Mali | gcc | gcc | 9.87 |
|
| 369 |
+
| 360 | Malinaltepec Me'phaa | tcf | tcf | 9.04 |
|
| 370 |
+
| 361 | Maltese | mt | mlt | 630.29 |
|
| 371 |
+
| 362 | Mandara | tbf | tbf | 10.01 |
|
| 372 |
+
| 363 | Mandjak | mfv | mfv | 9.55 |
|
| 373 |
+
| 364 | Manggarai | mqy | mqy | 10.5 |
|
| 374 |
+
| 365 | Manipuri | mni | mni | 44.46 |
|
| 375 |
+
| 366 | Mansoanka | msw | msw | 9.32 |
|
| 376 |
+
| 367 | Manx | gv | glv | 10.07 |
|
| 377 |
+
| 368 | Maori | mi | mri | 18.02 |
|
| 378 |
+
| 369 | Marathi | mr | mar | 156.71 |
|
| 379 |
+
| 370 | Marghi Central | mrt | mrt | 10.36 |
|
| 380 |
+
| 371 | Marghi South | mfm | mfm | 10.05 |
|
| 381 |
+
| 372 | Maria (India) | mrr | mrr | 11.0 |
|
| 382 |
+
| 373 | Marwari (Pakistan) | mve | mve | 9.96 |
|
| 383 |
+
| 374 | Masana | mcn | mcn | 10.09 |
|
| 384 |
+
| 375 | Masikoro Malagasy | msh | msh | 14.16 |
|
| 385 |
+
| 376 | Matsés | mcf | mcf | 9.61 |
|
| 386 |
+
| 377 | Mazaltepec Zapotec | zpy | zpy | 9.47 |
|
| 387 |
+
| 378 | Mazatlán Mazatec | vmz | vmz | 9.82 |
|
| 388 |
+
| 379 | Mazatlán Mixe | mzl | mzl | 10.05 |
|
| 389 |
+
| 380 | Mbe | mfo | mfo | 10.24 |
|
| 390 |
+
| 381 | Mbo (Cameroon) | mbo | mbo | 9.51 |
|
| 391 |
+
| 382 | Mbum | mdd | mdd | 9.82 |
|
| 392 |
+
| 383 | Medumba | byv | byv | 10.95 |
|
| 393 |
+
| 384 | Mekeo | mek | mek | 9.18 |
|
| 394 |
+
| 385 | Meru | mer | mer | 9.89 |
|
| 395 |
+
| 386 | Mesopotamian Arabic | acm | acm | 3.78 |
|
| 396 |
+
| 387 | Mewari | mtr | mtr | 10.58 |
|
| 397 |
+
| 388 | Min Nan Chinese | nan | nan | 17.55 |
|
| 398 |
+
| 389 | Mingrelian | xmf | xmf | 11.47 |
|
| 399 |
+
| 390 | Mitlatongo Mixtec | vmm | vmm | 9.95 |
|
| 400 |
+
| 391 | Miya | mkf | mkf | 10.16 |
|
| 401 |
+
| 392 | Mokpwe | bri | bri | 7.53 |
|
| 402 |
+
| 393 | Moksha | mdf | mdf | 0.47 |
|
| 403 |
+
| 394 | Mom Jango | ver | ver | 10.93 |
|
| 404 |
+
| 395 | Mongolian | mn | mon | 269.08 |
|
| 405 |
+
| 396 | Moroccan Arabic | ary | ary | 104.67 |
|
| 406 |
+
| 397 | Motu | meu | meu | 9.88 |
|
| 407 |
+
| 398 | Mpiemo | mcx | mcx | 9.88 |
|
| 408 |
+
| 399 | Mpumpong | mgg | mgg | 4.94 |
|
| 409 |
+
| 400 | Mundang | mua | mua | 9.2 |
|
| 410 |
+
| 401 | Mungaka | mhk | mhk | 7.53 |
|
| 411 |
+
| 402 | Musey | mse | mse | 7.21 |
|
| 412 |
+
| 403 | Musgu | mug | mug | 4.74 |
|
| 413 |
+
| 404 | Musi | mui | mui | 10.52 |
|
| 414 |
+
| 405 | Naba | mne | mne | 10.37 |
|
| 415 |
+
| 406 | Najdi Arabic | ars | ars | 203.54 |
|
| 416 |
+
| 407 | Nalik | nal | nal | 10.33 |
|
| 417 |
+
| 408 | Nawdm | nmz | nmz | 6.3 |
|
| 418 |
+
| 409 | Ndonga | ng | ndo | 9.08 |
|
| 419 |
+
| 410 | Neapolitan | nap | nap | 9.97 |
|
| 420 |
+
| 411 | Nepali | npi | npi | 171.5 |
|
| 421 |
+
| 412 | Ngamo | nbh | nbh | 10.04 |
|
| 422 |
+
| 413 | Ngas | anc | anc | 10.14 |
|
| 423 |
+
| 414 | Ngiemboon | nnh | nnh | 16.15 |
|
| 424 |
+
| 415 | Ngizim | ngi | ngi | 10.06 |
|
| 425 |
+
| 416 | Ngomba | jgo | jgo | 10.15 |
|
| 426 |
+
| 417 | Ngombale | nla | nla | 8.79 |
|
| 427 |
+
| 418 | Nigerian Fulfulde | fuv | fuv | 9.97 |
|
| 428 |
+
| 419 | Nigerian Pidgin | pcm | pcm | 11.04 |
|
| 429 |
+
| 420 | Nimadi | noe | noe | 11.12 |
|
| 430 |
+
| 421 | Nobiin | fia | fia | 9.96 |
|
| 431 |
+
| 422 | North Mesopotamian Arabic | ayp | ayp | 10.92 |
|
| 432 |
+
| 423 | North Moluccan Malay | max | max | 9.43 |
|
| 433 |
+
| 424 | Northern Betsimisaraka Malagasy | bmm | bmm | 19.12 |
|
| 434 |
+
| 425 | Northern Hindko | hno | hno | 20.04 |
|
| 435 |
+
| 426 | Northern Kurdish | kmr | kmr | 69.59 |
|
| 436 |
+
| 427 | Northern Pame | pmq | pmq | 10.24 |
|
| 437 |
+
| 428 | Northern Pashto | pbu | pbu | 11.03 |
|
| 438 |
+
| 429 | Northern Uzbek | uzn | uzn | 15.23 |
|
| 439 |
+
| 430 | Northwest Gbaya | gya | gya | 8.45 |
|
| 440 |
+
| 431 | Norwegian | no | nor | 3849.8 |
|
| 441 |
+
| 432 | Norwegian Bokmål | nb | nob | 12.7 |
|
| 442 |
+
| 433 | Norwegian Nynorsk | nn | nno | 1.54 |
|
| 443 |
+
| 434 | Notsi | ncf | ncf | 9.84 |
|
| 444 |
+
| 435 | Nyankpa | yes | yes | 10.26 |
|
| 445 |
+
| 436 | Nyungwe | nyu | nyu | 8.98 |
|
| 446 |
+
| 437 | Nzanyi | nja | nja | 10.02 |
|
| 447 |
+
| 438 | Nüpode Huitoto | hux | hux | 9.04 |
|
| 448 |
+
| 439 | Occitan | oc | oci | 16.8 |
|
| 449 |
+
| 440 | Od | odk | odk | 20.26 |
|
| 450 |
+
| 441 | Odia | ory | ory | 144.81 |
|
| 451 |
+
| 442 | Odual | odu | odu | 10.57 |
|
| 452 |
+
| 443 | Omani Arabic | acx | acx | 22.03 |
|
| 453 |
+
| 444 | Orizaba Nahuatl | nlv | nlv | 11.42 |
|
| 454 |
+
| 445 | Orma | orc | orc | 22.01 |
|
| 455 |
+
| 446 | Ormuri | oru | oru | 16.74 |
|
| 456 |
+
| 447 | Oromo | om | orm | 6.6 |
|
| 457 |
+
| 448 | Pahari-Potwari | phr | phr | 24.03 |
|
| 458 |
+
| 449 | Paiwan | pwn | pwn | 13.76 |
|
| 459 |
+
| 450 | Panjabi | pa | pan | 147.37 |
|
| 460 |
+
| 451 | Papuan Malay | pmy | pmy | 10.17 |
|
| 461 |
+
| 452 | Parkari Koli | kvx | kvx | 11.04 |
|
| 462 |
+
| 453 | Pedi | nso | nso | 12.64 |
|
| 463 |
+
| 454 | Pero | pip | pip | 9.85 |
|
| 464 |
+
| 455 | Persian | fa | fas | 366.07 |
|
| 465 |
+
| 456 | Petats | pex | pex | 10.2 |
|
| 466 |
+
| 457 | Phalura | phl | phl | 20.69 |
|
| 467 |
+
| 458 | Piemontese | pms | pms | 16.01 |
|
| 468 |
+
| 459 | Piya-Kwonci | piy | piy | 10.38 |
|
| 469 |
+
| 460 | Plateau Malagasy | plt | plt | 19.39 |
|
| 470 |
+
| 461 | Polish | pl | pol | 911.68 |
|
| 471 |
+
| 462 | Poqomam | poc | poc | 9.63 |
|
| 472 |
+
| 463 | Portuguese | pt | por | 16855.05 |
|
| 473 |
+
| 464 | Pulaar | fuc | fuc | 14.77 |
|
| 474 |
+
| 465 | Pular | fuf | fuf | 13.77 |
|
| 475 |
+
| 466 | Puno Quechua | qxp | qxp | 9.81 |
|
| 476 |
+
| 467 | Pushto | ps | pus | 88.62 |
|
| 477 |
+
| 468 | Pökoot | pko | pko | 10.4 |
|
| 478 |
+
| 469 | Qaqet | byx | byx | 9.79 |
|
| 479 |
+
| 470 | Quiotepec Chinantec | chq | chq | 9.76 |
|
| 480 |
+
| 471 | Rana Tharu | thr | thr | 9.99 |
|
| 481 |
+
| 472 | Rangi | lag | lag | 9.47 |
|
| 482 |
+
| 473 | Rapoisi | kyx | kyx | 9.17 |
|
| 483 |
+
| 474 | Ratahan | rth | rth | 9.34 |
|
| 484 |
+
| 475 | Rayón Zoque | zor | zor | 9.04 |
|
| 485 |
+
| 476 | Romanian | ro | ron | 70.23 |
|
| 486 |
+
| 477 | Romansh | rm | roh | 9.21 |
|
| 487 |
+
| 478 | Rombo | rof | rof | 18.9 |
|
| 488 |
+
| 479 | Rotokas | roo | roo | 9.07 |
|
| 489 |
+
| 480 | Rukai | dru | dru | 9.26 |
|
| 490 |
+
| 481 | Russian | ru | rus | 20338.5 |
|
| 491 |
+
| 482 | Sacapulteco | quv | quv | 8.9 |
|
| 492 |
+
| 483 | Saidi Arabic | aec | aec | 9.28 |
|
| 493 |
+
| 484 | Sakalava Malagasy | skg | skg | 9.02 |
|
| 494 |
+
| 485 | Sakizaya | szy | szy | 11.47 |
|
| 495 |
+
| 486 | Saleman | sau | sau | 10.53 |
|
| 496 |
+
| 487 | Samba Daka | ccg | ccg | 10.11 |
|
| 497 |
+
| 488 | Samba Leko | ndi | ndi | 11.27 |
|
| 498 |
+
| 489 | San Felipe Otlaltepec Popoloca | pow | pow | 8.84 |
|
| 499 |
+
| 490 | San Francisco Del Mar Huave | hue | hue | 9.45 |
|
| 500 |
+
| 491 | San Juan Atzingo Popoloca | poe | poe | 10.01 |
|
| 501 |
+
| 492 | San Martín Itunyoso Triqui | trq | trq | 8.29 |
|
| 502 |
+
| 493 | San Miguel El Grande Mixtec | mig | mig | 9.66 |
|
| 503 |
+
| 494 | Sansi | ssi | ssi | 10.47 |
|
| 504 |
+
| 495 | Sanskrit | sa | san | 84.44 |
|
| 505 |
+
| 496 | Santa Ana de Tusi Pasco Quechua | qxt | qxt | 10.05 |
|
| 506 |
+
| 497 | Santa Catarina Albarradas Zapotec | ztn | ztn | 10.02 |
|
| 507 |
+
| 498 | Santali | sat | sat | 98.37 |
|
| 508 |
+
| 499 | Santiago del Estero Quichua | qus | qus | 9.55 |
|
| 509 |
+
| 500 | Saposa | sps | sps | 9.81 |
|
| 510 |
+
| 501 | Saraiki | skr | skr | 4.13 |
|
| 511 |
+
| 502 | Sardinian | sc | srd | 2.77 |
|
| 512 |
+
| 503 | Saya | say | say | 10.02 |
|
| 513 |
+
| 504 | Sediq | trv | trv | 7.77 |
|
| 514 |
+
| 505 | Serbian | sr | srp | 1855.33 |
|
| 515 |
+
| 506 | Seri | sei | sei | 9.81 |
|
| 516 |
+
| 507 | Shina | scl | scl | 9.84 |
|
| 517 |
+
| 508 | Shona | sn | sna | 9.96 |
|
| 518 |
+
| 509 | Siar-Lak | sjr | sjr | 9.87 |
|
| 519 |
+
| 510 | Sibe | nco | nco | 9.96 |
|
| 520 |
+
| 511 | Sicilian | scn | scn | 13.35 |
|
| 521 |
+
| 512 | Sihuas Ancash Quechua | qws | qws | 10.18 |
|
| 522 |
+
| 513 | Sikkimese | sip | sip | 10.07 |
|
| 523 |
+
| 514 | Sinaugoro | snc | snc | 10.38 |
|
| 524 |
+
| 515 | Sindhi | sd | snd | 46.27 |
|
| 525 |
+
| 516 | Sindhi Bhil | sbn | sbn | 10.53 |
|
| 526 |
+
| 517 | Sinhala | si | sin | 11.98 |
|
| 527 |
+
| 518 | Sinicahua Mixtec | xti | xti | 9.5 |
|
| 528 |
+
| 519 | Sipacapense | qum | qum | 9.37 |
|
| 529 |
+
| 520 | Siwai | siw | siw | 10.47 |
|
| 530 |
+
| 521 | Slovak | sk | slk | 2478.46 |
|
| 531 |
+
| 522 | Slovenian | sl | slv | 1172.61 |
|
| 532 |
+
| 523 | Solos | sol | sol | 9.95 |
|
| 533 |
+
| 524 | Somali | so | som | 13.22 |
|
| 534 |
+
| 525 | Soninke | snk | snk | 10.04 |
|
| 535 |
+
| 526 | South Giziga | giz | giz | 10.03 |
|
| 536 |
+
| 527 | South Ucayali Ashéninka | cpy | cpy | 9.15 |
|
| 537 |
+
| 528 | Southeastern Nochixtlán Mixtec | mxy | mxy | 9.48 |
|
| 538 |
+
| 529 | Southern Betsimisaraka Malagasy | bzc | bzc | 17.45 |
|
| 539 |
+
| 530 | Southern Pashto | pbt | pbt | 11.6 |
|
| 540 |
+
| 531 | Southern Pastaza Quechua | qup | qup | 11.13 |
|
| 541 |
+
| 532 | Soyaltepec Mazatec | vmp | vmp | 10.17 |
|
| 542 |
+
| 533 | Spanish | es | spa | 27559.74 |
|
| 543 |
+
| 534 | Standard Arabic | arb | arb | 1483.53 |
|
| 544 |
+
| 535 | Standard Moroccan Tamazight | zgh | zgh | 1.19 |
|
| 545 |
+
| 536 | Sudanese Arabic | apd | apd | 9.93 |
|
| 546 |
+
| 537 | Sulka | sua | sua | 10.12 |
|
| 547 |
+
| 538 | Svan | sva | sva | 15.11 |
|
| 548 |
+
| 539 | Swahili | sw | swa | 418.41 |
|
| 549 |
+
| 540 | Swedish | sv | swe | 2453.14 |
|
| 550 |
+
| 541 | Tae' | rob | rob | 9.02 |
|
| 551 |
+
| 542 | Tahaggart Tamahaq | thv | thv | 4.25 |
|
| 552 |
+
| 543 | Taita | dav | dav | 9.12 |
|
| 553 |
+
| 544 | Tajik | tg | tgk | 9.23 |
|
| 554 |
+
| 545 | Tamil | ta | tam | 423.09 |
|
| 555 |
+
| 546 | Tandroy-Mahafaly Malagasy | tdx | tdx | 3.81 |
|
| 556 |
+
| 547 | Tangale | tan | tan | 10.14 |
|
| 557 |
+
| 548 | Tanosy Malagasy | txy | txy | 12.07 |
|
| 558 |
+
| 549 | Tarok | yer | yer | 10.08 |
|
| 559 |
+
| 550 | Tatar | tt | tat | 30.03 |
|
| 560 |
+
| 551 | Tedaga | tuq | tuq | 10.0 |
|
| 561 |
+
| 552 | Telugu | te | tel | 230.21 |
|
| 562 |
+
| 553 | Tem | kdh | kdh | 4.07 |
|
| 563 |
+
| 554 | Teop | tio | tio | 9.85 |
|
| 564 |
+
| 555 | Tepeuxila Cuicatec | cux | cux | 7.83 |
|
| 565 |
+
| 556 | Tepinapa Chinantec | cte | cte | 9.54 |
|
| 566 |
+
| 557 | Tera | ttr | ttr | 9.89 |
|
| 567 |
+
| 558 | Terei | buo | buo | 9.48 |
|
| 568 |
+
| 559 | Termanu | twu | twu | 11.45 |
|
| 569 |
+
| 560 | Tesaka Malagasy | tkg | tkg | 17.86 |
|
| 570 |
+
| 561 | Tetelcingo Nahuatl | nhg | nhg | 8.92 |
|
| 571 |
+
| 562 | Teutila Cuicatec | cut | cut | 8.04 |
|
| 572 |
+
| 563 | Thai | th | tha | 10499.77 |
|
| 573 |
+
| 564 | Tibetan | bo | bod | 82.27 |
|
| 574 |
+
| 565 | Tidaá Mixtec | mtx | mtx | 9.09 |
|
| 575 |
+
| 566 | Tidore | tvo | tvo | 10.31 |
|
| 576 |
+
| 567 | Tigak | tgc | tgc | 9.71 |
|
| 577 |
+
| 568 | Tigre | tig | tig | 7.49 |
|
| 578 |
+
| 569 | Tigrinya | ti | tir | 0.08 |
|
| 579 |
+
| 570 | Tilquiapan Zapotec | zts | zts | 9.33 |
|
| 580 |
+
| 571 | Tinputz | tpz | tpz | 9.33 |
|
| 581 |
+
| 572 | Tlacoapa Me'phaa | tpl | tpl | 9.28 |
|
| 582 |
+
| 573 | Tlacoatzintepec Chinantec | ctl | ctl | 10.04 |
|
| 583 |
+
| 574 | Tlingit | tli | tli | 0.41 |
|
| 584 |
+
| 575 | Toki Pona | tok | tok | 13.51 |
|
| 585 |
+
| 576 | Tomoip | tqp | tqp | 10.1 |
|
| 586 |
+
| 577 | Tondano | tdn | tdn | 9.14 |
|
| 587 |
+
| 578 | Tonsea | txs | txs | 9.32 |
|
| 588 |
+
| 579 | Tooro | ttj | ttj | 10.31 |
|
| 589 |
+
| 580 | Torau | ttu | ttu | 9.87 |
|
| 590 |
+
| 581 | Torwali | trw | trw | 14.98 |
|
| 591 |
+
| 582 | Tsimihety Malagasy | xmw | xmw | 11.53 |
|
| 592 |
+
| 583 | Tsotso | lto | lto | 9.77 |
|
| 593 |
+
| 584 | Tswana | tn | tsn | 4.24 |
|
| 594 |
+
| 585 | Tugen | tuy | tuy | 8.79 |
|
| 595 |
+
| 586 | Tuki | bag | bag | 10.97 |
|
| 596 |
+
| 587 | Tula | tul | tul | 9.79 |
|
| 597 |
+
| 588 | Tulu | tcy | tcy | 11.72 |
|
| 598 |
+
| 589 | Tunen | tvu | tvu | 9.85 |
|
| 599 |
+
| 590 | Tungag | lcm | lcm | 9.77 |
|
| 600 |
+
| 591 | Tunisian Arabic | aeb | aeb | 21.63 |
|
| 601 |
+
| 592 | Tupuri | tui | tui | 9.26 |
|
| 602 |
+
| 593 | Turkana | tuv | tuv | 10.17 |
|
| 603 |
+
| 594 | Turkish | tr | tur | 125.36 |
|
| 604 |
+
| 595 | Turkmen | tk | tuk | 2.86 |
|
| 605 |
+
| 596 | Tututepec Mixtec | mtu | mtu | 10.13 |
|
| 606 |
+
| 597 | Twi | tw | twi | 0.25 |
|
| 607 |
+
| 598 | Ubaghara | byc | byc | 11.11 |
|
| 608 |
+
| 599 | Uighur | ug | uig | 428.77 |
|
| 609 |
+
| 600 | Ukrainian | uk | ukr | 1851.97 |
|
| 610 |
+
| 601 | Umbundu | umb | umb | 10.59 |
|
| 611 |
+
| 602 | Upper Sorbian | hsb | hsb | 2.71 |
|
| 612 |
+
| 603 | Urdu | ur | urd | 211.27 |
|
| 613 |
+
| 604 | Ushojo | ush | ush | 6.36 |
|
| 614 |
+
| 605 | Uzbek | uz | uzb | 115.28 |
|
| 615 |
+
| 606 | Vai | vai | vai | 8.76 |
|
| 616 |
+
| 607 | Vietnamese | vi | vie | 8481.98 |
|
| 617 |
+
| 608 | Votic | vot | vot | 0.1 |
|
| 618 |
+
| 609 | Võro | vro | vro | 15.66 |
|
| 619 |
+
| 610 | Waci Gbe | wci | wci | 8.02 |
|
| 620 |
+
| 611 | Wadiyara Koli | kxp | kxp | 20.0 |
|
| 621 |
+
| 612 | Waja | wja | wja | 10.22 |
|
| 622 |
+
| 613 | Wakhi | wbl | wbl | 11.67 |
|
| 623 |
+
| 614 | Wanga | lwg | lwg | 9.36 |
|
| 624 |
+
| 615 | Wapan | juk | juk | 10.22 |
|
| 625 |
+
| 616 | Warji | wji | wji | 11.39 |
|
| 626 |
+
| 617 | Welsh | cy | cym | 131.21 |
|
| 627 |
+
| 618 | Wemale | weo | weo | 9.09 |
|
| 628 |
+
| 619 | Western Frisian | fy | fry | 70.41 |
|
| 629 |
+
| 620 | Western Highland Purepecha | pua | pua | 10.17 |
|
| 630 |
+
| 621 | Western Juxtlahuaca Mixtec | jmx | jmx | 10.01 |
|
| 631 |
+
| 622 | Western Maninkakan | mlq | mlq | 9.83 |
|
| 632 |
+
| 623 | Western Mari | mrj | mrj | 32.26 |
|
| 633 |
+
| 624 | Western Niger Fulfulde | fuh | fuh | 9.69 |
|
| 634 |
+
| 625 | Western Panjabi | pnb | pnb | 10.0 |
|
| 635 |
+
| 626 | Wolof | wo | wol | 8.71 |
|
| 636 |
+
| 627 | Wuzlam | udl | udl | 9.23 |
|
| 637 |
+
| 628 | Xanaguía Zapotec | ztg | ztg | 9.86 |
|
| 638 |
+
| 629 | Xhosa | xh | xho | 13.35 |
|
| 639 |
+
| 630 | Yace | ekr | ekr | 10.76 |
|
| 640 |
+
| 631 | Yakut | sah | sah | 16.08 |
|
| 641 |
+
| 632 | Yalahatan | jal | jal | 11.18 |
|
| 642 |
+
| 633 | Yanahuanca Pasco Quechua | qur | qur | 9.95 |
|
| 643 |
+
| 634 | Yangben | yav | yav | 8.7 |
|
| 644 |
+
| 635 | Yaqui | yaq | yaq | 9.93 |
|
| 645 |
+
| 636 | Yauyos Quechua | qux | qux | 9.35 |
|
| 646 |
+
| 637 | Yekhee | ets | ets | 10.11 |
|
| 647 |
+
| 638 | Yiddish | yi | yid | 1.81 |
|
| 648 |
+
| 639 | Yidgha | ydg | ydg | 9.89 |
|
| 649 |
+
| 640 | Yoruba | yo | yor | 15.66 |
|
| 650 |
+
| 641 | Yutanduchi Mixtec | mab | mab | 9.26 |
|
| 651 |
+
| 642 | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | nhi | nhi | 0.05 |
|
| 652 |
+
| 643 | Zarma | dje | dje | 10.72 |
|
| 653 |
+
| 644 | Zaza | zza | zza | 1.52 |
|
| 654 |
+
| 645 | Zulu | zu | zul | 14.83 |
|
| 655 |
+
| 646 | Ömie | aom | aom | 8.19 |
|
| 656 |
+
|
| 657 |
+
*646 languages, 581k hours total.*
|
| 658 |
+
|
| 659 |
+
Data source: [docs/lang_id_name_map.tsv](lang_id_name_map.tsv)
|
docs/tips.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Tips & Notes
|
| 2 |
+
|
| 3 |
+
- **Combination of `ref_audio` and `instruct`**:
|
| 4 |
+
When both `ref_audio` and `instruct` are provided and they **conflict**, the model will most likely follow the style of the reference audio. When the two are **consistent**, `instruct` can improve cloning stability for the attributes it describes. A typical example is **Chinese dialect cloning**: provide both dialect reference audio and a matching dialect instruct (e.g., `ref_audio="sichuan.wav", instruct="四川话"`) for more stable dialect output.
|
| 5 |
+
|
| 6 |
+
- **Short Audio Generation**:
|
| 7 |
+
The model may not reliably generate short audio clips (e.g., 1–2 seconds) without reference audio. If you need to generate short clips, provide reference audio to the model.
|
| 8 |
+
|
| 9 |
+
- **Min Nan Chinese (Hokkien) Input Format**:
|
| 10 |
+
Min Nan Chinese (闽南语, also known as Hokkien) can only be synthesized using [Tai-lo romanization](https://en.wikipedia.org/wiki/T%C3%A2i-l%C3%B4) as input; Chinese characters are not supported for Min Nan Chinese in the current model version.
|
docs/training.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training
|
| 2 |
+
|
| 3 |
+
## Training Config
|
| 4 |
+
|
| 5 |
+
All training is controlled by a JSON training config file and a JSON data config file.
|
| 6 |
+
|
| 7 |
+
See [examples/config/](../examples/config/) for ready-to-use configs.
|
| 8 |
+
|
| 9 |
+
Training config file on Emilia is: [examples/config/train_config_emilia.json](../examples/config/train_config_emilia.json)
|
| 10 |
+
|
| 11 |
+
Data config file for Emilia is: [examples/config/data_config_emilia.json](../examples/config/data_config_emilia.json)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
Key fields in training config file:
|
| 15 |
+
|
| 16 |
+
| Field | Description | Default |
|
| 17 |
+
|---|---|---|
|
| 18 |
+
| `llm_name_or_path` | local LLM path or huggingface id | Qwen/Qwen3-0.6B |
|
| 19 |
+
| `steps` | Total training steps | 300,000 |
|
| 20 |
+
| `learning_rate` | Peak learning rate | 1e-4 |
|
| 21 |
+
| `batch_tokens` | Tokens per batch on each GPU | 8192 |
|
| 22 |
+
| `attn_implementation` | Attention backend: `"flex_attention"` or `"sdpa"` | `"flex_attention"` |
|
| 23 |
+
|
| 24 |
+
`output_dir` and `data_config` are passed via command line (see below).
|
| 25 |
+
|
| 26 |
+
## Attention Implementation
|
| 27 |
+
|
| 28 |
+
By default, training uses `flex_attention`, which requires PyTorch ≥ 2.5 and a compatible GPU (e.g. NVIDIA Ampere or newer). If your environment does not support `flex_attention`, set `attn_implementation` to `"sdpa"` in your training config. See [examples/config/train_config_finetune_sdpa.json](../examples/config/train_config_finetune_sdpa.json) for a ready-to-use SDPA config:
|
| 29 |
+
|
| 30 |
+
```json
|
| 31 |
+
{
|
| 32 |
+
"attn_implementation": "sdpa",
|
| 33 |
+
"max_sample_tokens": 2000,
|
| 34 |
+
"min_sample_tokens": 50,
|
| 35 |
+
"max_batch_size": 64
|
| 36 |
+
}
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
`"sdpa"` uses PyTorch's built-in scaled dot-product attention and works on a wider range of hardware.
|
| 40 |
+
|
| 41 |
+
The following fields only apply when `attn_implementation != "flex_attention"`:
|
| 42 |
+
|
| 43 |
+
| Field | Description | Default |
|
| 44 |
+
|---|---|---|
|
| 45 |
+
| `max_sample_tokens` | Maximum token length per sample; longer samples are dropped | 2000 |
|
| 46 |
+
| `min_sample_tokens` | Minimum token length per sample; shorter samples are dropped | 50 |
|
| 47 |
+
| `max_batch_size` | Cap on the number of samples per batch | 64 |
|
| 48 |
+
|
| 49 |
+
`batch_tokens` remains the primary control for memory usage — it sets the total token budget per batch. `max_batch_size` is a safety guard to prevent a batch of many short samples from creating an unusually large batch dimension.
|
| 50 |
+
|
| 51 |
+
### Batching strategy
|
| 52 |
+
|
| 53 |
+
The two backends use **different batching strategies**, which are selected automatically:
|
| 54 |
+
|
| 55 |
+
| Backend | Batching strategy | Batch shape | Notes |
|
| 56 |
+
|---|---|---|---|
|
| 57 |
+
| `flex_attention` | Sequence packing | `[1, C, batch_tokens]` | Multiple samples concatenated into one long sequence; document boundaries tracked via `document_ids` |
|
| 58 |
+
| `sdpa` | Length-grouped padding | `[B, C, max_len]` | Samples with similar token lengths are grouped into the same batch and padded to the local maximum length |
|
| 59 |
+
|
| 60 |
+
**Why different strategies?**
|
| 61 |
+
|
| 62 |
+
- With `flex_attention`, sequence packing is memory-efficient because a compact `BlockMask` (not a dense matrix) describes which tokens can attend to each other across document boundaries.
|
| 63 |
+
- With `sdpa`, length-grouped padding is used instead: samples of similar token lengths are batched together and padded to the local maximum, so a lightweight `[B, 1, max_len, max_len]` boolean attention mask suffices with low overhead and minimal wasted padding.
|
| 64 |
+
|
| 65 |
+
## Launching Training
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
accelerate launch \
|
| 69 |
+
--gpu_ids "0,1,2,3,4,5,6,7" \
|
| 70 |
+
--num_processes 8 \
|
| 71 |
+
-m omnivoice.cli.train \
|
| 72 |
+
--train_config config/train_config_emilia.json \
|
| 73 |
+
--data_config config/data_config_emilia.json \
|
| 74 |
+
--output_dir exp/omnivoice_emilia
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Resuming Training
|
| 78 |
+
|
| 79 |
+
Set `resume_from_checkpoint` in your training config to resume from an existing checkpoint:
|
| 80 |
+
|
| 81 |
+
```json
|
| 82 |
+
{
|
| 83 |
+
"resume_from_checkpoint": "exp/omnivoice/checkpoint-100000"
|
| 84 |
+
}
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Initializing from a Pretrained Model
|
| 88 |
+
|
| 89 |
+
To start training from a pretrained OmniVoice checkpoint (for fine-tuning):
|
| 90 |
+
|
| 91 |
+
```json
|
| 92 |
+
{
|
| 93 |
+
"init_from_checkpoint": "exp/omnivoice/checkpoint-100000"
|
| 94 |
+
}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## Monitoring
|
| 98 |
+
|
| 99 |
+
Training logs to TensorBoard:
|
| 100 |
+
```bash
|
| 101 |
+
tensorboard --logdir exp/omnivoice_emilia/tensorboard
|
| 102 |
+
```
|
docs/voice-design.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Voice Design
|
| 2 |
+
|
| 3 |
+
Voice Design mode lets you describe the desired speaker through speaker attributes (`instruct` parameter) — no reference audio needed. The model
|
| 4 |
+
generates a matching voice on the fly.
|
| 5 |
+
|
| 6 |
+
## Quick Example
|
| 7 |
+
|
| 8 |
+
```python
|
| 9 |
+
import torch
|
| 10 |
+
from omnivoice import OmniVoice
|
| 11 |
+
|
| 12 |
+
model = OmniVoice.from_pretrained(
|
| 13 |
+
"k2-fsa/OmniVoice",
|
| 14 |
+
device_map="cuda:0",
|
| 15 |
+
dtype=torch.float16
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
audio = model.generate(
|
| 19 |
+
text="This is a test for voice design.",
|
| 20 |
+
instruct="female, young adult, high pitch, british accent",
|
| 21 |
+
)
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
## How It Works
|
| 25 |
+
|
| 26 |
+
The `instruct` parameter accepts a comma-separated string of speaker attributes.
|
| 27 |
+
Each attribute belongs to a **category** (gender, age, pitch, style, accent,
|
| 28 |
+
or dialect). Within a category, only one attribute may be selected at a time.
|
| 29 |
+
Attributes from different categories can be freely combined.
|
| 30 |
+
|
| 31 |
+
The model auto-detects the language of the instruct text and normalises it
|
| 32 |
+
internally — you can write in English, Chinese, or a mix of both.
|
| 33 |
+
|
| 34 |
+
## Supported Attributes
|
| 35 |
+
|
| 36 |
+
### Gender
|
| 37 |
+
|
| 38 |
+
| English | Chinese |
|
| 39 |
+
|---------|---------|
|
| 40 |
+
| male | 男 |
|
| 41 |
+
| female | 女 |
|
| 42 |
+
|
| 43 |
+
### Age
|
| 44 |
+
|
| 45 |
+
| English | Chinese |
|
| 46 |
+
|---------|---------|
|
| 47 |
+
| child | 儿童 |
|
| 48 |
+
| teenager | 少年 |
|
| 49 |
+
| young adult | 青年 |
|
| 50 |
+
| middle-aged | 中年 |
|
| 51 |
+
| elderly | 老年 |
|
| 52 |
+
|
| 53 |
+
### Pitch
|
| 54 |
+
|
| 55 |
+
| English | Chinese |
|
| 56 |
+
|---------|---------|
|
| 57 |
+
| very low pitch | 极低音调 |
|
| 58 |
+
| low pitch | 低音调 |
|
| 59 |
+
| moderate pitch | 中音调 |
|
| 60 |
+
| high pitch | 高音调 |
|
| 61 |
+
| very high pitch | 极高音调 |
|
| 62 |
+
|
| 63 |
+
### Style
|
| 64 |
+
|
| 65 |
+
| English | Chinese |
|
| 66 |
+
|---------|---------|
|
| 67 |
+
| whisper | 耳语 |
|
| 68 |
+
|
| 69 |
+
### English Accent
|
| 70 |
+
|
| 71 |
+
Only effective when the synthesis text is in English.
|
| 72 |
+
|
| 73 |
+
| Accent |
|
| 74 |
+
|--------|
|
| 75 |
+
| american accent |
|
| 76 |
+
| british accent |
|
| 77 |
+
| australian accent |
|
| 78 |
+
| canadian accent |
|
| 79 |
+
| indian accent |
|
| 80 |
+
| chinese accent |
|
| 81 |
+
| korean accent |
|
| 82 |
+
| japanese accent |
|
| 83 |
+
| portuguese accent |
|
| 84 |
+
| russian accent |
|
| 85 |
+
|
| 86 |
+
### Chinese Dialect
|
| 87 |
+
|
| 88 |
+
Only effective when the synthesis text is in Chinese.
|
| 89 |
+
|
| 90 |
+
| Dialect |
|
| 91 |
+
|---------|
|
| 92 |
+
| 河南话 |
|
| 93 |
+
| 陕西话 |
|
| 94 |
+
| 四川话 |
|
| 95 |
+
| 贵州话 |
|
| 96 |
+
| 云南话 |
|
| 97 |
+
| 桂林话 |
|
| 98 |
+
| 济南话 |
|
| 99 |
+
| 石家庄话 |
|
| 100 |
+
| 甘肃话 |
|
| 101 |
+
| 宁夏话 |
|
| 102 |
+
| 青岛话 |
|
| 103 |
+
| 东北话 |
|
| 104 |
+
|
| 105 |
+
## Writing Instruct Strings
|
| 106 |
+
|
| 107 |
+
Separate attributes with commas (half-width `,` for English, full-width `,`
|
| 108 |
+
for Chinese — the model auto-fixes mismatches).
|
| 109 |
+
|
| 110 |
+
```
|
| 111 |
+
# English
|
| 112 |
+
"female, young adult, high pitch, british accent"
|
| 113 |
+
|
| 114 |
+
# Chinese
|
| 115 |
+
"女,青年,高音调,四川话"
|
| 116 |
+
|
| 117 |
+
# Mixed (auto-normalised)
|
| 118 |
+
"female, young adult, 四川话"
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
### Tips
|
| 122 |
+
|
| 123 |
+
- **Combine freely** across categories: `"male, elderly, low pitch, whisper"`.
|
| 124 |
+
- **Leave it to the model**: omit attributes you don't care about — the model
|
| 125 |
+
fills in the rest. For example `"female"` alone is valid.
|
| 126 |
+
- **Case-insensitive**: `"Male"`, `"MALE"`, and `"male"` are all accepted, the code will normalize them to lower case.
|
| 127 |
+
|
| 128 |
+
- **Accent vs Dialect**: English accents are only applied to English speech, Chinese dialects are only applied to Chinese speech.
|
| 129 |
+
- **Attribute combinations**: Due to training data limitations, some attribute combinations may not work well — the model may ignore certain attributes in a combination. If the output doesn't match your expectation, try simplifying the instruct string.
|
examples/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OmniVoice Examples
|
| 2 |
+
|
| 3 |
+
This directory contains scripts and configs for training, fine-tuning, and evaluating OmniVoice.
|
| 4 |
+
|
| 5 |
+
| Use Case | Script | Description |
|
| 6 |
+
|---|---|---|
|
| 7 |
+
| Training from scratch | [run_emilia.sh](run_emilia.sh) | Full pipeline on the Emilia dataset (data check, tokenization, training) |
|
| 8 |
+
| Fine-tuning | [run_finetune.sh](run_finetune.sh) | Fine-tune from a pretrained checkpoint using your own JSONL data |
|
| 9 |
+
| Evaluation | [run_eval.sh](run_eval.sh) | Evaluate WER, speaker similarity, and UTMOS on standard test sets |
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Training from Scratch (Emilia)
|
| 14 |
+
|
| 15 |
+
[run_emilia.sh](run_emilia.sh) runs the full pipeline in 3 stages:
|
| 16 |
+
|
| 17 |
+
| Stage | What it does |
|
| 18 |
+
|---|---|
|
| 19 |
+
| 0 | Verify the Emilia dataset and JSONL manifests are in place |
|
| 20 |
+
| 1 | Tokenize audio into WebDataset shards |
|
| 21 |
+
| 2 | Launch multi-GPU training with `accelerate` |
|
| 22 |
+
|
| 23 |
+
**Prerequisites:**
|
| 24 |
+
|
| 25 |
+
1. Download the Emilia dataset from [OpenXLab](https://openxlab.org.cn/datasets/Amphion/Emilia) and place it under `download/`:
|
| 26 |
+
```
|
| 27 |
+
download/Amphion___Emilia
|
| 28 |
+
└── raw
|
| 29 |
+
├── EN
|
| 30 |
+
└── ZH
|
| 31 |
+
```
|
| 32 |
+
2. Obtain JSONL manifests and place them in `data/emilia/manifests/`:
|
| 33 |
+
- `emilia_en_train.jsonl`, `emilia_en_dev.jsonl`
|
| 34 |
+
- `emilia_zh_train.jsonl`, `emilia_zh_dev.jsonl`
|
| 35 |
+
|
| 36 |
+
You can generate them from the raw data, or download pre-processed manifests from [HuggingFace](https://huggingface.co/datasets/zhu-han/Emilia-Manifests).
|
| 37 |
+
|
| 38 |
+
**Run the full pipeline:**
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
bash examples/run_emilia.sh
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Or run individual stages by setting `stage` and `stop_stage` at the top of the script (e.g. `stage=1`, `stop_stage=1` to only tokenize).
|
| 45 |
+
|
| 46 |
+
> See [docs/training.md](../docs/training.md) for config details, checkpoint resuming, and TensorBoard monitoring.
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## Fine-tuning
|
| 51 |
+
|
| 52 |
+
[run_finetune.sh](run_finetune.sh) fine-tunes from a pretrained checkpoint on your own data.
|
| 53 |
+
|
| 54 |
+
### Step 1: Prepare Your Data
|
| 55 |
+
|
| 56 |
+
Create a JSONL manifest where each line describes one audio sample:
|
| 57 |
+
|
| 58 |
+
```jsonl
|
| 59 |
+
{"id": "sample_001", "audio_path": "/data/audio/001.wav", "text": "Hello world", "language_id": "en"}
|
| 60 |
+
{"id": "sample_002", "audio_path": "/data/audio/002.wav", "text": "你好世界", "language_id": "zh"}
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
`id`, `audio_path`, and `text` are mandatory. `language_id` is optional.
|
| 64 |
+
|
| 65 |
+
> See [docs/data_preparation.md](../docs/data_preparation.md) for the full data format specification.
|
| 66 |
+
|
| 67 |
+
### Step 2: Configure the Script
|
| 68 |
+
|
| 69 |
+
Edit the variables at the top of `run_finetune.sh`:
|
| 70 |
+
|
| 71 |
+
```bash
|
| 72 |
+
TRAIN_JSONL="data/my_data_train.jsonl" # path to training JSONL
|
| 73 |
+
DEV_JSONL="data/my_data_dev.jsonl" # path to dev JSONL
|
| 74 |
+
GPU_IDS="0,1" # GPUs to use
|
| 75 |
+
NUM_GPUS=2
|
| 76 |
+
OUTPUT_DIR="exp/omnivoice_finetune" # output directory
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### Step 3: Run
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
bash examples/run_finetune.sh
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
The script will:
|
| 86 |
+
1. Tokenize your audio into WebDataset shards
|
| 87 |
+
2. Launch fine-tuning with `accelerate`
|
| 88 |
+
|
| 89 |
+
Main difference between fine-tuning config ([config/train_config_finetune.json](config/train_config_finetune.json)) and the Emilia training config ([config/train_config_emilia.json](config/train_config_emilia.json)) are:
|
| 90 |
+
|
| 91 |
+
| Parameter | Emilia (from scratch) | Fine-tune | Why |
|
| 92 |
+
|---|---|---|---|
|
| 93 |
+
| `init_from_checkpoint` | `null` | `"k2-fsa/OmniVoice"` | Load pretrained weights |
|
| 94 |
+
| `steps` | 300,000 | 5,000 | Fewer steps for fine-tuning, can be tuned according to your data/task. |
|
| 95 |
+
| `learning_rate` | 1e-4 | 5e-5 | Lower LR for fine-tuning, can be tuned according to your data/task |
|
| 96 |
+
|
| 97 |
+
To use a different pretrained checkpoint, modify `init_from_checkpoint` in the config file.
|
| 98 |
+
|
| 99 |
+
If you encounter issues with `flex_attention` on your GPU, use [config/train_config_finetune_sdpa.json](config/train_config_finetune_sdpa.json) instead, which uses SDPA attention for broader compatibility. See [docs/training.md](../docs/training.md#attention-implementation) for details.
|
| 100 |
+
|
| 101 |
+
---
|
| 102 |
+
|
| 103 |
+
## Evaluation
|
| 104 |
+
|
| 105 |
+
Install evaluation dependencies first:
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
pip install omnivoice[eval]
|
| 109 |
+
# or
|
| 110 |
+
uv sync --extra eval
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
Supported test sets: `librispeech_pc`, `seedtts_en`, `seedtts_zh`, `fleurs`, `minimax`.
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
bash examples/run_eval.sh
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
> See [docs/evaluation.md](../docs/evaluation.md) for metrics details, test set preparation, and running individual metrics.
|
| 120 |
+
|
examples/config/data_config_emilia.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train":
|
| 3 |
+
[
|
| 4 |
+
{
|
| 5 |
+
"language_id": "en",
|
| 6 |
+
"manifest_path": [
|
| 7 |
+
"data/emilia/tokens/emilia_en_train/data.lst"
|
| 8 |
+
],
|
| 9 |
+
"repeat": 1
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"language_id": "zh",
|
| 13 |
+
"manifest_path": [
|
| 14 |
+
"data/emilia/tokens/emilia_zh_train/data.lst"
|
| 15 |
+
],
|
| 16 |
+
"repeat": 1
|
| 17 |
+
}
|
| 18 |
+
],
|
| 19 |
+
"dev":
|
| 20 |
+
[
|
| 21 |
+
{
|
| 22 |
+
"language_id": "en",
|
| 23 |
+
"manifest_path": [
|
| 24 |
+
"data/emilia/tokens/emilia_en_dev/data.lst"
|
| 25 |
+
],
|
| 26 |
+
"repeat": 1
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"language_id": "zh",
|
| 30 |
+
"manifest_path": [
|
| 31 |
+
"data/emilia/tokens/emilia_zh_dev/data.lst"
|
| 32 |
+
],
|
| 33 |
+
"repeat": 1
|
| 34 |
+
}
|
| 35 |
+
]
|
| 36 |
+
}
|
examples/config/data_config_finetune.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"train": [
|
| 3 |
+
{
|
| 4 |
+
"manifest_path": ["data/finetune/tokens/train/data.lst"]
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"dev": [
|
| 8 |
+
{
|
| 9 |
+
"manifest_path": ["data/finetune/tokens/dev/data.lst"]
|
| 10 |
+
}
|
| 11 |
+
]
|
| 12 |
+
}
|
examples/config/ds_config_zero2.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"steps_per_print": 100,
|
| 3 |
+
"zero_optimization": {
|
| 4 |
+
"stage": 2,
|
| 5 |
+
"allgather_partitions": true,
|
| 6 |
+
"allgather_bucket_size": 2e8,
|
| 7 |
+
"overlap_comm": true,
|
| 8 |
+
"reduce_scatter": true,
|
| 9 |
+
"reduce_bucket_size": 2e8,
|
| 10 |
+
"contiguous_gradients": true
|
| 11 |
+
},
|
| 12 |
+
"gradient_accumulation_steps": "auto",
|
| 13 |
+
"gradient_clipping": "auto",
|
| 14 |
+
"train_batch_size": "auto",
|
| 15 |
+
"train_micro_batch_size_per_gpu": "auto",
|
| 16 |
+
"bf16": {
|
| 17 |
+
"enabled": "auto"
|
| 18 |
+
}
|
| 19 |
+
}
|
examples/config/train_config_emilia.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 3 |
+
"audio_vocab_size": 1025,
|
| 4 |
+
"audio_mask_id": 1024,
|
| 5 |
+
"num_audio_codebook": 8,
|
| 6 |
+
|
| 7 |
+
"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
|
| 8 |
+
"drop_cond_ratio": 0.1,
|
| 9 |
+
"prompt_ratio_range": [0.0, 0.3],
|
| 10 |
+
"mask_ratio_range": [0.0, 1.0],
|
| 11 |
+
"language_ratio": 0.0,
|
| 12 |
+
"use_pinyin_ratio": 0.0,
|
| 13 |
+
"instruct_ratio": 0.0,
|
| 14 |
+
"only_instruct_ratio": 0.0,
|
| 15 |
+
|
| 16 |
+
"resume_from_checkpoint": null,
|
| 17 |
+
"init_from_checkpoint": null,
|
| 18 |
+
|
| 19 |
+
"learning_rate": 1e-4,
|
| 20 |
+
"weight_decay": 0.01,
|
| 21 |
+
"max_grad_norm": 1.0,
|
| 22 |
+
"steps": 300000,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"warmup_type": "ratio",
|
| 25 |
+
"warmup_ratio": 0.03,
|
| 26 |
+
"warmup_steps": 0,
|
| 27 |
+
|
| 28 |
+
"batch_tokens": 8192,
|
| 29 |
+
"gradient_accumulation_steps": 1,
|
| 30 |
+
"num_workers": 4,
|
| 31 |
+
|
| 32 |
+
"mixed_precision": "bf16",
|
| 33 |
+
"allow_tf32": true,
|
| 34 |
+
|
| 35 |
+
"logging_steps": 100,
|
| 36 |
+
"eval_steps": 1000,
|
| 37 |
+
"save_steps": 10000,
|
| 38 |
+
"keep_last_n_checkpoints": -1
|
| 39 |
+
}
|
examples/config/train_config_finetune.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 3 |
+
"audio_vocab_size": 1025,
|
| 4 |
+
"audio_mask_id": 1024,
|
| 5 |
+
"num_audio_codebook": 8,
|
| 6 |
+
|
| 7 |
+
"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
|
| 8 |
+
"drop_cond_ratio": 0.1,
|
| 9 |
+
"prompt_ratio_range": [0.0, 0.3],
|
| 10 |
+
"mask_ratio_range": [0.0, 1.0],
|
| 11 |
+
"language_ratio": 0.8,
|
| 12 |
+
"use_pinyin_ratio": 0.0,
|
| 13 |
+
"instruct_ratio": 0.0,
|
| 14 |
+
"only_instruct_ratio": 0.0,
|
| 15 |
+
|
| 16 |
+
"resume_from_checkpoint": null,
|
| 17 |
+
"init_from_checkpoint": "k2-fsa/OmniVoice",
|
| 18 |
+
|
| 19 |
+
"learning_rate": 1e-5,
|
| 20 |
+
"weight_decay": 0.01,
|
| 21 |
+
"max_grad_norm": 1.0,
|
| 22 |
+
"steps": 5000,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"warmup_type": "ratio",
|
| 25 |
+
"warmup_ratio": 0.01,
|
| 26 |
+
"warmup_steps": 0,
|
| 27 |
+
|
| 28 |
+
"batch_tokens": 8192,
|
| 29 |
+
"gradient_accumulation_steps": 1,
|
| 30 |
+
"num_workers": 2,
|
| 31 |
+
|
| 32 |
+
"mixed_precision": "bf16",
|
| 33 |
+
"allow_tf32": true,
|
| 34 |
+
|
| 35 |
+
"logging_steps": 50,
|
| 36 |
+
"eval_steps": 500,
|
| 37 |
+
"save_steps": 500,
|
| 38 |
+
"keep_last_n_checkpoints": -1
|
| 39 |
+
}
|
examples/config/train_config_finetune_sdpa.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 3 |
+
"audio_vocab_size": 1025,
|
| 4 |
+
"audio_mask_id": 1024,
|
| 5 |
+
"num_audio_codebook": 8,
|
| 6 |
+
|
| 7 |
+
"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
|
| 8 |
+
"drop_cond_ratio": 0.1,
|
| 9 |
+
"prompt_ratio_range": [0.0, 0.3],
|
| 10 |
+
"mask_ratio_range": [0.0, 1.0],
|
| 11 |
+
"language_ratio": 0.8,
|
| 12 |
+
"use_pinyin_ratio": 0.0,
|
| 13 |
+
"instruct_ratio": 0.0,
|
| 14 |
+
"only_instruct_ratio": 0.0,
|
| 15 |
+
|
| 16 |
+
"resume_from_checkpoint": null,
|
| 17 |
+
"init_from_checkpoint": "k2-fsa/OmniVoice",
|
| 18 |
+
|
| 19 |
+
"learning_rate": 1e-5,
|
| 20 |
+
"weight_decay": 0.01,
|
| 21 |
+
"max_grad_norm": 1.0,
|
| 22 |
+
"steps": 5000,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"warmup_type": "ratio",
|
| 25 |
+
"warmup_ratio": 0.01,
|
| 26 |
+
"warmup_steps": 0,
|
| 27 |
+
|
| 28 |
+
"batch_tokens": 8192,
|
| 29 |
+
"gradient_accumulation_steps": 1,
|
| 30 |
+
"num_workers": 2,
|
| 31 |
+
|
| 32 |
+
"mixed_precision": "bf16",
|
| 33 |
+
"allow_tf32": true,
|
| 34 |
+
"attn_implementation": "sdpa",
|
| 35 |
+
"max_sample_tokens": 2000,
|
| 36 |
+
"min_sample_tokens": 50,
|
| 37 |
+
"max_batch_size": 64,
|
| 38 |
+
|
| 39 |
+
"logging_steps": 50,
|
| 40 |
+
"eval_steps": 500,
|
| 41 |
+
"save_steps": 500,
|
| 42 |
+
"keep_last_n_checkpoints": -1
|
| 43 |
+
}
|
examples/config/train_config_multilingual.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 3 |
+
"audio_vocab_size": 1025,
|
| 4 |
+
"audio_mask_id": 1024,
|
| 5 |
+
"num_audio_codebook": 8,
|
| 6 |
+
|
| 7 |
+
"audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
|
| 8 |
+
"drop_cond_ratio": 0.1,
|
| 9 |
+
"prompt_ratio_range": [0.0, 0.3],
|
| 10 |
+
"mask_ratio_range": [0.0, 1.0],
|
| 11 |
+
"language_ratio": 0.8,
|
| 12 |
+
"use_pinyin_ratio": 0.3,
|
| 13 |
+
"instruct_ratio": 1.0,
|
| 14 |
+
"only_instruct_ratio": 0.5,
|
| 15 |
+
|
| 16 |
+
"resume_from_checkpoint": null,
|
| 17 |
+
"init_from_checkpoint": null,
|
| 18 |
+
|
| 19 |
+
"learning_rate": 1e-4,
|
| 20 |
+
"weight_decay": 0.01,
|
| 21 |
+
"max_grad_norm": 1.0,
|
| 22 |
+
"steps": 2000000,
|
| 23 |
+
"seed": 42,
|
| 24 |
+
"warmup_type": "ratio",
|
| 25 |
+
"warmup_ratio": 0.03,
|
| 26 |
+
"warmup_steps": 0,
|
| 27 |
+
|
| 28 |
+
"batch_tokens": 8192,
|
| 29 |
+
"gradient_accumulation_steps": 1,
|
| 30 |
+
"num_workers": 4,
|
| 31 |
+
|
| 32 |
+
"mixed_precision": "bf16",
|
| 33 |
+
"allow_tf32": true,
|
| 34 |
+
|
| 35 |
+
"logging_steps": 100,
|
| 36 |
+
"eval_steps": 1000,
|
| 37 |
+
"save_steps": 10000,
|
| 38 |
+
"keep_last_n_checkpoints": -1
|
| 39 |
+
}
|
examples/run_emilia.sh
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# This script demonstrates how to run the full training pipeline on the Emilia dataset.
|
| 4 |
+
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
stage=0
|
| 8 |
+
stop_stage=2
|
| 9 |
+
|
| 10 |
+
# ====== Modify as needed ======
|
| 11 |
+
# GPUs to use
|
| 12 |
+
GPU_IDS="0,1,2,3,4,5,6,7"
|
| 13 |
+
NUM_GPUS=8
|
| 14 |
+
|
| 15 |
+
# Download directory for raw Emilia data
|
| 16 |
+
dl_dir="download"
|
| 17 |
+
|
| 18 |
+
# Directory containing JSONL manifests for train/dev splits
|
| 19 |
+
# Stage 0 will check for the presence of the following files:
|
| 20 |
+
# data/emilia/manifests/emilia_en_train.jsonl
|
| 21 |
+
# data/emilia/manifests/emilia_en_dev.jsonl
|
| 22 |
+
# data/emilia/manifests/emilia_zh_train.jsonl
|
| 23 |
+
# data/emilia/manifests/emilia_zh_dev.jsonl
|
| 24 |
+
MANIFEST_DIR="data/emilia/manifests"
|
| 25 |
+
|
| 26 |
+
# Directory to write tokenized WebDataset shards
|
| 27 |
+
TOKEN_DIR="data/emilia/tokens"
|
| 28 |
+
|
| 29 |
+
# Audio tokenizer model (HuggingFace repo or local path)
|
| 30 |
+
TOKENIZER_PATH="eustlb/higgs-audio-v2-tokenizer"
|
| 31 |
+
|
| 32 |
+
# Training config file
|
| 33 |
+
TRAIN_CONFIG="config/train_config_emilia.json"
|
| 34 |
+
|
| 35 |
+
# Data config file
|
| 36 |
+
data_config="config/data_config_emilia.json"
|
| 37 |
+
|
| 38 |
+
# Output directory for checkpoints
|
| 39 |
+
OUTPUT_DIR="exp/omnivoice_emilia"
|
| 40 |
+
# =================================
|
| 41 |
+
|
| 42 |
+
export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Stage 0: Download data
|
| 46 |
+
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
| 47 |
+
echo "Stage 0: Download data"
|
| 48 |
+
|
| 49 |
+
# You should manually download the Emilia dataset from
|
| 50 |
+
# https://openxlab.org.cn/datasets/Amphion/Emilia
|
| 51 |
+
# or https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07
|
| 52 |
+
# and place it in the download directory.
|
| 53 |
+
# Your download directory should at least contain the following structure:
|
| 54 |
+
#
|
| 55 |
+
# download/Amphion___Emilia
|
| 56 |
+
# ├── raw
|
| 57 |
+
# │ ├── EN
|
| 58 |
+
# │ └── ZH
|
| 59 |
+
|
| 60 |
+
if [ ! -d "$dl_dir"/Amphion___Emilia/raw ]; then
|
| 61 |
+
echo "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
|
| 62 |
+
exit 1
|
| 63 |
+
fi
|
| 64 |
+
|
| 65 |
+
# We require JSONL manifests for the training and dev splits. You can
|
| 66 |
+
# either generate them yourself using the raw data and the provided
|
| 67 |
+
# metadata, or download our processed JSONL manifests from HuggingFace.
|
| 68 |
+
# https://huggingface.co/datasets/zhu-han/Emilia-Manifests
|
| 69 |
+
#
|
| 70 |
+
# Place them as data/emilia/manifests/{emilia_en_train,emilia_en_dev,emilia_zh_train,emilia_zh_dev}.jsonl
|
| 71 |
+
|
| 72 |
+
for split in emilia_en_dev emilia_zh_dev emilia_en_train emilia_zh_train; do
|
| 73 |
+
if [ ! -f "${MANIFEST_DIR}/${split}.jsonl" ]; then
|
| 74 |
+
echo "Please download the manifest for ${split} and place it in ${MANIFEST_DIR}/${split}.jsonl"
|
| 75 |
+
exit 1
|
| 76 |
+
fi
|
| 77 |
+
done
|
| 78 |
+
|
| 79 |
+
echo " Done. All manifests and data are in place."
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Stage 1: Tokenize splits into directories matching data_config_emilia.json
|
| 84 |
+
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
| 85 |
+
echo "Stage 1: Tokenizing audio"
|
| 86 |
+
|
| 87 |
+
for split in emilia_en_dev emilia_zh_dev emilia_en_train emilia_zh_train; do
|
| 88 |
+
echo " Tokenizing ${split} from ${MANIFEST_DIR}/${split}.jsonl"
|
| 89 |
+
|
| 90 |
+
CUDA_VISIBLE_DEVICES=${GPU_IDS} \
|
| 91 |
+
python -m omnivoice.scripts.extract_audio_tokens \
|
| 92 |
+
--input_jsonl "${MANIFEST_DIR}/${split}.jsonl" \
|
| 93 |
+
--tar_output_pattern "${TOKEN_DIR}/${split}/audios/shard-%06d.tar" \
|
| 94 |
+
--jsonl_output_pattern "${TOKEN_DIR}/${split}/txts/shard-%06d.jsonl" \
|
| 95 |
+
--tokenizer_path "${TOKENIZER_PATH}" \
|
| 96 |
+
--nj_per_gpu 3 \
|
| 97 |
+
--shuffle True
|
| 98 |
+
|
| 99 |
+
echo " Done. Tokens written to ${TOKEN_DIR}/${split}"
|
| 100 |
+
done
|
| 101 |
+
fi
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Stage 2: Train
|
| 105 |
+
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
| 106 |
+
echo "Stage 2: Training"
|
| 107 |
+
|
| 108 |
+
accelerate launch \
|
| 109 |
+
--gpu_ids "${GPU_IDS}" \
|
| 110 |
+
--num_processes ${NUM_GPUS} \
|
| 111 |
+
-m omnivoice.cli.train \
|
| 112 |
+
--train_config ${TRAIN_CONFIG} \
|
| 113 |
+
--data_config ${data_config} \
|
| 114 |
+
--output_dir ${OUTPUT_DIR}
|
| 115 |
+
fi
|
examples/run_eval.sh
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Evaluate OmniVoice models on TTS benchmarks.
|
| 4 |
+
|
| 5 |
+
# Stage 1: Download the test sets and evaluation models.
|
| 6 |
+
# Stage 2: LibriSpeech-PC
|
| 7 |
+
# Stage 3: seedtts_en
|
| 8 |
+
# Stage 4: seedtts_zh
|
| 9 |
+
# Stage 5: fleurs
|
| 10 |
+
# Stage 6: minimax
|
| 11 |
+
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
# Specify the stages to run by setting the `stage` and `stop_stage` variables.
|
| 15 |
+
stage=1
|
| 16 |
+
stop_stage=6
|
| 17 |
+
|
| 18 |
+
# Available GPUs for evaluation. Adjust this according to your setup.
|
| 19 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
| 20 |
+
|
| 21 |
+
# Specify the checkpoint to evaluate.
|
| 22 |
+
CHECKPOINT=k2-fsa/OmniVoice
|
| 23 |
+
emilia_checkpoint=false
|
| 24 |
+
|
| 25 |
+
# CHECKPOINT=k2-fsa/OmniVoice
|
| 26 |
+
# emilia_checkpoint=true
|
| 27 |
+
|
| 28 |
+
# For the OmniVoice-Emilia checkpoint, we set denoise to False and lang_id to None
|
| 29 |
+
#, as the model is trained without prompt denoising or language id.
|
| 30 |
+
|
| 31 |
+
if [ "${emilia_checkpoint}" = true ]; then
|
| 32 |
+
infer_options="--preprocess_prompt False \
|
| 33 |
+
--postprocess_output False \
|
| 34 |
+
--batch_duration 600 \
|
| 35 |
+
--denoise False \
|
| 36 |
+
--lang_id None \
|
| 37 |
+
--audio_chunk_threshold 1000"
|
| 38 |
+
else
|
| 39 |
+
infer_options="--preprocess_prompt False \
|
| 40 |
+
--postprocess_output False \
|
| 41 |
+
--batch_duration 600 \
|
| 42 |
+
--audio_chunk_threshold 1000"
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
|
| 46 |
+
|
| 47 |
+
download_dir="download"
|
| 48 |
+
TTS_EVAL_MODEL_DIR="${download_dir}/tts_eval_models/"
|
| 49 |
+
TTS_EVAL_DATA_DIR="${download_dir}/tts_eval_datasets/"
|
| 50 |
+
|
| 51 |
+
# Map test_name to its test.jsonl path.
|
| 52 |
+
get_test_list() {
|
| 53 |
+
case "$1" in
|
| 54 |
+
librispeech_pc) echo "${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean.jsonl" ;;
|
| 55 |
+
seedtts_en) echo "${TTS_EVAL_DATA_DIR}/seedtts_test_en.jsonl" ;;
|
| 56 |
+
seedtts_zh) echo "${TTS_EVAL_DATA_DIR}/seedtts_test_zh.jsonl" ;;
|
| 57 |
+
minimax) echo "${TTS_EVAL_DATA_DIR}/minimax_multilingual_24.jsonl" ;;
|
| 58 |
+
fleurs) echo "${TTS_EVAL_DATA_DIR}/fleurs_multilingual_102.jsonl" ;;
|
| 59 |
+
*) echo ""; return 1 ;;
|
| 60 |
+
esac
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# ============================================================
|
| 64 |
+
# Stage 1: Prepare the test sets and evaluation models
|
| 65 |
+
# ============================================================
|
| 66 |
+
|
| 67 |
+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
| 68 |
+
echo "Stage 1: Download test sets and evaluation models"
|
| 69 |
+
|
| 70 |
+
hf_repo=k2-fsa/TTS_eval_datasets
|
| 71 |
+
mkdir -p ${TTS_EVAL_DATA_DIR}/
|
| 72 |
+
for file in \
|
| 73 |
+
librispeech_pc_test_clean.jsonl \
|
| 74 |
+
librispeech_pc_test_clean_transcript.jsonl \
|
| 75 |
+
seedtts_test_en.jsonl \
|
| 76 |
+
seedtts_test_zh.jsonl \
|
| 77 |
+
minimax_multilingual_24.jsonl \
|
| 78 |
+
fleurs_multilingual_102.jsonl; do
|
| 79 |
+
echo "Downloading ${file}..."
|
| 80 |
+
huggingface-cli download \
|
| 81 |
+
--repo-type dataset \
|
| 82 |
+
--local-dir ${TTS_EVAL_DATA_DIR}/ \
|
| 83 |
+
${hf_repo} \
|
| 84 |
+
${file}
|
| 85 |
+
done
|
| 86 |
+
|
| 87 |
+
for file in \
|
| 88 |
+
librispeech_pc_testset.tar.gz \
|
| 89 |
+
seedtts_testset.tar.gz \
|
| 90 |
+
minimax_multilingual_24.tar.gz \
|
| 91 |
+
fleurs_multilingual_102.tar.gz; do
|
| 92 |
+
echo "Downloading ${file}..."
|
| 93 |
+
huggingface-cli download \
|
| 94 |
+
--repo-type dataset \
|
| 95 |
+
--local-dir ${TTS_EVAL_DATA_DIR}/ \
|
| 96 |
+
${hf_repo} \
|
| 97 |
+
${file}
|
| 98 |
+
|
| 99 |
+
echo "Extracting ${file}..."
|
| 100 |
+
tar -xzf ${TTS_EVAL_DATA_DIR}/${file} -C ${TTS_EVAL_DATA_DIR}/
|
| 101 |
+
done
|
| 102 |
+
|
| 103 |
+
echo "Download all evaluation models"
|
| 104 |
+
hf_repo=k2-fsa/TTS_eval_models
|
| 105 |
+
mkdir -p ${TTS_EVAL_MODEL_DIR}
|
| 106 |
+
huggingface-cli download \
|
| 107 |
+
--local-dir ${TTS_EVAL_MODEL_DIR} \
|
| 108 |
+
${hf_repo}
|
| 109 |
+
fi
|
| 110 |
+
|
| 111 |
+
# ============================================================
|
| 112 |
+
# Stage 2: Evaluation on LibriSpeech-PC
|
| 113 |
+
# ============================================================
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
| 117 |
+
echo "Stage 2: Evaluation on LibriSpeech-PC"
|
| 118 |
+
wav_path="results/librispeech_pc"
|
| 119 |
+
test_jsonl="$(get_test_list librispeech_pc)"
|
| 120 |
+
transcript_jsonl="${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean_transcript.jsonl"
|
| 121 |
+
|
| 122 |
+
python -m omnivoice.cli.infer_batch \
|
| 123 |
+
--model "${CHECKPOINT}" \
|
| 124 |
+
--test_list "${test_jsonl}" \
|
| 125 |
+
--res_dir "${wav_path}" ${infer_options}
|
| 126 |
+
|
| 127 |
+
python -m omnivoice.eval.speaker_similarity.sim \
|
| 128 |
+
--wav-path "${wav_path}" \
|
| 129 |
+
--test-list "${test_jsonl}" \
|
| 130 |
+
--decode-path "${wav_path}.sim.log" \
|
| 131 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 132 |
+
|
| 133 |
+
python -m omnivoice.eval.wer.hubert \
|
| 134 |
+
--wav-path "${wav_path}" \
|
| 135 |
+
--test-list "${transcript_jsonl}" \
|
| 136 |
+
--decode-path "${wav_path}.wer.log" \
|
| 137 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 138 |
+
|
| 139 |
+
python -m omnivoice.eval.mos.utmos \
|
| 140 |
+
--wav-path "${wav_path}" \
|
| 141 |
+
--test-list "${test_jsonl}" \
|
| 142 |
+
--decode-path "${wav_path}.mos.log" \
|
| 143 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 144 |
+
fi
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
# ============================================================
|
| 148 |
+
# Stage 3: Evaluation on Seed-TTS en
|
| 149 |
+
# ============================================================
|
| 150 |
+
|
| 151 |
+
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
| 152 |
+
echo "Stage 3: Evaluation on Seed-TTS en"
|
| 153 |
+
wav_path="results/seedtts_en"
|
| 154 |
+
test_jsonl="$(get_test_list seedtts_en)"
|
| 155 |
+
|
| 156 |
+
python -m omnivoice.cli.infer_batch \
|
| 157 |
+
--model "${CHECKPOINT}" \
|
| 158 |
+
--test_list "${test_jsonl}" \
|
| 159 |
+
--res_dir "${wav_path}" ${infer_options}
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
python -m omnivoice.eval.speaker_similarity.sim \
|
| 163 |
+
--wav-path "${wav_path}" \
|
| 164 |
+
--test-list "${test_jsonl}" \
|
| 165 |
+
--decode-path "${wav_path}.sim.log" \
|
| 166 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 167 |
+
|
| 168 |
+
python -m omnivoice.eval.wer.seedtts \
|
| 169 |
+
--wav-path "${wav_path}" \
|
| 170 |
+
--test-list "${test_jsonl}" \
|
| 171 |
+
--decode-path "${wav_path}.wer.log" \
|
| 172 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}" \
|
| 173 |
+
--lang en
|
| 174 |
+
|
| 175 |
+
python -m omnivoice.eval.mos.utmos \
|
| 176 |
+
--wav-path "${wav_path}" \
|
| 177 |
+
--test-list "${test_jsonl}" \
|
| 178 |
+
--decode-path "${wav_path}.mos.log" \
|
| 179 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 180 |
+
fi
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ============================================================
|
| 184 |
+
# Stage 4: Evaluation on Seed-TTS zh
|
| 185 |
+
# ============================================================
|
| 186 |
+
|
| 187 |
+
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
| 188 |
+
echo "Stage 4: Evaluation on Seed-TTS zh"
|
| 189 |
+
wav_path="results/seedtts_zh"
|
| 190 |
+
test_jsonl="$(get_test_list seedtts_zh)"
|
| 191 |
+
|
| 192 |
+
python -m omnivoice.cli.infer_batch \
|
| 193 |
+
--model "${CHECKPOINT}" \
|
| 194 |
+
--test_list "${test_jsonl}" \
|
| 195 |
+
--res_dir "${wav_path}" ${infer_options}
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
python -m omnivoice.eval.speaker_similarity.sim \
|
| 199 |
+
--wav-path "${wav_path}" \
|
| 200 |
+
--test-list "${test_jsonl}" \
|
| 201 |
+
--decode-path "${wav_path}.sim.log" \
|
| 202 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 203 |
+
|
| 204 |
+
python -m omnivoice.eval.wer.seedtts \
|
| 205 |
+
--wav-path "${wav_path}" \
|
| 206 |
+
--test-list "${test_jsonl}" \
|
| 207 |
+
--decode-path "${wav_path}.wer.log" \
|
| 208 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}" \
|
| 209 |
+
--lang zh
|
| 210 |
+
|
| 211 |
+
python -m omnivoice.eval.mos.utmos \
|
| 212 |
+
--wav-path "${wav_path}" \
|
| 213 |
+
--test-list "${test_jsonl}" \
|
| 214 |
+
--decode-path "${wav_path}.mos.log" \
|
| 215 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 216 |
+
fi
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
# ============================================================
|
| 221 |
+
# Stage 5: Evaluation on MiniMax multilingual
|
| 222 |
+
# ============================================================
|
| 223 |
+
|
| 224 |
+
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
| 225 |
+
echo "Stage 5: Evaluation on MiniMax multilingual"
|
| 226 |
+
wav_path="results/minimax"
|
| 227 |
+
test_jsonl="$(get_test_list minimax)"
|
| 228 |
+
|
| 229 |
+
python -m omnivoice.cli.infer_batch \
|
| 230 |
+
--model "${CHECKPOINT}" \
|
| 231 |
+
--test_list "${test_jsonl}" \
|
| 232 |
+
--res_dir "${wav_path}" ${infer_options}
|
| 233 |
+
|
| 234 |
+
python -m omnivoice.eval.speaker_similarity.sim \
|
| 235 |
+
--wav-path "${wav_path}" \
|
| 236 |
+
--test-list "${test_jsonl}" \
|
| 237 |
+
--decode-path "${wav_path}.sim.log" \
|
| 238 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 239 |
+
|
| 240 |
+
python -m omnivoice.eval.wer.minimax \
|
| 241 |
+
--wav-path "${wav_path}" \
|
| 242 |
+
--test-list "${test_jsonl}" \
|
| 243 |
+
--decode-path "${wav_path}.wer.log" \
|
| 244 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 245 |
+
fi
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ============================================================
|
| 249 |
+
# Stage 6: Evaluation on FLEURS multilingual
|
| 250 |
+
# ============================================================
|
| 251 |
+
|
| 252 |
+
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
| 253 |
+
echo "Stage 6: Evaluation on FLEURS multilingual"
|
| 254 |
+
wav_path="results/fleurs"
|
| 255 |
+
test_jsonl="$(get_test_list fleurs)"
|
| 256 |
+
|
| 257 |
+
python -m omnivoice.cli.infer_batch \
|
| 258 |
+
--model "${CHECKPOINT}" \
|
| 259 |
+
--test_list "${test_jsonl}" \
|
| 260 |
+
--res_dir "${wav_path}" ${infer_options}
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
python -m omnivoice.eval.speaker_similarity.sim \
|
| 264 |
+
--wav-path "${wav_path}" \
|
| 265 |
+
--test-list "${test_jsonl}" \
|
| 266 |
+
--decode-path "${wav_path}.sim.log" \
|
| 267 |
+
--model-dir "${TTS_EVAL_MODEL_DIR}"
|
| 268 |
+
|
| 269 |
+
# Evaluation on FLEURS requires omnilingual-asr, which has dependencies that
|
| 270 |
+
# conflict with other packages (at least the transformers package) in our project.
|
| 271 |
+
|
| 272 |
+
# To evaluate on FLEURS, we suggest users to set up a separate virtual
|
| 273 |
+
# environment to install omnilingual-asr. Install instructions can be found in
|
| 274 |
+
# https://github.com/facebookresearch/omnilingual-asr
|
| 275 |
+
|
| 276 |
+
python ${PWD}/../omnivoice/eval/wer/fleurs.py \
|
| 277 |
+
--wav-path "${wav_path}" \
|
| 278 |
+
--test-list "${test_jsonl}" \
|
| 279 |
+
--decode-path "${wav_path}.wer.log" \
|
| 280 |
+
--model-card omniASR_LLM_Unlimited_7B_v2 \
|
| 281 |
+
--chunk-size 100 \
|
| 282 |
+
--batch-size 50
|
| 283 |
+
fi
|
examples/run_finetune.sh
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# This script demonstrates how to fine-tune OmniVoice from a JSONL manifest.
|
| 4 |
+
|
| 5 |
+
set -euo pipefail
|
| 6 |
+
|
| 7 |
+
stage=0
|
| 8 |
+
stop_stage=1
|
| 9 |
+
|
| 10 |
+
# ====== Modify as needed ======
|
| 11 |
+
# GPUs to use
|
| 12 |
+
GPU_IDS="0,1"
|
| 13 |
+
NUM_GPUS=2
|
| 14 |
+
|
| 15 |
+
# Path to your input JSONL file
|
| 16 |
+
# (each line: {"id": ..., "audio_path": ..., "text": ..., "language_id": ...})
|
| 17 |
+
TRAIN_JSONL="/home/riftuser/OmniVoice/sync_data/data/train_raw.jsonl"
|
| 18 |
+
|
| 19 |
+
# Path to your dev JSONL file. Set to empty string to skip dev set.
|
| 20 |
+
DEV_JSONL="/home/riftuser/OmniVoice/sync_data/data/dev_raw.jsonl"
|
| 21 |
+
|
| 22 |
+
# Directory to write tokenized WebDataset shards
|
| 23 |
+
TOKEN_DIR="/home/riftuser/OmniVoice/sync_data/tokens"
|
| 24 |
+
|
| 25 |
+
# Audio tokenizer model (HuggingFace repo or local path)
|
| 26 |
+
TOKENIZER_PATH="eustlb/higgs-audio-v2-tokenizer"
|
| 27 |
+
|
| 28 |
+
# Training config file
|
| 29 |
+
# If you encounter issues with flex_attention on your GPU, use the SDPA config instead:
|
| 30 |
+
# TRAIN_CONFIG="config/train_config_finetune_sdpa.json"
|
| 31 |
+
TRAIN_CONFIG="/home/riftuser/OmniVoice/sync_data/configs/config.json"
|
| 32 |
+
|
| 33 |
+
# Data config file
|
| 34 |
+
data_config="/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json"
|
| 35 |
+
|
| 36 |
+
# Output directory for fine-tuned checkpoints
|
| 37 |
+
OUTPUT_DIR="/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune"
|
| 38 |
+
# =================================
|
| 39 |
+
|
| 40 |
+
export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# Stage 0: Tokenize audio into WebDataset shards
|
| 44 |
+
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
| 45 |
+
echo "Stage 0: Tokenizing audio"
|
| 46 |
+
|
| 47 |
+
for split_jsonl_path in ${TRAIN_JSONL} ${DEV_JSONL}; do
|
| 48 |
+
if [ -z "${split_jsonl_path}" ]; then
|
| 49 |
+
continue
|
| 50 |
+
fi
|
| 51 |
+
|
| 52 |
+
if [ "${split_jsonl_path}" = "${TRAIN_JSONL}" ]; then
|
| 53 |
+
split="train"
|
| 54 |
+
else
|
| 55 |
+
split="dev"
|
| 56 |
+
fi
|
| 57 |
+
|
| 58 |
+
echo " Tokenizing ${split} from ${split_jsonl_path}"
|
| 59 |
+
|
| 60 |
+
CUDA_VISIBLE_DEVICES=${GPU_IDS} \
|
| 61 |
+
python -m omnivoice.scripts.extract_audio_tokens \
|
| 62 |
+
--input_jsonl "${split_jsonl_path}" \
|
| 63 |
+
--tar_output_pattern "${TOKEN_DIR}/${split}/audios/shard-%06d.tar" \
|
| 64 |
+
--jsonl_output_pattern "${TOKEN_DIR}/${split}/txts/shard-%06d.jsonl" \
|
| 65 |
+
--tokenizer_path "${TOKENIZER_PATH}" \
|
| 66 |
+
--nj_per_gpu 3 \
|
| 67 |
+
--shuffle True
|
| 68 |
+
|
| 69 |
+
echo " Done. Manifest written to ${TOKEN_DIR}/${split}/data.lst"
|
| 70 |
+
done
|
| 71 |
+
fi
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Stage 1: Fine-tune
|
| 75 |
+
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
| 76 |
+
echo "Stage 1: Fine-tuning"
|
| 77 |
+
|
| 78 |
+
accelerate launch \
|
| 79 |
+
--gpu_ids "${GPU_IDS}" \
|
| 80 |
+
--num_processes ${NUM_GPUS} \
|
| 81 |
+
-m omnivoice.cli.train \
|
| 82 |
+
--train_config ${TRAIN_CONFIG} \
|
| 83 |
+
--data_config ${data_config} \
|
| 84 |
+
--output_dir ${OUTPUT_DIR}
|
| 85 |
+
fi
|
exp_v1/omnivoice_finetune/checkpoint-4500/chat_template.jinja
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if message.content is string %}
|
| 27 |
+
{%- set content = message.content %}
|
| 28 |
+
{%- else %}
|
| 29 |
+
{%- set content = '' %}
|
| 30 |
+
{%- endif %}
|
| 31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
| 33 |
+
{%- elif message.role == "assistant" %}
|
| 34 |
+
{%- set reasoning_content = '' %}
|
| 35 |
+
{%- if message.reasoning_content is string %}
|
| 36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 37 |
+
{%- else %}
|
| 38 |
+
{%- if '</think>' in content %}
|
| 39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 41 |
+
{%- endif %}
|
| 42 |
+
{%- endif %}
|
| 43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 46 |
+
{%- else %}
|
| 47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 48 |
+
{%- endif %}
|
| 49 |
+
{%- else %}
|
| 50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 51 |
+
{%- endif %}
|
| 52 |
+
{%- if message.tool_calls %}
|
| 53 |
+
{%- for tool_call in message.tool_calls %}
|
| 54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 55 |
+
{{- '\n' }}
|
| 56 |
+
{%- endif %}
|
| 57 |
+
{%- if tool_call.function %}
|
| 58 |
+
{%- set tool_call = tool_call.function %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 61 |
+
{{- tool_call.name }}
|
| 62 |
+
{{- '", "arguments": ' }}
|
| 63 |
+
{%- if tool_call.arguments is string %}
|
| 64 |
+
{{- tool_call.arguments }}
|
| 65 |
+
{%- else %}
|
| 66 |
+
{{- tool_call.arguments | tojson }}
|
| 67 |
+
{%- endif %}
|
| 68 |
+
{{- '}\n</tool_call>' }}
|
| 69 |
+
{%- endfor %}
|
| 70 |
+
{%- endif %}
|
| 71 |
+
{{- '<|im_end|>\n' }}
|
| 72 |
+
{%- elif message.role == "tool" %}
|
| 73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 74 |
+
{{- '<|im_start|>user' }}
|
| 75 |
+
{%- endif %}
|
| 76 |
+
{{- '\n<tool_response>\n' }}
|
| 77 |
+
{{- content }}
|
| 78 |
+
{{- '\n</tool_response>' }}
|
| 79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 80 |
+
{{- '<|im_end|>\n' }}
|
| 81 |
+
{%- endif %}
|
| 82 |
+
{%- endif %}
|
| 83 |
+
{%- endfor %}
|
| 84 |
+
{%- if add_generation_prompt %}
|
| 85 |
+
{{- '<|im_start|>assistant\n' }}
|
| 86 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 87 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 88 |
+
{%- endif %}
|
| 89 |
+
{%- endif %}
|
exp_v1/omnivoice_finetune/checkpoint-4500/config.json
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"OmniVoice"
|
| 4 |
+
],
|
| 5 |
+
"audio_codebook_weights": [
|
| 6 |
+
8,
|
| 7 |
+
8,
|
| 8 |
+
6,
|
| 9 |
+
6,
|
| 10 |
+
4,
|
| 11 |
+
4,
|
| 12 |
+
2,
|
| 13 |
+
2
|
| 14 |
+
],
|
| 15 |
+
"audio_mask_id": 1024,
|
| 16 |
+
"audio_vocab_size": 1025,
|
| 17 |
+
"bos_token_id": null,
|
| 18 |
+
"dtype": "float32",
|
| 19 |
+
"eos_token_id": 151645,
|
| 20 |
+
"llm_config": {
|
| 21 |
+
"_name_or_path": "",
|
| 22 |
+
"architectures": [
|
| 23 |
+
"Qwen3ForCausalLM"
|
| 24 |
+
],
|
| 25 |
+
"attention_bias": false,
|
| 26 |
+
"attention_dropout": 0.0,
|
| 27 |
+
"bos_token_id": 151643,
|
| 28 |
+
"chunk_size_feed_forward": 0,
|
| 29 |
+
"dtype": "float32",
|
| 30 |
+
"eos_token_id": 151645,
|
| 31 |
+
"head_dim": 128,
|
| 32 |
+
"hidden_act": "silu",
|
| 33 |
+
"hidden_size": 1024,
|
| 34 |
+
"id2label": {
|
| 35 |
+
"0": "LABEL_0",
|
| 36 |
+
"1": "LABEL_1"
|
| 37 |
+
},
|
| 38 |
+
"initializer_range": 0.02,
|
| 39 |
+
"intermediate_size": 3072,
|
| 40 |
+
"is_encoder_decoder": false,
|
| 41 |
+
"label2id": {
|
| 42 |
+
"LABEL_0": 0,
|
| 43 |
+
"LABEL_1": 1
|
| 44 |
+
},
|
| 45 |
+
"layer_types": [
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention",
|
| 73 |
+
"full_attention"
|
| 74 |
+
],
|
| 75 |
+
"max_position_embeddings": 40960,
|
| 76 |
+
"max_window_layers": 28,
|
| 77 |
+
"model_type": "qwen3",
|
| 78 |
+
"num_attention_heads": 16,
|
| 79 |
+
"num_hidden_layers": 28,
|
| 80 |
+
"num_key_value_heads": 8,
|
| 81 |
+
"output_attentions": false,
|
| 82 |
+
"output_hidden_states": false,
|
| 83 |
+
"pad_token_id": null,
|
| 84 |
+
"problem_type": null,
|
| 85 |
+
"return_dict": true,
|
| 86 |
+
"rms_norm_eps": 1e-06,
|
| 87 |
+
"rope_parameters": {
|
| 88 |
+
"rope_theta": 1000000,
|
| 89 |
+
"rope_type": "default"
|
| 90 |
+
},
|
| 91 |
+
"sliding_window": null,
|
| 92 |
+
"tie_word_embeddings": true,
|
| 93 |
+
"use_cache": true,
|
| 94 |
+
"use_sliding_window": false,
|
| 95 |
+
"vocab_size": 151676
|
| 96 |
+
},
|
| 97 |
+
"model_type": "omnivoice",
|
| 98 |
+
"num_audio_codebook": 8,
|
| 99 |
+
"pad_token_id": 151643,
|
| 100 |
+
"transformers_version": "5.3.0"
|
| 101 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-4500/tokenizer_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<|denoise|>",
|
| 10 |
+
"<|lang_start|>",
|
| 11 |
+
"<|lang_end|>",
|
| 12 |
+
"<|instruct_start|>",
|
| 13 |
+
"<|instruct_end|>",
|
| 14 |
+
"<|text_start|>",
|
| 15 |
+
"<|text_end|>"
|
| 16 |
+
],
|
| 17 |
+
"is_local": true,
|
| 18 |
+
"local_files_only": false,
|
| 19 |
+
"model_max_length": 131072,
|
| 20 |
+
"pad_token": "<|endoftext|>",
|
| 21 |
+
"split_special_tokens": false,
|
| 22 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 23 |
+
"unk_token": null
|
| 24 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-4500/train_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
|
| 3 |
+
"data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
|
| 4 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 5 |
+
"audio_vocab_size": 1025,
|
| 6 |
+
"audio_mask_id": 1024,
|
| 7 |
+
"num_audio_codebook": 8,
|
| 8 |
+
"audio_codebook_weights": [
|
| 9 |
+
8,
|
| 10 |
+
8,
|
| 11 |
+
6,
|
| 12 |
+
6,
|
| 13 |
+
4,
|
| 14 |
+
4,
|
| 15 |
+
2,
|
| 16 |
+
2
|
| 17 |
+
],
|
| 18 |
+
"drop_cond_ratio": 0.1,
|
| 19 |
+
"prompt_ratio_range": [
|
| 20 |
+
0.0,
|
| 21 |
+
0.3
|
| 22 |
+
],
|
| 23 |
+
"mask_ratio_range": [
|
| 24 |
+
0.0,
|
| 25 |
+
1.0
|
| 26 |
+
],
|
| 27 |
+
"language_ratio": 0.8,
|
| 28 |
+
"use_pinyin_ratio": 0.0,
|
| 29 |
+
"instruct_ratio": 0.0,
|
| 30 |
+
"only_instruct_ratio": 0.0,
|
| 31 |
+
"resume_from_checkpoint": null,
|
| 32 |
+
"init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
|
| 33 |
+
"learning_rate": 1e-05,
|
| 34 |
+
"weight_decay": 0.01,
|
| 35 |
+
"max_grad_norm": 1.0,
|
| 36 |
+
"steps": 5000,
|
| 37 |
+
"seed": 42,
|
| 38 |
+
"lr_scheduler_type": "cosine",
|
| 39 |
+
"warmup_type": "ratio",
|
| 40 |
+
"warmup_ratio": 0.01,
|
| 41 |
+
"warmup_steps": 0,
|
| 42 |
+
"batch_tokens": 4096,
|
| 43 |
+
"gradient_accumulation_steps": 2,
|
| 44 |
+
"num_workers": 3,
|
| 45 |
+
"mixed_precision": "bf16",
|
| 46 |
+
"allow_tf32": true,
|
| 47 |
+
"use_deepspeed": false,
|
| 48 |
+
"deepspeed_config": null,
|
| 49 |
+
"attn_implementation": "sdpa",
|
| 50 |
+
"max_sample_tokens": 2000,
|
| 51 |
+
"min_sample_tokens": 50,
|
| 52 |
+
"max_batch_size": 64,
|
| 53 |
+
"logging_steps": 50,
|
| 54 |
+
"eval_steps": 500,
|
| 55 |
+
"save_steps": 500,
|
| 56 |
+
"keep_last_n_checkpoints": -1
|
| 57 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-500/chat_template.jinja
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if message.content is string %}
|
| 27 |
+
{%- set content = message.content %}
|
| 28 |
+
{%- else %}
|
| 29 |
+
{%- set content = '' %}
|
| 30 |
+
{%- endif %}
|
| 31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
| 33 |
+
{%- elif message.role == "assistant" %}
|
| 34 |
+
{%- set reasoning_content = '' %}
|
| 35 |
+
{%- if message.reasoning_content is string %}
|
| 36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 37 |
+
{%- else %}
|
| 38 |
+
{%- if '</think>' in content %}
|
| 39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 41 |
+
{%- endif %}
|
| 42 |
+
{%- endif %}
|
| 43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 46 |
+
{%- else %}
|
| 47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 48 |
+
{%- endif %}
|
| 49 |
+
{%- else %}
|
| 50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 51 |
+
{%- endif %}
|
| 52 |
+
{%- if message.tool_calls %}
|
| 53 |
+
{%- for tool_call in message.tool_calls %}
|
| 54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 55 |
+
{{- '\n' }}
|
| 56 |
+
{%- endif %}
|
| 57 |
+
{%- if tool_call.function %}
|
| 58 |
+
{%- set tool_call = tool_call.function %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 61 |
+
{{- tool_call.name }}
|
| 62 |
+
{{- '", "arguments": ' }}
|
| 63 |
+
{%- if tool_call.arguments is string %}
|
| 64 |
+
{{- tool_call.arguments }}
|
| 65 |
+
{%- else %}
|
| 66 |
+
{{- tool_call.arguments | tojson }}
|
| 67 |
+
{%- endif %}
|
| 68 |
+
{{- '}\n</tool_call>' }}
|
| 69 |
+
{%- endfor %}
|
| 70 |
+
{%- endif %}
|
| 71 |
+
{{- '<|im_end|>\n' }}
|
| 72 |
+
{%- elif message.role == "tool" %}
|
| 73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 74 |
+
{{- '<|im_start|>user' }}
|
| 75 |
+
{%- endif %}
|
| 76 |
+
{{- '\n<tool_response>\n' }}
|
| 77 |
+
{{- content }}
|
| 78 |
+
{{- '\n</tool_response>' }}
|
| 79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 80 |
+
{{- '<|im_end|>\n' }}
|
| 81 |
+
{%- endif %}
|
| 82 |
+
{%- endif %}
|
| 83 |
+
{%- endfor %}
|
| 84 |
+
{%- if add_generation_prompt %}
|
| 85 |
+
{{- '<|im_start|>assistant\n' }}
|
| 86 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 87 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 88 |
+
{%- endif %}
|
| 89 |
+
{%- endif %}
|
exp_v1/omnivoice_finetune/checkpoint-500/config.json
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"OmniVoice"
|
| 4 |
+
],
|
| 5 |
+
"audio_codebook_weights": [
|
| 6 |
+
8,
|
| 7 |
+
8,
|
| 8 |
+
6,
|
| 9 |
+
6,
|
| 10 |
+
4,
|
| 11 |
+
4,
|
| 12 |
+
2,
|
| 13 |
+
2
|
| 14 |
+
],
|
| 15 |
+
"audio_mask_id": 1024,
|
| 16 |
+
"audio_vocab_size": 1025,
|
| 17 |
+
"bos_token_id": null,
|
| 18 |
+
"dtype": "float32",
|
| 19 |
+
"eos_token_id": 151645,
|
| 20 |
+
"llm_config": {
|
| 21 |
+
"_name_or_path": "",
|
| 22 |
+
"architectures": [
|
| 23 |
+
"Qwen3ForCausalLM"
|
| 24 |
+
],
|
| 25 |
+
"attention_bias": false,
|
| 26 |
+
"attention_dropout": 0.0,
|
| 27 |
+
"bos_token_id": 151643,
|
| 28 |
+
"chunk_size_feed_forward": 0,
|
| 29 |
+
"dtype": "float32",
|
| 30 |
+
"eos_token_id": 151645,
|
| 31 |
+
"head_dim": 128,
|
| 32 |
+
"hidden_act": "silu",
|
| 33 |
+
"hidden_size": 1024,
|
| 34 |
+
"id2label": {
|
| 35 |
+
"0": "LABEL_0",
|
| 36 |
+
"1": "LABEL_1"
|
| 37 |
+
},
|
| 38 |
+
"initializer_range": 0.02,
|
| 39 |
+
"intermediate_size": 3072,
|
| 40 |
+
"is_encoder_decoder": false,
|
| 41 |
+
"label2id": {
|
| 42 |
+
"LABEL_0": 0,
|
| 43 |
+
"LABEL_1": 1
|
| 44 |
+
},
|
| 45 |
+
"layer_types": [
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention",
|
| 73 |
+
"full_attention"
|
| 74 |
+
],
|
| 75 |
+
"max_position_embeddings": 40960,
|
| 76 |
+
"max_window_layers": 28,
|
| 77 |
+
"model_type": "qwen3",
|
| 78 |
+
"num_attention_heads": 16,
|
| 79 |
+
"num_hidden_layers": 28,
|
| 80 |
+
"num_key_value_heads": 8,
|
| 81 |
+
"output_attentions": false,
|
| 82 |
+
"output_hidden_states": false,
|
| 83 |
+
"pad_token_id": null,
|
| 84 |
+
"problem_type": null,
|
| 85 |
+
"return_dict": true,
|
| 86 |
+
"rms_norm_eps": 1e-06,
|
| 87 |
+
"rope_parameters": {
|
| 88 |
+
"rope_theta": 1000000,
|
| 89 |
+
"rope_type": "default"
|
| 90 |
+
},
|
| 91 |
+
"sliding_window": null,
|
| 92 |
+
"tie_word_embeddings": true,
|
| 93 |
+
"use_cache": true,
|
| 94 |
+
"use_sliding_window": false,
|
| 95 |
+
"vocab_size": 151676
|
| 96 |
+
},
|
| 97 |
+
"model_type": "omnivoice",
|
| 98 |
+
"num_audio_codebook": 8,
|
| 99 |
+
"pad_token_id": 151643,
|
| 100 |
+
"transformers_version": "5.3.0"
|
| 101 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-500/train_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
|
| 3 |
+
"data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
|
| 4 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 5 |
+
"audio_vocab_size": 1025,
|
| 6 |
+
"audio_mask_id": 1024,
|
| 7 |
+
"num_audio_codebook": 8,
|
| 8 |
+
"audio_codebook_weights": [
|
| 9 |
+
8,
|
| 10 |
+
8,
|
| 11 |
+
6,
|
| 12 |
+
6,
|
| 13 |
+
4,
|
| 14 |
+
4,
|
| 15 |
+
2,
|
| 16 |
+
2
|
| 17 |
+
],
|
| 18 |
+
"drop_cond_ratio": 0.1,
|
| 19 |
+
"prompt_ratio_range": [
|
| 20 |
+
0.0,
|
| 21 |
+
0.3
|
| 22 |
+
],
|
| 23 |
+
"mask_ratio_range": [
|
| 24 |
+
0.0,
|
| 25 |
+
1.0
|
| 26 |
+
],
|
| 27 |
+
"language_ratio": 0.8,
|
| 28 |
+
"use_pinyin_ratio": 0.0,
|
| 29 |
+
"instruct_ratio": 0.0,
|
| 30 |
+
"only_instruct_ratio": 0.0,
|
| 31 |
+
"resume_from_checkpoint": null,
|
| 32 |
+
"init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
|
| 33 |
+
"learning_rate": 1e-05,
|
| 34 |
+
"weight_decay": 0.01,
|
| 35 |
+
"max_grad_norm": 1.0,
|
| 36 |
+
"steps": 5000,
|
| 37 |
+
"seed": 42,
|
| 38 |
+
"lr_scheduler_type": "cosine",
|
| 39 |
+
"warmup_type": "ratio",
|
| 40 |
+
"warmup_ratio": 0.01,
|
| 41 |
+
"warmup_steps": 0,
|
| 42 |
+
"batch_tokens": 4096,
|
| 43 |
+
"gradient_accumulation_steps": 2,
|
| 44 |
+
"num_workers": 3,
|
| 45 |
+
"mixed_precision": "bf16",
|
| 46 |
+
"allow_tf32": true,
|
| 47 |
+
"use_deepspeed": false,
|
| 48 |
+
"deepspeed_config": null,
|
| 49 |
+
"attn_implementation": "sdpa",
|
| 50 |
+
"max_sample_tokens": 2000,
|
| 51 |
+
"min_sample_tokens": 50,
|
| 52 |
+
"max_batch_size": 64,
|
| 53 |
+
"logging_steps": 50,
|
| 54 |
+
"eval_steps": 500,
|
| 55 |
+
"save_steps": 500,
|
| 56 |
+
"keep_last_n_checkpoints": -1
|
| 57 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-5000/chat_template.jinja
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0].role == 'system' %}
|
| 4 |
+
{{- messages[0].content + '\n\n' }}
|
| 5 |
+
{%- endif %}
|
| 6 |
+
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 7 |
+
{%- for tool in tools %}
|
| 8 |
+
{{- "\n" }}
|
| 9 |
+
{{- tool | tojson }}
|
| 10 |
+
{%- endfor %}
|
| 11 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 12 |
+
{%- else %}
|
| 13 |
+
{%- if messages[0].role == 'system' %}
|
| 14 |
+
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
| 15 |
+
{%- endif %}
|
| 16 |
+
{%- endif %}
|
| 17 |
+
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
| 18 |
+
{%- for message in messages[::-1] %}
|
| 19 |
+
{%- set index = (messages|length - 1) - loop.index0 %}
|
| 20 |
+
{%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
| 21 |
+
{%- set ns.multi_step_tool = false %}
|
| 22 |
+
{%- set ns.last_query_index = index %}
|
| 23 |
+
{%- endif %}
|
| 24 |
+
{%- endfor %}
|
| 25 |
+
{%- for message in messages %}
|
| 26 |
+
{%- if message.content is string %}
|
| 27 |
+
{%- set content = message.content %}
|
| 28 |
+
{%- else %}
|
| 29 |
+
{%- set content = '' %}
|
| 30 |
+
{%- endif %}
|
| 31 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
| 32 |
+
{{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
|
| 33 |
+
{%- elif message.role == "assistant" %}
|
| 34 |
+
{%- set reasoning_content = '' %}
|
| 35 |
+
{%- if message.reasoning_content is string %}
|
| 36 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 37 |
+
{%- else %}
|
| 38 |
+
{%- if '</think>' in content %}
|
| 39 |
+
{%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
| 40 |
+
{%- set content = content.split('</think>')[-1].lstrip('\n') %}
|
| 41 |
+
{%- endif %}
|
| 42 |
+
{%- endif %}
|
| 43 |
+
{%- if loop.index0 > ns.last_query_index %}
|
| 44 |
+
{%- if loop.last or (not loop.last and reasoning_content) %}
|
| 45 |
+
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
| 46 |
+
{%- else %}
|
| 47 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 48 |
+
{%- endif %}
|
| 49 |
+
{%- else %}
|
| 50 |
+
{{- '<|im_start|>' + message.role + '\n' + content }}
|
| 51 |
+
{%- endif %}
|
| 52 |
+
{%- if message.tool_calls %}
|
| 53 |
+
{%- for tool_call in message.tool_calls %}
|
| 54 |
+
{%- if (loop.first and content) or (not loop.first) %}
|
| 55 |
+
{{- '\n' }}
|
| 56 |
+
{%- endif %}
|
| 57 |
+
{%- if tool_call.function %}
|
| 58 |
+
{%- set tool_call = tool_call.function %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '<tool_call>\n{"name": "' }}
|
| 61 |
+
{{- tool_call.name }}
|
| 62 |
+
{{- '", "arguments": ' }}
|
| 63 |
+
{%- if tool_call.arguments is string %}
|
| 64 |
+
{{- tool_call.arguments }}
|
| 65 |
+
{%- else %}
|
| 66 |
+
{{- tool_call.arguments | tojson }}
|
| 67 |
+
{%- endif %}
|
| 68 |
+
{{- '}\n</tool_call>' }}
|
| 69 |
+
{%- endfor %}
|
| 70 |
+
{%- endif %}
|
| 71 |
+
{{- '<|im_end|>\n' }}
|
| 72 |
+
{%- elif message.role == "tool" %}
|
| 73 |
+
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
| 74 |
+
{{- '<|im_start|>user' }}
|
| 75 |
+
{%- endif %}
|
| 76 |
+
{{- '\n<tool_response>\n' }}
|
| 77 |
+
{{- content }}
|
| 78 |
+
{{- '\n</tool_response>' }}
|
| 79 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 80 |
+
{{- '<|im_end|>\n' }}
|
| 81 |
+
{%- endif %}
|
| 82 |
+
{%- endif %}
|
| 83 |
+
{%- endfor %}
|
| 84 |
+
{%- if add_generation_prompt %}
|
| 85 |
+
{{- '<|im_start|>assistant\n' }}
|
| 86 |
+
{%- if enable_thinking is defined and enable_thinking is false %}
|
| 87 |
+
{{- '<think>\n\n</think>\n\n' }}
|
| 88 |
+
{%- endif %}
|
| 89 |
+
{%- endif %}
|
exp_v1/omnivoice_finetune/checkpoint-5000/config.json
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"OmniVoice"
|
| 4 |
+
],
|
| 5 |
+
"audio_codebook_weights": [
|
| 6 |
+
8,
|
| 7 |
+
8,
|
| 8 |
+
6,
|
| 9 |
+
6,
|
| 10 |
+
4,
|
| 11 |
+
4,
|
| 12 |
+
2,
|
| 13 |
+
2
|
| 14 |
+
],
|
| 15 |
+
"audio_mask_id": 1024,
|
| 16 |
+
"audio_vocab_size": 1025,
|
| 17 |
+
"bos_token_id": null,
|
| 18 |
+
"dtype": "float32",
|
| 19 |
+
"eos_token_id": 151645,
|
| 20 |
+
"llm_config": {
|
| 21 |
+
"_name_or_path": "",
|
| 22 |
+
"architectures": [
|
| 23 |
+
"Qwen3ForCausalLM"
|
| 24 |
+
],
|
| 25 |
+
"attention_bias": false,
|
| 26 |
+
"attention_dropout": 0.0,
|
| 27 |
+
"bos_token_id": 151643,
|
| 28 |
+
"chunk_size_feed_forward": 0,
|
| 29 |
+
"dtype": "float32",
|
| 30 |
+
"eos_token_id": 151645,
|
| 31 |
+
"head_dim": 128,
|
| 32 |
+
"hidden_act": "silu",
|
| 33 |
+
"hidden_size": 1024,
|
| 34 |
+
"id2label": {
|
| 35 |
+
"0": "LABEL_0",
|
| 36 |
+
"1": "LABEL_1"
|
| 37 |
+
},
|
| 38 |
+
"initializer_range": 0.02,
|
| 39 |
+
"intermediate_size": 3072,
|
| 40 |
+
"is_encoder_decoder": false,
|
| 41 |
+
"label2id": {
|
| 42 |
+
"LABEL_0": 0,
|
| 43 |
+
"LABEL_1": 1
|
| 44 |
+
},
|
| 45 |
+
"layer_types": [
|
| 46 |
+
"full_attention",
|
| 47 |
+
"full_attention",
|
| 48 |
+
"full_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"full_attention",
|
| 52 |
+
"full_attention",
|
| 53 |
+
"full_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"full_attention",
|
| 56 |
+
"full_attention",
|
| 57 |
+
"full_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"full_attention",
|
| 60 |
+
"full_attention",
|
| 61 |
+
"full_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"full_attention",
|
| 64 |
+
"full_attention",
|
| 65 |
+
"full_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"full_attention",
|
| 68 |
+
"full_attention",
|
| 69 |
+
"full_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"full_attention",
|
| 72 |
+
"full_attention",
|
| 73 |
+
"full_attention"
|
| 74 |
+
],
|
| 75 |
+
"max_position_embeddings": 40960,
|
| 76 |
+
"max_window_layers": 28,
|
| 77 |
+
"model_type": "qwen3",
|
| 78 |
+
"num_attention_heads": 16,
|
| 79 |
+
"num_hidden_layers": 28,
|
| 80 |
+
"num_key_value_heads": 8,
|
| 81 |
+
"output_attentions": false,
|
| 82 |
+
"output_hidden_states": false,
|
| 83 |
+
"pad_token_id": null,
|
| 84 |
+
"problem_type": null,
|
| 85 |
+
"return_dict": true,
|
| 86 |
+
"rms_norm_eps": 1e-06,
|
| 87 |
+
"rope_parameters": {
|
| 88 |
+
"rope_theta": 1000000,
|
| 89 |
+
"rope_type": "default"
|
| 90 |
+
},
|
| 91 |
+
"sliding_window": null,
|
| 92 |
+
"tie_word_embeddings": true,
|
| 93 |
+
"use_cache": true,
|
| 94 |
+
"use_sliding_window": false,
|
| 95 |
+
"vocab_size": 151676
|
| 96 |
+
},
|
| 97 |
+
"model_type": "omnivoice",
|
| 98 |
+
"num_audio_codebook": 8,
|
| 99 |
+
"pad_token_id": 151643,
|
| 100 |
+
"transformers_version": "5.3.0"
|
| 101 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-5000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": null,
|
| 5 |
+
"clean_up_tokenization_spaces": false,
|
| 6 |
+
"eos_token": "<|im_end|>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"extra_special_tokens": [
|
| 9 |
+
"<|denoise|>",
|
| 10 |
+
"<|lang_start|>",
|
| 11 |
+
"<|lang_end|>",
|
| 12 |
+
"<|instruct_start|>",
|
| 13 |
+
"<|instruct_end|>",
|
| 14 |
+
"<|text_start|>",
|
| 15 |
+
"<|text_end|>"
|
| 16 |
+
],
|
| 17 |
+
"is_local": true,
|
| 18 |
+
"local_files_only": false,
|
| 19 |
+
"model_max_length": 131072,
|
| 20 |
+
"pad_token": "<|endoftext|>",
|
| 21 |
+
"split_special_tokens": false,
|
| 22 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 23 |
+
"unk_token": null
|
| 24 |
+
}
|
exp_v1/omnivoice_finetune/checkpoint-5000/train_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
|
| 3 |
+
"data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
|
| 4 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 5 |
+
"audio_vocab_size": 1025,
|
| 6 |
+
"audio_mask_id": 1024,
|
| 7 |
+
"num_audio_codebook": 8,
|
| 8 |
+
"audio_codebook_weights": [
|
| 9 |
+
8,
|
| 10 |
+
8,
|
| 11 |
+
6,
|
| 12 |
+
6,
|
| 13 |
+
4,
|
| 14 |
+
4,
|
| 15 |
+
2,
|
| 16 |
+
2
|
| 17 |
+
],
|
| 18 |
+
"drop_cond_ratio": 0.1,
|
| 19 |
+
"prompt_ratio_range": [
|
| 20 |
+
0.0,
|
| 21 |
+
0.3
|
| 22 |
+
],
|
| 23 |
+
"mask_ratio_range": [
|
| 24 |
+
0.0,
|
| 25 |
+
1.0
|
| 26 |
+
],
|
| 27 |
+
"language_ratio": 0.8,
|
| 28 |
+
"use_pinyin_ratio": 0.0,
|
| 29 |
+
"instruct_ratio": 0.0,
|
| 30 |
+
"only_instruct_ratio": 0.0,
|
| 31 |
+
"resume_from_checkpoint": null,
|
| 32 |
+
"init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
|
| 33 |
+
"learning_rate": 1e-05,
|
| 34 |
+
"weight_decay": 0.01,
|
| 35 |
+
"max_grad_norm": 1.0,
|
| 36 |
+
"steps": 5000,
|
| 37 |
+
"seed": 42,
|
| 38 |
+
"lr_scheduler_type": "cosine",
|
| 39 |
+
"warmup_type": "ratio",
|
| 40 |
+
"warmup_ratio": 0.01,
|
| 41 |
+
"warmup_steps": 0,
|
| 42 |
+
"batch_tokens": 4096,
|
| 43 |
+
"gradient_accumulation_steps": 2,
|
| 44 |
+
"num_workers": 3,
|
| 45 |
+
"mixed_precision": "bf16",
|
| 46 |
+
"allow_tf32": true,
|
| 47 |
+
"use_deepspeed": false,
|
| 48 |
+
"deepspeed_config": null,
|
| 49 |
+
"attn_implementation": "sdpa",
|
| 50 |
+
"max_sample_tokens": 2000,
|
| 51 |
+
"min_sample_tokens": 50,
|
| 52 |
+
"max_batch_size": 64,
|
| 53 |
+
"logging_steps": 50,
|
| 54 |
+
"eval_steps": 500,
|
| 55 |
+
"save_steps": 500,
|
| 56 |
+
"keep_last_n_checkpoints": -1
|
| 57 |
+
}
|
exp_v1/omnivoice_finetune/initial_config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"output_dir": "/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune",
|
| 3 |
+
"data_config": "/home/riftuser/OmniVoice/sync_data/configs/data_saudi.json",
|
| 4 |
+
"llm_name_or_path": "Qwen/Qwen3-0.6B",
|
| 5 |
+
"audio_vocab_size": 1025,
|
| 6 |
+
"audio_mask_id": 1024,
|
| 7 |
+
"num_audio_codebook": 8,
|
| 8 |
+
"audio_codebook_weights": [
|
| 9 |
+
8,
|
| 10 |
+
8,
|
| 11 |
+
6,
|
| 12 |
+
6,
|
| 13 |
+
4,
|
| 14 |
+
4,
|
| 15 |
+
2,
|
| 16 |
+
2
|
| 17 |
+
],
|
| 18 |
+
"drop_cond_ratio": 0.1,
|
| 19 |
+
"prompt_ratio_range": [
|
| 20 |
+
0.0,
|
| 21 |
+
0.3
|
| 22 |
+
],
|
| 23 |
+
"mask_ratio_range": [
|
| 24 |
+
0.0,
|
| 25 |
+
1.0
|
| 26 |
+
],
|
| 27 |
+
"language_ratio": 0.8,
|
| 28 |
+
"use_pinyin_ratio": 0.0,
|
| 29 |
+
"instruct_ratio": 0.0,
|
| 30 |
+
"only_instruct_ratio": 0.0,
|
| 31 |
+
"resume_from_checkpoint": null,
|
| 32 |
+
"init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
|
| 33 |
+
"learning_rate": 1e-05,
|
| 34 |
+
"weight_decay": 0.01,
|
| 35 |
+
"max_grad_norm": 1.0,
|
| 36 |
+
"steps": 5000,
|
| 37 |
+
"seed": 42,
|
| 38 |
+
"lr_scheduler_type": "cosine",
|
| 39 |
+
"warmup_type": "ratio",
|
| 40 |
+
"warmup_ratio": 0.01,
|
| 41 |
+
"warmup_steps": 0,
|
| 42 |
+
"batch_tokens": 4096,
|
| 43 |
+
"gradient_accumulation_steps": 2,
|
| 44 |
+
"num_workers": 3,
|
| 45 |
+
"mixed_precision": "bf16",
|
| 46 |
+
"allow_tf32": true,
|
| 47 |
+
"use_deepspeed": false,
|
| 48 |
+
"deepspeed_config": null,
|
| 49 |
+
"attn_implementation": "sdpa",
|
| 50 |
+
"max_sample_tokens": 2000,
|
| 51 |
+
"min_sample_tokens": 50,
|
| 52 |
+
"max_batch_size": 64,
|
| 53 |
+
"logging_steps": 50,
|
| 54 |
+
"eval_steps": 500,
|
| 55 |
+
"save_steps": 500,
|
| 56 |
+
"keep_last_n_checkpoints": -1
|
| 57 |
+
}
|
infer.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from omnivoice import OmniVoice
|
| 3 |
+
import soundfile as sf
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
model = OmniVoice.from_pretrained(
|
| 7 |
+
"/home/riftuser/OmniVoice/exp_v1/omnivoice_finetune/checkpoint-500",
|
| 8 |
+
device_map="cuda:0",
|
| 9 |
+
dtype=torch.float16
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
saudi_tts_text = """
|
| 13 |
+
السلام عليكم يا شباب، كيف الحال؟
|
| 14 |
+
|
| 15 |
+
اليوم عندي تقريبًا 3 meetings مهمة، وأول meeting بتبدأ الساعة 10:30 الصباح. [sigh] [sigh]
|
| 16 |
+
|
| 17 |
+
بصراحة كنت ناوي أخلص الـ report بدري، لكن الـ internet صار بطيء بشكل مو طبيعي. [dissatisfaction-hnn] [sigh] [dissatisfaction-hnn]
|
| 18 |
+
|
| 19 |
+
قلت خلاص، خلني آخذ coffee وأروق شوي قبل ما أبدأ الشغل. [laughter] [laughter] [confirmation-en]
|
| 20 |
+
|
| 21 |
+
وبعدين اكتشفت إن الـ laptop يحتاج update من أمس! [surprise-oh] [dissatisfaction-hnn]
|
| 22 |
+
|
| 23 |
+
قلت يا ساتر، شكله يوم طويل جدًا. [sigh] [laughter]
|
| 24 |
+
|
| 25 |
+
لكن الحمد لله الأمور مشت تمام بالنهاية.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
# Measure generation time
|
| 29 |
+
start_time = time.time()
|
| 30 |
+
|
| 31 |
+
audio = model.generate(
|
| 32 |
+
text=saudi_tts_text,
|
| 33 |
+
ref_audio="ref_audio/women_ref_1.mp3",
|
| 34 |
+
ref_text="شوفي يا حلوة هالكريم الجديد للبشرة، يخلي وجهك مثل القمر! ",
|
| 35 |
+
instruct = "female, young adult, high pitch",
|
| 36 |
+
speed = 1.1,
|
| 37 |
+
num_step = 25,
|
| 38 |
+
guidance_scale=2.0,
|
| 39 |
+
t_shift=0.1,
|
| 40 |
+
position_temperature=3,
|
| 41 |
+
layer_penalty_factor=5.0,
|
| 42 |
+
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
generation_time = time.time() - start_time
|
| 46 |
+
|
| 47 |
+
# Save audio
|
| 48 |
+
sf.write("out_1.wav", audio[0], 24000)
|
| 49 |
+
|
| 50 |
+
# Calculate audio duration
|
| 51 |
+
audio_duration = len(audio[0]) / 24000
|
| 52 |
+
|
| 53 |
+
# Calculate RTF
|
| 54 |
+
rtf = generation_time / audio_duration
|
| 55 |
+
|
| 56 |
+
print(f"Generation Time: {generation_time:.2f} sec")
|
| 57 |
+
print(f"Audio Duration: {audio_duration:.2f} sec")
|
| 58 |
+
print(f"RTF: {rtf:.4f}")
|
omnivoice/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import warnings
|
| 2 |
+
from importlib.metadata import PackageNotFoundError, version
|
| 3 |
+
|
| 4 |
+
warnings.filterwarnings("ignore", module="torchaudio")
|
| 5 |
+
warnings.filterwarnings(
|
| 6 |
+
"ignore",
|
| 7 |
+
category=SyntaxWarning,
|
| 8 |
+
message="invalid escape sequence",
|
| 9 |
+
module="pydub.utils",
|
| 10 |
+
)
|
| 11 |
+
warnings.filterwarnings(
|
| 12 |
+
"ignore",
|
| 13 |
+
category=FutureWarning,
|
| 14 |
+
module="torch.distributed.algorithms.ddp_comm_hooks",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
__version__ = version("omnivoice")
|
| 19 |
+
except PackageNotFoundError:
|
| 20 |
+
__version__ = "0.0.0"
|
| 21 |
+
|
| 22 |
+
from omnivoice.models.omnivoice import (
|
| 23 |
+
OmniVoice,
|
| 24 |
+
OmniVoiceConfig,
|
| 25 |
+
OmniVoiceGenerationConfig,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
__all__ = ["OmniVoice", "OmniVoiceConfig", "OmniVoiceGenerationConfig"]
|
prepare_sync_data.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import io
|
| 3 |
+
import json
|
| 4 |
+
import random
|
| 5 |
+
import shutil
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
from datasets import Audio, load_dataset
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
DEFAULT_REPO = "saleh1312/syncing_data"
|
| 13 |
+
MAX_DURATION = 10.0
|
| 14 |
+
|
| 15 |
+
def main():
|
| 16 |
+
parser = argparse.ArgumentParser(description="Prepare data for OmniVoice Training")
|
| 17 |
+
parser.add_argument("--repo", default=DEFAULT_REPO, help="HF Dataset ID")
|
| 18 |
+
parser.add_argument("--out", default="sync_data", help="Output directory")
|
| 19 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 20 |
+
args = parser.parse_args()
|
| 21 |
+
|
| 22 |
+
out_root = Path(args.out).resolve()
|
| 23 |
+
out_root = out_root / "data"
|
| 24 |
+
if out_root.exists():
|
| 25 |
+
print(f"Cleaning up old directory: {out_root}")
|
| 26 |
+
shutil.rmtree(out_root)
|
| 27 |
+
|
| 28 |
+
wav_dir = out_root / "wavs"
|
| 29 |
+
wav_dir.mkdir(parents=True)
|
| 30 |
+
|
| 31 |
+
print(f"Loading dataset: {args.repo}")
|
| 32 |
+
ds = load_dataset(args.repo, split="train")
|
| 33 |
+
ds = ds.cast_column("audio", Audio(decode=False))
|
| 34 |
+
|
| 35 |
+
processed_records = []
|
| 36 |
+
skipped = 0
|
| 37 |
+
|
| 38 |
+
print("Processing audio files...")
|
| 39 |
+
for i, row in enumerate(tqdm(ds)):
|
| 40 |
+
audio_data = row["audio"]["bytes"]
|
| 41 |
+
if not audio_data:
|
| 42 |
+
continue
|
| 43 |
+
|
| 44 |
+
# Load audio to check duration
|
| 45 |
+
with io.BytesIO(audio_data) as f:
|
| 46 |
+
data, sr = sf.read(f)
|
| 47 |
+
|
| 48 |
+
duration = len(data) / sr
|
| 49 |
+
if duration > MAX_DURATION:
|
| 50 |
+
skipped += 1
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
# Ensure Mono
|
| 54 |
+
if data.ndim > 1:
|
| 55 |
+
data = data.mean(axis=1)
|
| 56 |
+
|
| 57 |
+
sample_id = f"sample_{i:06d}"
|
| 58 |
+
wav_path = wav_dir / f"{sample_id}.wav"
|
| 59 |
+
sf.write(wav_path, data, sr, subtype='PCM_16')
|
| 60 |
+
|
| 61 |
+
tone = str(row.get("tone", "neutral")).strip().lower()
|
| 62 |
+
processed_records.append({
|
| 63 |
+
"id": sample_id,
|
| 64 |
+
"audio_path": str(wav_path.resolve()),
|
| 65 |
+
"text": row["text"],
|
| 66 |
+
"language_id": "ar",
|
| 67 |
+
"instruct": f"saudi, conversational, {tone}"
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
random.seed(args.seed)
|
| 71 |
+
random.shuffle(processed_records)
|
| 72 |
+
|
| 73 |
+
split_idx = int(len(processed_records) * 0.95)
|
| 74 |
+
train_data = processed_records[:split_idx]
|
| 75 |
+
dev_data = processed_records[split_idx:]
|
| 76 |
+
|
| 77 |
+
# Write input files for the tokenization script
|
| 78 |
+
for name, data in [("train_raw.jsonl", train_data), ("dev_raw.jsonl", dev_data)]:
|
| 79 |
+
out_path = out_root / name
|
| 80 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 81 |
+
for rec in data:
|
| 82 |
+
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
| 83 |
+
print(f"Created {out_path} ({len(data)} samples)")
|
| 84 |
+
|
| 85 |
+
print(f"\nPreparation Complete!")
|
| 86 |
+
print(f"Skipped {skipped} samples (> {MAX_DURATION}s)")
|
| 87 |
+
print(f"Next: Run the 'extract_audio_tokens.py' script using 'sync_data/train_raw.jsonl'")
|
| 88 |
+
|
| 89 |
+
if __name__ == "__main__":
|
| 90 |
+
main()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "omnivoice"
|
| 7 |
+
version = "0.1.5"
|
| 8 |
+
description = "OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = "Apache-2.0"
|
| 11 |
+
requires-python = ">=3.10"
|
| 12 |
+
authors = [{name = "Han Zhu"}]
|
| 13 |
+
keywords = [
|
| 14 |
+
"tts",
|
| 15 |
+
"text-to-speech",
|
| 16 |
+
"speech-synthesis",
|
| 17 |
+
"zero-shot",
|
| 18 |
+
"multilingual",
|
| 19 |
+
"diffusion",
|
| 20 |
+
"voice-cloning",
|
| 21 |
+
]
|
| 22 |
+
classifiers = [
|
| 23 |
+
"Intended Audience :: Science/Research",
|
| 24 |
+
"Intended Audience :: Developers",
|
| 25 |
+
|
| 26 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 27 |
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
| 28 |
+
|
| 29 |
+
"Operating System :: OS Independent",
|
| 30 |
+
"Programming Language :: Python :: 3",
|
| 31 |
+
]
|
| 32 |
+
dependencies = [
|
| 33 |
+
"torch>=2.4",
|
| 34 |
+
"torchaudio>=2.4",
|
| 35 |
+
"transformers>=5.3.0",
|
| 36 |
+
"accelerate",
|
| 37 |
+
"pydub",
|
| 38 |
+
"gradio",
|
| 39 |
+
"tensorboardX",
|
| 40 |
+
"webdataset",
|
| 41 |
+
"numpy",
|
| 42 |
+
"soundfile",
|
| 43 |
+
"librosa",
|
| 44 |
+
"uvicorn>=0.42.0",
|
| 45 |
+
"fastapi>=0.135.2",
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
[project.optional-dependencies]
|
| 49 |
+
|
| 50 |
+
eval = [
|
| 51 |
+
"jiwer==3.1.0", # WER
|
| 52 |
+
"s3prl", # Speech representation (HuBERT etc.)
|
| 53 |
+
"funasr", # ASR models
|
| 54 |
+
"zhconv", # Chinese character normalization
|
| 55 |
+
"zhon", # Chinese punctuation
|
| 56 |
+
"unidecode", # Unicode normalization
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
[project.scripts]
|
| 60 |
+
omnivoice-infer = "omnivoice.cli.infer:main"
|
| 61 |
+
omnivoice-infer-batch = "omnivoice.cli.infer_batch:main"
|
| 62 |
+
omnivoice-demo = "omnivoice.cli.demo:main"
|
| 63 |
+
|
| 64 |
+
[project.urls]
|
| 65 |
+
Homepage = "https://github.com/k2-fsa/OmniVoice"
|
| 66 |
+
Repository = "https://github.com/k2-fsa/OmniVoice"
|
| 67 |
+
"Bug Tracker" = "https://github.com/k2-fsa/OmniVoice/issues"
|
| 68 |
+
|
| 69 |
+
[tool.uv.sources]
|
| 70 |
+
# Install PyTorch with CUDA support on Linux/Windows (CUDA doesn't exist for Mac).
|
| 71 |
+
# NOTE: We must explicitly request them as `dependencies` above. These improved
|
| 72 |
+
# versions will not be selected if they're only third-party dependencies.
|
| 73 |
+
torch = [
|
| 74 |
+
{ index = "pytorch-cuda", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
|
| 75 |
+
]
|
| 76 |
+
torchaudio = [
|
| 77 |
+
{ index = "pytorch-cuda", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
[[tool.uv.index]]
|
| 81 |
+
name = "pytorch-cuda"
|
| 82 |
+
# Use PyTorch built for NVIDIA Toolkit version 12.8.
|
| 83 |
+
# Available versions: https://pytorch.org/get-started/locally/
|
| 84 |
+
url = "https://download.pytorch.org/whl/cu128"
|
| 85 |
+
# Only use this index when explicitly requested by `tool.uv.sources`.
|
| 86 |
+
explicit = true
|
| 87 |
+
|
| 88 |
+
[tool.uv]
|
| 89 |
+
constraint-dependencies = [
|
| 90 |
+
"torch==2.8.0",
|
| 91 |
+
"torchaudio==2.8.0",
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
[tool.hatch.build.targets.sdist]
|
| 95 |
+
include = ["omnivoice"]
|
| 96 |
+
|
| 97 |
+
[tool.hatch.build.targets.wheel]
|
| 98 |
+
packages = ["omnivoice"]
|
ref_audio/women_ref_1.mp3
ADDED
|
Binary file (79 kB). View file
|
|
|
upload_to_hf.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import HfApi, create_repo
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
# =========================
|
| 5 |
+
# CONFIG
|
| 6 |
+
# =========================
|
| 7 |
+
repo_name = "OmniVoice_sync_data_and_code"
|
| 8 |
+
username = "TTS-ORG"
|
| 9 |
+
|
| 10 |
+
local_path = os.path.expanduser("~/OmniVoice")
|
| 11 |
+
|
| 12 |
+
repo_id = f"{username}/{repo_name}"
|
| 13 |
+
|
| 14 |
+
# =========================
|
| 15 |
+
# INIT
|
| 16 |
+
# =========================
|
| 17 |
+
api = HfApi()
|
| 18 |
+
|
| 19 |
+
# =========================
|
| 20 |
+
# CREATE REPO
|
| 21 |
+
# =========================
|
| 22 |
+
create_repo(
|
| 23 |
+
repo_id=repo_id,
|
| 24 |
+
repo_type="model",
|
| 25 |
+
exist_ok=True,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
print(f"Repo ready: {repo_id}")
|
| 29 |
+
|
| 30 |
+
# =========================
|
| 31 |
+
# UPLOAD LARGE FOLDER
|
| 32 |
+
# =========================
|
| 33 |
+
api.upload_large_folder(
|
| 34 |
+
folder_path=local_path,
|
| 35 |
+
repo_id=repo_id,
|
| 36 |
+
repo_type="model",
|
| 37 |
+
|
| 38 |
+
# VERY IMPORTANT
|
| 39 |
+
ignore_patterns=[
|
| 40 |
+
".git/*",
|
| 41 |
+
".venv/*",
|
| 42 |
+
"__pycache__/*",
|
| 43 |
+
"*.pyc",
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# huge optimizer states
|
| 48 |
+
"*.bin",
|
| 49 |
+
|
| 50 |
+
# cache/temp
|
| 51 |
+
"*.tmp",
|
| 52 |
+
"*.log",
|
| 53 |
+
|
| 54 |
+
# optional
|
| 55 |
+
"out.wav",
|
| 56 |
+
],
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
print("Upload completed successfully 🚀")
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|