diff --git a/omnivoice/cli/__init__.py b/omnivoice/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/cli/demo.py b/omnivoice/cli/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f45ac775ed4a984282d64e72334a8e6d6504e520
--- /dev/null
+++ b/omnivoice/cli/demo.py
@@ -0,0 +1,548 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Gradio demo for OmniVoice.
+
+Supports voice cloning and voice design.
+
+Usage:
+ omnivoice-demo --model /path/to/checkpoint --port 8000
+"""
+
+import argparse
+import logging
+from typing import Any, Dict
+
+import gradio as gr
+import numpy as np
+import torch
+
+from omnivoice import OmniVoice, OmniVoiceGenerationConfig
+from omnivoice.utils.lang_map import LANG_NAMES, lang_display_name
+
+
+def get_best_device():
+ """Auto-detect the best available device: CUDA > MPS > CPU."""
+ if torch.cuda.is_available():
+ return "cuda"
+ if torch.backends.mps.is_available():
+ return "mps"
+ return "cpu"
+
+
+# ---------------------------------------------------------------------------
+# Language list — all 600+ supported languages
+# ---------------------------------------------------------------------------
+_ALL_LANGUAGES = ["Auto"] + sorted(lang_display_name(n) for n in LANG_NAMES)
+
+
+# ---------------------------------------------------------------------------
+# Voice Design instruction templates
+# ---------------------------------------------------------------------------
+# Each option is displayed as "English / 中文".
+# The model expects English for accents and Chinese for dialects.
+_CATEGORIES = {
+ "Gender / 性别": ["Male / 男", "Female / 女"],
+ "Age / 年龄": [
+ "Child / 儿童",
+ "Teenager / 少年",
+ "Young Adult / 青年",
+ "Middle-aged / 中年",
+ "Elderly / 老年",
+ ],
+ "Pitch / 音调": [
+ "Very Low Pitch / 极低音调",
+ "Low Pitch / 低音调",
+ "Moderate Pitch / 中音调",
+ "High Pitch / 高音调",
+ "Very High Pitch / 极高音调",
+ ],
+ "Style / 风格": ["Whisper / 耳语"],
+ "English Accent / 英文口音": [
+ "American Accent / 美式口音",
+ "Australian Accent / 澳大利亚口音",
+ "British Accent / 英国口音",
+ "Chinese Accent / 中国口音",
+ "Canadian Accent / 加拿大口音",
+ "Indian Accent / 印度口音",
+ "Korean Accent / 韩国口音",
+ "Portuguese Accent / 葡萄牙口音",
+ "Russian Accent / 俄罗斯口音",
+ "Japanese Accent / 日本口音",
+ ],
+ "Chinese Dialect / 中文方言": [
+ "Henan Dialect / 河南话",
+ "Shaanxi Dialect / 陕西话",
+ "Sichuan Dialect / 四川话",
+ "Guizhou Dialect / 贵州话",
+ "Yunnan Dialect / 云南话",
+ "Guilin Dialect / 桂林话",
+ "Jinan Dialect / 济南话",
+ "Shijiazhuang Dialect / 石家庄话",
+ "Gansu Dialect / 甘肃话",
+ "Ningxia Dialect / 宁夏话",
+ "Qingdao Dialect / 青岛话",
+ "Northeast Dialect / 东北话",
+ ],
+}
+
+_ATTR_INFO = {
+ "English Accent / 英文口音": "Only effective for English speech.",
+ "Chinese Dialect / 中文方言": "Only effective for Chinese speech.",
+}
+
+# ---------------------------------------------------------------------------
+# Argument parser
+# ---------------------------------------------------------------------------
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ prog="omnivoice-demo",
+ description="Launch a Gradio demo for OmniVoice.",
+ formatter_class=argparse.RawTextHelpFormatter,
+ )
+ parser.add_argument(
+ "--model",
+ default="k2-fsa/OmniVoice",
+ help="Model checkpoint path or HuggingFace repo id.",
+ )
+ parser.add_argument(
+ "--device", default=None, help="Device to use. Auto-detected if not specified."
+ )
+ parser.add_argument("--ip", default="0.0.0.0", help="Server IP (default: 0.0.0.0).")
+ parser.add_argument(
+ "--port", type=int, default=7860, help="Server port (default: 7860)."
+ )
+ parser.add_argument(
+ "--root-path",
+ default=None,
+ help="Root path for reverse proxy.",
+ )
+ parser.add_argument(
+ "--share", action="store_true", default=False, help="Create public link."
+ )
+ parser.add_argument(
+ "--no-asr",
+ action="store_true",
+ default=False,
+ help="Skip loading Whisper ASR model. Reference text auto-transcription"
+ " will be unavailable.",
+ )
+ parser.add_argument(
+ "--asr-model",
+ default="openai/whisper-large-v3-turbo",
+ help="ASR model path or HuggingFace repo id"
+ " (default: openai/whisper-large-v3-turbo).",
+ )
+ return parser
+
+
+# ---------------------------------------------------------------------------
+# Build demo
+# ---------------------------------------------------------------------------
+
+
+def build_demo(
+ model: OmniVoice,
+ checkpoint: str,
+ generate_fn=None,
+) -> gr.Blocks:
+
+ sampling_rate = model.sampling_rate
+
+ # -- shared generation core --
+ def _gen_core(
+ text,
+ language,
+ ref_audio,
+ instruct,
+ num_step,
+ guidance_scale,
+ denoise,
+ speed,
+ duration,
+ preprocess_prompt,
+ postprocess_output,
+ mode,
+ ref_text=None,
+ ):
+ if not text or not text.strip():
+ return None, "Please enter the text to synthesize."
+
+ gen_config = OmniVoiceGenerationConfig(
+ num_step=int(num_step or 32),
+ guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0,
+ denoise=bool(denoise) if denoise is not None else True,
+ preprocess_prompt=bool(preprocess_prompt),
+ postprocess_output=bool(postprocess_output),
+ )
+
+ lang = language if (language and language != "Auto") else None
+
+ kw: Dict[str, Any] = dict(
+ text=text.strip(), language=lang, generation_config=gen_config
+ )
+
+ if speed is not None and float(speed) != 1.0:
+ kw["speed"] = float(speed)
+ if duration is not None and float(duration) > 0:
+ kw["duration"] = float(duration)
+
+ if mode == "clone":
+ if not ref_audio:
+ return None, "Please upload a reference audio."
+ kw["voice_clone_prompt"] = model.create_voice_clone_prompt(
+ ref_audio=ref_audio,
+ ref_text=ref_text,
+ )
+
+ if instruct and instruct.strip():
+ kw["instruct"] = instruct.strip()
+
+ try:
+ audio = model.generate(**kw)
+ except Exception as e:
+ return None, f"Error: {type(e).__name__}: {e}"
+
+ waveform = (audio[0] * 32767).astype(np.int16)
+ return (sampling_rate, waveform), "Done."
+
+ # Allow external wrappers (e.g. spaces.GPU for ZeroGPU Spaces)
+ _gen = generate_fn if generate_fn is not None else _gen_core
+
+ # =====================================================================
+ # UI
+ # =====================================================================
+ theme = gr.themes.Soft(
+ font=["Inter", "Arial", "sans-serif"],
+ )
+ css = """
+ .gradio-container {max-width: 100% !important; font-size: 16px !important;}
+ .gradio-container h1 {font-size: 1.5em !important;}
+ .gradio-container .prose {font-size: 1.1em !important;}
+ .compact-audio audio {height: 60px !important;}
+ .compact-audio .waveform {min-height: 80px !important;}
+ """
+
+ # Reusable: language dropdown component
+ def _lang_dropdown(label="Language (optional) / 语种 (可选)", value="Auto"):
+ return gr.Dropdown(
+ label=label,
+ choices=_ALL_LANGUAGES,
+ value=value,
+ allow_custom_value=False,
+ interactive=True,
+ info="Keep as Auto to auto-detect the language.",
+ )
+
+ # Reusable: optional generation settings accordion
+ def _gen_settings():
+ with gr.Accordion("Generation Settings (optional)", open=False):
+ sp = gr.Slider(
+ 0.5,
+ 1.5,
+ value=1.0,
+ step=0.05,
+ label="Speed",
+ info="1.0 = normal. >1 faster, <1 slower. Ignored if Duration is set.",
+ )
+ du = gr.Number(
+ value=None,
+ label="Duration (seconds)",
+ info=(
+ "Leave empty to use speed."
+ " Set a fixed duration to override speed."
+ ),
+ )
+ ns = gr.Slider(
+ 4,
+ 64,
+ value=32,
+ step=1,
+ label="Inference Steps",
+ info="Default: 32. Lower = faster, higher = better quality.",
+ )
+ dn = gr.Checkbox(
+ label="Denoise",
+ value=True,
+ info="Default: enabled. Uncheck to disable denoising.",
+ )
+ gs = gr.Slider(
+ 0.0,
+ 4.0,
+ value=2.0,
+ step=0.1,
+ label="Guidance Scale (CFG)",
+ info="Default: 2.0.",
+ )
+ pp = gr.Checkbox(
+ label="Preprocess Prompt",
+ value=True,
+ info="apply silence removal and trimming to the reference "
+ "audio, add punctuation in the end of reference text (if not already)",
+ )
+ po = gr.Checkbox(
+ label="Postprocess Output",
+ value=True,
+ info="Remove long silences from generated audio.",
+ )
+ return ns, gs, dn, sp, du, pp, po
+
+ with gr.Blocks(theme=theme, css=css, title="OmniVoice Demo") as demo:
+ gr.Markdown(
+ """
+# OmniVoice Demo
+
+State-of-the-art text-to-speech model for **600+ languages**, supporting:
+
+- **Voice Clone** — Clone any voice from a reference audio
+- **Voice Design** — Create custom voices with speaker attributes
+
+Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice)
+by Xiaomi AI Lab Next-gen Kaldi team.
+"""
+ )
+
+ with gr.Tabs():
+ # ==============================================================
+ # Voice Clone
+ # ==============================================================
+ with gr.TabItem("Voice Clone"):
+ with gr.Row():
+ with gr.Column(scale=1):
+ vc_text = gr.Textbox(
+ label="Text to Synthesize / 待合成文本",
+ lines=4,
+ placeholder="Enter the text you want to synthesize...",
+ )
+ vc_ref_audio = gr.Audio(
+ label="Reference Audio / 参考音频",
+ type="filepath",
+ elem_classes="compact-audio",
+ )
+ gr.Markdown(
+ ""
+ "Recommended: 3–10 seconds audio. "
+ ""
+ )
+ vc_ref_text = gr.Textbox(
+ label=("Reference Text (optional)" " / 参考音频文本(可选)"),
+ lines=2,
+ placeholder="Transcript of the reference audio. Leave empty"
+ " to auto-transcribe via ASR models.",
+ )
+ vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)")
+ with gr.Accordion("Instruct (optional)", open=False):
+ vc_instruct = gr.Textbox(label="Instruct", lines=2)
+ (
+ vc_ns,
+ vc_gs,
+ vc_dn,
+ vc_sp,
+ vc_du,
+ vc_pp,
+ vc_po,
+ ) = _gen_settings()
+ vc_btn = gr.Button("Generate / 生成", variant="primary")
+ with gr.Column(scale=1):
+ vc_audio = gr.Audio(
+ label="Output Audio / 合成结果",
+ type="numpy",
+ )
+ vc_status = gr.Textbox(label="Status / 状态", lines=2)
+
+ def _clone_fn(
+ text, lang, ref_aud, ref_text, instruct, ns, gs, dn, sp, du, pp, po
+ ):
+ return _gen(
+ text,
+ lang,
+ ref_aud,
+ instruct,
+ ns,
+ gs,
+ dn,
+ sp,
+ du,
+ pp,
+ po,
+ mode="clone",
+ ref_text=ref_text or None,
+ )
+
+ vc_btn.click(
+ _clone_fn,
+ inputs=[
+ vc_text,
+ vc_lang,
+ vc_ref_audio,
+ vc_ref_text,
+ vc_instruct,
+ vc_ns,
+ vc_gs,
+ vc_dn,
+ vc_sp,
+ vc_du,
+ vc_pp,
+ vc_po,
+ ],
+ outputs=[vc_audio, vc_status],
+ )
+
+ # ==============================================================
+ # Voice Design
+ # ==============================================================
+ with gr.TabItem("Voice Design"):
+ with gr.Row():
+ with gr.Column(scale=1):
+ vd_text = gr.Textbox(
+ label="Text to Synthesize / 待合成文本",
+ lines=4,
+ placeholder="Enter the text you want to synthesize...",
+ )
+ vd_lang = _lang_dropdown()
+
+ _AUTO = "Auto"
+ vd_groups = []
+ for _cat, _choices in _CATEGORIES.items():
+ vd_groups.append(
+ gr.Dropdown(
+ label=_cat,
+ choices=[_AUTO] + _choices,
+ value=_AUTO,
+ info=_ATTR_INFO.get(_cat),
+ )
+ )
+
+ (
+ vd_ns,
+ vd_gs,
+ vd_dn,
+ vd_sp,
+ vd_du,
+ vd_pp,
+ vd_po,
+ ) = _gen_settings()
+ vd_btn = gr.Button("Generate / 生成", variant="primary")
+ with gr.Column(scale=1):
+ vd_audio = gr.Audio(
+ label="Output Audio / 合成结果",
+ type="numpy",
+ )
+ vd_status = gr.Textbox(label="Status / 状态", lines=2)
+
+ def _build_instruct(groups):
+ """Extract instruct text from UI dropdowns.
+
+ Language unification and validation is handled by
+ _resolve_instruct inside _preprocess_all.
+ """
+ selected = [g for g in groups if g and g != "Auto"]
+ if not selected:
+ return None
+ parts = []
+ for v in selected:
+ if " / " in v:
+ en, zh = v.split(" / ", 1)
+ # Dialects have no English equivalent
+ if "Dialect" in v.split(" / ")[0]:
+ parts.append(zh.strip())
+ else:
+ parts.append(en.strip())
+ else:
+ parts.append(v)
+ return ", ".join(parts)
+
+ def _design_fn(text, lang, ns, gs, dn, sp, du, pp, po, *groups):
+ return _gen(
+ text,
+ lang,
+ None,
+ _build_instruct(groups),
+ ns,
+ gs,
+ dn,
+ sp,
+ du,
+ pp,
+ po,
+ mode="design",
+ )
+
+ vd_btn.click(
+ _design_fn,
+ inputs=[
+ vd_text,
+ vd_lang,
+ vd_ns,
+ vd_gs,
+ vd_dn,
+ vd_sp,
+ vd_du,
+ vd_pp,
+ vd_po,
+ ]
+ + vd_groups,
+ outputs=[vd_audio, vd_status],
+ )
+
+ return demo
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main(argv=None) -> int:
+ logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s %(name)s %(levelname)s: %(message)s",
+ )
+ parser = build_parser()
+ args = parser.parse_args(argv)
+
+ device = args.device or get_best_device()
+
+ checkpoint = args.model
+ if not checkpoint:
+ parser.print_help()
+ return 0
+ logging.info(f"Loading model from {checkpoint}, device={device} ...")
+ model = OmniVoice.from_pretrained(
+ checkpoint,
+ device_map=device,
+ dtype=torch.float16,
+ load_asr=not args.no_asr,
+ asr_model_name=args.asr_model,
+ )
+ print("Model loaded.")
+
+ demo = build_demo(model, checkpoint)
+
+ demo.queue().launch(
+ server_name=args.ip,
+ server_port=args.port,
+ share=args.share,
+ root_path=args.root_path,
+ )
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/omnivoice/cli/infer.py b/omnivoice/cli/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..32447b91527a6a247b904b312ca88f09a5908277
--- /dev/null
+++ b/omnivoice/cli/infer.py
@@ -0,0 +1,158 @@
+"""Single-item inference CLI for OmniVoice.
+
+Generates audio from a single text input using voice cloning,
+voice design, or auto voice.
+
+Usage:
+ # Voice cloning
+ omnivoice-infer --model k2-fsa/OmniVoice \
+ --text "Hello, this is a text for text-to-speech." \
+ --ref_audio ref.wav --ref_text "Reference transcript." --output out.wav
+
+ # Voice design
+ omnivoice-infer --model k2-fsa/OmniVoice \
+ --text "Hello, this is a text for text-to-speech." \
+ --instruct "male, British accent" --output out.wav
+
+ # Auto voice
+ omnivoice-infer --model k2-fsa/OmniVoice \
+ --text "Hello, this is a text for text-to-speech." --output out.wav
+"""
+
+import argparse
+import logging
+
+import torch
+
+import soundfile as sf
+
+from omnivoice.models.omnivoice import OmniVoice
+from omnivoice.utils.common import str2bool
+
+
+def get_best_device():
+ """Auto-detect the best available device: CUDA > MPS > CPU."""
+ if torch.cuda.is_available():
+ return "cuda"
+ if torch.backends.mps.is_available():
+ return "mps"
+ return "cpu"
+
+
+def get_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="OmniVoice single-item inference",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "--model",
+ type=str,
+ default="k2-fsa/OmniVoice",
+ help="Model checkpoint path or HuggingFace repo id.",
+ )
+ parser.add_argument(
+ "--text",
+ type=str,
+ required=True,
+ help="Text to synthesize.",
+ )
+ parser.add_argument(
+ "--output",
+ type=str,
+ required=True,
+ help="Output WAV file path.",
+ )
+ # Voice cloning
+ parser.add_argument(
+ "--ref_audio",
+ type=str,
+ default=None,
+ help="Reference audio file path for voice cloning.",
+ )
+ parser.add_argument(
+ "--ref_text",
+ type=str,
+ default=None,
+ help="Reference text describing the reference audio.",
+ )
+ # Voice design
+ parser.add_argument(
+ "--instruct",
+ type=str,
+ default=None,
+ help="Style instruction for voice design mode.",
+ )
+ parser.add_argument(
+ "--language",
+ type=str,
+ default=None,
+ help="Language name (e.g. 'English') or code (e.g. 'en').",
+ )
+ # Generation parameters
+ parser.add_argument("--num_step", type=int, default=32)
+ parser.add_argument("--guidance_scale", type=float, default=2.0)
+ parser.add_argument("--speed", type=float, default=1.0)
+ parser.add_argument(
+ "--duration",
+ type=float,
+ default=None,
+ help="Fixed output duration in seconds. If set, overrides the "
+ "model's duration estimation. The speed factor is automatically "
+ "adjusted to match while preserving language-aware pacing.",
+ )
+ parser.add_argument("--t_shift", type=float, default=0.1)
+ parser.add_argument("--denoise", type=str2bool, default=True)
+ parser.add_argument(
+ "--postprocess_output",
+ type=str2bool,
+ default=True,
+ )
+ parser.add_argument("--layer_penalty_factor", type=float, default=5.0)
+ parser.add_argument("--position_temperature", type=float, default=5.0)
+ parser.add_argument("--class_temperature", type=float, default=0.0)
+ parser.add_argument(
+ "--device",
+ type=str,
+ default=None,
+ help="Device to use for inference. Auto-detected if not specified.",
+ )
+ return parser
+
+
+def main():
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ args = get_parser().parse_args()
+
+ device = args.device or get_best_device()
+ logging.info(f"Loading model from {args.model} on {device} ...")
+ model = OmniVoice.from_pretrained(
+ args.model, device_map=device, dtype=torch.float16
+ )
+
+ logging.info(f"Generating audio for: {args.text[:80]}...")
+ audios = model.generate(
+ text=args.text,
+ language=args.language,
+ ref_audio=args.ref_audio,
+ ref_text=args.ref_text,
+ instruct=args.instruct,
+ duration=args.duration,
+ num_step=args.num_step,
+ guidance_scale=args.guidance_scale,
+ speed=args.speed,
+ t_shift=args.t_shift,
+ denoise=args.denoise,
+ postprocess_output=args.postprocess_output,
+ layer_penalty_factor=args.layer_penalty_factor,
+ position_temperature=args.position_temperature,
+ class_temperature=args.class_temperature,
+ )
+
+ sf.write(args.output, audios[0], model.sampling_rate)
+ logging.info(f"Saved to {args.output}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/cli/infer_batch.py b/omnivoice/cli/infer_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b24912bbb538b24429fb78c0cc191bd7010d429
--- /dev/null
+++ b/omnivoice/cli/infer_batch.py
@@ -0,0 +1,545 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batch inference CLI for OmniVoice.
+
+Distributes TTS generation across multiple GPUs for large-scale tasks.
+Reads a JSONL test list, generates audio in parallel, and saves results.
+
+Usage:
+ omnivoice-infer-batch --model k2-fsa/OmniVoice \
+ --test_list test.jsonl --res_dir results/
+
+Test list format (JSONL, one JSON object per line):
+ Required fields: "id", "text"
+ Voice cloning: "ref_audio", "ref_text"
+ Voice design: "instruct"
+ Optional: "language_id", "duration", "speed"
+"""
+
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import signal
+import time
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import List, Optional, Tuple
+
+import torch
+from tqdm import tqdm
+
+from omnivoice.models.omnivoice import OmniVoice
+import soundfile as sf
+
+from omnivoice.utils.audio import load_audio
+from omnivoice.utils.common import str2bool
+from omnivoice.utils.data_utils import read_test_list
+from omnivoice.utils.duration import RuleDurationEstimator
+
+
+def get_best_device():
+ """Auto-detect the best available device: CUDA > MPS > CPU."""
+ if torch.cuda.is_available():
+ return "cuda", torch.cuda.device_count()
+ if torch.backends.mps.is_available():
+ return "mps", 1
+ return "cpu", 1
+
+
+worker_model = None
+SAMPLING_RATE = 24000
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(description="Infer OmniVoice Model")
+ parser.add_argument(
+ "--model",
+ type=str,
+ default="k2-fsa/OmniVoice",
+ help="Path to the model checkpoint (local dir or HF repo id). "
+ "Audio tokenizer is expected at /audio_tokenizer/.",
+ )
+ parser.add_argument(
+ "--test_list",
+ type=str,
+ required=True,
+ help="Path to the JSONL file containing test samples. "
+ "Each line is a JSON object with the following fields: "
+ '"id" (str, required): unique name for the output file; '
+ '"text" (str, required): text to synthesize; '
+ '"ref_audio" (str): path to reference audio for voice cloning; '
+ '"ref_text" (str): transcript of the reference audio; '
+ '"instruct" (str): instruction for voice design (used when ref_audio is absent); '
+ '"language_id" (str): language code, e.g. "en"; '
+ '"duration" (float): target duration in seconds; '
+ '"speed" (float): speaking speed multiplier. '
+ "Only id and text are required; all other fields are optional.",
+ )
+ parser.add_argument(
+ "--res_dir",
+ type=str,
+ required=True,
+ help="Directory to save the generated audio files.",
+ )
+ parser.add_argument(
+ "--num_step",
+ type=int,
+ default=32,
+ help="Number of steps for iterative decoding.",
+ )
+ parser.add_argument(
+ "--guidance_scale",
+ type=float,
+ default=2.0,
+ help="Scale for Classifier-Free Guidance.",
+ )
+ parser.add_argument(
+ "--t_shift",
+ type=float,
+ default=0.1,
+ help="Shift t to smaller ones if t_shift < 1.0",
+ )
+ parser.add_argument(
+ "--nj_per_gpu",
+ type=int,
+ default=1,
+ help="Number of worker processes to spawn per GPU.",
+ )
+ parser.add_argument(
+ "--audio_chunk_duration",
+ type=float,
+ default=15.0,
+ help="Maximum duration of audio chunk (in seconds) for splitting. "
+ '"Not split" if <= 0.',
+ )
+ parser.add_argument(
+ "--audio_chunk_threshold",
+ type=float,
+ default=30.0,
+ help=(
+ "The duration threshold (in seconds) to decide"
+ " whether to split audio into chunks."
+ ),
+ )
+ parser.add_argument(
+ "--batch_duration",
+ type=float,
+ default=1000.0,
+ help="Maximum total duration (reference + generated) per batch (seconds).",
+ )
+ parser.add_argument(
+ "--batch_size",
+ type=int,
+ default=0,
+ help="Fixed batch size (number of samples per batch). "
+ "If > 0, use fixed-size batching instead of duration-based batching.",
+ )
+ parser.add_argument(
+ "--warmup",
+ type=int,
+ default=0,
+ help="Number of dummy inference runs per worker before real inference "
+ "starts, to warm up CUDA kernels and caches.",
+ )
+ parser.add_argument(
+ "--preprocess_prompt",
+ type=str2bool,
+ default=True,
+ help="Whether to preprocess reference audio (silence removal, trimming). "
+ "Set to False to keep raw audio.",
+ )
+ parser.add_argument(
+ "--postprocess_output",
+ type=str2bool,
+ default=True,
+ help="Whether to post-process generated audio (remove silence).",
+ )
+ parser.add_argument(
+ "--layer_penalty_factor",
+ type=float,
+ default=5.0,
+ help="The penalty factor for layer-wise sampling.",
+ )
+ parser.add_argument(
+ "--position_temperature",
+ type=float,
+ default=5.0,
+ help="The temperature for position selection.",
+ )
+ parser.add_argument(
+ "--class_temperature",
+ type=float,
+ default=0.0,
+ help="The temperature for class token sampling.",
+ )
+ parser.add_argument(
+ "--denoise",
+ type=str2bool,
+ default=True,
+ help="Whether to add <|denoise|> token in the reference.",
+ )
+ parser.add_argument(
+ "--lang_id",
+ type=str,
+ default=None,
+ help="Language id to use when test_list JSONL entries do not contain "
+ "a language_id field.",
+ )
+ return parser
+
+
+def process_init(rank_queue, model_checkpoint, warmup=0):
+ """Initializer for each worker process.
+
+ Loads model (with tokenizers and duration estimator) onto a specific GPU
+ via ``OmniVoice.from_pretrained()``.
+ """
+ global worker_model
+
+ torch.set_num_threads(2)
+ torch.set_num_interop_threads(2)
+
+ formatter = (
+ "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] "
+ "[Worker %(process)d] %(message)s"
+ )
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ rank = rank_queue.get()
+ device_type, device_id = rank
+ if device_type == "cpu":
+ worker_device = "cpu"
+ elif device_type == "mps":
+ worker_device = "mps"
+ else:
+ worker_device = f"cuda:{device_id}"
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+ worker_model = OmniVoice.from_pretrained(
+ model_checkpoint,
+ device_map=worker_device,
+ dtype=torch.float16,
+ )
+
+ if warmup > 0:
+ logging.info(f"Running {warmup} warmup iterations on {worker_device}")
+ dummy_ref_audio = (
+ torch.randn(1, SAMPLING_RATE),
+ SAMPLING_RATE,
+ ) # 1s dummy audio
+ for i in range(warmup):
+ worker_model.generate(
+ text=["hello"],
+ language=["en"],
+ ref_audio=[dummy_ref_audio],
+ ref_text=["hello"],
+ )
+ logging.info(f"Warmup complete on {worker_device}")
+
+ logging.info(f"Worker on {worker_device} initialized successfully.")
+
+
+def estimate_sample_total_duration(
+ duration_estimator: RuleDurationEstimator,
+ text: str,
+ ref_text: Optional[str],
+ ref_audio_path: Optional[str],
+ gen_duration: Optional[float] = None,
+) -> float:
+ """Estimate total duration (ref + generated) for a single sample.
+
+ When ``ref_audio_path`` is ``None`` (instruct / voice-design mode),
+ the reference duration is treated as 0 and only the estimated generated
+ duration contributes to the total.
+ """
+ if ref_audio_path is not None:
+ ref_wav = load_audio(ref_audio_path, SAMPLING_RATE)
+ ref_duration = ref_wav.shape[-1] / SAMPLING_RATE
+ else:
+ ref_duration = 0
+
+ if gen_duration is None:
+ if ref_audio_path is not None:
+ gen_duration = duration_estimator.estimate_duration(
+ text, ref_text or "", ref_duration, low_threshold=2.0
+ )
+ else:
+ gen_duration = duration_estimator.estimate_duration(
+ text, "Nice to meet you.", 0.5, low_threshold=2.0
+ )
+
+ total_duration = ref_duration + gen_duration
+ return total_duration
+
+
+def _sort_samples_by_duration(
+ samples: List[Tuple],
+ duration_estimator: RuleDurationEstimator,
+) -> List[Tuple[Tuple, float]]:
+ """Return (sample, total_duration) pairs sorted by duration descending."""
+ sample_with_duration = []
+ for sample in samples:
+ _, ref_text, ref_audio_path, text, _, dur, _, _ = sample
+ total_duration = estimate_sample_total_duration(
+ duration_estimator, text, ref_text, ref_audio_path, gen_duration=dur
+ )
+ sample_with_duration.append((sample, total_duration))
+ sample_with_duration.sort(key=lambda x: x[1], reverse=True)
+ return sample_with_duration
+
+
+def cluster_samples_by_duration(
+ samples: List[Tuple],
+ duration_estimator: RuleDurationEstimator,
+ batch_duration: float,
+) -> List[List[Tuple]]:
+ sample_with_duration = _sort_samples_by_duration(samples, duration_estimator)
+ batches = []
+ current_batch = []
+ current_total_duration = 0.0
+
+ for sample, duration in sample_with_duration:
+ if duration > batch_duration:
+ batches.append([sample])
+ continue
+
+ if current_total_duration + duration <= batch_duration:
+ current_batch.append(sample)
+ current_total_duration += duration
+ else:
+ batches.append(current_batch)
+ current_batch = [sample]
+ current_total_duration = duration
+
+ if current_batch:
+ batches.append(current_batch)
+
+ logging.info(f"Clustered {len(samples)} samples into {len(batches)} batches")
+ return batches
+
+
+def cluster_samples_by_batch_size(
+ samples: List[Tuple],
+ duration_estimator: RuleDurationEstimator,
+ batch_size: int,
+) -> List[List[Tuple]]:
+ """Split samples into fixed-size batches, sorted by duration to minimize padding."""
+ sample_with_duration = _sort_samples_by_duration(samples, duration_estimator)
+ sorted_samples = [s for s, _ in sample_with_duration]
+
+ batches = [
+ sorted_samples[i : i + batch_size]
+ for i in range(0, len(sorted_samples), batch_size)
+ ]
+ logging.info(
+ f"Split {len(samples)} samples into {len(batches)} batches "
+ f"(fixed batch_size={batch_size}, sorted by duration)"
+ )
+ return batches
+
+
+def run_inference_batch(
+ batch_samples: List[Tuple],
+ res_dir: str,
+ **gen_kwargs,
+) -> List[Tuple]:
+ global worker_model
+
+ save_names = []
+ ref_texts = []
+ ref_audio_paths = []
+ texts = []
+ langs = []
+ durations = []
+ speeds = []
+ instructs = []
+
+ for sample in batch_samples:
+ save_name, ref_text, ref_audio_path, text, lang_id, dur, spd, instruct = sample
+ save_names.append(save_name)
+ ref_texts.append(ref_text)
+ ref_audio_paths.append(ref_audio_path)
+ texts.append(text)
+ langs.append(lang_id)
+ durations.append(dur)
+ speeds.append(spd)
+ instructs.append(instruct)
+
+ start_time = time.time()
+ audios = worker_model.generate(
+ text=texts,
+ language=langs,
+ ref_audio=ref_audio_paths if any(p is not None for p in ref_audio_paths) else None,
+ ref_text=ref_texts if any(t is not None for t in ref_texts) else None,
+ duration=durations if any(d is not None for d in durations) else None,
+ speed=speeds if any(s is not None for s in speeds) else None,
+ instruct=instructs if any(i is not None for i in instructs) else None,
+ **gen_kwargs,
+ )
+ batch_synth_time = time.time() - start_time
+
+ results = []
+ for save_name, audio in zip(save_names, audios):
+ save_path = os.path.join(res_dir, save_name + ".wav")
+ sf.write(save_path, audio, worker_model.sampling_rate)
+ audio_duration = audio.shape[-1] / worker_model.sampling_rate
+ results.append(
+ (
+ save_name,
+ batch_synth_time / len(batch_samples),
+ audio_duration,
+ "success",
+ )
+ )
+
+ return results
+
+
+def main():
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+ mp.set_start_method("spawn", force=True)
+
+ args = get_parser().parse_args()
+ os.makedirs(args.res_dir, exist_ok=True)
+
+ device_type, num_devices = get_best_device()
+ if device_type == "cpu":
+ logging.warning(
+ "No GPU found. Falling back to CPU inference. This might be slow."
+ )
+
+ num_processes = num_devices * args.nj_per_gpu
+ logging.info(
+ f"Using {device_type} ({num_devices} device(s))."
+ f" Spawning {num_processes} worker processes."
+ )
+
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+ for rank in list(range(num_devices)) * args.nj_per_gpu:
+ rank_queue.put((device_type, rank))
+
+ samples_raw = read_test_list(args.test_list)
+ samples = []
+ for s in samples_raw:
+ lang_id = args.lang_id if args.lang_id is not None else s.get("language_id")
+ samples.append(
+ (
+ s["id"],
+ s.get("ref_text"),
+ s.get("ref_audio"),
+ s["text"],
+ lang_id,
+ s.get("duration"),
+ s.get("speed"),
+ s.get("instruct"),
+ )
+ )
+
+ total_synthesis_time = []
+ total_audio_duration = []
+
+ try:
+ with ProcessPoolExecutor(
+ max_workers=num_processes,
+ initializer=process_init,
+ initargs=(rank_queue, args.model, args.warmup),
+ ) as executor:
+ futures = []
+
+ logging.info("Running batch inference")
+
+ # Split samples by mode (voice-clone vs non-voice-clone) before
+ # clustering so that each batch is homogeneous. Mixing ref_audio
+ # and non-ref_audio samples in the same batch would crash in
+ # generate() → create_voice_clone_prompt().
+ clone_samples = [s for s in samples if s[2] is not None]
+ other_samples = [s for s in samples if s[2] is None]
+
+ duration_estimator = RuleDurationEstimator()
+ batches = []
+ for subset in (clone_samples, other_samples):
+ if not subset:
+ continue
+ if args.batch_size > 0:
+ batches.extend(
+ cluster_samples_by_batch_size(
+ subset, duration_estimator, args.batch_size
+ )
+ )
+ else:
+ batches.extend(
+ cluster_samples_by_duration(
+ subset, duration_estimator, args.batch_duration
+ )
+ )
+
+ args_dict = vars(args)
+
+ for batch in batches:
+ futures.append(
+ executor.submit(
+ run_inference_batch, batch_samples=batch, **args_dict
+ )
+ )
+
+ for future in tqdm(
+ as_completed(futures), total=len(futures), desc="Processing samples"
+ ):
+ try:
+ result = future.result()
+ for s_name, synth_time, audio_dur, status in result:
+ total_synthesis_time.append(synth_time)
+ total_audio_duration.append(audio_dur)
+ rtf = synth_time / audio_dur if audio_dur > 0 else float("inf")
+ logging.debug(
+ f"Processed {s_name}: Audio Duration={audio_dur:.2f}s, "
+ f"Synthesis Time={synth_time:.2f}s, RTF={rtf:.4f}"
+ )
+ except Exception as e:
+ logging.error(f"Failed to process sample: {e}")
+ detailed_error = traceback.format_exc()
+ logging.error(f"Detailed error: {detailed_error}")
+
+ except (Exception, KeyboardInterrupt) as e:
+ logging.critical(
+ f"An unrecoverable error occurred: {e}. Terminating all processes."
+ )
+ detailed_error_info = traceback.format_exc()
+ logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+ os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
+
+ total_synthesis_time = sum(total_synthesis_time)
+ total_audio_duration = sum(total_audio_duration)
+ logging.info("--- Summary ---")
+ logging.info(f"Total audio duration: {total_audio_duration:.2f}s")
+ logging.info(f"Total synthesis time: {total_synthesis_time:.2f}s")
+ if total_audio_duration > 0:
+ average_rtf = total_synthesis_time / total_audio_duration
+ logging.info(f"Average RTF: {average_rtf:.4f}")
+ else:
+ logging.warning("No speech was generated. RTF cannot be computed.")
+
+ logging.info("Done!")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/cli/train.py b/omnivoice/cli/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..947746e16e8cb7a15040596618df7fbeb4f29c0b
--- /dev/null
+++ b/omnivoice/cli/train.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training CLI for OmniVoice.
+
+Launches distributed training via HuggingFace Accelerate.
+Supports pre-training on Emilia data and finetuning on custom data.
+
+Usage:
+ accelerate launch --gpu_ids 0,1,2,3 --num_processes 4 \\
+ -m omnivoice.cli.train \\
+ --train_config train_config.json \\
+ --data_config data_config.json \\
+ --output_dir output/
+
+See examples/run_emilia.sh and examples/run_finetune.sh for full pipelines.
+"""
+
+import argparse
+
+from omnivoice.training.builder import build_dataloaders, build_model_and_tokenizer
+from omnivoice.training.config import TrainingConfig
+from omnivoice.training.trainer import OmniTrainer
+
+
+def main():
+ parser = argparse.ArgumentParser(description="OmniVoice Training Entry Point")
+ parser.add_argument(
+ "--train_config", type=str, required=True, help="Path to config JSON"
+ )
+ parser.add_argument(
+ "--output_dir", type=str, required=True, help="Where to save checkpoints"
+ )
+ parser.add_argument(
+ "--data_config", type=str, required=True, help="Path to data config JSON"
+ )
+ args = parser.parse_args()
+
+ # 1. Load Configuration
+ config = TrainingConfig.from_json(args.train_config)
+ config.output_dir = args.output_dir
+ config.data_config = args.data_config
+
+ # 2. Build Components
+ model, tokenizer = build_model_and_tokenizer(config)
+ train_loader, eval_loader = build_dataloaders(config, tokenizer)
+
+ # 3. Initialize Trainer and Start
+ trainer = OmniTrainer(
+ model=model,
+ config=config,
+ train_dataloader=train_loader,
+ eval_dataloader=eval_loader,
+ tokenizer=tokenizer,
+ )
+ trainer.train()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/data/__init__.py b/omnivoice/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/data/batching.py b/omnivoice/data/batching.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f78cbf6abd1368f315c007db56544f988f8942e
--- /dev/null
+++ b/omnivoice/data/batching.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Batching strategies for streaming/iterable datasets.
+
+Provides length-based grouping and packing for efficient training with
+variable-length audio.
+
+Key classes:
+- ``PackingIterableDataset``: Packs multiple samples into fixed-length sequences
+ for training. Used by ``omnivoice.training.builder`` with flex_attention.
+- ``StreamLengthGroupDataset``: Groups samples by length into buckets. Used by
+ data processing scripts (e.g. ``omnivoice/scripts/``) and by
+ ``omnivoice.training.builder`` when ``attn_implementation != "flex_attention"``.
+"""
+
+import bisect
+import logging
+from typing import Any, Dict, Iterator, List, Optional
+
+import numpy as np
+
+from omnivoice.data.dataset import IterableDataReader, WrappedIterableDataset
+
+
+class StreamLengthGroupDataset(WrappedIterableDataset):
+ """A streaming dataset that groups samples by their lengths into buckets.
+
+ By default, length is measured as audio duration in seconds from a raw
+ waveform field. Pass a custom ``length_fn`` to use a different measure —
+ e.g. ``lambda s: s["length"]`` for processed training data, in which case
+ ``batch_duration`` and ``min/max_length`` should use the same units.
+
+ If ``processor`` is provided, each raw sample is processed before length
+ measurement and bucketing, and the yielded batches contain **processed**
+ samples. This allows accurate bucketing by post-processing token length
+ (used in the SDPA training path).
+ """
+
+ def __init__(
+ self,
+ dataset: IterableDataReader,
+ batch_duration: float,
+ min_length: float = 0.5,
+ max_length: float = 30.0,
+ num_buckets: int = 20,
+ audio_key: str = "audio",
+ drop_last: bool = False,
+ max_sample: Optional[int] = None,
+ length_fn: Optional[Any] = None,
+ processor: Optional[Any] = None,
+ ):
+ self.dataset = dataset
+ self.batch_duration = batch_duration
+ self.min_length = min_length
+ self.max_length = max_length
+ self.num_buckets = num_buckets
+ self.audio_key = audio_key
+ self.drop_last = drop_last
+ self.max_sample = max_sample if max_sample is not None else float("inf")
+ self.length_fn = length_fn
+ self.processor = processor
+
+ self.boundaries = np.linspace(min_length, max_length, num_buckets + 1)[1:]
+
+ def set_epoch(self, epoch: int):
+ """
+ Set the epoch for shuffling.
+ """
+ self.dataset.set_epoch(epoch)
+
+ def _get_bucket_id(self, length: float) -> int:
+
+ return bisect.bisect_left(self.boundaries, length)
+
+ def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+ buckets = [[] for _ in range(self.num_buckets)]
+ bucket_max_len = [0.0] * self.num_buckets
+
+ for sample in self.dataset:
+ if self.processor is not None:
+ try:
+ sample = self.processor(sample)
+ except Exception as e:
+ logging.warning(f"Error processing sample: {e}")
+ continue
+
+ if self.length_fn is not None:
+ duration = self.length_fn(sample)
+ else:
+ audio = sample[self.audio_key]
+ duration = audio.size(-1) / self.dataset.sample_rate
+
+ if duration < self.min_length or duration > self.max_length:
+ # logging.warning(f"Skipping sample with duration {duration:.2f}s")
+ continue
+
+ b_id = self._get_bucket_id(duration)
+ buckets[b_id].append(sample)
+
+ if duration > bucket_max_len[b_id]:
+ bucket_max_len[b_id] = duration
+
+ if (
+ bucket_max_len[b_id] * (len(buckets[b_id]) + 1) >= self.batch_duration
+ or len(buckets[b_id]) >= self.max_sample
+ ):
+ yield buckets[b_id]
+ buckets[b_id] = []
+ bucket_max_len[b_id] = 0.0
+
+ if not self.drop_last:
+ for b_idx, bucket in enumerate(buckets):
+ if bucket:
+ yield bucket
+ buckets[b_idx] = []
+
+
+class PackingIterableDataset(WrappedIterableDataset):
+ """
+ An IterableDataset that dynamically processes samples using a processor
+ and packs them into batches based on the real token count.
+
+ Args:
+ dataset (Iterable): The raw dataset to process.
+ processor (Callable): A processor to process each sample.
+ batch_tokens (int): Maximum number of tokens per batch.
+ """
+
+ def __init__(
+ self,
+ dataset: IterableDataReader,
+ processor: Any,
+ batch_tokens: int,
+ ):
+ self.dataset = dataset
+ self.processor = processor
+ self.batch_tokens = batch_tokens
+ self.skip_batches = 0
+
+ def set_epoch(self, epoch: int):
+ """
+ Set the epoch for shuffling.
+ """
+ self.dataset.set_epoch(epoch)
+
+ def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+ current_batch = []
+ current_token_count = 0
+
+ for raw_sample in self.dataset:
+ # Process the sample using the processor
+ try:
+ processed_sample = self.processor(raw_sample)
+ except Exception as e:
+ logging.warning(f"Error processing sample {raw_sample}: {e}")
+ continue
+
+ sample_length = processed_sample["length"]
+
+ if sample_length > self.batch_tokens:
+ continue
+
+ # Check if adding this sample exceeds the batch token limit
+ if current_token_count + sample_length > self.batch_tokens:
+ # Yield the current batch and start a new one
+ yield current_batch
+ current_batch = []
+ current_token_count = 0
+
+ # Add the processed sample to the current batch
+ current_batch.append(processed_sample)
+ current_token_count += sample_length
+
+ # Yield the last batch if it's not empty
+ if current_batch:
+ yield current_batch
diff --git a/omnivoice/data/collator.py b/omnivoice/data/collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1cac27462f919e44d1d359ed8c722f302903a87
--- /dev/null
+++ b/omnivoice/data/collator.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data collators for OmniVoice training.
+
+Two strategies are available:
+
+- ``PackingDataCollator``: Concatenates samples into a single long sequence
+ (sequence packing). Used with flex_attention. Batch shape is ``[1, C, L]``.
+- ``PaddingDataCollator``: Pads samples to the same length and stacks them.
+ Used with SDPA/eager attention. Batch shape is ``[B, C, max_len]``.
+"""
+
+from typing import Any, Dict, List
+
+import torch
+
+
+class PaddingDataCollator:
+ """Pads a list of processed samples to the same length and stacks them.
+
+ Produces a standard ``[B, C, max_len]`` batch suitable for SDPA/eager
+ attention, where B is the number of samples in the batch, C is the number
+ of audio codebook layers, and max_len is the longest sequence in the batch.
+
+ A 4D boolean attention mask of shape ``[B, 1, max_len, max_len]`` is included.
+ Each query position can attend to all non-padding key positions (bidirectional),
+ matching the masked-diffusion training objective. When passed as a 4D tensor,
+ HuggingFace models use it directly without adding an additional causal mask.
+
+ No ``document_ids`` are emitted — each sample occupies its own batch row.
+ """
+
+ def __init__(self, processor, batch_tokens: int):
+ self.batch_tokens = batch_tokens
+ self.processor = processor
+
+ def __call__(self, processed_samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+ pad_id = self.processor.text_tokenizer.pad_token_id
+ max_len = max(s["length"] for s in processed_samples)
+ B = len(processed_samples)
+
+ padded_input_ids = []
+ padded_labels = []
+ padded_audio_mask = []
+ padded_position_ids = []
+ # valid[b, j] = True if position j is a real (non-padding) token for sample b
+ valid = torch.zeros(B, max_len, dtype=torch.bool)
+
+ for i, s in enumerate(processed_samples):
+ length = s["length"]
+ pad = max_len - length
+
+ padded_input_ids.append(
+ torch.nn.functional.pad(s["input_ids"], (0, pad), value=pad_id)
+ ) # [C, max_len]
+ padded_labels.append(
+ torch.nn.functional.pad(s["labels"], (0, pad), value=-100)
+ ) # [C, max_len]
+ padded_audio_mask.append(
+ torch.nn.functional.pad(s["audio_mask"], (0, pad), value=False)
+ ) # [max_len]
+ padded_position_ids.append(
+ torch.nn.functional.pad(
+ torch.arange(length, dtype=torch.long), (0, pad), value=0
+ )
+ ) # [max_len]
+ valid[i, :length] = True
+
+ # Stack into [B, C, max_len] / [B, max_len]
+ input_ids = torch.stack(padded_input_ids, dim=0) # [B, C, max_len]
+ labels = torch.stack(padded_labels, dim=0) # [B, C, max_len]
+ audio_mask = torch.stack(padded_audio_mask, dim=0) # [B, max_len]
+ position_ids = torch.stack(padded_position_ids, dim=0) # [B, max_len]
+
+ # 4D bidirectional attention mask: mask[b, 0, i, j] = valid[b, j]
+ # All query positions attend to all non-padding key positions.
+ attention_mask = valid[:, None, None, :].expand(B, 1, max_len, max_len).contiguous()
+
+ return {
+ "input_ids": input_ids, # [B, C, max_len]
+ "labels": labels, # [B, C, max_len]
+ "audio_mask": audio_mask, # [B, max_len]
+ "position_ids": position_ids, # [B, max_len]
+ "attention_mask": attention_mask, # [B, 1, max_len, max_len]
+ }
+
+
+class PackingDataCollator:
+ def __init__(self, processor, batch_tokens: int):
+ self.batch_tokens = batch_tokens
+ self.processor = processor
+
+ def __call__(self, processed_samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+
+ target_length = self.batch_tokens
+
+ input_ids = torch.cat(
+ [s["input_ids"] for s in processed_samples], dim=1
+ ) # [C, Total_Len], C is the number of codebook layers of the audio tokenizer
+ labels = torch.cat(
+ [s["labels"] for s in processed_samples], dim=1
+ ) # [C, Total_Len]
+ audio_mask = torch.cat(
+ [s["audio_mask"] for s in processed_samples], dim=0
+ ) # [Total_Len]
+
+ position_ids = torch.cat(
+ [torch.arange(s["length"], dtype=torch.long) for s in processed_samples],
+ dim=0,
+ ) # [Total_Len]
+
+ pad_length = target_length - input_ids.shape[1]
+
+ input_ids = torch.nn.functional.pad(
+ input_ids,
+ pad=(0, pad_length),
+ value=self.processor.text_tokenizer.pad_token_id,
+ )
+
+ labels = torch.nn.functional.pad(labels, pad=(0, pad_length), value=-100)
+
+ audio_mask = torch.nn.functional.pad(
+ audio_mask, pad=(0, pad_length), value=False
+ )
+
+ position_ids = torch.nn.functional.pad(
+ position_ids, pad=(0, pad_length), value=0
+ )
+
+ return_list = {
+ "input_ids": input_ids.unsqueeze(0), # [1, C, L]
+ "labels": labels.unsqueeze(0), # [1, C, L]
+ "audio_mask": audio_mask.unsqueeze(0), # [1, L]
+ "position_ids": position_ids.unsqueeze(0), # [1, L]
+ }
+
+ document_ids_list = []
+
+ for i, s in enumerate(processed_samples):
+ seq_len = s["length"]
+ document_ids_list.append(torch.full((seq_len,), i, dtype=torch.int32))
+
+ document_ids = torch.cat(document_ids_list, dim=0)
+
+ document_ids = torch.nn.functional.pad(
+ document_ids, pad=(0, pad_length), value=-1
+ )
+ return_list["document_ids"] = document_ids.unsqueeze(0) # [1, L]
+
+ return return_list
diff --git a/omnivoice/data/dataset.py b/omnivoice/data/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3811e9b5a7364f2d54f54d3a4af8211266826f9
--- /dev/null
+++ b/omnivoice/data/dataset.py
@@ -0,0 +1,544 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataset and data-loading utilities for training and evaluation.
+
+Provides WebDataset-based iterable datasets, manifest parsing, and audio/token
+loading. Used by ``omnivoice.training.builder.build_dataloaders()`` to construct
+train and eval data loaders.
+
+Key functions:
+- ``prepare_data_manifests_from_json()``: Parses a data config JSON into train/dev
+ manifests.
+
+Key classes:
+- ``WebDatasetReader``: Reads audio/text pairs from WebDataset tar shards as an
+ iterable dataset.
+- ``MuxWebDatasetReader``: Multiplexes multiple WebDataset readers for
+ multilingual data.
+- ``JsonlDatasetReader``: Reads audio/text pairs from a JSONL manifest file.
+ Used by data processing scripts (e.g. ``omnivoice/scripts/``).
+- ``SampleDecoder``: Decodes individual samples (audio or tokens + labels).
+"""
+
+import io
+import json
+import logging
+import os
+import random
+from typing import Any, Dict, Iterator, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import webdataset as wds
+
+from omnivoice.utils.audio import load_audio, load_audio_bytes
+from torch.utils.data import IterableDataset
+
+
+def load_audio_webdataset(data, sample_rate: int = 24000, device="cpu"):
+ """
+ Load audio from bytes data and resample to the target sample rate if needed.
+ Return a tensor of shape (1, num_samples)
+ """
+ audio = torch.from_numpy(load_audio_bytes(data, sample_rate))
+ audio = audio.to(device)
+ return audio
+
+
+def prepare_data_manifests_from_json(
+ data_config: str,
+) -> Tuple[List[Tuple[str, str, int, float]], List[Tuple[str, str, int, float]]]:
+ """
+ Prepare data manifests from a json file.
+ A typical multilingual json file is in the following format:
+ {
+ "train":
+ [
+ {
+ "language_id": "en",
+ "manifest_path": [
+ "/Emilia/EN/data.lst"
+ ],
+ "repeat": 1
+ },
+ {
+ "language_id": "zh",
+ "manifest_path": [
+ "/Emilia/ZH/data.lst"
+ ],
+ "repeat": 1
+ }
+ ],
+ "dev":
+ [
+ {
+ "language_id": "en",
+ "manifest_path": [
+ "/Emilia/EN-dev/data.lst"
+ ],
+ "repeat": 1
+ },
+ {
+ "language_id": "zh",
+ "manifest_path": [
+ "/Emilia/ZH-dev/data.lst"
+ ],
+ "repeat": 1
+ }
+ ]
+ }
+
+ "language_id" is not used, just for better organization of multilingual data.
+ "repeat" is an optional field, default to 1, which indicates how many times
+ the manifest should be repeated.
+
+ The simplist format is like:
+ {
+ "train":
+ [
+ {
+ "manifest_path": [
+ "/Emilia/EN/data.lst",
+ "/Emilia/ZH/data.lst"
+ ],
+ }
+ ],
+ "dev":
+ [
+ {
+ "manifest_path": [
+ "/Emilia/EN-dev/data.lst",
+ "/Emilia/ZH-dev/data.lst"
+ ],
+ }
+ ]
+
+ data.lst format (items separated by space):
+ /path/to/data.tar /path/to/label.jsonl num_items num_seconds
+ """
+ train_manifests = []
+ dev_manifests = []
+ with open(data_config, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ for item in data["train"]:
+ manifest_paths = item["manifest_path"]
+ repeat = item.get("repeat", 1)
+ for manifest_path in manifest_paths:
+ # assert manifest_path is a file
+ assert os.path.isfile(manifest_path), f"{manifest_path} is not a file."
+ train_manifests.extend(
+ webdataset_manifest_reader(manifest_path) * repeat
+ )
+ if "dev" in data:
+ for item in data["dev"]:
+ manifest_paths = item["manifest_path"]
+ repeat = item.get("repeat", 1)
+ for manifest_path in manifest_paths:
+ dev_manifests.extend(
+ webdataset_manifest_reader(manifest_path) * repeat
+ )
+ return train_manifests, dev_manifests
+
+
+def webdataset_manifest_reader(
+ manifest_path: str,
+) -> List[Tuple[str, str]]:
+ """
+ Read a manifest file containing webdataset tar paths and label jsonl paths.
+ Each line in the manifest file is in the format of:
+ /path/to/data.tar /path/to/label.jsonl num_items num_seconds
+ """
+ manifests = []
+ with open(manifest_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ parts = line.split()
+ if len(parts) != 4:
+ raise ValueError(
+ f"Invalid manifest line: {line}. "
+ f"Each line must contain "
+ "tar_path, label_jsonl_path, num_items, num_seconds."
+ )
+ tar_path, label_jsonl_path, num_items, num_seconds = (
+ parts[0],
+ parts[1],
+ int(parts[2]),
+ float(parts[3]),
+ )
+ manifests.append((tar_path, label_jsonl_path, num_items, num_seconds))
+ return manifests
+
+
+class SampleDecoder:
+ """
+ Decode a sample from webdataset, including loading audio/tokens and fetching label.
+ """
+
+ def __init__(
+ self,
+ tar_to_label: Dict,
+ sample_rate: int = 24000,
+ audio_format: Optional[Tuple[str]] = None,
+ normalize_audio: bool = True,
+ ):
+ """
+ Args:
+ tar_to_label:
+ A dict mapping from audio tar file to label tar file.
+ sample_rate:
+ Target sample rate for audio. Required if audio is loaded.
+ audio_format:
+ Tuple of audio file extensions to look for in the sample.
+ """
+ self.tar_to_label = tar_to_label
+ self.sample_rate = sample_rate
+ self.label_dataset = None
+ if audio_format is None:
+ self.audio_format = ("flac", "wav", "mp3")
+ else:
+ self.audio_format = audio_format
+ self.normalize_audio = normalize_audio
+
+ def __call__(self, sample):
+ return_dict = {}
+ src = sample["__url__"]
+ key = sample["__key__"]
+ if (
+ self.label_dataset is None
+ or self.label_dataset.path != self.tar_to_label[src]
+ ):
+ self.label_dataset = LabelDataset(self.tar_to_label[src])
+
+ audio = torch.empty(0)
+ if "npy" in sample:
+ audio_tokens = torch.from_numpy(sample["npy"])
+ return_dict["audio_tokens"] = audio_tokens
+ else:
+ for ext in self.audio_format:
+ if ext in sample:
+ # load audio (1, num_samples)
+ audio = load_audio_webdataset(
+ sample[ext], sample_rate=self.sample_rate
+ )
+ if self.normalize_audio:
+ audio = (audio / (audio.abs().max() + 1e-7)) * 0.9
+ break
+ return_dict["audio"] = audio
+ return_dict["audio_duration"] = audio.size(-1) / self.sample_rate
+
+ label = self.label_dataset[key]
+
+ return_dict["label"] = label
+ return return_dict
+
+
+class LabelDataset:
+ def __init__(self, jsonl_path: str):
+ """
+ Load labels from a jsonl file.
+ Args:
+ jsonl_path:
+ Path to the jsonl file containing labels.
+ Each line in the manifest file is in the format of:
+ {"idx": "idx", "text": "transcription text"}
+ """
+ self._labels = {}
+ self.path = jsonl_path
+ if not os.path.exists(jsonl_path):
+ raise FileNotFoundError(f"Label jsonl file {jsonl_path} does not exist.")
+ with open(jsonl_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if not line:
+ continue
+ item = json.loads(line)
+ if "id" in item:
+ self._labels[item["id"]] = item
+
+ def __getitem__(self, key):
+ return self._labels[key]
+
+
+class IterableDataReader:
+ "Interfaces for classes reading data."
+
+ sample_rate: int
+
+ def set_epoch(self, epoch: int):
+ raise NotImplementedError
+
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
+ raise NotImplementedError
+
+ def __len__(self) -> int:
+ raise NotImplementedError
+
+
+class WrappedIterableDataset(IterableDataset):
+ "IterableDataset interfaces in this project."
+
+ def set_epoch(self, epoch: int):
+ raise NotImplementedError
+
+ def __iter__(self) -> Iterator[List[Dict[str, Any]]]:
+ raise NotImplementedError
+
+
+class WebDatasetReader(IterableDataReader):
+ def __init__(
+ self,
+ manifests: List[Tuple[str, str, int, float]],
+ evaluation: bool = False,
+ shuffle_buffer_size: int = 20000,
+ sample_rate: int = 24000,
+ ):
+ self.shuffle_buffer_size = shuffle_buffer_size
+ self.evaluation = evaluation
+ self.epoch = 0
+
+ self.orig_urls = []
+ self.tar_to_label = {}
+ self.num_items = 0
+ self.num_seconds = 0.0
+ for tar_path, label_jsonl_path, num_items, num_seconds in manifests:
+ self.orig_urls.append(tar_path)
+ self.tar_to_label[tar_path] = label_jsonl_path
+ self.num_items += num_items
+ self.num_seconds += num_seconds
+ self.urls = self.orig_urls.copy()
+ self.sample_decoder = SampleDecoder(
+ tar_to_label=self.tar_to_label,
+ sample_rate=sample_rate,
+ )
+ self.sample_rate = sample_rate
+
+ def set_epoch(self, epoch: int):
+ """
+ Set the epoch for shuffling.
+ """
+ self.epoch = epoch
+ self.urls = self.orig_urls.copy()
+ if not self.evaluation:
+ random.Random(epoch).shuffle(self.urls)
+
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
+
+ dataset = wds.WebDataset(
+ self.urls,
+ shardshuffle=False,
+ workersplitter=wds.split_by_worker,
+ nodesplitter=wds.split_by_node,
+ )
+
+ pipeline = dataset.decode().map(self.sample_decoder)
+ if not self.evaluation:
+ pipeline = pipeline.shuffle(self.shuffle_buffer_size, seed=self.epoch)
+ return iter(pipeline)
+
+ def __len__(self) -> int:
+ return self.num_items
+
+
+class JsonlDatasetReader(IterableDataReader):
+ """Read raw JSONL and load audio files, matching WebDatasetReader output format.
+
+ Each JSONL line should be a JSON object with at least:
+ {"id": "...", "audio_path": "/path/to/audio.wav", ...}
+
+ Yields dicts of the form: {"audio": Tensor(1, T), "label": dict}
+ """
+
+ def __init__(
+ self,
+ jsonl_path: str,
+ sample_rate: int = 24_000,
+ shuffle: bool = True,
+ shuffle_seed: int = 42,
+ normalize_audio: bool = True,
+ ):
+ self.jsonl_path = jsonl_path
+ self.sample_rate = sample_rate
+ self.shuffle = shuffle
+ self.shuffle_seed = shuffle_seed
+ self.normalize_audio = normalize_audio
+
+ def set_epoch(self, epoch: int):
+ self.shuffle_seed = epoch
+
+ def _read_lines(self) -> list[dict]:
+ entries = []
+ with open(self.jsonl_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ entries.append(json.loads(line))
+ if self.shuffle:
+ random.seed(self.shuffle_seed)
+ random.shuffle(entries)
+ logging.info(
+ f"Shuffled {len(entries)} JSONL entries (seed={self.shuffle_seed})"
+ )
+ return entries
+
+ def _stream_lines(self):
+ with open(self.jsonl_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ yield json.loads(line)
+
+ def __iter__(self):
+ source = self._read_lines() if self.shuffle else self._stream_lines()
+
+ # Split data across distributed ranks (multi-GPU / DDP)
+ if dist.is_initialized():
+ rank = dist.get_rank()
+ world_size = dist.get_world_size()
+ source = [item for i, item in enumerate(source) if i % world_size == rank]
+
+ # Split data across DataLoader workers to avoid duplication
+ worker_info = torch.utils.data.get_worker_info()
+ if worker_info is not None:
+ source = (
+ item
+ for i, item in enumerate(source)
+ if i % worker_info.num_workers == worker_info.id
+ )
+
+ for meta in source:
+ audio_path = meta.get("audio_path")
+ if not audio_path or not os.path.exists(audio_path):
+ logging.warning(
+ f"Skipping {meta.get('id', '?')}: audio_path missing or not found"
+ )
+ continue
+ try:
+ waveform = torch.from_numpy(
+ load_audio(audio_path, self.sample_rate)
+ )
+ if self.normalize_audio:
+ waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.9
+ meta["audio_duration"] = waveform.shape[1] / self.sample_rate
+ yield {"audio": waveform, "label": meta}
+ except Exception as e:
+ logging.warning(f"Skipping {meta.get('id', '?')}: {e}")
+
+
+class MuxWebDatasetReader(IterableDataReader):
+ def __init__(
+ self,
+ readers: List[WebDatasetReader],
+ weights: Optional[List[float]] = None,
+ stop_early: bool = False,
+ seed: int = 0,
+ ):
+ self.readers = readers
+ self.stop_early = stop_early
+ self.mux_iterator = LazyIteratorMultiplexer(
+ *readers,
+ stop_early=stop_early,
+ weights=weights,
+ seed=seed,
+ )
+
+ def set_epoch(self, epoch: int):
+ """
+ Set the epoch for shuffling.
+ """
+ for reader in self.readers:
+ reader.set_epoch(epoch)
+
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
+ return iter(self.mux_iterator)
+
+
+class LazyIteratorMultiplexer:
+ """
+ A wrapper over multiple iterators that enables to combine
+ lazy manifests in Lhotse. During iteration, unlike
+ :class:`.LazyIteratorChain`,
+ :class:`.LazyIteratorMultiplexer` at each step randomly
+ selects the iterable used to yield an item.
+
+ Since the iterables might be of different length, we provide
+ a ``weights`` parameter to let the user decide which iterables
+ should be sampled more frequently than others.
+ When an iterable is exhausted, we will keep sampling from the other iterables, until
+ we exhaust them all, unless ``stop_early`` is set to ``True``.
+ """
+
+ def __init__(
+ self,
+ *iterators: IterableDataReader,
+ stop_early: bool = False,
+ weights: Optional[List[float]] = None,
+ seed: int = 0,
+ ) -> None:
+ self.iterators = list(iterators)
+ self.stop_early = stop_early
+ self.seed = seed
+
+ assert (
+ len(self.iterators) > 1
+ ), "There have to be at least two iterables to multiplex."
+
+ if weights is None:
+ if all(hasattr(it, "__len__") for it in self.iterators):
+ lengths = [len(it) for it in self.iterators]
+ total_length = sum(lengths)
+ self.weights = [length / total_length for length in lengths]
+ else:
+ self.weights = [1] * len(self.iterators)
+ else:
+ self.weights = weights
+
+ assert len(self.iterators) == len(self.weights)
+
+ def __iter__(self):
+
+ rng = random.Random(self.seed)
+ iters = [iter(it) for it in self.iterators]
+ exhausted = [False for _ in range(len(iters))]
+
+ def should_continue():
+ if self.stop_early:
+ return not any(exhausted)
+ else:
+ return not all(exhausted)
+
+ while should_continue():
+ active_indexes, active_weights = zip(
+ *[
+ (i, w)
+ for i, (is_exhausted, w) in enumerate(zip(exhausted, self.weights))
+ if not is_exhausted
+ ]
+ )
+ idx = rng.choices(active_indexes, weights=active_weights, k=1)[0]
+ selected = iters[idx]
+ try:
+ item = next(selected)
+ yield item
+ except StopIteration:
+ exhausted[idx] = True
+ continue
+
+ def __len__(self) -> int:
+ return sum(len(iterator) for iterator in self.iterators)
diff --git a/omnivoice/data/processor.py b/omnivoice/data/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e3ec1b320a7178f895dd542cdbd9c70b16b3bb2
--- /dev/null
+++ b/omnivoice/data/processor.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training sample processor for OmniVoice.
+
+Converts raw audio/text samples into model-ready tensors: applies prompt/mask
+tokenization, randomly drops conditioning, and injects language/instruct tokens.
+Used by ``omnivoice.training.builder`` to build the data pipeline.
+
+Contains two processor classes:
+- ``OmniVoiceSampleProcessor``: Full processor used for training.
+- ``OmniVoiceSimpleSampleProcessor``: Simplified processor (not used for training).
+"""
+
+import random
+from typing import Any, Dict
+
+import torch
+
+
+class OmniVoiceSampleProcessor:
+ """
+ Handles the logic of processing a raw sample into tensors
+ (masking, tokenization, etc.).
+ """
+
+ def __init__(
+ self,
+ text_tokenizer: Any,
+ num_channels: int,
+ audio_mask_id: int,
+ prompt_ratio_range: tuple,
+ mask_ratio_range: tuple,
+ drop_cond_ratio: float,
+ language_ratio: float,
+ use_pinyin_ratio: float,
+ instruct_ratio: float,
+ only_instruct_ratio: float,
+ ):
+ self.text_tokenizer = text_tokenizer
+ self.num_channels = num_channels
+ self.audio_mask_id = audio_mask_id
+ self.prompt_ratio_range = prompt_ratio_range
+ self.mask_ratio_range = mask_ratio_range
+ self.drop_cond_ratio = drop_cond_ratio
+
+ self.language_ratio = language_ratio
+ self.use_pinyin_ratio = use_pinyin_ratio
+ self.instruct_ratio = instruct_ratio
+ self.only_instruct_ratio = only_instruct_ratio
+
+ def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+
+ # clean_start_token_idx is only used for prompt denoising training,
+ # where the prompt region is augmented with noises and the model
+ # needs to learn to recover the clean prompt.
+ # clean_start_token_idx indicates the start index of the clean generated token.
+ if "clean_start_token_idx" in sample["label"]:
+ drop_cond = False
+ else:
+ drop_cond = random.uniform(0, 1) < self.drop_cond_ratio
+
+ if drop_cond:
+ prompt_ratio = 0.0
+ drop_text = True
+ use_language = False
+ use_instruct = False
+ else:
+ prompt_ratio = random.uniform(*self.prompt_ratio_range)
+ drop_text = False
+ use_language = random.uniform(0, 1) < self.language_ratio
+ use_instruct = random.uniform(0, 1) < self.instruct_ratio
+ if use_instruct and random.uniform(0, 1) < self.only_instruct_ratio:
+ prompt_ratio = 0.0
+
+ mask_ratio = random.uniform(*self.mask_ratio_range)
+
+ # --- Style ---
+ style = ""
+ if use_language:
+ language = sample["label"].get("language_id", "None")
+ else:
+ language = "None"
+ if use_instruct:
+ instruct = sample["label"].get("instruct", "None")
+ else:
+ instruct = "None"
+
+ if "clean_start_token_idx" in sample["label"]:
+ style += "<|denoise|>"
+
+ style += f"<|lang_start|>{language}<|lang_end|>"
+ style += f"<|instruct_start|>{instruct}<|instruct_end|>"
+
+ style_inputs = self.text_tokenizer(style, return_tensors="pt").input_ids.repeat(
+ self.num_channels, 1
+ )
+ style_labels = torch.full(
+ style_inputs.shape, -100
+ ) # Style prompt does not compute loss
+
+ # --- Text ---
+ if (
+ "text_pinyin" in sample["label"]
+ and random.uniform(0, 1) < self.use_pinyin_ratio
+ ):
+ text = sample["label"]["text_pinyin"]
+ else:
+ text = sample["label"]["text"]
+ text_inputs = self.text_tokenizer(
+ f"<|text_start|>{text}<|text_end|>", return_tensors="pt"
+ ).input_ids.repeat(self.num_channels, 1)
+ text_labels = torch.full(text_inputs.shape, -100) # Text does not compute loss
+
+ # --- Audio ---
+ audio_tokens = sample["audio_tokens"].long()
+
+ # Masking Logic
+ if "clean_start_token_idx" in sample["label"]:
+ prompt_length = sample["label"]["clean_start_token_idx"]
+ else:
+ prompt_length = int(audio_tokens.shape[1] * prompt_ratio)
+
+ audio_inputs = audio_tokens.clone()
+ audio_labels = audio_tokens.clone()
+
+ # Apply masking
+ maskable_region = audio_tokens[:, prompt_length:]
+ token_mask = torch.rand(maskable_region.shape) < mask_ratio
+ audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id
+ audio_labels[:, prompt_length:][
+ ~token_mask
+ ] = -100 # Only compute loss on masked tokens
+ if not drop_cond:
+ audio_labels[:, :prompt_length] = -100 # No loss on prompt region
+
+ # --- Concatenation ---
+ if drop_text:
+ input_ids = audio_inputs
+ labels = audio_labels
+ total_length = input_ids.shape[1]
+ audio_mask = torch.ones(total_length, dtype=torch.bool)
+ else:
+ input_ids = torch.cat([style_inputs, text_inputs, audio_inputs], dim=1)
+ labels = torch.cat([style_labels, text_labels, audio_labels], dim=1)
+ total_length = input_ids.shape[1]
+ audio_start_idx = style_inputs.shape[1] + text_inputs.shape[1]
+ audio_mask = torch.zeros(total_length, dtype=torch.bool)
+ audio_mask[audio_start_idx:] = True
+
+ return_dict = {
+ "input_ids": input_ids, # [C, L]
+ "labels": labels, # [C, L]
+ "audio_mask": audio_mask, # [L]
+ "length": total_length,
+ }
+
+ return return_dict
+
+
+class OmniVoiceSimpleSampleProcessor:
+ """
+ Handles the logic of processing a raw sample into tensors
+ (masking, tokenization, etc.).
+ This is a simpler version that does not include language, instructions,
+ or denoising prompts.
+ We do not use it for training as OmniVoiceSampleProcessor can cover this case.
+ We keep it as a reference implementation for users to understand the basic logics.
+ """
+
+ def __init__(
+ self,
+ text_tokenizer: Any,
+ num_channels: int,
+ audio_mask_id: int,
+ prompt_ratio_range: tuple,
+ mask_ratio_range: tuple,
+ drop_cond_ratio: float,
+ ):
+ self.text_tokenizer = text_tokenizer
+ self.num_channels = num_channels
+ self.audio_mask_id = audio_mask_id
+ self.prompt_ratio_range = prompt_ratio_range
+ self.mask_ratio_range = mask_ratio_range
+ self.drop_cond_ratio = drop_cond_ratio
+
+ def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+ drop_cond = random.uniform(0, 1) < self.drop_cond_ratio
+ mask_ratio = random.uniform(*self.mask_ratio_range)
+
+ if drop_cond:
+ prompt_ratio = 0.0
+ else:
+ prompt_ratio = random.uniform(*self.prompt_ratio_range)
+
+ # --- Text ---
+ text = sample["label"]["text"]
+ text_inputs = self.text_tokenizer(
+ f"<|text_start|>{text}<|text_end|>", return_tensors="pt"
+ ).input_ids.repeat(self.num_channels, 1)
+ text_labels = torch.full(text_inputs.shape, -100) # Text does not compute loss
+
+ # --- Audio ---
+ audio_tokens = sample["audio_tokens"].long()
+
+ # Masking Logic
+ prompt_length = int(audio_tokens.shape[1] * prompt_ratio)
+ audio_inputs = audio_tokens.clone()
+ audio_labels = audio_tokens.clone()
+
+ # Apply masking
+ maskable_region = audio_tokens[:, prompt_length:]
+ token_mask = torch.rand(maskable_region.shape) < mask_ratio
+ audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id
+ audio_labels[:, prompt_length:][
+ ~token_mask
+ ] = -100 # Only compute loss on masked tokens
+
+ if not drop_cond:
+ # No loss on prompt region
+ audio_labels[:, :prompt_length] = -100
+
+ # --- Concatenation ---
+ if drop_cond:
+ input_ids = audio_inputs
+ labels = audio_labels
+ total_length = input_ids.shape[1]
+ audio_mask = torch.ones(total_length, dtype=torch.bool)
+ else:
+ input_ids = torch.cat([text_inputs, audio_inputs], dim=1)
+ labels = torch.cat([text_labels, audio_labels], dim=1)
+ total_length = input_ids.shape[1]
+ audio_start_idx = text_inputs.shape[1]
+ audio_mask = torch.zeros(total_length, dtype=torch.bool)
+ audio_mask[audio_start_idx:] = True
+
+ return_dict = {
+ "input_ids": input_ids, # [C, L]
+ "labels": labels, # [C, L]
+ "audio_mask": audio_mask, # [L]
+ "length": total_length,
+ }
+
+ return return_dict
diff --git a/omnivoice/eval/__init__.py b/omnivoice/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e1e0a04cb2911882c19385fe1670c04064862f
--- /dev/null
+++ b/omnivoice/eval/__init__.py
@@ -0,0 +1,4 @@
+import warnings
+
+# Suppress specific warnings from zhconv that are not relevant to WER calculation
+warnings.filterwarnings("ignore", category=UserWarning)
diff --git a/omnivoice/eval/models/ecapa_tdnn_wavlm.py b/omnivoice/eval/models/ecapa_tdnn_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1219fbcb152fdb390b371ab183b5af6c98589830
--- /dev/null
+++ b/omnivoice/eval/models/ecapa_tdnn_wavlm.py
@@ -0,0 +1,374 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ECAPA_TDNN_WAVLM(nn.Module):
+ def __init__(
+ self,
+ feat_dim=80,
+ channels=512,
+ emb_dim=192,
+ global_context_att=False,
+ sr=16000,
+ ssl_model_path=None,
+ ):
+ super().__init__()
+ self.sr = sr
+
+ if ssl_model_path is None:
+ self.feature_extract = torch.hub.load("s3prl/s3prl", "wavlm_large")
+ else:
+ self.feature_extract = torch.hub.load(
+ os.path.dirname(ssl_model_path),
+ "wavlm_local",
+ source="local",
+ ckpt=os.path.join(ssl_model_path, "wavlm_large.pt"),
+ )
+
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+ self.feature_extract.model.encoder.layers[23].self_attn,
+ "fp32_attention",
+ ):
+ self.feature_extract.model.encoder.layers[
+ 23
+ ].self_attn.fp32_attention = False
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
+ self.feature_extract.model.encoder.layers[11].self_attn,
+ "fp32_attention",
+ ):
+ self.feature_extract.model.encoder.layers[
+ 11
+ ].self_attn.fp32_attention = False
+
+ self.feat_num = self.get_feat_num()
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
+
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
+ # self.channels = [channels] * 4 + [channels * 3]
+ self.channels = [channels] * 4 + [1536]
+
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
+ self.layer2 = SE_Res2Block(
+ self.channels[0],
+ self.channels[1],
+ kernel_size=3,
+ stride=1,
+ padding=2,
+ dilation=2,
+ scale=8,
+ se_bottleneck_dim=128,
+ )
+ self.layer3 = SE_Res2Block(
+ self.channels[1],
+ self.channels[2],
+ kernel_size=3,
+ stride=1,
+ padding=3,
+ dilation=3,
+ scale=8,
+ se_bottleneck_dim=128,
+ )
+ self.layer4 = SE_Res2Block(
+ self.channels[2],
+ self.channels[3],
+ kernel_size=3,
+ stride=1,
+ padding=4,
+ dilation=4,
+ scale=8,
+ se_bottleneck_dim=128,
+ )
+
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
+ cat_channels = channels * 3
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
+ self.pooling = AttentiveStatsPool(
+ self.channels[-1],
+ attention_channels=128,
+ global_context_att=global_context_att,
+ )
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
+
+ def get_feat_num(self):
+ self.feature_extract.eval()
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
+ with torch.no_grad():
+ features = self.feature_extract(wav)
+ select_feature = features["hidden_states"]
+ if isinstance(select_feature, (list, tuple)):
+ return len(select_feature)
+ else:
+ return 1
+
+ def get_feat(self, x):
+ with torch.no_grad():
+ x = self.feature_extract([sample for sample in x])
+
+ x = x["hidden_states"]
+ if isinstance(x, (list, tuple)):
+ x = torch.stack(x, dim=0)
+ else:
+ x = x.unsqueeze(0)
+ norm_weights = (
+ F.softmax(self.feature_weight, dim=-1)
+ .unsqueeze(-1)
+ .unsqueeze(-1)
+ .unsqueeze(-1)
+ )
+ x = (norm_weights * x).sum(dim=0)
+ x = torch.transpose(x, 1, 2) + 1e-6
+
+ x = self.instance_norm(x)
+ return x
+
+ def forward(self, x):
+ x = self.get_feat(x)
+
+ out1 = self.layer1(x)
+ out2 = self.layer2(out1)
+ out3 = self.layer3(out2)
+ out4 = self.layer4(out3)
+
+ out = torch.cat([out2, out3, out4], dim=1)
+ out = F.relu(self.conv(out))
+ out = self.bn(self.pooling(out))
+ out = self.linear(out)
+
+ return out
+
+
+# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
+
+""" Res2Conv1d + BatchNorm1d + ReLU
+"""
+
+
+class Res2Conv1dReluBn(nn.Module):
+ """
+ in_channels == out_channels == channels
+ """
+
+ def __init__(
+ self,
+ channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ dilation=1,
+ bias=True,
+ scale=4,
+ ):
+ super().__init__()
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
+ self.scale = scale
+ self.width = channels // scale
+ self.nums = scale if scale == 1 else scale - 1
+
+ self.convs = []
+ self.bns = []
+ for i in range(self.nums):
+ self.convs.append(
+ nn.Conv1d(
+ self.width,
+ self.width,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ bias=bias,
+ )
+ )
+ self.bns.append(nn.BatchNorm1d(self.width))
+ self.convs = nn.ModuleList(self.convs)
+ self.bns = nn.ModuleList(self.bns)
+
+ def forward(self, x):
+ out = []
+ spx = torch.split(x, self.width, 1)
+ for i in range(self.nums):
+ if i == 0:
+ sp = spx[i]
+ else:
+ sp = sp + spx[i]
+ # Order: conv -> relu -> bn
+ sp = self.convs[i](sp)
+ sp = self.bns[i](F.relu(sp))
+ out.append(sp)
+ if self.scale != 1:
+ out.append(spx[self.nums])
+ out = torch.cat(out, dim=1)
+
+ return out
+
+
+""" Conv1d + BatchNorm1d + ReLU
+"""
+
+
+class Conv1dReluBn(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ dilation=1,
+ bias=True,
+ ):
+ super().__init__()
+ self.conv = nn.Conv1d(
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ bias=bias,
+ )
+ self.bn = nn.BatchNorm1d(out_channels)
+
+ def forward(self, x):
+ return self.bn(F.relu(self.conv(x)))
+
+
+""" The SE connection of 1D case.
+"""
+
+
+class SE_Connect(nn.Module):
+ def __init__(self, channels, se_bottleneck_dim=128):
+ super().__init__()
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
+
+ def forward(self, x):
+ out = x.mean(dim=2)
+ out = F.relu(self.linear1(out))
+ out = torch.sigmoid(self.linear2(out))
+ out = x * out.unsqueeze(2)
+
+ return out
+
+
+""" SE-Res2Block of the ECAPA-TDNN architecture.
+"""
+
+
+# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
+# return nn.Sequential(
+# Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
+# Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
+# Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
+# SE_Connect(channels)
+# )
+
+
+class SE_Res2Block(nn.Module):
+ def __init__(
+ self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ dilation,
+ scale,
+ se_bottleneck_dim,
+ ):
+ super().__init__()
+ self.Conv1dReluBn1 = Conv1dReluBn(
+ in_channels, out_channels, kernel_size=1, stride=1, padding=0
+ )
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(
+ out_channels, kernel_size, stride, padding, dilation, scale=scale
+ )
+ self.Conv1dReluBn2 = Conv1dReluBn(
+ out_channels, out_channels, kernel_size=1, stride=1, padding=0
+ )
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
+
+ self.shortcut = None
+ if in_channels != out_channels:
+ self.shortcut = nn.Conv1d(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ )
+
+ def forward(self, x):
+ residual = x
+ if self.shortcut:
+ residual = self.shortcut(x)
+
+ x = self.Conv1dReluBn1(x)
+ x = self.Res2Conv1dReluBn(x)
+ x = self.Conv1dReluBn2(x)
+ x = self.SE_Connect(x)
+
+ return x + residual
+
+
+""" Attentive weighted mean and standard deviation pooling.
+"""
+
+
+class AttentiveStatsPool(nn.Module):
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
+ super().__init__()
+ self.global_context_att = global_context_att
+
+ # Use Conv1d with stride == 1 rather than Linear,
+ # then we don't need to transpose inputs.
+ if global_context_att:
+ self.linear1 = nn.Conv1d(
+ in_dim * 3, attention_channels, kernel_size=1
+ ) # equals W and b in the paper
+ else:
+ self.linear1 = nn.Conv1d(
+ in_dim, attention_channels, kernel_size=1
+ ) # equals W and b in the paper
+ self.linear2 = nn.Conv1d(
+ attention_channels, in_dim, kernel_size=1
+ ) # equals V and k in the paper
+
+ def forward(self, x):
+
+ if self.global_context_att:
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
+ context_std = torch.sqrt(
+ torch.var(x, dim=-1, keepdim=True) + 1e-10
+ ).expand_as(x)
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
+ else:
+ x_in = x
+
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
+ alpha = torch.tanh(self.linear1(x_in))
+ # alpha = F.relu(self.linear1(x_in))
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
+ mean = torch.sum(alpha * x, dim=2)
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
+ std = torch.sqrt(residuals.clamp(min=1e-9))
+ return torch.cat([mean, std], dim=1)
diff --git a/omnivoice/eval/models/utmos.py b/omnivoice/eval/models/utmos.py
new file mode 100644
index 0000000000000000000000000000000000000000..dca1d4ef50e9483b7913a36077b12336171080ba
--- /dev/null
+++ b/omnivoice/eval/models/utmos.py
@@ -0,0 +1,370 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+UTMOS strong model.
+Implementation from https://github.com/tarepan/SpeechMOS
+
+"""
+
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class UTMOS22Strong(nn.Module):
+ """Saeki_2022 paper's `UTMOS strong learner` inference model
+ (w/o Phoneme encoder)."""
+
+ def __init__(self):
+ """Init."""
+
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ feat_ssl, feat_domain_emb, feat_judge_emb, feat_rnn_h, feat_proj_h = (
+ 768,
+ 128,
+ 128,
+ 512,
+ 2048,
+ )
+ feat_cat = feat_ssl + feat_domain_emb + feat_judge_emb
+
+ # SSL/DataDomainEmb/JudgeIdEmb/BLSTM/Projection
+ self.wav2vec2 = Wav2Vec2Model()
+ self.domain_emb = nn.Parameter(
+ data=torch.empty(1, feat_domain_emb), requires_grad=False
+ )
+ self.judge_emb = nn.Parameter(
+ data=torch.empty(1, feat_judge_emb), requires_grad=False
+ )
+ self.blstm = nn.LSTM(
+ input_size=feat_cat,
+ hidden_size=feat_rnn_h,
+ batch_first=True,
+ bidirectional=True,
+ )
+ self.projection = nn.Sequential(
+ nn.Linear(feat_rnn_h * 2, feat_proj_h), nn.ReLU(), nn.Linear(feat_proj_h, 1)
+ )
+
+ def forward(self, wave: Tensor, sr: int) -> Tensor: # pylint: disable=invalid-name
+ """wave-to-score :: (B, T) -> (B,)"""
+
+ # Feature extraction :: (B, T) -> (B, Frame, Feat)
+ unit_series = self.wav2vec2(wave)
+ bsz, frm, _ = unit_series.size()
+
+ # DataDomain/JudgeId Embedding's Batch/Time expansion ::
+ # (B=1, Feat) -> (B=bsz, Frame=frm, Feat)
+ domain_series = self.domain_emb.unsqueeze(1).expand(bsz, frm, -1)
+ judge_series = self.judge_emb.unsqueeze(1).expand(bsz, frm, -1)
+
+ # Feature concatenation :: (B, Frame, Feat=f1) + (B, Frame, Feat=f2) +
+ # (B, Frame, Feat=f3) -> (B, Frame, Feat=f1+f2+f3)
+ cat_series = torch.cat([unit_series, domain_series, judge_series], dim=2)
+
+ # Frame-scale score estimation :: (B, Frame, Feat) -> (B, Frame, Feat)
+ # -> (B, Frame, Feat=1) - BLSTM/Projection
+ feat_series = self.blstm(cat_series)[0]
+ score_series = self.projection(feat_series)
+
+ # Utterance-scale score :: (B, Frame, Feat=1) -> (B, Feat=1)
+ # -> (B,) - Time averaging
+ utter_score = score_series.mean(dim=1).squeeze(1) * 2 + 3
+
+ return utter_score
+
+
+class Wav2Vec2Model(nn.Module):
+ """Wav2Vev2."""
+
+ def __init__(self):
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ feat_h1, feat_h2 = 512, 768
+ feature_enc_layers = (
+ [(feat_h1, 10, 5)] + [(feat_h1, 3, 2)] * 4 + [(feat_h1, 2, 2)] * 2
+ )
+
+ self.feature_extractor = ConvFeatureExtractionModel(
+ conv_layers=feature_enc_layers
+ ) # pyright: ignore [reportGeneralTypeIssues]
+ self.layer_norm = nn.LayerNorm(feat_h1)
+ self.post_extract_proj = nn.Linear(feat_h1, feat_h2)
+ self.dropout_input = nn.Dropout(0.1)
+ self.encoder = TransformerEncoder(feat_h2)
+
+ # Remnants
+ self.mask_emb = nn.Parameter(torch.FloatTensor(feat_h2))
+
+ def forward(self, source: Tensor):
+ """FeatureEncoder + ContextTransformer"""
+
+ # Feature encoding
+ features = self.feature_extractor(source)
+ features = features.transpose(1, 2)
+ features = self.layer_norm(features)
+ features = self.post_extract_proj(features)
+
+ # Context transformer
+ x = self.encoder(features)
+
+ return x
+
+
+class ConvFeatureExtractionModel(nn.Module):
+ """Feature Encoder."""
+
+ def __init__(self, conv_layers: List[Tuple[int, int, int]]):
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ def block(
+ n_in: int, n_out: int, k: int, stride: int, is_group_norm: bool = False
+ ):
+ if is_group_norm:
+ return nn.Sequential(
+ nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+ nn.Dropout(p=0.0),
+ nn.GroupNorm(dim, dim, affine=True),
+ nn.GELU(),
+ )
+ else:
+ return nn.Sequential(
+ nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
+ nn.Dropout(p=0.0),
+ nn.GELU(),
+ )
+
+ in_d = 1
+ self.conv_layers = nn.ModuleList()
+ for i, params in enumerate(conv_layers):
+ (dim, k, stride) = params
+ self.conv_layers.append(block(in_d, dim, k, stride, is_group_norm=i == 0))
+ in_d = dim
+
+ def forward(self, series: Tensor) -> Tensor:
+ """:: (B, T) -> (B, Feat, Frame)"""
+
+ series = series.unsqueeze(1)
+ for conv in self.conv_layers:
+ series = conv(series)
+
+ return series
+
+
+class TransformerEncoder(nn.Module):
+ """Transformer."""
+
+ def build_encoder_layer(self, feat: int):
+ """Layer builder."""
+ return TransformerSentenceEncoderLayer(
+ embedding_dim=feat,
+ ffn_embedding_dim=3072,
+ num_attention_heads=12,
+ activation_fn="gelu",
+ dropout=0.1,
+ attention_dropout=0.1,
+ activation_dropout=0.0,
+ layer_norm_first=False,
+ )
+
+ def __init__(self, feat: int):
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ self.required_seq_len_multiple = 2
+
+ self.pos_conv = nn.Sequential(
+ *[
+ nn.utils.weight_norm(
+ nn.Conv1d(feat, feat, kernel_size=128, padding=128 // 2, groups=16),
+ name="weight",
+ dim=2,
+ ),
+ SamePad(128),
+ nn.GELU(),
+ ]
+ )
+ self.layer_norm = nn.LayerNorm(feat)
+ self.layers = nn.ModuleList([self.build_encoder_layer(feat) for _ in range(12)])
+
+ def forward(self, x: Tensor) -> Tensor:
+
+ x_conv = self.pos_conv(x.transpose(1, 2)).transpose(1, 2)
+ x = x + x_conv
+
+ x = self.layer_norm(x)
+
+ # pad to the sequence length dimension
+ x, pad_length = pad_to_multiple(
+ x, self.required_seq_len_multiple, dim=-2, value=0
+ )
+ if pad_length > 0:
+ padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
+ padding_mask[:, -pad_length:] = True
+ else:
+ padding_mask, _ = pad_to_multiple(
+ None, self.required_seq_len_multiple, dim=-1, value=True
+ )
+
+ # :: (B, T, Feat) -> (T, B, Feat)
+ x = x.transpose(0, 1)
+ for layer in self.layers:
+ x = layer(x, padding_mask)
+ # :: (T, B, Feat) -> (B, T, Feat)
+ x = x.transpose(0, 1)
+
+ # undo paddding
+ if pad_length > 0:
+ x = x[:, :-pad_length]
+
+ return x
+
+
+class SamePad(nn.Module):
+ """Tail inverse padding."""
+
+ def __init__(self, kernel_size: int):
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+ assert kernel_size % 2 == 0, "`SamePad` now support only even kernel."
+
+ def forward(self, x: Tensor) -> Tensor:
+ return x[:, :, :-1]
+
+
+def pad_to_multiple(
+ x: Optional[Tensor], multiple: int, dim: int = -1, value: float = 0
+) -> Tuple[Optional[Tensor], int]:
+ """Tail padding."""
+ if x is None:
+ return None, 0
+ tsz = x.size(dim)
+ m = tsz / multiple
+ remainder = math.ceil(m) * multiple - tsz
+ if m.is_integer():
+ return x, 0
+ pad_offset = (0,) * (-1 - dim) * 2
+
+ return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+ """Transformer Encoder Layer used in BERT/XLM style pre-trained models."""
+
+ def __init__(
+ self,
+ embedding_dim: int,
+ ffn_embedding_dim: int,
+ num_attention_heads: int,
+ activation_fn: str,
+ dropout: float,
+ attention_dropout: float,
+ activation_dropout: float,
+ layer_norm_first: bool,
+ ) -> None:
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ assert layer_norm_first is False, "`layer_norm_first` is fixed to `False`"
+ assert activation_fn == "gelu", "`activation_fn` is fixed to `gelu`"
+
+ feat = embedding_dim
+
+ self.self_attn = MultiheadAttention(
+ feat, num_attention_heads, attention_dropout
+ )
+ self.dropout1 = nn.Dropout(dropout)
+ self.dropout2 = nn.Dropout(activation_dropout)
+ self.dropout3 = nn.Dropout(dropout)
+ self.fc1 = nn.Linear(feat, ffn_embedding_dim)
+ self.fc2 = nn.Linear(ffn_embedding_dim, feat)
+ self.self_attn_layer_norm = nn.LayerNorm(feat)
+ self.final_layer_norm = nn.LayerNorm(feat)
+
+ def forward(self, x: Tensor, self_attn_padding_mask: Optional[Tensor]):
+ # Res[Attn-Do]-LN
+ residual = x
+ x = self.self_attn(x, x, x, self_attn_padding_mask)
+ x = self.dropout1(x)
+ x = residual + x
+ x = self.self_attn_layer_norm(x)
+
+ # Res[SegFC-GELU-Do-SegFC-Do]-LN
+ residual = x
+ x = F.gelu(self.fc1(x)) # pyright: ignore [reportUnknownMemberType]
+ x = self.dropout2(x)
+ x = self.fc2(x)
+ x = self.dropout3(x)
+ x = residual + x
+ x = self.final_layer_norm(x)
+
+ return x
+
+
+class MultiheadAttention(nn.Module):
+ """Multi-headed attention."""
+
+ def __init__(self, embed_dim: int, num_heads: int, dropout: float):
+ super().__init__() # pyright: ignore [reportUnknownMemberType]
+
+ self.embed_dim, self.num_heads, self.p_dropout = embed_dim, num_heads, dropout
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True)
+
+ def forward(
+ self,
+ query: Tensor,
+ key: Tensor,
+ value: Tensor,
+ key_padding_mask: Optional[Tensor],
+ ) -> Tensor:
+ """
+ Args:
+ query :: (T, B, Feat)
+ key_padding_mask :: (B, src_len) - mask to exclude keys that are pads
+ , where padding elements are indicated by 1s.
+ """
+ return F.multi_head_attention_forward(
+ query=query,
+ key=key,
+ value=value,
+ embed_dim_to_check=self.embed_dim,
+ num_heads=self.num_heads,
+ in_proj_weight=torch.empty([0]),
+ in_proj_bias=torch.cat(
+ (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)
+ ),
+ bias_k=None,
+ bias_v=None,
+ add_zero_attn=False,
+ dropout_p=self.p_dropout,
+ out_proj_weight=self.out_proj.weight,
+ out_proj_bias=self.out_proj.bias,
+ training=False,
+ key_padding_mask=key_padding_mask.bool()
+ if key_padding_mask is not None
+ else None,
+ need_weights=False,
+ use_separate_proj_weight=True,
+ q_proj_weight=self.q_proj.weight,
+ k_proj_weight=self.k_proj.weight,
+ v_proj_weight=self.v_proj.weight,
+ )[0]
diff --git a/omnivoice/eval/mos/utmos.py b/omnivoice/eval/mos/utmos.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69d30ff45093d828c978dfd85a76aa61e7d31ae
--- /dev/null
+++ b/omnivoice/eval/mos/utmos.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Calculate UTMOS score with automatic Mean Opinion Score (MOS) prediction system
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import sys
+import traceback
+import warnings
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from omnivoice.eval.models.utmos import UTMOS22Strong
+from omnivoice.eval.utils import load_eval_waveform
+from omnivoice.utils.data_utils import read_test_list
+
+warnings.filterwarnings("ignore")
+
+# Global variables for workers
+worker_model = None
+worker_device = None
+worker_sr = 16000
+
+
+def get_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Calculate UTMOS score using UTMOS22Strong model."
+ )
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing evaluated speech files.",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ required=True,
+ help="Path to the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of our evaluation model repository."
+ "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+ "Will use 'tts_eval_models/mos/utmos22_strong_step7459_v1.pt'"
+ " in this script",
+ )
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where UTMOS information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu",
+ type=int,
+ default=1,
+ help="Number of worker processes to spawn per GPU.",
+ )
+ return parser
+
+
+def get_device(rank: int = 0) -> torch.device:
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+ return device
+
+
+def worker_init(
+ rank_queue,
+ model_path,
+):
+ """Initialize worker process with model and device."""
+ global worker_model, worker_device, worker_sr
+
+ # Limit CPU threads per worker
+ torch.set_num_threads(2)
+
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ rank = rank_queue.get() if rank_queue else -1
+
+ worker_device = get_device(rank)
+ worker_sr = 16000
+
+ logging.debug(f"Initializing UTMOS worker on {worker_device}")
+
+ # Initialize Model
+ worker_model = UTMOS22Strong()
+ try:
+ # Load weights to CPU first, then move to device
+ state_dict = torch.load(model_path, map_location="cpu")
+ worker_model.load_state_dict(state_dict)
+ except Exception as e:
+ logging.error(f"Failed to load model from {model_path}: {e}")
+ raise
+
+ worker_model.to(worker_device)
+ worker_model.eval()
+
+
+@torch.no_grad()
+def run_utmos_worker(file_idx, wav_path, language_name):
+ """Worker function to process a single audio file."""
+ try:
+ if not os.path.exists(wav_path):
+ return file_idx, wav_path, language_name, f"File not found: {wav_path}", "error"
+
+ # Load and preprocess waveform
+ speech = load_eval_waveform(wav_path, worker_sr, device=worker_device)
+
+ # Compute score
+ # UTMOS expects input shape (Batch, Time)
+ score = worker_model(speech.unsqueeze(0), worker_sr)
+
+ return file_idx, wav_path, language_name, score.item(), "success"
+
+ except Exception as e:
+ error_detail = (
+ f"Error processing {wav_path}: {str(e)}\n"
+ f"Traceback:\n{traceback.format_exc()}"
+ )
+ return file_idx, wav_path, language_name, error_detail, "error"
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ # Main process thread setting
+ torch.set_num_threads(2)
+
+ mp.set_start_method("spawn", force=True)
+
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ # Validate inputs
+ if not os.path.isdir(args.wav_path):
+ logging.error(f"Invalid directory: {args.wav_path}")
+ sys.exit(1)
+
+ model_path = os.path.join(args.model_dir, "mos/utmos22_strong_step7459_v1.pt")
+ if not os.path.exists(model_path):
+ logging.error(f"Model file not found at {model_path}")
+ sys.exit(1)
+
+ # Scan directory for files
+ logging.info(f"Calculating UTMOS for {args.wav_path}")
+
+ wav_files = []
+ try:
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ language_name = s.get("language_name") or "unknown"
+ eval_wav_path = os.path.join(args.wav_path, f"{s['id']}.{args.extension}")
+ wav_files.append((eval_wav_path, language_name))
+ except Exception as e:
+ raise ValueError(f"Error reading test list {args.test_list}: {e}")
+
+ # Setup Parallel Processing
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_procs = num_gpus * args.nj_per_gpu
+
+ logging.info(
+ f"Starting evaluation with {total_procs} processes on {num_gpus} GPUs."
+ )
+
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+
+ for rank in list(range(num_gpus)) * args.nj_per_gpu:
+ rank_queue.put(rank)
+
+ scores = []
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ fout = open(args.decode_path, "w", encoding="utf8")
+ logging.info(f"Saving detailed UTMOS results to: {args.decode_path}")
+ fout.write("Name\tUTMOS\n")
+
+ try:
+ with ProcessPoolExecutor(
+ max_workers=total_procs,
+ initializer=worker_init,
+ initargs=(
+ rank_queue,
+ model_path,
+ ),
+ ) as executor:
+ futures = []
+ for i, (wav_path, language_name) in enumerate(wav_files):
+ futures.append(
+ executor.submit(run_utmos_worker, i, wav_path, language_name)
+ )
+
+ pbar = tqdm(
+ as_completed(futures), total=len(wav_files), desc="Evaluating UTMOS"
+ )
+ lang_stats = {}
+ for future in pbar:
+ idx, path, language_name, result, status = future.result()
+ if status == "success":
+ if language_name not in lang_stats:
+ lang_stats[language_name] = []
+ lang_stats[language_name].append(result)
+ scores.append(result)
+ if fout:
+ if language_name == "unknown":
+ fout.write(f"{os.path.basename(path)}\t{result:.2f}\n")
+ else:
+ fout.write(
+ f"{language_name}\t{os.path.basename(path)}\t{result:.2f}\n"
+ )
+ else:
+ pbar.write(f"!!! FAILED [File {idx}]: {path} | {result}")
+
+ except (Exception, KeyboardInterrupt) as e:
+ logging.critical(
+ f"An unrecoverable error occurred: {e}. Terminating all processes."
+ )
+ detailed_error_info = traceback.format_exc()
+ logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+ sys.exit(1)
+
+ print("-" * 50)
+
+ if len(lang_stats) > 1:
+ lang_scores = []
+ for lang in sorted(lang_stats.keys()):
+ l_scores = lang_stats[lang]
+ l_avg = np.mean(l_scores)
+ lang_scores.append(l_scores)
+ l_count = len(l_scores)
+ logging.info(f"[{lang}] UTMOS score: {l_avg:.3f} ({l_count} samples)")
+ if fout:
+ fout.write(f"[{lang}] UTMOS: {l_avg:.3f} ({l_count} samples)\n")
+ logging.info(
+ f"Macro-average UTMOS over {len(lang_stats)} languages: "
+ f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}"
+ )
+ if fout:
+ fout.write(
+ f"\nMacro-average UTMOS over {len(lang_stats)} languages: "
+ f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n"
+ )
+
+ if scores:
+ avg_score = np.mean(scores)
+ logging.info(f"Processed {len(scores)}/{len(wav_files)} files.")
+ logging.info(f"UTMOS score: {avg_score:.2f}")
+ if fout:
+ fout.write(f"\nAverage UTMOS: {avg_score:.2f}\n")
+ else:
+ logging.error("No valid scores computed.")
+ print("-" * 50)
+
+ if fout:
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/speaker_similarity/sim.py b/omnivoice/eval/speaker_similarity/sim.py
new file mode 100644
index 0000000000000000000000000000000000000000..429b12807ab490fd281cee20fc2e122e94243ef6
--- /dev/null
+++ b/omnivoice/eval/speaker_similarity/sim.py
@@ -0,0 +1,321 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Computes speaker similarity (SIM-o) using a WavLM-based
+ ECAPA-TDNN speaker verification model.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import sys
+import traceback
+import warnings
+from concurrent.futures import ProcessPoolExecutor, as_completed
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from omnivoice.eval.models.ecapa_tdnn_wavlm import ECAPA_TDNN_WAVLM
+from omnivoice.eval.utils import load_eval_waveform
+from omnivoice.utils.data_utils import read_test_list
+
+warnings.filterwarnings("ignore")
+
+# Global variables for workers
+worker_model = None
+worker_device = None
+worker_sr = 16000
+
+
+def get_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Calculate speaker similarity (SIM-o) score."
+ )
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing evaluated speech files.",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ required=True,
+ help="Path to the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of our evaluation model repository."
+ "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+ "Will use 'tts_eval_models/speaker_similarity/wavlm_large_finetune.pth'"
+ "and 'tts_eval_models/speaker_similarity/wavlm_large/' in this script",
+ )
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files.",
+ )
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where SIM-o information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu",
+ type=int,
+ default=1,
+ help="Number of worker processes to spawn per GPU.",
+ )
+ return parser
+
+
+def get_device(rank: int = 0) -> torch.device:
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+ return device
+
+
+def worker_init(
+ rank_queue,
+ sv_model_path,
+ ssl_model_path,
+):
+ """Initialize worker process with model and device."""
+ global worker_model, worker_device, worker_sr
+
+ torch.set_num_threads(2)
+
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ rank = rank_queue.get() if rank_queue else -1
+
+ worker_device = get_device(rank)
+ worker_sr = 16000
+
+ logging.debug(f"Initializing SIM-o worker on {worker_device}")
+ # Temporarily suppress INFO logs to hide verbose WavLM config
+ logging.disable(logging.INFO)
+
+ # Initialize Model
+ try:
+ worker_model = ECAPA_TDNN_WAVLM(
+ feat_dim=1024,
+ channels=512,
+ emb_dim=256,
+ sr=worker_sr,
+ ssl_model_path=ssl_model_path,
+ )
+ state_dict = torch.load(
+ sv_model_path, map_location=lambda storage, loc: storage
+ )
+ worker_model.load_state_dict(state_dict["model"], strict=False)
+ worker_model.to(worker_device)
+ worker_model.eval()
+ finally:
+ # Restore normal logging
+ logging.disable(logging.NOTSET)
+
+
+@torch.no_grad()
+def get_embedding(wav_path: str) -> torch.Tensor:
+ """Extract embedding for a single file."""
+ speech = load_eval_waveform(wav_path, worker_sr, device=worker_device, max_seconds=120)
+ return worker_model([speech])
+
+
+def run_similarity_worker(line_idx, sample, wav_dir, extension):
+ """Worker function to process a single pair."""
+ try:
+ wav_name = sample["id"]
+ ref_wav_path = sample["ref_audio"]
+ language_name = sample.get("language_name") or "unknown"
+ eval_wav_path = os.path.join(wav_dir, f"{wav_name}.{extension}")
+
+ if not os.path.exists(ref_wav_path):
+ return line_idx, f"Reference not found: {ref_wav_path}", None, "error"
+ if not os.path.exists(eval_wav_path):
+ return line_idx, f"Eval wav not found: {eval_wav_path}", None, "error"
+
+ # Compute embeddings pair-wise
+ ref_emb = get_embedding(ref_wav_path)
+ eval_emb = get_embedding(eval_wav_path)
+
+ # Cosine Similarity
+ similarity = torch.nn.functional.cosine_similarity(ref_emb, eval_emb, dim=-1)
+
+ return (
+ line_idx,
+ (ref_wav_path, eval_wav_path, language_name),
+ similarity.item(),
+ "success",
+ )
+
+ except Exception as e:
+ error_detail = f"Error: {str(e)}\nTraceback:\n{traceback.format_exc()}"
+ return line_idx, str(sample), error_detail, "error"
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ # Main process thread setting
+ torch.set_num_threads(2)
+
+ mp.set_start_method("spawn", force=True)
+
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ # Prepare paths
+ sv_model_path = os.path.join(
+ args.model_dir, "speaker_similarity/wavlm_large_finetune.pth"
+ )
+ ssl_model_path = os.path.join(args.model_dir, "speaker_similarity/wavlm_large/")
+
+ if not os.path.exists(sv_model_path) or not os.path.exists(ssl_model_path):
+ logging.error("Model files not found. Please check --model-dir.")
+ sys.exit(1)
+
+ logging.info(f"Calculating SIM-o for {args.wav_path}")
+ # Read list
+ samples = read_test_list(args.test_list)
+
+ # Setup Parallel Processing
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_procs = num_gpus * args.nj_per_gpu
+
+ logging.info(
+ f"Starting evaluation with {total_procs} processes " f"on {num_gpus} GPUs."
+ )
+
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+
+ for rank in list(range(num_gpus)) * args.nj_per_gpu:
+ rank_queue.put(rank)
+
+ scores = []
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ fout = open(args.decode_path, "w", encoding="utf8")
+ logging.info(f"Saving detailed SIM-o results to: {args.decode_path}")
+ fout.write("Prompt-path\tEval-path\tSIM-o\n")
+
+ try:
+ with ProcessPoolExecutor(
+ max_workers=total_procs,
+ initializer=worker_init,
+ initargs=(
+ rank_queue,
+ sv_model_path,
+ ssl_model_path,
+ ),
+ ) as executor:
+ futures = []
+ for i, sample in enumerate(samples):
+ futures.append(
+ executor.submit(
+ run_similarity_worker, i, sample, args.wav_path, args.extension
+ )
+ )
+
+ pbar = tqdm(
+ as_completed(futures), total=len(samples), desc="Evaluating SIM-o"
+ )
+
+ lang_stats = {}
+
+ for future in pbar:
+ idx, context, result, status = future.result()
+ if status == "success":
+ prompt_path, eval_path, lang = context
+ scores.append(result)
+
+ # Accumulate per-language
+ if lang not in lang_stats:
+ lang_stats[lang] = []
+ lang_stats[lang].append(result)
+
+ if fout:
+ if lang == "unknown":
+ fout.write(f"{prompt_path}\t{eval_path}\t{result:.2f}\n")
+ else:
+ fout.write(
+ f"{lang}\t{context[0]}\t{context[1]}\t{result:.2f}\n"
+ )
+ else:
+ pbar.write(f"!!! FAILED [Line {idx}]: {context} | Error: {result}")
+
+ except (Exception, KeyboardInterrupt) as e:
+ logging.critical(
+ f"An unrecoverable error occurred: {e}. " f"Terminating all processes."
+ )
+ detailed_error_info = traceback.format_exc()
+ logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}")
+ sys.exit(1)
+
+ print("-" * 50)
+ if len(lang_stats) > 1:
+ lang_scores = []
+ for lang in sorted(lang_stats.keys()):
+ l_scores = lang_stats[lang]
+ l_avg = np.mean(l_scores)
+ lang_scores.append(l_scores)
+ l_count = len(l_scores)
+ logging.info(f"[{lang}] SIM-o score: {l_avg:.3f} ({l_count} pairs)")
+ if fout:
+ fout.write(f"[{lang}] SIM-o: {l_avg:.3f} ({l_count} pairs)\n")
+ logging.info(
+ f"Macro-average SIM-o over {len(lang_stats)} languages: "
+ f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}"
+ )
+ if fout:
+ fout.write(
+ f"\nMacro-average SIM-o over {len(lang_stats)} languages: "
+ f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n"
+ )
+
+ if scores:
+ avg_score = np.mean(scores)
+ logging.info(f"Processed {len(scores)}/{len(samples)} pairs.")
+ logging.info(f"SIM-o score: {avg_score:.3f}")
+ if fout:
+ fout.write(f"\nAverage SIM-o: {avg_score:.3f}\n")
+ else:
+ logging.error("No valid scores computed.")
+ if fout:
+ fout.close()
+ print("-" * 50)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/utils.py b/omnivoice/eval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..375d9636581ba33bcfa33e8b6f54c8dc8bdf11cc
--- /dev/null
+++ b/omnivoice/eval/utils.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Optional
+
+import soundfile as sf
+import torch
+import torchaudio
+
+
+def load_eval_waveform(
+ fname: str,
+ sample_rate: int,
+ dtype: str = "float32",
+ device: torch.device = torch.device("cpu"),
+ return_numpy: bool = False,
+ max_seconds: Optional[float] = None,
+) -> torch.Tensor:
+ """
+ Load an audio file, preprocess it, and convert to a PyTorch tensor.
+
+ Args:
+ fname (str): Path to the audio file.
+ sample_rate (int): Target sample rate for resampling.
+ dtype (str, optional): Data type to load audio as (default: "float32").
+ device (torch.device, optional): Device to place the resulting tensor
+ on (default: CPU).
+ return_numpy (bool): If True, returns a NumPy array instead of a
+ PyTorch tensor.
+ max_seconds (float): Maximum length (seconds) of the audio tensor.
+ If the audio is longer than this, it will be truncated.
+
+ Returns:
+ torch.Tensor: Processed audio waveform as a PyTorch tensor,
+ with shape (num_samples,).
+
+ Notes:
+ - If the audio is stereo, it will be converted to mono by averaging channels.
+ - If the audio's sample rate differs from the target, it will be resampled.
+ """
+ # Load audio file with specified data type
+ wav_data, sr = sf.read(fname, dtype=dtype)
+
+ # Convert stereo to mono if necessary
+ if len(wav_data.shape) == 2:
+ wav_data = wav_data.mean(1)
+
+ # Resample to target sample rate if needed
+ if sr != sample_rate:
+ wav_data = torchaudio.functional.resample(
+ torch.from_numpy(wav_data), orig_freq=sr, new_freq=sample_rate
+ ).numpy()
+
+ if max_seconds is not None:
+ # Trim to max length
+ max_length = int(sample_rate * max_seconds)
+ if len(wav_data) > max_length:
+ wav_data = wav_data[:max_length]
+ logging.warning(
+ f"Wav file {fname} is longer than {max_seconds}s, "
+ f"truncated to {max_seconds}s to avoid OOM."
+ )
+ if return_numpy:
+ return wav_data
+ else:
+ wav_data = torch.from_numpy(wav_data)
+ return wav_data.to(device)
diff --git a/omnivoice/eval/wer/common.py b/omnivoice/eval/wer/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..c081fbd970f73e413bea6f7446a74dad2b45c849
--- /dev/null
+++ b/omnivoice/eval/wer/common.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Shared utilities for WER evaluation scripts.
+"""
+import logging
+
+import numpy as np
+from jiwer import compute_measures
+
+
+def process_one(hypothesis: str, truth: str, post_process, lang: str = None) -> dict:
+ """
+ Computes WER and related metrics for a single hypothesis-truth pair.
+
+ Args:
+ hypothesis (str): The transcribed text from the ASR model.
+ truth (str): The ground truth transcript.
+ post_process (callable): Text normalization function defined by each script.
+ Signature: post_process(text, lang) or post_process(text).
+ lang (str): The language code for post_process. Pass None if post_process
+ does not accept a lang argument.
+
+ Returns:
+ dict: A dict containing:
+ - truth (str): Post-processed ground truth text.
+ - hypothesis (str): Post-processed hypothesis text.
+ - wer (float): Word Error Rate.
+ - substitutions (int): Number of substitutions.
+ - deletions (int): Number of deletions.
+ - insertions (int): Number of insertions.
+ - word_num (int): Number of words in the post-processed ground truth.
+ """
+ if lang is not None:
+ truth_processed = post_process(truth, lang)
+ hypothesis_processed = post_process(hypothesis, lang)
+ else:
+ truth_processed = post_process(truth)
+ hypothesis_processed = post_process(hypothesis)
+ measures = compute_measures(truth_processed, hypothesis_processed)
+ word_num = len(truth_processed.split(" "))
+ return {
+ "truth": truth_processed,
+ "hypo": hypothesis_processed,
+ "wer": measures["wer"],
+ "substitutions": measures["substitutions"],
+ "deletions": measures["deletions"],
+ "insertions": measures["insertions"],
+ "word_num": word_num,
+ }
+
+
+def log_metrics(fout, prefix, i_list, d_list, s_list, w_total, ndigits=2):
+ """Log weighted WER metrics for a subset of results."""
+ metrics_wer = round(
+ (np.sum(s_list) + np.sum(d_list) + np.sum(i_list)) / w_total * 100, ndigits
+ )
+ metrics_inse = np.sum(i_list)
+ metrics_dele = np.sum(d_list)
+ metrics_subs = np.sum(s_list)
+
+ logging.info(f"{prefix} WER: {metrics_wer}%")
+ logging.info(
+ f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, "
+ f"{metrics_subs} sub / {w_total} words"
+ )
+ if fout:
+ fout.write(f"{prefix} WER: {metrics_wer}%\n")
+ fout.write(
+ f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, "
+ f"{metrics_subs} sub / {w_total} words\n"
+ )
+ return metrics_wer
diff --git a/omnivoice/eval/wer/fleurs.py b/omnivoice/eval/wer/fleurs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1899afc9dba7759e1c56746e79198a066854fd1
--- /dev/null
+++ b/omnivoice/eval/wer/fleurs.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Computes word error rate (WER) for FLEURS multilingual evaluation.
+
+Uses omnilingual-asr for ASR transcription across 100+ languages.
+Requires a separate environment with ``omnilingual_asr`` installed.
+
+Usage:
+ python3 omnivoice/eval/wer/fleurs.py \\
+ --wav-path results/fleurs \\
+ --test-list test.jsonl \\
+ --decode-path results/fleurs.wer.log \\
+ --model-card omniASR_LLM_Unlimited_7B_v2 \\
+ --chunk-size 100 --batch-size 50
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import sys
+import traceback
+import types
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Union
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+try:
+ from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline
+ from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs
+except ImportError:
+ logging.error("Please install omnilingual_asr first.")
+ exit(1)
+
+# omnilingual-asr may pull a transformers version that lacks
+# HiggsAudioV2TokenizerModel. Pre-register stubs to bypass
+# omnivoice/__init__.py heavy imports.
+if "omnivoice" not in sys.modules:
+ _root = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
+ for _name in (
+ "omnivoice",
+ "omnivoice.eval",
+ "omnivoice.eval.wer",
+ "omnivoice.utils",
+ ):
+ if _name not in sys.modules:
+ _m = types.ModuleType(_name)
+ _m.__path__ = [os.path.join(_root, *_name.split(".")[1:])]
+ _m.__package__ = _name
+ sys.modules[_name] = _m
+
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+
+
+# fix mismatched language codes between OmniVoice and Omnilingual-ASR model
+rename = {
+ "et": "ekk",
+ "ms": "zsm",
+ "sw": "swh",
+ "npi": "nep",
+}
+
+
+def read_language_mapping_from_tsv(
+ mapping_path: Path,
+) -> dict[str, Union[str, List[str]]]:
+ with open(mapping_path, "r", encoding="utf-8") as f:
+ _ = f.readline() # Skip header
+ language_mapping = {}
+ for line in f:
+ parts = line.strip().split("\t")
+ mixed_id, language_name, iso_639_3_id, duration = parts
+ language_mapping[iso_639_3_id] = mixed_id
+ return language_mapping
+
+
+iso_639_3_id_to_mixed_id = read_language_mapping_from_tsv(
+ Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv")
+)
+
+mixed_id_to_omnilingual_asr_lang = {}
+
+for lang in supported_langs:
+ if lang in ("cmn_Hant",):
+ continue
+ iso_639_3_lang_code = lang.split("_")[0]
+ if iso_639_3_lang_code in iso_639_3_id_to_mixed_id:
+ mixed_id = iso_639_3_id_to_mixed_id[iso_639_3_lang_code]
+ mixed_id_to_omnilingual_asr_lang[mixed_id] = lang
+ else:
+ mixed_id_to_omnilingual_asr_lang[iso_639_3_lang_code] = lang
+
+
+def clean_cjk_spaces(text):
+ """
+ Removes spaces adjacent to Chinese and Japanese characters while preserving
+ meaningful spaces in English or other languages (like Korean).
+ """
+
+ # Define CJK (Chinese, Japanese) Unicode ranges
+ # \u4e00-\u9fff: CJK Unified Ideographs (Chinese)
+ # \u3040-\u309f: Hiragana (Japanese)
+ # \u30a0-\u30ff: Katakana (Japanese)
+ # \u3000-\u303f: CJK Symbols and Punctuation
+ cjk_range = r"\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f"
+
+ # 1. Remove spaces between two CJK characters
+ # Example: "我 爱 你" -> "我爱你"
+ text = re.sub(f"([{cjk_range}])\\s+([{cjk_range}])", r"\1\2", text)
+
+ # 2. Remove spaces between a CJK character and a non-CJK character (English/Numbers)
+ # Example: "我 爱 you" -> "我爱you"
+ text = re.sub(f"([{cjk_range}])\\s+", r"\1", text)
+ text = re.sub(f"\\s+([{cjk_range}])", r"\1", text)
+
+ # 3. Collapse multiple spaces into one for the remaining parts (e.g., English words)
+ text = re.sub(r"\s+", " ", text)
+
+ return text.strip()
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Computes WER with Whisper.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing speech files.",
+ )
+
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where WER information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--model-card",
+ type=str,
+ default="omniASR_LLM_7B",
+ help="Model card name for OmniASR (e.g., omniASR_LLM_7B) or local path.",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ default="test.jsonl",
+ help="path of the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--lang",
+ type=str,
+ default=None,
+ help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese).
+ If not provided, the script will evaluate all languages found in the test list.
+ If specified, only samples of the given language will be evaluated.
+ """,
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=8,
+ help="Batch size for decoding with the Hugging Face pipeline.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+ )
+ parser.add_argument(
+ "--chunk-size",
+ type=int,
+ default=300,
+ help="Number of samples per task chunk sent to workers.",
+ )
+ return parser
+
+
+def load_omni_model(model_card, device):
+ logging.info(f"Loading OmniASR model ({model_card}) on {device}...")
+ try:
+ pipeline = ASRInferencePipeline(model_card=model_card, device=str(device))
+ return pipeline
+ except Exception as e:
+ logging.error(f"Failed to load OmniASR pipeline: {e}")
+ return None
+
+
+def process_init(rank_queue, model_card):
+ """
+ Initializer for each worker process.
+ """
+ global worker_pipe, worker_device
+
+ # Configure threads constraint
+ torch.set_num_threads(2)
+
+ try:
+ rank = rank_queue.get(timeout=10)
+ except Exception:
+ raise RuntimeError("Failed to get GPU rank from queue.")
+
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ worker_device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+ try:
+ # Using the model_card argument
+ worker_pipe = load_omni_model(model_card, worker_device)
+ if worker_pipe is None:
+ raise RuntimeError("Model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load model on {worker_device}: {e}")
+ raise e
+
+
+def post_process(text: str, lang: str) -> str:
+ """
+ Cleans and normalizes text for WER calculation.
+ Args:
+ text (str): The input text to be processed.
+ lang (str): The language of the input text.
+
+ Returns:
+ str: The cleaned and normalized text.
+ """
+ lang_id = lang[:3] # Extract ISO 639-3 code (e.g., 'eng' from 'eng_Latn')
+ text = text_normalize(
+ text,
+ iso_code=lang_id,
+ lower_case=True,
+ remove_numbers=False,
+ remove_brackets=False,
+ )
+ text = clean_cjk_spaces(text)
+ text = text.replace(" ", "|")
+ text = " ".join([x for x in text])
+ return text
+
+
+def run_eval_worker(data_chunk, language, batch_size):
+ """
+ Worker function to process a chunk of data.
+ Uses the global worker_pipe initialized by process_init.
+ """
+ global worker_pipe
+ if worker_pipe is None:
+ logging.error("Worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ # Prepare batch lists for OmniASR
+ audio_paths = [item["wav_path"] for item in data_chunk]
+
+ # OmniASR expects explicit language codes for each file if not auto-detected.
+ # Using the language passed to the worker function, or item specific language
+ # Assuming item['lang_id'] is compatible (e.g., 'en', 'zh', 'arb_Arab')
+ # If the model needs full tokens like 'en_Latn', conversion might be needed here depending on input data.
+ lang_list = [item.get("lang_id", language) for item in data_chunk]
+
+ # Use the pipeline to infer batch
+ # OmniASR pipeline.transcribe returns a list of strings
+ transcriptions = worker_pipe.transcribe(
+ audio_paths, lang=lang_list, batch_size=batch_size
+ )
+
+ for i, hypo_text in enumerate(transcriptions):
+ ref_item = data_chunk[i]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+ lang_id = ref_item.get("lang_id")
+ lang_name = ref_item.get("lang_name")
+
+ m = process_one(hypo_text, truth, post_process, lang_id)
+ m["wav_path"] = wav_path
+ m["lang_name"] = lang_name
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(
+ f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}"
+ )
+ return []
+
+ return metrics_buffer
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+ level=logging.INFO,
+ force=True,
+ )
+
+ # 1. Prepare Data
+ logging.info("Reading test list...")
+ data_by_lang = defaultdict(list)
+ total_files = 0
+ wav_root = Path(args.wav_path)
+
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+ if not os.path.exists(wav_path):
+ logging.warning(f"File missing: {wav_path}")
+ continue
+
+ lang_id = s.get("language_id") or "unknown"
+ if lang_id in rename:
+ lang_id = mixed_id_to_omnilingual_asr_lang[rename[lang_id]]
+ else:
+ lang_id = mixed_id_to_omnilingual_asr_lang[lang_id]
+ item = {
+ "wav_path": wav_path,
+ "truth_text": s["text"],
+ "lang_id": lang_id,
+ "lang_name": s.get("language_name") or "unknown",
+ }
+ if args.lang and s.get("language_id") != args.lang:
+ continue
+
+ data_by_lang[s.get("language_name") or "unknown"].append(item)
+
+ total_files += 1
+
+ logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.")
+
+ # 2. Worker config
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_workers = num_gpus * args.nj_per_gpu
+
+ mp.set_start_method("spawn", force=True)
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ rank_queue.put(rank)
+
+ # 3. Scheduling: Split languages into chunks
+ # This prevents one huge language from blocking a worker for too long,
+ # allows better load balancing across the pool.
+ tasks = []
+ chunk_size = args.chunk_size
+
+ for lang_name, items in data_by_lang.items():
+ # Slicing the list into chunks
+ for i in range(0, len(items), chunk_size):
+ chunk = items[i : i + chunk_size]
+ tasks.append({"chunk": chunk, "lang": lang_name})
+
+ logging.info(
+ f"Split data into {len(tasks)} chunks (size ~{chunk_size}). Spawning {total_workers} workers."
+ )
+
+ # 4. Execution
+ results = []
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init,
+ initargs=(rank_queue, args.model_card),
+ ) as executor:
+
+ futures = []
+ for task in tasks:
+ futures.append(
+ executor.submit(
+ run_eval_worker, task["chunk"], task["lang"], args.batch_size
+ )
+ )
+
+ # Unified progress bar
+ with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+ for future in as_completed(futures):
+ try:
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+ except Exception as e:
+ logging.error(f"Task failed: {e}")
+
+ # 5. Metrics Aggregation
+ wers, inses, deles, subses = [], [], [], []
+ word_nums = 0
+
+ # Store metrics per language
+ lang_stats = {}
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ logging.info(f"Saving detailed WER results to: {args.decode_path}")
+ fout = open(args.decode_path, "w", encoding="utf-8")
+
+ for res in results:
+ wers.append(float(res["wer"]))
+ inses.append(float(res["insertions"]))
+ deles.append(float(res["deletions"]))
+ subses.append(float(res["substitutions"]))
+ word_nums += res["word_num"]
+
+ if fout:
+ fout.write(
+ f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+ f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+ f"{res['substitutions']}\n"
+ )
+ lang_name = res["lang_name"]
+
+ # Per language stats
+ if lang_name not in lang_stats:
+ lang_stats[lang_name] = {
+ "inses": [],
+ "deles": [],
+ "subses": [],
+ "word_nums": 0,
+ }
+ lang_stats[lang_name]["inses"].append(float(res["insertions"]))
+ lang_stats[lang_name]["deles"].append(float(res["deletions"]))
+ lang_stats[lang_name]["subses"].append(float(res["substitutions"]))
+ lang_stats[lang_name]["word_nums"] += res["word_num"]
+
+ print("-" * 50)
+ # Log per-language stats
+ per_lang_wers = []
+ for lang in sorted(lang_stats.keys()):
+ stats = lang_stats[lang]
+ if stats["word_nums"] > 0:
+ lang_wer = log_metrics(
+ fout,
+ f"[{lang}]",
+ stats["inses"],
+ stats["deles"],
+ stats["subses"],
+ stats["word_nums"],
+ )
+ per_lang_wers.append(lang_wer)
+ print("-" * 50)
+
+ # Log Macro-average WER
+ if len(per_lang_wers) > 1:
+ macro_wer = np.mean(per_lang_wers)
+ logging.info(
+ f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%"
+ )
+ if fout:
+ fout.write(
+ f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n"
+ )
+ count_le_5 = sum(1 for w in per_lang_wers if w <= 5.0)
+ count_le_10 = sum(1 for w in per_lang_wers if w <= 10.0)
+ count_le_20 = sum(1 for w in per_lang_wers if w <= 20.0)
+
+ stats_msg = (
+ f"Languages with WER/CER <= 5%: {count_le_5}/{len(per_lang_wers)}\n"
+ f"Languages with WER/CER <= 10%: {count_le_10}/{len(per_lang_wers)}\n"
+ f"Languages with WER/CER <= 20%: {count_le_20}/{len(per_lang_wers)}"
+ )
+
+ logging.info("\n" + stats_msg)
+ if fout:
+ fout.write(stats_msg + "\n")
+
+ # Log overall stats
+ if word_nums > 0:
+ log_metrics(fout, "Overall", inses, deles, subses, word_nums)
+
+ if fout:
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/wer/hubert.py b/omnivoice/eval/wer/hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..48706bb215641e360b8fd49ffca0e23e1a82134f
--- /dev/null
+++ b/omnivoice/eval/wer/hubert.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Computes word error rate (WER) with Hubert models for LibriSpeech test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from omnivoice.eval.utils import load_eval_waveform
+from omnivoice.eval.wer.common import process_one
+from omnivoice.utils.data_utils import read_test_list
+
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Computes WER with Hubert-based ASR model.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing speech files.",
+ )
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where WER information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of our evaluation model repository."
+ "Download from https://huggingface.co/k2-fsa/TTS_eval_models."
+ "Will use 'tts_eval_models/wer/hubert-large-ls960-ft/'"
+ " in this script",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ default="transcript.jsonl",
+ help="path of the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=16,
+ help="Batch size for decoding with the Hugging Face pipeline.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+ )
+ return parser
+
+
+def process_init(rank_queue, model_dir):
+ global worker_pipe, worker_device
+
+ torch.set_num_threads(2)
+
+ try:
+ rank = rank_queue.get(timeout=10)
+ except Exception:
+ raise RuntimeError("Failed to get GPU rank from queue.")
+
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ worker_device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+ try:
+ worker_pipe = load_hubert_model(model_dir, worker_device)
+ if worker_pipe is None:
+ raise RuntimeError("Model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load model on {worker_device}: {e}")
+ raise e
+
+
+def load_hubert_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/hubert-large-ls960-ft/")
+ if not os.path.exists(model_path):
+ logging.error(
+ f"Hubert model not found at {model_path}. "
+ "Please download from https://huggingface.co/k2-fsa/TTS_eval_models"
+ )
+ return None
+
+ logging.debug(f"Loading Hubert-based ASR model on {device}...")
+ import transformers
+
+ # Suppress transformers logging
+ transformers.logging.set_verbosity_error()
+
+ pipe = transformers.pipeline(
+ "automatic-speech-recognition",
+ model=model_path,
+ device=device,
+ tokenizer=model_path,
+ )
+ return pipe
+
+
+def post_process(text: str) -> str:
+ """
+ Cleans and normalizes text for WER calculation.
+ Args:
+ text (str): The input text to be processed.
+
+ Returns:
+ str: The cleaned and normalized text.
+ """
+ text = text.replace("‘", "'").replace("’", "'")
+ text = re.sub(r"[^a-zA-Z0-9']", " ", text.lower())
+ text = re.sub(r"\s+", " ", text).strip()
+ return text
+
+
+def run_eval_worker(data_chunk, batch_size):
+ global worker_pipe
+ if worker_pipe is None:
+ logging.error("Worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ dataset = [
+ {
+ "array": load_eval_waveform(
+ item["wav_path"], sample_rate=16000, return_numpy=True
+ ),
+ "sampling_rate": 16000,
+ }
+ for item in data_chunk
+ ]
+ generate_kwargs = {"language": "english", "task": "transcribe"}
+
+ iterator = worker_pipe(
+ dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+ )
+
+ for i, out in enumerate(iterator):
+ hypothesis = out["text"].strip()
+ ref_item = data_chunk[i]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+
+ m = process_one(hypothesis, truth, post_process)
+ m["wav_path"] = wav_path
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(f"Worker failed on chunk:\n{traceback.format_exc()}")
+ return []
+
+ return metrics_buffer
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+ level=logging.INFO,
+ force=True,
+ )
+
+ logging.info(f"Calculating WER for {args.wav_path}")
+
+ data_list = []
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ wav_full_path = str(Path(args.wav_path) / (s["id"] + "." + args.extension))
+ if not os.path.exists(wav_full_path):
+ logging.warning(f"File missing: {wav_full_path}")
+ continue
+ data_list.append(
+ {
+ "wav_path": wav_full_path,
+ "truth_text": s["text"],
+ }
+ )
+ total_files = len(data_list)
+
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_workers = num_gpus * args.nj_per_gpu
+
+ mp.set_start_method("spawn", force=True)
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ rank_queue.put(rank)
+
+ chunk_size = max(1, args.batch_size)
+ tasks = [data_list[i : i + chunk_size] for i in range(0, total_files, chunk_size)]
+
+ logging.info(
+ f"Split data into {len(tasks)} chunks (size ~{chunk_size}). "
+ f"Spawning {total_workers} workers."
+ )
+
+ results = []
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init,
+ initargs=(rank_queue, args.model_dir),
+ ) as executor:
+
+ futures = []
+ for chunk in tasks:
+ futures.append(executor.submit(run_eval_worker, chunk, args.batch_size))
+
+ with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+ for future in as_completed(futures):
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+
+ wers, inses, deles, subses = [], [], [], []
+ word_nums = 0
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ fout = open(args.decode_path, "w", encoding="utf8")
+ logging.info(f"Saving detailed WER results to: {args.decode_path}")
+ fout.write(
+ "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n"
+ )
+
+ for res in results:
+ wers.append(float(res["wer"]))
+ inses.append(float(res["insertions"]))
+ deles.append(float(res["deletions"]))
+ subses.append(float(res["substitutions"]))
+ word_nums += res["word_num"]
+
+ if fout:
+ fout.write(
+ f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+ f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+ f"{res['substitutions']}\n"
+ )
+
+ wer_weighted = (
+ round(
+ (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2
+ )
+ if word_nums > 0
+ else float("nan")
+ )
+
+ inse_sum = np.sum(inses)
+ dele_sum = np.sum(deles)
+ subs_sum = np.sum(subses)
+
+ print("-" * 50)
+ logging.info(f"Processed {len(results)}/{total_files} files.")
+ wer_info = f"WER: {wer_weighted}%"
+ detailed_info = (
+ f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words"
+ )
+ logging.info(wer_info)
+ logging.info(detailed_info)
+ print("-" * 50)
+
+ if fout:
+ fout.write(wer_info + "\n" + detailed_info + "\n")
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/wer/minimax.py b/omnivoice/eval/wer/minimax.py
new file mode 100644
index 0000000000000000000000000000000000000000..14793048d0c92fde69ae8bfdcc74dc6ad37b095d
--- /dev/null
+++ b/omnivoice/eval/wer/minimax.py
@@ -0,0 +1,596 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Computes word error rate (WER) with Whisper-large-v3 for English and
+Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import traceback
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Union
+
+import numpy as np
+import torch
+import zhconv
+from tqdm import tqdm
+
+from omnivoice.eval.utils import load_eval_waveform
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_paraformer = None
+worker_device = None
+
+
+def read_language_mapping_from_tsv(
+ mapping_path: Path,
+) -> dict[str, Union[str, List[str]]]:
+ with open(mapping_path, "r", encoding="utf-8") as f:
+ _ = f.readline() # Skip header
+ language_mapping = {}
+ for line in f:
+ parts = line.strip().split("\t")
+ mixed_id, language_name, iso_639_3_id, duration = parts
+ language_mapping[mixed_id] = iso_639_3_id
+ return language_mapping
+
+
+mixed_id_to_iso_639_3_id = read_language_mapping_from_tsv(
+ Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv")
+)
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Computes WER with Whisper.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing speech files.",
+ )
+
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where WER information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of evaluation models repository. "
+ "Download from https://huggingface.co/k2-fsa/TTS_eval_models. ",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ default="test.jsonl",
+ help="path of the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--lang",
+ type=str,
+ default=None,
+ help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese).
+ If not provided, the script will evaluate all languages found in the test list.
+ If specified, only samples of the given language will be evaluated.
+ """,
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=16,
+ help="Batch size for decoding with the Hugging Face pipeline.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+ )
+ parser.add_argument(
+ "--chunk-size",
+ type=int,
+ default=10,
+ help="Number of samples per task chunk sent to workers.",
+ )
+ return parser
+
+
+def load_whisper_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/whisper-large-v3/")
+ if not os.path.exists(model_path):
+ logging.error(f"Whisper model not found at {model_path}.")
+ return None
+
+ import transformers
+
+ # Suppress transformers logging
+ transformers.logging.set_verbosity_error()
+
+ logging.info(f"Loading Whisper model on {device}...")
+ pipe = transformers.pipeline(
+ "automatic-speech-recognition",
+ model=model_path,
+ chunk_length_s=30,
+ dtype=torch.float16 if "cuda" in str(device) else torch.float32,
+ device=device,
+ )
+ return pipe
+
+
+def load_paraformer_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/paraformer-zh/")
+ if not os.path.exists(model_path):
+ logging.error(f"Paraformer model not found at {model_path}.")
+ return None
+
+ logging.info(f"Loading Paraformer model on {device}...")
+
+ previous_level = logging.root.manager.disable
+ logging.disable(logging.CRITICAL)
+
+ try:
+ from funasr import AutoModel
+
+ model = AutoModel(
+ model=model_path,
+ device=str(device),
+ disable_update=True,
+ disable_pbar=True,
+ verbose=False,
+ )
+ finally:
+ logging.disable(previous_level)
+
+ return model
+
+
+def _worker_setup(rank_queue):
+ """Common worker setup: get rank, configure device and threads."""
+ global worker_device
+
+ torch.set_num_threads(2)
+
+ try:
+ rank = rank_queue.get(timeout=10)
+ except Exception:
+ raise RuntimeError("Failed to get GPU rank from queue.")
+
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ worker_device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+
+def process_init(rank_queue, model_dir):
+ """Initializer for Whisper worker processes."""
+ global worker_pipe
+
+ _worker_setup(rank_queue)
+
+ try:
+ worker_pipe = load_whisper_model(model_dir, worker_device)
+ if worker_pipe is None:
+ raise RuntimeError("Whisper model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load Whisper model on {worker_device}: {e}")
+ raise e
+
+
+def process_init_paraformer(rank_queue, model_dir):
+ """Initializer for Paraformer worker processes (Chinese evaluation)."""
+ global worker_paraformer
+
+ _worker_setup(rank_queue)
+
+ try:
+ worker_paraformer = load_paraformer_model(model_dir, worker_device)
+ if worker_paraformer is None:
+ raise RuntimeError("Paraformer model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load Paraformer model on {worker_device}: {e}")
+ raise e
+
+
+def post_process(text: str, lang: str) -> str:
+ """
+ Cleans and normalizes text for WER calculation.
+ Args:
+ text (str): The input text to be processed.
+ lang (str): The language of the input text.
+
+ Returns:
+ str: The cleaned and normalized text.
+ """
+ if lang != "unknown":
+
+ iso_639_3_code = mixed_id_to_iso_639_3_id[lang]
+ text = text_normalize(
+ text,
+ iso_code=iso_639_3_code,
+ lower_case=True,
+ remove_numbers=False,
+ remove_brackets=False,
+ )
+
+ if lang in ["zh", "yue"]:
+ text = zhconv.convert(text, "zh-cn")
+
+ # Processing spaces for languages using CER (consistent with the practice
+ # in paper Minimax-Speech), specifically: zh, yue, ja, ko, th, arb, vi, hi, el.
+ if lang in ("zh", "yue", "ja"):
+ # For languages where spaces are not semantically meaningful, remove spaces.
+ text = text.replace(" ", "")
+ text = " ".join([x for x in text])
+ elif lang in ("ko", "th", "arb", "vi", "hi", "el"):
+ # For languages where spaces are semantically meaningful, replace spaces with |.
+ text = text.replace(" ", "|")
+ text = " ".join([x for x in text])
+ text = text.lower()
+ return text.strip()
+
+
+class SpeechEvalDataset(torch.utils.data.Dataset):
+ def __init__(self, data_list):
+ self.data_list = data_list
+
+ def __len__(self):
+ return len(self.data_list)
+
+ def __getitem__(self, index):
+ item = self.data_list[index]
+ waveform = load_eval_waveform(item["wav_path"], sample_rate=16000, return_numpy=True)
+ return {
+ "array": waveform,
+ "sampling_rate": 16000,
+ "truth_text": item["truth_text"],
+ }
+
+
+def run_eval_worker(data_chunk, language, batch_size):
+ """
+ Worker function to process a chunk of data.
+ Uses the global worker_pipe initialized by process_init.
+ """
+ global worker_pipe
+ if worker_pipe is None:
+ logging.error("Worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ dataset = SpeechEvalDataset(data_chunk)
+ if language != "unknown":
+ generate_kwargs = {"language": language, "task": "transcribe"}
+ else:
+ generate_kwargs = {"task": "transcribe"}
+
+ # Use the pipeline to infer batch
+ # Note: We iterate through the iterator returned by pipe
+ iterator = worker_pipe(
+ dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+ )
+
+ for i, out in enumerate(iterator):
+ hypothesis = out["text"].strip()
+
+ ref_item = data_chunk[i]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+ lang_id = ref_item.get("lang_id")
+ lang_name = ref_item.get("lang_name")
+
+ m = process_one(hypothesis, truth, post_process, lang_id)
+ m["wav_path"] = wav_path
+ m["lang_name"] = lang_name
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(
+ f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}"
+ )
+ return []
+
+ return metrics_buffer
+
+
+def run_eval_worker_paraformer(data_chunk, batch_size):
+ """
+ Worker function for Chinese evaluation using Paraformer.
+ Uses the global worker_paraformer initialized by process_init_paraformer.
+ """
+ global worker_paraformer
+ if worker_paraformer is None:
+ logging.error("Paraformer worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ wav_paths = [item["wav_path"] for item in data_chunk]
+
+ for i in range(0, len(wav_paths), batch_size):
+ batch_paths = wav_paths[i : i + batch_size]
+ res_batch = worker_paraformer.generate(
+ input=batch_paths, batch_size=batch_size, disable_pbar=True
+ )
+
+ for j, res in enumerate(res_batch):
+ hypothesis = res["text"]
+ ref_item = data_chunk[i + j]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+ lang_name = ref_item.get("lang_name")
+
+ m = process_one(hypothesis, truth, post_process, "zh")
+ m["wav_path"] = wav_path
+ m["lang_name"] = lang_name
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(f"Paraformer worker failed on chunk:\n{traceback.format_exc()}")
+ return []
+
+ return metrics_buffer
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+ level=logging.INFO,
+ force=True,
+ )
+
+ # 1. Prepare Data
+ logging.info("Reading test list...")
+ data_by_lang = defaultdict(list)
+ total_files = 0
+ wav_root = Path(args.wav_path)
+
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+ if not os.path.exists(wav_path):
+ logging.warning(f"File missing: {wav_path}")
+ continue
+
+ lang_id = s.get("language_id") or "unknown"
+ lang_name = s.get("language_name") or "unknown"
+
+ item = {
+ "wav_path": wav_path,
+ "truth_text": s["text"],
+ "lang_id": lang_id,
+ "lang_name": lang_name,
+ }
+ if args.lang and s.get("language_id") != args.lang:
+ continue
+
+ data_by_lang[lang_name].append(item)
+ total_files += 1
+
+ logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.")
+
+ # 2. Worker config
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_workers = num_gpus * args.nj_per_gpu
+
+ mp.set_start_method("spawn", force=True)
+ manager = mp.Manager()
+
+ # 3. Scheduling: Split data into Chinese (Paraformer) and non-Chinese (Whisper)
+ zh_items = []
+ non_zh_items = []
+ for lang_name, items in data_by_lang.items():
+ lang_id = items[0].get("lang_id", "") if items else ""
+ if lang_name == "Chinese" or (lang_id and lang_id.startswith("zh")):
+ zh_items.extend(items)
+ else:
+ non_zh_items.extend(items)
+
+ chunk_size = args.chunk_size
+
+ whisper_tasks = []
+ for i in range(0, len(non_zh_items), chunk_size):
+ chunk = non_zh_items[i : i + chunk_size]
+ lang_name = chunk[0].get("lang_name", "unknown")
+ whisper_tasks.append({"chunk": chunk, "lang": lang_name})
+
+ paraformer_tasks = []
+ for i in range(0, len(zh_items), chunk_size):
+ paraformer_tasks.append(zh_items[i : i + chunk_size])
+
+ logging.info(
+ f"Whisper tasks: {len(whisper_tasks)} chunks ({len(non_zh_items)} files). "
+ f"Paraformer tasks: {len(paraformer_tasks)} chunks ({len(zh_items)} files). "
+ f"Spawning {total_workers} workers per pool."
+ )
+
+ # 4. Execution — run Whisper and Paraformer pools sequentially
+ results = []
+
+ # 4a. Whisper pool for non-Chinese languages
+ if whisper_tasks:
+ whisper_rank_queue = manager.Queue()
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ whisper_rank_queue.put(rank)
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init,
+ initargs=(whisper_rank_queue, args.model_dir),
+ ) as executor:
+
+ futures = []
+ for task in whisper_tasks:
+ futures.append(
+ executor.submit(
+ run_eval_worker, task["chunk"], task["lang"], args.batch_size
+ )
+ )
+
+ with tqdm(
+ total=len(non_zh_items),
+ desc="Whisper Eval",
+ dynamic_ncols=True,
+ ) as pbar:
+ for future in as_completed(futures):
+ try:
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+ except Exception as e:
+ logging.error(f"Whisper task failed: {e}")
+
+ # 4b. Paraformer pool for Chinese
+ if paraformer_tasks:
+ para_rank_queue = manager.Queue()
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ para_rank_queue.put(rank)
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init_paraformer,
+ initargs=(para_rank_queue, args.model_dir),
+ ) as executor:
+
+ futures = []
+ for chunk in paraformer_tasks:
+ futures.append(
+ executor.submit(run_eval_worker_paraformer, chunk, args.batch_size)
+ )
+
+ with tqdm(
+ total=len(zh_items),
+ desc="Paraformer Eval",
+ dynamic_ncols=True,
+ ) as pbar:
+ for future in as_completed(futures):
+ try:
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+ except Exception as e:
+ logging.error(f"Paraformer task failed: {e}")
+
+ # 5. Metrics Aggregation
+ wers, inses, deles, subses = [], [], [], []
+ word_nums = 0
+
+ # Store metrics per language
+ lang_stats = {}
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ logging.info(f"Saving detailed WER results to: {args.decode_path}")
+ fout = open(args.decode_path, "w", encoding="utf-8")
+
+ for res in results:
+ wers.append(float(res["wer"]))
+ inses.append(float(res["insertions"]))
+ deles.append(float(res["deletions"]))
+ subses.append(float(res["substitutions"]))
+ word_nums += res["word_num"]
+
+ if fout:
+ fout.write(
+ f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+ f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+ f"{res['substitutions']}\n"
+ )
+ lang_name = res["lang_name"]
+
+ # Per language stats
+ if lang_name not in lang_stats:
+ lang_stats[lang_name] = {
+ "inses": [],
+ "deles": [],
+ "subses": [],
+ "word_nums": 0,
+ }
+ lang_stats[lang_name]["inses"].append(float(res["insertions"]))
+ lang_stats[lang_name]["deles"].append(float(res["deletions"]))
+ lang_stats[lang_name]["subses"].append(float(res["substitutions"]))
+ lang_stats[lang_name]["word_nums"] += res["word_num"]
+
+ print("-" * 50)
+ # Log per-language stats
+ per_lang_wers = []
+ for lang in sorted(lang_stats.keys()):
+ stats = lang_stats[lang]
+ if stats["word_nums"] > 0:
+ lang_wer = log_metrics(
+ fout,
+ f"[{lang}]",
+ stats["inses"],
+ stats["deles"],
+ stats["subses"],
+ stats["word_nums"],
+ ndigits=3,
+ )
+ per_lang_wers.append(lang_wer)
+ print("-" * 50)
+
+ # Log Macro-average WER
+ if len(per_lang_wers) > 1:
+ macro_wer = np.mean(per_lang_wers)
+ logging.info(
+ f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%"
+ )
+ if fout:
+ fout.write(
+ f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n"
+ )
+
+ # Log overall stats
+ if word_nums > 0:
+ log_metrics(fout, "Overall", inses, deles, subses, word_nums)
+
+ if fout:
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/wer/norm_config_module.py b/omnivoice/eval/wer/norm_config_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2df2e5267bcc6d953c68e151550ff031d374530
--- /dev/null
+++ b/omnivoice/eval/wer/norm_config_module.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This module defines the normalization configuration for WER evaluation.
+Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/norm_config_module.py
+"""
+
+# type: ignore
+import os
+import re
+
+colon = ":"
+comma = ","
+exclamation_mark = "!"
+period = re.escape(".")
+question_mark = re.escape("?")
+semicolon = ";"
+
+left_curly_bracket = "{"
+right_curly_bracket = "}"
+quotation_mark = '"'
+
+basic_punc = (
+ period
+ + question_mark
+ + comma
+ + colon
+ + exclamation_mark
+ + left_curly_bracket
+ + right_curly_bracket
+)
+
+# General punc unicode block (0x2000-0x206F)
+zero_width_space = r"\u200B"
+zero_width_nonjoiner = r"\u200C"
+left_to_right_mark = r"\u200E"
+right_to_left_mark = r"\u200F"
+left_to_right_embedding = r"\u202A"
+pop_directional_formatting = r"\u202C"
+
+# Here are some commonly ill-typed versions of apostrophe
+right_single_quotation_mark = r"\u2019"
+left_single_quotation_mark = r"\u2018"
+
+# Language specific definitions
+# Spanish
+inverted_exclamation_mark = r"\u00A1"
+inverted_question_mark = r"\u00BF"
+
+
+# Hindi
+hindi_danda = "\u0964"
+
+# Egyptian Arabic
+# arabic_percent = r"\u066A"
+arabic_comma = r"\u060C"
+arabic_question_mark = r"\u061F"
+arabic_semicolon = r"\u061B"
+arabic_diacritics = r"\u064B-\u0652"
+
+
+arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657"
+
+
+# Chinese
+full_stop = r"\u3002"
+full_comma = r"\uFF0C"
+full_exclamation_mark = r"\uFF01"
+full_question_mark = r"\uFF1F"
+full_semicolon = r"\uFF1B"
+full_colon = r"\uFF1A"
+full_parentheses = r"\uFF08\uFF09"
+quotation_mark_horizontal = r"\u300C-\u300F"
+quotation_mark_vertical = r"\uFF41-\uFF44"
+title_marks = r"\u3008-\u300B"
+wavy_low_line = r"\uFE4F"
+ellipsis = r"\u22EF"
+enumeration_comma = r"\u3001"
+hyphenation_point = r"\u2027"
+forward_slash = r"\uFF0F"
+wavy_dash = r"\uFF5E"
+box_drawings_light_horizontal = r"\u2500"
+fullwidth_low_line = r"\uFF3F"
+chinese_punc = (
+ full_stop
+ + full_comma
+ + full_exclamation_mark
+ + full_question_mark
+ + full_semicolon
+ + full_colon
+ + full_parentheses
+ + quotation_mark_horizontal
+ + quotation_mark_vertical
+ + title_marks
+ + wavy_low_line
+ + ellipsis
+ + enumeration_comma
+ + hyphenation_point
+ + forward_slash
+ + wavy_dash
+ + box_drawings_light_horizontal
+ + fullwidth_low_line
+)
+
+# Armenian
+armenian_apostrophe = r"\u055A"
+emphasis_mark = r"\u055B"
+exclamation_mark = r"\u055C"
+armenian_comma = r"\u055D"
+armenian_question_mark = r"\u055E"
+abbreviation_mark = r"\u055F"
+armenian_full_stop = r"\u0589"
+armenian_punc = (
+ armenian_apostrophe
+ + emphasis_mark
+ + exclamation_mark
+ + armenian_comma
+ + armenian_question_mark
+ + abbreviation_mark
+ + armenian_full_stop
+)
+
+lesser_than_symbol = r"<"
+greater_than_symbol = r">"
+
+lesser_than_sign = r"\u003c"
+greater_than_sign = r"\u003e"
+
+nbsp_written_form = r" "
+
+# Quotation marks
+left_double_quotes = r"\u201c"
+right_double_quotes = r"\u201d"
+left_double_angle = r"\u00ab"
+right_double_angle = r"\u00bb"
+left_single_angle = r"\u2039"
+right_single_angle = r"\u203a"
+low_double_quotes = r"\u201e"
+low_single_quotes = r"\u201a"
+high_double_quotes = r"\u201f"
+high_single_quotes = r"\u201b"
+
+all_punct_quotes = (
+ left_double_quotes
+ + right_double_quotes
+ + left_double_angle
+ + right_double_angle
+ + left_single_angle
+ + right_single_angle
+ + low_double_quotes
+ + low_single_quotes
+ + high_double_quotes
+ + high_single_quotes
+ + right_single_quotation_mark
+ + left_single_quotation_mark
+)
+mapping_quotes = (
+ "["
+ + high_single_quotes
+ + right_single_quotation_mark
+ + left_single_quotation_mark
+ + "]"
+)
+
+
+# Digits
+
+english_digits = r"\u0030-\u0039"
+bengali_digits = r"\u09e6-\u09ef"
+khmer_digits = r"\u17e0-\u17e9"
+devanagari_digits = r"\u0966-\u096f"
+oriya_digits = r"\u0b66-\u0b6f"
+extended_arabic_indic_digits = r"\u06f0-\u06f9"
+kayah_li_digits = r"\ua900-\ua909"
+fullwidth_digits = r"\uff10-\uff19"
+malayam_digits = r"\u0d66-\u0d6f"
+myanmar_digits = r"\u1040-\u1049"
+roman_numeral = r"\u2170-\u2179"
+nominal_digit_shapes = r"\u206f"
+
+# Load punctuations
+with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f:
+ punc_list = [
+ line
+ for line in punc_f.readlines()
+ if line.strip() and not line.strip().startswith("#")
+ ]
+
+punct_pattern = r""
+for punc in punc_list:
+ # the first character in the tab separated line is the punc to be removed
+ punct_pattern += re.escape(punc.split("\t")[0])
+
+shared_digits = (
+ english_digits
+ + bengali_digits
+ + khmer_digits
+ + devanagari_digits
+ + oriya_digits
+ + extended_arabic_indic_digits
+ + kayah_li_digits
+ + fullwidth_digits
+ + malayam_digits
+ + myanmar_digits
+ + roman_numeral
+ + nominal_digit_shapes
+)
+
+shared_punc_list = (
+ basic_punc
+ + all_punct_quotes
+ + greater_than_sign
+ + lesser_than_sign
+ + inverted_question_mark
+ + full_stop
+ + semicolon
+ + armenian_punc
+ + inverted_exclamation_mark
+ + arabic_comma
+ + enumeration_comma
+ + hindi_danda
+ + quotation_mark
+ + arabic_semicolon
+ + arabic_question_mark
+ + chinese_punc
+ + punct_pattern
+)
+
+shared_mappping = {
+ lesser_than_symbol: "",
+ greater_than_symbol: "",
+ nbsp_written_form: "",
+ r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2",
+}
+
+shared_deletion_list = (
+ left_to_right_mark
+ + zero_width_nonjoiner
+ + arabic_subscript_alef_and_inverted_damma
+ + zero_width_space
+ + arabic_diacritics
+ + pop_directional_formatting
+ + right_to_left_mark
+ + left_to_right_embedding
+)
+
+norm_config = {
+ "*": {
+ "lower_case": True,
+ "punc_set": shared_punc_list,
+ "del_set": shared_deletion_list,
+ "mapping": shared_mappping,
+ "digit_set": shared_digits,
+ "unicode_norm": "NFKC",
+ "rm_diacritics": False,
+ }
+}
+
+# =============== Mongolian ===============#
+
+norm_config["mon"] = norm_config["*"].copy()
+# add soft hyphen to punc list to match with fleurs
+norm_config["mon"]["del_set"] += r"\u00AD"
+
+norm_config["khk"] = norm_config["mon"].copy()
+
+# =============== Hebrew ===============#
+
+norm_config["heb"] = norm_config["*"].copy()
+# add "HEBREW POINT" symbols to match with fleurs
+norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF"
+
+# =============== Thai ===============#
+
+norm_config["tha"] = norm_config["*"].copy()
+# add "Zero width joiner" symbols to match with fleurs
+norm_config["tha"]["punc_set"] += r"\u200D"
+
+# =============== Arabic ===============#
+norm_config["ara"] = norm_config["*"].copy()
+norm_config["ara"]["mapping"]["ٱ"] = "ا"
+norm_config["arb"] = norm_config["ara"].copy()
+
+# =============== Javanese ===============#
+norm_config["jav"] = norm_config["*"].copy()
+norm_config["jav"]["rm_diacritics"] = True
diff --git a/omnivoice/eval/wer/punctuations.lst b/omnivoice/eval/wer/punctuations.lst
new file mode 100644
index 0000000000000000000000000000000000000000..f002b3553cb1344290950824d7d9f7c9a000357d
--- /dev/null
+++ b/omnivoice/eval/wer/punctuations.lst
@@ -0,0 +1,188 @@
+ 7355 INVALID UNICODE 0x81
+ 5265 INVALID UNICODE 0x90
+ 75 INVALID UNICODE 0x8
+ 31 INVALID UNICODE 0x8d
+ 3 INVALID UNICODE 0x94
+ 2 INVALID UNICODE 0x8f
+ 2 INVALID UNICODE 0x1a
+ 1 INVALID UNICODE 0x9d
+ 1 INVALID UNICODE 0x93
+ 1 INVALID UNICODE 0x92
+ 8647 INVALID UNICODE 0xe295
+ 6650 INVALID UNICODE 0xf21d
+ 6234 INVALID UNICODE 0xf62d
+ 4815 INVALID UNICODE 0xf173
+ 4789 INVALID UNICODE 0xe514
+ 4409 INVALID UNICODE 0xe293
+ 3881 INVALID UNICODE 0xf523
+ 3788 INVALID UNICODE 0xe233
+ 2448 INVALID UNICODE 0xf50f
+ 2177 INVALID UNICODE 0xe232
+ 1955 INVALID UNICODE 0xea7b
+ 1926 INVALID UNICODE 0xf172
+ 973 INVALID UNICODE 0xe290
+ 972 INVALID UNICODE 0xf519
+ 661 INVALID UNICODE 0xe292
+ 591 INVALID UNICODE 0xe328
+ 509 INVALID UNICODE 0xe2fa
+ 458 INVALID UNICODE 0xe234
+ 446 INVALID UNICODE 0xe043
+ 419 INVALID UNICODE 0xe040
+ 399 INVALID UNICODE 0xe2fb
+ 387 INVALID UNICODE 0xe32b
+ 381 INVALID UNICODE 0xe236
+ 374 INVALID UNICODE 0xf511
+ 314 INVALID UNICODE 0xe517
+ 296 INVALID UNICODE 0xe2fe
+ 293 INVALID UNICODE 0xe492
+ 291 INVALID UNICODE 0xf52d
+ 289 INVALID UNICODE 0xe2fc
+ 195 INVALID UNICODE 0xf521
+ 190 INVALID UNICODE 0xe516
+ 182 INVALID UNICODE 0xe041
+ 178 INVALID UNICODE 0xf529
+ 113 INVALID UNICODE 0xe2f9
+ 87 INVALID UNICODE 0xe2d9
+ 78 INVALID UNICODE 0xe32a
+ 76 INVALID UNICODE 0xe291
+ 74 INVALID UNICODE 0xe296
+ 66 INVALID UNICODE 0xe518
+ 52 INVALID UNICODE 0xe32c
+ 46 INVALID UNICODE 0xe2db
+ 41 INVALID UNICODE 0xe231
+ 34 INVALID UNICODE 0xf522
+ 33 INVALID UNICODE 0xf518
+ 32 INVALID UNICODE 0xf513
+ 27 INVALID UNICODE 0xe32d
+ 25 INVALID UNICODE 0xe32e
+ 23 INVALID UNICODE 0xe06b
+ 15 INVALID UNICODE 0xea01
+ 12 INVALID UNICODE 0xe294
+ 11 INVALID UNICODE 0xe203
+ 8 INVALID UNICODE 0xf218
+ 7 INVALID UNICODE 0xe070
+ 7 INVALID UNICODE 0xe013
+ 5 INVALID UNICODE 0xe2de
+ 4 INVALID UNICODE 0xe493
+ 3 INVALID UNICODE 0xf7e8
+ 3 INVALID UNICODE 0xf7d0
+ 3 INVALID UNICODE 0xe313
+ 2 INVALID UNICODE 0xe329
+ 2 INVALID UNICODE 0xe06d
+ 2 INVALID UNICODE 0xe003
+ 1 INVALID UNICODE 0xf50e
+ 1 INVALID UNICODE 0xf171
+ 1 INVALID UNICODE 0xe01d
+ 71 NOMINAL DIGIT SHAPES 0x206f
+ 3 WORD JOINER 0x2060
+― 126545 HORIZONTAL BAR 0x2015
+־ 1028 HEBREW PUNCTUATION MAQAF 0x5be
+) 98429 RIGHT PARENTHESIS 0x29
+] 27108 RIGHT SQUARE BRACKET 0x5d
+⌋ 1567 RIGHT FLOOR 0x230b
+〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015
+】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011
+﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e
+& 170517 AMPERSAND 0x26
+། 106330 TIBETAN MARK SHAD 0xf0d
+። 90203 ETHIOPIC FULL STOP 0x1362
+፥ 60484 ETHIOPIC COLON 0x1365
+༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c
+။ 51567 MYANMAR SIGN SECTION 0x104b
+/ 46929 SOLIDUS 0x2f
+၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a
+· 37985 MIDDLE DOT 0xb7
+‸ 36310 CARET 0x2038
+* 34793 ASTERISK 0x2a
+۔ 32432 ARABIC FULL STOP 0x6d4
+፤ 31906 ETHIOPIC SEMICOLON 0x1364
+၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f
+។ 20834 KHMER SIGN KHAN 0x17d4
+꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe
+᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e
+꤯ 12892 KAYAH LI SIGN SHYA 0xa92f
+⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70
+꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff
+॥ 10763 DEVANAGARI DOUBLE DANDA 0x965
+؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e
+၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d
+· 8431 GREEK ANO TELEIA 0x387
+† 7477 DAGGER 0x2020
+၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c
+፣ 5719 ETHIOPIC COMMA 0x1363
+៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6
+꤮ 4791 KAYAH LI SIGN CWI 0xa92e
+※ 3439 REFERENCE MARK 0x203b
+፦ 2727 ETHIOPIC PREFACE COLON 0x1366
+• 1749 BULLET 0x2022
+¶ 1507 PILCROW SIGN 0xb6
+၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e
+﹖ 1224 SMALL QUESTION MARK 0xfe56
+; 975 GREEK QUESTION MARK 0x37e
+… 827 HORIZONTAL ELLIPSIS 0x2026
+% 617 PERCENT SIGN 0x25
+・ 468 KATAKANA MIDDLE DOT 0x30fb
+༎ 306 TIBETAN MARK NYIS SHAD 0xf0e
+‡ 140 DOUBLE DAGGER 0x2021
+# 137 NUMBER SIGN 0x23
+@ 125 COMMERCIAL AT 0x40
+፡ 121 ETHIOPIC WORDSPACE 0x1361
+៚ 55 KHMER SIGN KOOMUUT 0x17da
+៕ 49 KHMER SIGN BARIYOOSAN 0x17d5
+﹐ 10 SMALL COMMA 0xfe50
+༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05
+༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04
+. 2 FULLWIDTH FULL STOP 0xff0e
+﹗ 2 SMALL EXCLAMATION MARK 0xfe57
+﹕ 2 SMALL COLON 0xfe55
+‰ 2 PER MILLE SIGN 0x2030
+・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65
+( 98504 LEFT PARENTHESIS 0x28
+[ 27245 LEFT SQUARE BRACKET 0x5b
+⌊ 1567 LEFT FLOOR 0x230a
+〔 95 LEFT TORTOISE SHELL BRACKET 0x3014
+【 36 LEFT BLACK LENTICULAR BRACKET 0x3010
+﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f
+_ 4851 LOW LINE 0x5f
+$ 72 DOLLAR SIGN 0x24
+€ 14 EURO SIGN 0x20ac
+£ 2 POUND SIGN 0xa3
+~ 27462 TILDE 0x7e
+= 11450 EQUALS SIGN 0x3d
+| 8430 VERTICAL LINE 0x7c
+− 3971 MINUS SIGN 0x2212
+≫ 1904 MUCH GREATER-THAN 0x226b
+≪ 1903 MUCH LESS-THAN 0x226a
++ 1450 PLUS SIGN 0x2b
+< 345 FULLWIDTH LESS-THAN SIGN 0xff1c
+> 344 FULLWIDTH GREATER-THAN SIGN 0xff1e
+¬ 5 NOT SIGN 0xac
+× 4 MULTIPLICATION SIGN 0xd7
+→ 2 RIGHTWARDS ARROW 0x2192
+᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d
+° 499 DEGREE SIGN 0xb0
+႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f
+� 192 REPLACEMENT CHARACTER 0xfffd
+⌟ 54 BOTTOM RIGHT CORNER 0x231f
+⌞ 54 BOTTOM LEFT CORNER 0x231e
+© 2 COPYRIGHT SIGN 0xa9
+ 40 NARROW NO-BREAK SPACE 0x202f
+ 1 SIX-PER-EM SPACE 0x2006
+˜ 40261 SMALL TILDE 0x2dc
+^ 6469 CIRCUMFLEX ACCENT 0x5e
+¯ 20 MACRON 0xaf
+ˇ 191442 CARON 0x2c7
+ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f
+ـ 9440 ARABIC TATWEEL 0x640
+ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46
+ៗ 3310 KHMER SIGN LEK TOO 0x17d7
+々 678 IDEOGRAPHIC ITERATION MARK 0x3005
+ໆ 430 LAO KO LA 0xec6
+ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc
+ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071
+৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7
+⅓ 26 VULGAR FRACTION ONE THIRD 0x2153
+½ 26 VULGAR FRACTION ONE HALF 0xbd
+¼ 4 VULGAR FRACTION ONE QUARTER 0xbc
+⅟ 1 FRACTION NUMERATOR ONE 0x215f
+⁄ 57 FRACTION SLASH 0x2044
diff --git a/omnivoice/eval/wer/seedtts.py b/omnivoice/eval/wer/seedtts.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d1ba98f186f8b36a4ab658e4814553c484e684f
--- /dev/null
+++ b/omnivoice/eval/wer/seedtts.py
@@ -0,0 +1,413 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Computes word error rate (WER) with Whisper-large-v3 for English and
+Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets.
+"""
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import string
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+import numpy as np
+import torch
+import zhconv
+from tqdm import tqdm
+from zhon.hanzi import punctuation
+
+from omnivoice.eval.utils import load_eval_waveform
+from omnivoice.eval.wer.common import process_one
+from omnivoice.utils.data_utils import read_test_list
+
+# --- Global variables for worker processes ---
+worker_pipe = None
+worker_device = None
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Computes WER with Whisper/Paraformer.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing speech files.",
+ )
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where WER information will be saved. "
+ "If not provided, results are only printed to console.",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of evaluation models repository. "
+ "Download from https://huggingface.co/k2-fsa/TTS_eval_models. "
+ "This script expects 'tts_eval_models/wer/whisper-large-v3/' for English "
+ "and 'tts_eval_models/wer/paraformer-zh/' for Chinese within this directory.",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ default="test.jsonl",
+ help="path of the JSONL test list. Each line is a JSON object "
+ "with fields: id, text, ref_audio, ref_text, language_id, language_name.",
+ )
+ parser.add_argument(
+ "--lang",
+ type=str,
+ choices=["zh", "en"],
+ required=True,
+ help="Language of the audio and transcripts for "
+ "decoding ('zh' for Chinese or 'en' for English).",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=16,
+ help="Batch size for decoding with the Hugging Face pipeline.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+ )
+ return parser
+
+
+def load_whisper_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/whisper-large-v3/")
+ if not os.path.exists(model_path):
+ logging.error(f"Whisper model not found at {model_path}.")
+ return None
+
+ logging.debug(f"Loading Whisper model on {device}...")
+
+ import transformers
+
+ # Suppress transformers logging
+ transformers.logging.set_verbosity_error()
+
+ pipe = transformers.pipeline(
+ "automatic-speech-recognition",
+ model=model_path,
+ dtype=torch.float16 if "cuda" in str(device) else torch.float32,
+ device=device,
+ )
+ return pipe
+
+
+def load_paraformer_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/paraformer-zh/")
+ if not os.path.exists(model_path):
+ logging.error(f"Paraformer model not found at {model_path}.")
+ return None
+
+ logging.debug(f"Loading Paraformer model on {device}...")
+
+ previous_level = logging.root.manager.disable
+ logging.disable(logging.CRITICAL)
+
+ try:
+ from funasr import AutoModel
+
+ # FunASR AutoModel accepts "cuda:0" string or torch.device
+ model = AutoModel(
+ model=model_path,
+ device=str(device),
+ disable_update=True,
+ disable_pbar=True,
+ verbose=False,
+ )
+ finally:
+ logging.disable(previous_level)
+
+ return model
+
+
+def post_process(text: str, lang: str) -> str:
+ """
+ Cleans and normalizes text for WER calculation.
+ Args:
+ text (str): The input text to be processed.
+ lang (str): The language of the input text.
+
+ Returns:
+ str: The cleaned and normalized text.
+ """
+ punctuation_all = punctuation + string.punctuation
+ for x in punctuation_all:
+ if x == "'":
+ continue
+ text = text.replace(x, "")
+
+ text = text.replace(" ", " ")
+
+ if lang == "zh":
+ text = " ".join([x for x in text])
+ elif lang == "en":
+ text = text.lower()
+ else:
+ raise NotImplementedError
+ return text
+
+
+def process_init(rank_queue, model_dir, lang):
+ """
+ Initializer for each worker process.
+ Loads model onto a specific GPU, once per process.
+ """
+ global worker_pipe, worker_device
+
+ torch.set_num_threads(2)
+
+ try:
+ rank = rank_queue.get(timeout=10)
+ except Exception:
+ raise RuntimeError("Failed to get GPU rank from queue.")
+
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ worker_device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+ try:
+ if lang == "en":
+ worker_pipe = load_whisper_model(model_dir, worker_device)
+ elif lang == "zh":
+ worker_pipe = load_paraformer_model(model_dir, worker_device)
+ if worker_pipe is None:
+ raise RuntimeError("Model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load model on {worker_device}: {e}")
+ raise e
+
+
+def run_eval_worker(data_chunk, lang, batch_size):
+ """
+ Worker function to process a chunk of data.
+ Uses the global worker_pipe initialized by process_init.
+ """
+ global worker_pipe
+ if worker_pipe is None:
+ logging.error("Worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ if lang == "en":
+ # Load waveforms as arrays, truncating to 30s
+ dataset = [
+ {
+ "array": load_eval_waveform(
+ item["wav_path"], sample_rate=16000, return_numpy=True
+ )[: 16000 * 30],
+ "sampling_rate": 16000,
+ }
+ for item in data_chunk
+ ]
+ generate_kwargs = {"language": "english", "task": "transcribe"}
+
+ iterator = worker_pipe(
+ dataset, generate_kwargs=generate_kwargs, batch_size=batch_size
+ )
+
+ for i, out in enumerate(iterator):
+ hypothesis = out["text"].strip()
+ ref_item = data_chunk[i]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+
+ m = process_one(hypothesis, truth, post_process, lang)
+ m["wav_path"] = wav_path
+ metrics_buffer.append(m)
+
+ elif lang == "zh":
+ wav_paths = [item["wav_path"] for item in data_chunk]
+
+ for i in range(0, len(wav_paths), batch_size):
+ batch_paths = wav_paths[i : i + batch_size]
+ res_batch = worker_pipe.generate(
+ input=batch_paths, batch_size=batch_size, disable_pbar=True
+ )
+
+ for j, res in enumerate(res_batch):
+ hypothesis = zhconv.convert(res["text"], "zh-cn")
+ ref_item = data_chunk[i + j]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+
+ m = process_one(hypothesis, truth, post_process, lang)
+ m["wav_path"] = wav_path
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(
+ f"Worker failed on chunk (Lang: {lang}):\n{traceback.format_exc()}"
+ )
+ return []
+
+ return metrics_buffer
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+ level=logging.INFO,
+ force=True,
+ )
+
+ logging.info(f"Calculating WER for {args.wav_path}")
+
+ # 1. Prepare Data
+ logging.info("Reading test list...")
+ data_list = []
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ wav_path = str(Path(args.wav_path) / f"{s['id']}.{args.extension}")
+ if not os.path.exists(wav_path):
+ logging.warning(f"File missing: {wav_path}")
+ continue
+ data_list.append({"wav_path": wav_path, "truth_text": s["text"]})
+ total_files = len(data_list)
+ logging.info(f"Total files: {total_files}.")
+
+ # 2. Worker config
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_workers = num_gpus * args.nj_per_gpu
+
+ mp.set_start_method("spawn", force=True)
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ rank_queue.put(rank)
+
+ # 3. Scheduling: Split data into chunks for better load balancing
+ chunk_size = max(1, args.batch_size)
+ tasks = []
+ for i in range(0, total_files, chunk_size):
+ tasks.append(data_list[i : i + chunk_size])
+
+ logging.info(
+ f"Split data into {len(tasks)} chunks (size ~{chunk_size}). "
+ f"Spawning {total_workers} workers."
+ )
+
+ # 4. Execution
+ results = []
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init,
+ initargs=(rank_queue, args.model_dir, args.lang),
+ ) as executor:
+
+ futures = []
+ for chunk in tasks:
+ futures.append(
+ executor.submit(run_eval_worker, chunk, args.lang, args.batch_size)
+ )
+
+ # Unified progress bar
+ with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar:
+ for future in as_completed(futures):
+ try:
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+ except Exception as e:
+ logging.error(f"Task failed: {e}")
+
+ wers, inses, deles, subses = [], [], [], []
+ word_nums = 0
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ fout = open(args.decode_path, "w", encoding="utf8")
+ logging.info(f"Saving detailed WER results to: {args.decode_path}")
+ fout.write(
+ "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n"
+ )
+
+ for res in results:
+ wers.append(float(res["wer"]))
+ inses.append(float(res["insertions"]))
+ deles.append(float(res["deletions"]))
+ subses.append(float(res["substitutions"]))
+ word_nums += res["word_num"]
+
+ if fout:
+ fout.write(
+ f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+ f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+ f"{res['substitutions']}\n"
+ )
+
+ wer_avg = round(np.mean(wers) * 100, 2) if wers else float("nan")
+ wer_weighted = (
+ round(
+ (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2
+ )
+ if word_nums > 0
+ else float("nan")
+ )
+
+ inse_sum = np.sum(inses)
+ dele_sum = np.sum(deles)
+ subs_sum = np.sum(subses)
+
+ print("-" * 50)
+ logging.info(f"Processed {len(results)}/{total_files} files.")
+ seedtts_wer_info = f"Seed-TTS WER (Avg of WERs): {wer_avg}%"
+ wer_info = f"WER (Weighted): {wer_weighted}%"
+ detailed_info = (
+ f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words"
+ )
+ logging.info(seedtts_wer_info)
+ logging.info(wer_info)
+ logging.info(detailed_info)
+ print("-" * 50)
+
+ if fout:
+ fout.write(seedtts_wer_info + "\n" + wer_info + "\n" + detailed_info + "\n")
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/wer/sensevoice.py b/omnivoice/eval/wer/sensevoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..8490c22e6809e2a3e0d4723312d9481d0bbf545d
--- /dev/null
+++ b/omnivoice/eval/wer/sensevoice.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Computes Character Error Rate (CER) for Cantonese (yue) using SenseVoiceSmall.
+"""
+
+import argparse
+import logging
+import multiprocessing as mp
+import os
+import re
+import traceback
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+import cn2an
+import torch
+import zhconv
+from tqdm import tqdm
+
+from omnivoice.eval.wer.common import log_metrics, process_one
+from omnivoice.eval.wer.text_norm_omni import text_normalize
+from omnivoice.utils.data_utils import read_test_list
+
+# --- Global variables for worker processes ---
+worker_sensevoice = None
+worker_device = None
+
+
+def get_parser():
+ parser = argparse.ArgumentParser(
+ description="Computes CER for Cantonese using SenseVoiceSmall.",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--wav-path",
+ type=str,
+ required=True,
+ help="Path to the directory containing speech files.",
+ )
+
+ parser.add_argument(
+ "--extension",
+ type=str,
+ default="wav",
+ help="Extension of the speech files. Default: wav",
+ )
+
+ parser.add_argument(
+ "--decode-path",
+ type=str,
+ default=None,
+ help="Path to the output file where CER information will be saved. ",
+ )
+ parser.add_argument(
+ "--model-dir",
+ type=str,
+ required=True,
+ help="Local path of evaluation models repository. ",
+ )
+ parser.add_argument(
+ "--test-list",
+ type=str,
+ default="test.jsonl",
+ help="path of the JSONL test list.",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=16,
+ help="Batch size for decoding.",
+ )
+ parser.add_argument(
+ "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU."
+ )
+ parser.add_argument(
+ "--chunk-size",
+ type=int,
+ default=10,
+ help="Number of samples per task chunk sent to workers.",
+ )
+ return parser
+
+
+def load_sensevoice_model(model_dir, device):
+ model_path = os.path.join(model_dir, "wer/SenseVoiceSmall")
+ if not os.path.exists(model_path):
+ # Fallback if specific sensevoice spelling isn't found
+ logging.warning(
+ f"SenseVoiceSmall not found at {model_path}. "
+ f"Please ensure it is present in eval models."
+ )
+
+ logging.info(f"Loading SenseVoice model on {device}...")
+
+ previous_level = logging.root.manager.disable
+ logging.disable(logging.CRITICAL)
+
+ try:
+ from funasr import AutoModel
+
+ model = AutoModel(
+ model="iic/SenseVoiceSmall",
+ device=str(device),
+ disable_update=True,
+ disable_pbar=True,
+ verbose=False,
+ )
+ finally:
+ logging.disable(previous_level)
+
+ return model
+
+
+def _worker_setup(rank_queue):
+ global worker_device
+
+ torch.set_num_threads(2)
+
+ try:
+ rank = rank_queue.get(timeout=10)
+ except Exception:
+ raise RuntimeError("Failed to get GPU rank from queue.")
+
+ assert torch.cuda.is_available(), "CUDA is required but not available."
+ worker_device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(rank)
+
+ logging.info(f"Initializing worker on device: {worker_device}")
+
+
+def process_init_sensevoice(rank_queue, model_dir):
+ global worker_sensevoice
+
+ _worker_setup(rank_queue)
+
+ try:
+ worker_sensevoice = load_sensevoice_model(model_dir, worker_device)
+ if worker_sensevoice is None:
+ raise RuntimeError("SenseVoice model loading failed.")
+ except Exception as e:
+ logging.critical(f"Failed to load SenseVoice model on {worker_device}: {e}")
+ raise e
+
+
+def post_process(text: str, lang: str) -> str:
+ """
+ Cleans and normalizes text for calculation.
+ """
+ assert lang == "yue", "this script is designed for Cantonese (yue) evaluation only."
+ text = text_normalize(
+ text,
+ iso_code="yue",
+ lower_case=True,
+ remove_numbers=False,
+ remove_brackets=False,
+ )
+
+ text = zhconv.convert(text, "zh-cn")
+
+ text = cn2an.transform(text, "an2cn")
+
+ text = text.replace(" ", "")
+ text = " ".join([x for x in text])
+ text = text.lower()
+ return text.strip()
+
+
+def run_eval_worker_sensevoice(data_chunk, batch_size):
+ global worker_sensevoice
+ if worker_sensevoice is None:
+ logging.error("SenseVoice worker pipeline is not initialized!")
+ return []
+
+ metrics_buffer = []
+ try:
+ wav_paths = [item["wav_path"] for item in data_chunk]
+
+ for i in range(0, len(wav_paths), batch_size):
+ batch_paths = wav_paths[i : i + batch_size]
+
+ # SenseVoice generate call, target lang mapped to yue
+ res_batch = worker_sensevoice.generate(
+ input=batch_paths,
+ batch_size=batch_size,
+ language="yue",
+ use_itn=False,
+ disable_pbar=True,
+ )
+
+ for j, res in enumerate(res_batch):
+ hypothesis = res["text"]
+ # SenseVoice may format output with language tags,
+ # cleaning basic tags if any
+ hypothesis = re.sub(r"<\|[^|]*\|>", "", hypothesis).strip()
+
+ ref_item = data_chunk[i + j]
+ truth = ref_item["truth_text"]
+ wav_path = ref_item["wav_path"]
+ lang_name = ref_item.get("lang_name")
+
+ m = process_one(hypothesis, truth, post_process, "yue")
+ m["wav_path"] = wav_path
+ m["lang_name"] = lang_name
+ metrics_buffer.append(m)
+
+ except Exception:
+ logging.error(f"SenseVoice worker failed on chunk:\n{traceback.format_exc()}")
+ return []
+
+ return metrics_buffer
+
+
+def main():
+ parser = get_parser()
+ args = parser.parse_args()
+
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s",
+ level=logging.INFO,
+ force=True,
+ )
+
+ logging.info("Reading test list and filtering for Cantonese (yue)...")
+ yue_items = []
+ wav_root = Path(args.wav_path)
+
+ samples = read_test_list(args.test_list)
+ for s in samples:
+ lang_id = s.get("language_id", "")
+ if lang_id != "yue":
+ continue
+
+ wav_path = str(wav_root / f"{s['id']}.{args.extension}")
+ if not os.path.exists(wav_path):
+ logging.warning(f"File missing: {wav_path}")
+ continue
+
+ yue_items.append(
+ {
+ "wav_path": wav_path,
+ "truth_text": s["text"],
+ "lang_id": "yue",
+ "lang_name": s.get("language_name", "Cantonese"),
+ }
+ )
+
+ logging.info(f"Total Cantonese files found: {len(yue_items)}.")
+ if len(yue_items) == 0:
+ logging.warning("No files to evaluate. Exiting.")
+ return
+
+ num_gpus = torch.cuda.device_count()
+ assert num_gpus > 0, "No GPU found. GPU is required."
+ total_workers = num_gpus * args.nj_per_gpu
+
+ mp.set_start_method("spawn", force=True)
+ manager = mp.Manager()
+
+ chunk_size = args.chunk_size
+ tasks = []
+ for i in range(0, len(yue_items), chunk_size):
+ tasks.append(yue_items[i : i + chunk_size])
+
+ results = []
+ rank_queue = manager.Queue()
+ for _ in range(args.nj_per_gpu):
+ for rank in range(num_gpus):
+ rank_queue.put(rank)
+
+ with ProcessPoolExecutor(
+ max_workers=total_workers,
+ initializer=process_init_sensevoice,
+ initargs=(rank_queue, args.model_dir),
+ ) as executor:
+
+ futures = []
+ for chunk in tasks:
+ futures.append(
+ executor.submit(run_eval_worker_sensevoice, chunk, args.batch_size)
+ )
+
+ with tqdm(
+ total=len(yue_items),
+ desc="SenseVoice Eval (Cantonese)",
+ dynamic_ncols=True,
+ ) as pbar:
+ for future in as_completed(futures):
+ try:
+ chunk_metrics = future.result()
+ results.extend(chunk_metrics)
+ pbar.update(len(chunk_metrics))
+ except Exception as e:
+ logging.error(f"Task failed: {e}")
+
+ # Metrics Aggregation
+ inses, deles, subses = [], [], []
+ word_nums = 0
+
+ fout = None
+ if args.decode_path:
+ os.makedirs(os.path.dirname(args.decode_path), exist_ok=True)
+ logging.info(f"Saving detailed CER results to: {args.decode_path}")
+ fout = open(args.decode_path, "w", encoding="utf-8")
+
+ for res in results:
+ inses.append(float(res["insertions"]))
+ deles.append(float(res["deletions"]))
+ subses.append(float(res["substitutions"]))
+ word_nums += res["word_num"]
+
+ if fout:
+ fout.write(
+ f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t"
+ f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t"
+ f"{res['substitutions']}\n"
+ )
+
+ print("-" * 50)
+ if word_nums > 0:
+ log_metrics(fout, "[yue] Cantonese", inses, deles, subses, word_nums)
+
+ if fout:
+ fout.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/eval/wer/text_norm_omni.py b/omnivoice/eval/wer/text_norm_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc435e8e93812793e7d5d249438f8cf23114b801
--- /dev/null
+++ b/omnivoice/eval/wer/text_norm_omni.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This module contains the text normalization function for WER evaluation.
+Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/text_tools.py
+"""
+
+import re
+import unicodedata
+
+from unidecode import unidecode
+
+import omnivoice.eval.wer.norm_config_module as norm_config_module
+
+norm_config = norm_config_module.norm_config # type: ignore
+
+
+def text_normalize(
+ text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False
+):
+ """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces
+
+ Args:
+ text : The string to be normalized
+ iso_code :
+ remove_numbers : Boolean flag to specify if words containing only digits should be removed
+
+ Returns:
+ normalized_text : the string after all normalization
+
+ """
+
+ config = norm_config.get(iso_code, norm_config["*"])
+
+ for field in [
+ "lower_case",
+ "punc_set",
+ "del_set",
+ "mapping",
+ "digit_set",
+ "unicode_norm",
+ ]:
+ if field not in config:
+ config[field] = norm_config["*"][field]
+
+ text = unicodedata.normalize(config["unicode_norm"], text)
+
+ # Convert to lower case
+
+ if config["lower_case"] and lower_case:
+ text = text.lower()
+
+ # brackets
+
+ # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)"
+ text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text)
+ if remove_brackets:
+ text = re.sub(r"\([^\)]*\)", " ", text)
+
+ # Apply mappings
+
+ for old, new in config["mapping"].items():
+ text = re.sub(old, new, text)
+
+ # Replace punctutations with space
+
+ punct_pattern = r"[" + config["punc_set"]
+
+ punct_pattern += "]"
+
+ normalized_text = re.sub(punct_pattern, " ", text)
+
+ # remove characters in delete list
+
+ delete_patten = r"[" + config["del_set"] + "]"
+
+ normalized_text = re.sub(delete_patten, "", normalized_text)
+
+ # Remove words containing only digits
+ # We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number
+ # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space
+ # The lookaround enables overlapping pattern matches to be replaced
+
+ if remove_numbers:
+
+ digits_pattern = "[" + config["digit_set"]
+
+ digits_pattern += "]+"
+
+ complete_digit_pattern = (
+ r"^"
+ + digits_pattern
+ + r"(?=\s)|(?<=\s)"
+ + digits_pattern
+ + r"(?=\s)|(?<=\s)"
+ + digits_pattern
+ + "$"
+ )
+
+ normalized_text = re.sub(complete_digit_pattern, " ", normalized_text)
+
+ if config["rm_diacritics"]:
+ normalized_text = unidecode(normalized_text)
+
+ # Remove extra spaces
+ normalized_text = re.sub(r"\s+", " ", normalized_text).strip()
+
+ return normalized_text
diff --git a/omnivoice/models/__init__.py b/omnivoice/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/models/omnivoice.py b/omnivoice/models/omnivoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..96673eb84cbe084cbb8eb3127b34d721365c1584
--- /dev/null
+++ b/omnivoice/models/omnivoice.py
@@ -0,0 +1,1598 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Core OmniVoice model implementation.
+
+Defines the ``OmniVoice`` model class, generation config, and inference pipeline.
+This is the main entry point for both inference and training:
+
+- **Inference**: ``OmniVoice.from_pretrained()`` loads the model, then
+ ``model.generate()`` supports voice cloning, voice design, and auto voice.
+- **Training**: ``model.forward()`` computes the training loss; the model is
+ built and used by ``omnivoice.training.builder`` and ``omnivoice.training.trainer``.
+
+"""
+
+import difflib
+import logging
+import math
+import os
+import re
+from dataclasses import dataclass, fields
+from functools import partial
+from typing import Any, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+
+try:
+ from torch.nn.attention.flex_attention import create_block_mask
+
+ _flex_attention_available = True
+except ImportError:
+ _flex_attention_available = False
+from transformers import (
+ AutoFeatureExtractor,
+ AutoModel,
+ AutoTokenizer,
+ HiggsAudioV2TokenizerModel,
+ PretrainedConfig,
+ PreTrainedModel,
+)
+from transformers.modeling_outputs import ModelOutput
+from transformers.models.auto import CONFIG_MAPPING, AutoConfig
+
+from omnivoice.utils.audio import (
+ cross_fade_chunks,
+ fade_and_pad_audio,
+ load_audio,
+ remove_silence,
+ trim_long_audio,
+)
+from omnivoice.utils.duration import RuleDurationEstimator
+from omnivoice.utils.lang_map import LANG_IDS, LANG_NAMES
+from omnivoice.utils.text import add_punctuation, chunk_text_punctuation
+from omnivoice.utils.voice_design import (
+ _INSTRUCT_ALL_VALID,
+ _INSTRUCT_EN_TO_ZH,
+ _INSTRUCT_MUTUALLY_EXCLUSIVE,
+ _INSTRUCT_VALID_EN,
+ _INSTRUCT_VALID_ZH,
+ _INSTRUCT_ZH_TO_EN,
+ _ZH_RE,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Dataclasses
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class VoiceClonePrompt:
+ ref_audio_tokens: torch.Tensor # (C, T)
+ ref_text: str
+ ref_rms: float
+
+
+@dataclass
+class OmniVoiceGenerationConfig:
+ num_step: int = 32
+ guidance_scale: float = 2.0
+ t_shift: float = 0.1
+ layer_penalty_factor: float = 5.0
+ position_temperature: float = 5.0
+ class_temperature: float = 0.0
+ denoise: bool = True
+ preprocess_prompt: bool = True
+ postprocess_output: bool = True
+ audio_chunk_duration: float = 15.0
+ audio_chunk_threshold: float = 30.0
+
+ @classmethod
+ def from_dict(cls, kwargs_dict):
+ valid_keys = {f.name for f in fields(cls)}
+ filtered = {k: v for k, v in kwargs_dict.items() if k in valid_keys}
+ return cls(**filtered)
+
+
+@dataclass
+class GenerationTask:
+ batch_size: int
+ texts: List[str]
+ target_lens: List[int]
+ langs: List[Optional[str]]
+ instructs: List[Optional[str]]
+ ref_texts: List[Optional[str]]
+ ref_audio_tokens: List[Optional[torch.Tensor]]
+ ref_rms: List[Optional[float]]
+ speed: Optional[List[float]] = None
+
+ def get_indices(self, config: OmniVoiceGenerationConfig, frame_rate: int):
+ threshold = int(config.audio_chunk_threshold * frame_rate)
+ short_idx = [i for i, l in enumerate(self.target_lens) if l <= threshold]
+ long_idx = [i for i, l in enumerate(self.target_lens) if l > threshold]
+ return short_idx, long_idx
+
+ def slice_task(self, indices: List[int]):
+ if not indices:
+ return None
+ return GenerationTask(
+ batch_size=len(indices),
+ texts=[self.texts[i] for i in indices],
+ target_lens=[self.target_lens[i] for i in indices],
+ langs=[self.langs[i] for i in indices],
+ instructs=[self.instructs[i] for i in indices],
+ ref_texts=[self.ref_texts[i] for i in indices],
+ ref_audio_tokens=[self.ref_audio_tokens[i] for i in indices],
+ ref_rms=[self.ref_rms[i] for i in indices],
+ speed=[self.speed[i] for i in indices] if self.speed else None,
+ )
+
+
+@dataclass
+class OmniVoiceModelOutput(ModelOutput):
+ loss: Optional[torch.Tensor] = None
+ logits: Optional[torch.Tensor] = None
+
+
+# ---------------------------------------------------------------------------
+# Config & Model
+# ---------------------------------------------------------------------------
+
+
+class OmniVoiceConfig(PretrainedConfig):
+ model_type = "omnivoice"
+ sub_configs = {"llm_config": AutoConfig}
+
+ def __init__(
+ self,
+ audio_vocab_size: int = 1025,
+ audio_mask_id: int = 1024,
+ num_audio_codebook: int = 8,
+ audio_codebook_weights: Optional[list[float]] = None,
+ llm_config: Optional[Union[dict, PretrainedConfig]] = None,
+ **kwargs,
+ ):
+
+ if isinstance(llm_config, dict):
+ llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config)
+
+ self.llm_config = llm_config
+
+ super().__init__(**kwargs)
+ self.audio_vocab_size = audio_vocab_size
+ self.audio_mask_id = audio_mask_id
+ self.num_audio_codebook = num_audio_codebook
+ if audio_codebook_weights is None:
+ audio_codebook_weights = [8, 8, 6, 6, 4, 4, 2, 2]
+ self.audio_codebook_weights = audio_codebook_weights
+
+
+def _resolve_model_path(name_or_path: str) -> str:
+ if os.path.isdir(name_or_path):
+ return name_or_path
+ from huggingface_hub import snapshot_download
+
+ return snapshot_download(name_or_path)
+
+
+class OmniVoice(PreTrainedModel):
+ _supports_flex_attn = True
+ _supports_flash_attn_2 = True
+ _supports_sdpa = True
+ config_class = OmniVoiceConfig
+
+ def __init__(self, config: OmniVoiceConfig, llm: Optional[PreTrainedModel] = None):
+ super().__init__(config)
+
+ if llm is not None:
+ # If an LLM instance is provided, use it directly
+ # (skipping config-based init).
+ self.llm = llm
+ else:
+ # Otherwise, initialize the LLM from the config.
+ self.llm = AutoModel.from_config(self.config.llm_config)
+
+ self.audio_embeddings = nn.Embedding(
+ config.num_audio_codebook * config.audio_vocab_size,
+ self.config.llm_config.hidden_size,
+ )
+ self.register_buffer(
+ "codebook_layer_offsets",
+ torch.arange(config.num_audio_codebook) * config.audio_vocab_size,
+ )
+
+ self.audio_heads = nn.Linear(
+ self.config.llm_config.hidden_size,
+ config.num_audio_codebook * config.audio_vocab_size,
+ bias=False,
+ )
+
+ self.normalized_audio_codebook_weights = [
+ w / sum(config.audio_codebook_weights)
+ for w in config.audio_codebook_weights
+ ]
+
+ self.post_init()
+
+ # Inference-only attributes (set by from_pretrained when not in train mode)
+ self.text_tokenizer = None
+ self.audio_tokenizer = None
+ self.duration_estimator = None
+ self.sampling_rate = None
+ self._asr_pipe = None
+
+ @classmethod
+ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+ train_mode = kwargs.pop("train", False)
+ load_asr = kwargs.pop("load_asr", False)
+ asr_model_name = kwargs.pop("asr_model_name", "openai/whisper-large-v3-turbo")
+
+ # Suppress noisy INFO logs from transformers/huggingface_hub during loading
+ _prev_disable = logging.root.manager.disable
+ logging.disable(logging.INFO)
+
+ try:
+ # Resolve to local path first; download only if not already cached
+ resolved_path = _resolve_model_path(pretrained_model_name_or_path)
+
+ model = super().from_pretrained(resolved_path, *args, **kwargs)
+
+ if not train_mode:
+ model.text_tokenizer = AutoTokenizer.from_pretrained(resolved_path)
+
+ audio_tokenizer_path = os.path.join(resolved_path, "audio_tokenizer")
+
+ if not os.path.isdir(audio_tokenizer_path):
+ audio_tokenizer_path = _resolve_model_path(
+ "eustlb/higgs-audio-v2-tokenizer"
+ )
+
+ # higgs-audio-v2-tokenizer does not support MPS
+ # (output channels > 65536)
+ tokenizer_device = (
+ "cpu" if str(model.device).startswith("mps") else model.device
+ )
+ model.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+ audio_tokenizer_path, device_map=tokenizer_device
+ )
+ model.feature_extractor = AutoFeatureExtractor.from_pretrained(
+ audio_tokenizer_path
+ )
+
+ model.sampling_rate = model.feature_extractor.sampling_rate
+
+ model.duration_estimator = RuleDurationEstimator()
+
+ if load_asr:
+ model.load_asr_model(model_name=asr_model_name)
+ finally:
+ logging.disable(_prev_disable)
+
+ return model
+
+ # -------------------------------------------------------------------
+ # ASR support (optional, for auto-transcription)
+ # -------------------------------------------------------------------
+
+ def load_asr_model(self, model_name: str = "openai/whisper-large-v3-turbo"):
+ """Load a Whisper ASR model for reference audio transcription.
+
+ Args:
+ model_name: HuggingFace model name or local path for the Whisper model.
+ """
+ from transformers import pipeline as hf_pipeline
+
+ logger.info("Loading ASR model %s ...", model_name)
+ asr_dtype = (
+ torch.float16 if str(self.device).startswith("cuda") else torch.float32
+ )
+
+ model_name = _resolve_model_path(model_name)
+
+ self._asr_pipe = hf_pipeline(
+ "automatic-speech-recognition",
+ model=model_name,
+ dtype=asr_dtype,
+ device_map=self.device,
+ )
+ logger.info("ASR model loaded on %s.", self.device)
+
+ @torch.inference_mode()
+ def transcribe(
+ self,
+ audio: Union[str, tuple],
+ ) -> str:
+ """Transcribe audio using the loaded Whisper ASR model.
+
+ Args:
+ audio: File path or ``(waveform, sample_rate)`` tuple.
+ Waveform can be a numpy array or torch.Tensor of shape
+ ``(1, T)`` or ``(T,)``.
+
+ Returns:
+ Transcribed text.
+ """
+ if self._asr_pipe is None:
+ raise RuntimeError(
+ "ASR model is not loaded. Call model.load_asr_model() first."
+ )
+
+ if isinstance(audio, str):
+ return self._asr_pipe(audio)["text"].strip()
+ else:
+ waveform, sr = audio
+ if isinstance(waveform, torch.Tensor):
+ waveform = waveform.cpu().numpy()
+ waveform = np.squeeze(waveform) # (1, T) or (T,) → (T,)
+ audio_input = {
+ "array": waveform,
+ "sampling_rate": sr,
+ }
+ return self._asr_pipe(audio_input)["text"].strip()
+
+ def get_input_embeddings(self):
+ return self.llm.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.llm.set_input_embeddings(value)
+
+ def _prepare_embed_inputs(
+ self, input_ids: torch.Tensor, audio_mask: torch.Tensor
+ ) -> torch.Tensor:
+ """
+ Prepares embeddings from input_ids of shape (batch_size, layers, seq_length).
+ Embedding shape is (batch_size, seq_length, hidden_size).
+ """
+ text_embeds = self.get_input_embeddings()(input_ids[:, 0, :])
+
+ # Apply shift to audio IDs based on codebook layer
+ # audio_ids: [Batch, 8, Seq]
+ # codebook_layer_offsets: [1, 8, 1]
+ # Result: Layer 0 ID Layer 1 ID + Layer 2 ID + 2050...
+ shifted_ids = (
+ input_ids * audio_mask.unsqueeze(1)
+ ) + self.codebook_layer_offsets.view(1, -1, 1)
+
+ # input: [Batch, 8, Seq] -> output: [Batch, Seq, Hidden]
+ audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1)
+
+ return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds)
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor,
+ audio_mask: torch.Tensor,
+ labels: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ document_ids: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ ):
+
+ inputs_embeds = self._prepare_embed_inputs(input_ids, audio_mask)
+
+ if attention_mask is None and document_ids is not None:
+ if not _flex_attention_available:
+ raise RuntimeError(
+ "flex_attention is not available in the current environment. "
+ "If you do not need flex_attention, set "
+ '"attn_implementation": "sdpa" in your training config.'
+ )
+ attention_mask = create_block_mask(
+ _get_packed_mask(
+ document_ids[0].to(inputs_embeds.device),
+ ),
+ B=None,
+ H=None,
+ Q_LEN=input_ids.size(-1),
+ KV_LEN=input_ids.size(-1),
+ _compile=True,
+ device=inputs_embeds.device,
+ )
+
+ llm_outputs = self.llm(
+ inputs_embeds=inputs_embeds,
+ attention_mask=attention_mask,
+ return_dict=True,
+ position_ids=position_ids,
+ )
+ hidden_states = llm_outputs[0]
+
+ loss = None
+
+ # Shape: [B, S, C * Vocab]
+ batch_size, seq_len, _ = hidden_states.shape
+ logits_flat = self.audio_heads(hidden_states)
+ # Shape: [B, S, C, Vocab] -> [B, C, S, Vocab]
+ audio_logits = logits_flat.view(
+ batch_size,
+ seq_len,
+ self.config.num_audio_codebook,
+ self.config.audio_vocab_size,
+ ).permute(0, 2, 1, 3)
+
+ if labels is not None:
+
+ # audio_logits.permute(0, 3, 1, 2):
+ # [Batch, Layer, Seq, Vocab] -> [Batch, Vocab, Layer, Seq]
+ # per_token_loss shape: [Batch, Layer, Seq],ignore -100
+ per_token_loss = torch.nn.functional.cross_entropy(
+ audio_logits.permute(0, 3, 1, 2),
+ labels,
+ reduction="none",
+ ignore_index=-100,
+ )
+ # valid_mask shape: [Batch, Layer, Seq]
+ valid_mask = (labels != -100).float()
+
+ # layer_means shape: [num_layers]
+ layer_means = (per_token_loss * valid_mask).sum(
+ dim=(0, 2)
+ ) / valid_mask.sum(dim=(0, 2)).clamp(min=1.0)
+
+ weights = torch.tensor(
+ self.normalized_audio_codebook_weights, device=audio_logits.device
+ )
+ loss = (layer_means * weights).sum()
+
+ return OmniVoiceModelOutput(
+ loss=loss,
+ logits=audio_logits,
+ )
+
+ def supported_language_ids(self) -> set[str]:
+ """Return a list of supported language IDs."""
+ return LANG_IDS
+
+ def supported_language_names(self) -> set[str]:
+ """Return a list of supported language names."""
+ return LANG_NAMES
+
+ # -------------------------------------------------------------------
+ # Inference API
+ # -------------------------------------------------------------------
+
+ @torch.inference_mode()
+ def generate(
+ self,
+ text: Union[str, list[str]],
+ language: Union[str, list[str], None] = None,
+ ref_text: Union[str, list[str], None] = None,
+ ref_audio: Union[
+ str,
+ list[str],
+ tuple[torch.Tensor, int],
+ list[tuple[torch.Tensor, int]],
+ None,
+ ] = None,
+ voice_clone_prompt: Union[
+ VoiceClonePrompt, list[VoiceClonePrompt], None
+ ] = None,
+ instruct: Union[str, list[str], None] = None,
+ duration: Union[float, list[Optional[float]], None] = None,
+ speed: Union[float, list[Optional[float]], None] = None,
+ generation_config: Optional[OmniVoiceGenerationConfig] = None,
+ **kwargs,
+ ) -> list[np.ndarray]:
+ """Generate speech audio given text in various modes.
+
+ Supports three modes:
+
+ 1. **Voice clone** — clone the voice style from the reference audio.
+ Should provide ``voice_clone_prompt`` (from
+ :meth:`create_voice_clone_prompt`) or ``ref_text`` + ``ref_audio``.
+ 2. **Voice design** — provide ``instruct`` text describing
+ the desired voice style; no reference audio needed.
+ 3. **Auto** — provide neither; the model picks a voice itself.
+
+ Args:
+ text: Target text (single string or list for batch).
+ language: Language name (e.g. ``"English"``) or code
+ (e.g. ``"en"``). ``None`` for language-agnostic mode.
+ Performance is slightly better if you specify the language.
+ ref_text: Optional reference text for voice cloning mode.
+ ref_audio: Optional reference audio for voice cloning mode.
+ Can be a file path or a (waveform, sample_rate) tuple.
+ voice_clone_prompt: Reusable prompt from :meth:`create_voice_clone_prompt`.
+ If provided, it overrides ``ref_text`` and ``ref_audio``.
+ instruct: Style instruction for voice design mode.
+ duration: Fixed output duration in seconds. If a single float,
+ applies to all items; if a list, one value per item.
+ ``None`` (default) lets the model estimate duration from text.
+ Overrides ``speed`` when both are provided.
+ speed: Speaking speed factor. ``> 1.0`` for faster, ``< 1.0`` for
+ slower. If a list, one value per item. ``None`` (default) uses
+ the model's default estimation.
+ generation_config: Explicit config object. If provided, takes
+ precedence over ``**kwargs``.
+ **kwargs: Generation config or its fields:
+ denoise: Whether to prepend the ``<|denoise|>`` token.
+ num_step: Number of iterative decoding steps.
+ guidance_scale: Classifier-free guidance scale.
+ t_shift: Time-step shift (smaller → emphasise low-SNR).
+ postprocess_output: Post-process output (remove silence, fade-in/out, pad edges).
+ layer_penalty_factor: Penalty encouraging earlier codebook
+ layers to unmask first.
+ position_temperature: Temperature for position selection.
+ class_temperature: Temperature for token sampling (0 = greedy).
+ audio_chunk_duration: If > 0, split long text into chunks of
+ this duration (seconds) and generate chunk by chunk.
+ audio_chunk_threshold: Only apply chunking if estimated audio
+ duration exceeds this threshold (seconds).
+ Returns:
+ ``audios`` a list of 1-D ``np.ndarray`` with shape ``(T,)`` and
+ sampling rate consistent with the model's audio tokenizer
+ (usually 24 000 Hz). Can be saved directly with
+ ``soundfile.write("out.wav", audios[0], model.sampling_rate)``.
+ """
+
+ if self.audio_tokenizer is None or self.text_tokenizer is None:
+ raise RuntimeError(
+ "Model is not loaded with audio/text tokenizers. Make sure you "
+ "loaded the model with OmniVoice.from_pretrained()."
+ )
+ gen_config = (
+ generation_config
+ if generation_config is not None
+ else OmniVoiceGenerationConfig.from_dict(kwargs)
+ )
+
+ self.eval()
+
+ full_task = self._preprocess_all(
+ text=text,
+ language=language,
+ ref_text=ref_text,
+ ref_audio=ref_audio,
+ voice_clone_prompt=voice_clone_prompt,
+ instruct=instruct,
+ preprocess_prompt=gen_config.preprocess_prompt,
+ speed=speed,
+ duration=duration,
+ )
+
+ short_idx, long_idx = full_task.get_indices(
+ gen_config, self.audio_tokenizer.config.frame_rate
+ )
+
+ results = [None] * full_task.batch_size
+
+ if short_idx:
+ short_task = full_task.slice_task(short_idx)
+ short_results = self._generate_iterative(short_task, gen_config)
+ for idx, res in zip(short_idx, short_results):
+ results[idx] = res
+
+ if long_idx:
+ long_task = full_task.slice_task(long_idx)
+ long_results = self._generate_chunked(long_task, gen_config)
+ for idx, res in zip(long_idx, long_results):
+ results[idx] = res
+
+ generated_audios = []
+ for i in range(full_task.batch_size):
+ assert results[i] is not None, f"Result {i} was not generated"
+ generated_audios.append(
+ self._decode_and_post_process(
+ results[i], full_task.ref_rms[i], gen_config # type: ignore[arg-type]
+ )
+ )
+
+ return generated_audios
+
+ def create_voice_clone_prompt(
+ self,
+ ref_audio: Union[str, tuple[torch.Tensor, int]],
+ ref_text: Optional[str] = None,
+ preprocess_prompt: bool = True,
+ ) -> VoiceClonePrompt:
+ """Create a reusable voice clone prompt from reference audio.
+
+ Args:
+ ref_audio: File path (str) or ``(waveform, sample_rate)`` tuple.
+ waveform should be a 1-D or 2-D torch.Tensor (channels x samples).
+ ref_text: Transcript of the reference audio. If ``None``, the
+ ASR model will be used to auto-transcribe (must call
+ :meth:`load_asr_model` first).
+ preprocess_prompt: If ``True`` (default), apply silence removal and
+ trimming to the reference audio, add punctuation in the end
+ of reference text (if not already)
+
+ Returns:
+ A :class:`VoiceClonePrompt` that can be passed to :meth:`generate`.
+ """
+ if self.audio_tokenizer is None:
+ raise RuntimeError(
+ "Audio tokenizer is not loaded. Make sure you loaded the model "
+ "with OmniVoice.from_pretrained()."
+ )
+
+ if isinstance(ref_audio, str):
+ ref_wav = load_audio(ref_audio, self.sampling_rate)
+ else:
+ waveform, sr = ref_audio
+ if isinstance(waveform, torch.Tensor):
+ waveform = waveform.cpu().numpy()
+ if waveform.ndim == 1:
+ waveform = waveform[np.newaxis, :]
+ if waveform.shape[0] > 1:
+ waveform = np.mean(waveform, axis=0, keepdims=True)
+ if sr != self.sampling_rate:
+ waveform = torchaudio.functional.resample(
+ torch.from_numpy(waveform),
+ orig_freq=sr,
+ new_freq=self.sampling_rate,
+ ).numpy()
+ ref_wav = waveform
+
+ ref_rms = float(np.sqrt(np.mean(ref_wav**2)))
+ if 0 < ref_rms < 0.1:
+ ref_wav = ref_wav * 0.1 / ref_rms
+
+ if preprocess_prompt:
+ # Trim long reference audio (>20s) by splitting at the largest silence gap.
+ # Skip trimming when ref_text is user-provided, otherwise the
+ # trimmed audio will no longer match the full transcript.
+ if ref_text is None:
+ ref_wav = trim_long_audio(
+ ref_wav, self.sampling_rate, trim_threshold=20.0
+ )
+ ref_wav = remove_silence(
+ ref_wav,
+ self.sampling_rate,
+ mid_sil=200,
+ lead_sil=100,
+ trail_sil=200,
+ )
+ if ref_wav.shape[-1] == 0:
+ raise ValueError(
+ "Reference audio is empty after silence removal. "
+ "Try setting preprocess_prompt=False."
+ )
+
+ ref_duration = ref_wav.shape[-1] / self.sampling_rate
+ if ref_duration > 20.0:
+ logger.warning(
+ "Reference audio is %.1fs long (>20s). This may cause slower "
+ "generation, higher memory usage, and degraded voice cloning "
+ "quality. We recommend trimming it to 3-10s.",
+ ref_duration,
+ )
+
+ # Auto-transcribe if ref_text not provided
+ if ref_text is None:
+ if self._asr_pipe is None:
+ logger.info("ASR model not loaded yet, loading on-the-fly ...")
+ self.load_asr_model()
+ ref_text = self.transcribe((ref_wav, self.sampling_rate))
+ logger.debug("Auto-transcribed ref_text: %s", ref_text)
+
+ chunk_size = self.audio_tokenizer.config.hop_length
+ clip_size = int(ref_wav.shape[-1] % chunk_size)
+ ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav
+ # numpy → torch at tokenizer boundary
+ ref_wav_tensor = torch.from_numpy(ref_wav).to(self.audio_tokenizer.device)
+ ref_audio_tokens = self.audio_tokenizer.encode(
+ ref_wav_tensor.unsqueeze(0),
+ ).audio_codes.squeeze(
+ 0
+ ) # (C, T)
+
+ if preprocess_prompt:
+ ref_text = add_punctuation(ref_text)
+
+ return VoiceClonePrompt(
+ ref_audio_tokens=ref_audio_tokens,
+ ref_text=ref_text,
+ ref_rms=ref_rms,
+ )
+
+ def _decode_and_post_process(
+ self,
+ tokens: Union[torch.Tensor, List[torch.Tensor]],
+ rms: Union[float, None],
+ gen_config: OmniVoiceGenerationConfig,
+ ) -> np.ndarray:
+ """
+ Args:
+ tokens: Audio tokens — either a single tensor of shape
+ (num_codebooks, seq_len) or a list of chunk tensors.
+ rms: RMS of the reference audio for volume adjustment.
+ gen_config: Generation config for post-processing options.
+ Returns:
+ Decoded and post-processed audio array of shape (T,).
+ """
+ tokenizer_device = self.audio_tokenizer.device
+ if isinstance(tokens, list):
+ chunk_audios = [
+ self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0))
+ .audio_values[0]
+ .cpu()
+ .numpy()
+ for t in tokens
+ ]
+ audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate)
+ else:
+ audio_waveform = (
+ self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0))
+ .audio_values[0]
+ .cpu()
+ .numpy()
+ )
+
+ audio_waveform = self._post_process_audio(
+ audio_waveform,
+ postprocess_output=gen_config.postprocess_output,
+ ref_rms=rms,
+ )
+ return audio_waveform.squeeze(0)
+
+ def _post_process_audio(
+ self,
+ generated_audio: np.ndarray,
+ postprocess_output: bool,
+ ref_rms: Union[float, None],
+ ) -> np.ndarray:
+ """Optionally remove long silences, adjust volume, and add edge padding.
+
+ Args:
+ generated_audio: Numpy array of shape (1, T).
+ postprocess_output: If True, remove long silences and apply fade/pad.
+ ref_rms: RMS of the reference audio for volume normalisation.
+ Returns:
+ Processed numpy array of shape (1, T).
+ """
+ if postprocess_output:
+ generated_audio = remove_silence(
+ generated_audio,
+ self.sampling_rate,
+ mid_sil=500,
+ lead_sil=100,
+ trail_sil=100,
+ )
+
+ if ref_rms is not None and ref_rms < 0.1:
+ generated_audio = generated_audio * ref_rms / 0.1
+ elif ref_rms is None:
+ peak = np.abs(generated_audio).max()
+ if peak > 1e-6:
+ generated_audio = generated_audio / peak * 0.5
+
+ generated_audio = fade_and_pad_audio(
+ generated_audio,
+ sample_rate=self.sampling_rate,
+ )
+ return generated_audio
+
+ def _generate_chunked(
+ self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+ ) -> List[List[torch.Tensor]]:
+ """Generate long audio by splitting text into chunks and batching.
+
+ Each item in the returned list corresponds to one input and contains
+ a list of audio token tensors — one per text chunk.
+
+ Args:
+ task: A :class:`GenerationTask` with one or more items whose
+ estimated audio exceeds ``audio_chunk_threshold``.
+ gen_config: Generation config (``audio_chunk_duration`` controls
+ chunk size).
+ Returns:
+ Per-item list of chunk token-tensor lists.
+ """
+ # Chunk each item's text
+ all_chunks = []
+ for i in range(task.batch_size):
+ avg_tokens_per_char = task.target_lens[i] / len(task.texts[i])
+ text_chunk_len = int(
+ gen_config.audio_chunk_duration
+ * self.audio_tokenizer.config.frame_rate
+ / avg_tokens_per_char
+ )
+ chunks = chunk_text_punctuation(
+ text=task.texts[i],
+ chunk_len=text_chunk_len,
+ min_chunk_len=3,
+ )
+ logger.debug(f"Item {i} chunked into {len(chunks)} pieces: {chunks}")
+ all_chunks.append(chunks)
+
+ has_ref = [t is not None for t in task.ref_audio_tokens]
+ assert all(has_ref) or not any(has_ref), (
+ "Chunked inference requires all items to either have or not have "
+ "ref_audio. Mixed ref/non-ref is not supported."
+ )
+
+ max_num_chunks = max(len(c) for c in all_chunks)
+
+ # chunk_results[item_idx] = list of generated token tensors per chunk
+ chunk_results = [[] for _ in range(task.batch_size)]
+
+ def _run_batch(indices, texts, ref_audios, ref_texts):
+ speed_list = task.speed
+ target_lens = [
+ self._estimate_target_tokens(
+ texts[j],
+ ref_texts[j],
+ ref_audios[j].size(-1) if ref_audios[j] is not None else None,
+ speed=speed_list[i] if speed_list else 1.0,
+ )
+ for j, i in enumerate(indices)
+ ]
+ sub_task = GenerationTask(
+ batch_size=len(indices),
+ texts=texts,
+ target_lens=target_lens,
+ langs=[task.langs[i] for i in indices],
+ instructs=[task.instructs[i] for i in indices],
+ ref_texts=ref_texts,
+ ref_audio_tokens=ref_audios,
+ ref_rms=[task.ref_rms[i] for i in indices],
+ speed=[task.speed[i] for i in indices] if task.speed else None,
+ )
+ gen_tokens = self._generate_iterative(sub_task, gen_config)
+ for j, idx in enumerate(indices):
+ chunk_results[idx].append(gen_tokens[j])
+
+ if all(has_ref):
+ # All items have reference audio.
+ # We still sequentially generate chunks within each item, but we
+ # batch across items for the same chunk index. This allows to keep
+ # the VRAM usage manageable while still benefiting from batching.
+ for ci in range(max_num_chunks):
+ indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+ if not indices:
+ continue
+ _run_batch(
+ indices,
+ texts=[all_chunks[i][ci] for i in indices],
+ ref_audios=[task.ref_audio_tokens[i] for i in indices],
+ ref_texts=[task.ref_texts[i] for i in indices],
+ )
+ else:
+ # No reference audio — generate chunk 0 for all items first,
+ # then use chunk 0 output as reference for all subsequent chunks.
+ indices_0 = [i for i in range(task.batch_size) if len(all_chunks[i]) > 0]
+ _run_batch(
+ indices_0,
+ texts=[all_chunks[i][0] for i in indices_0],
+ ref_audios=[None] * len(indices_0),
+ ref_texts=[None] * len(indices_0),
+ )
+ first_chunk_map = {idx: chunk_results[idx][0] for idx in indices_0}
+
+ # Batch all remaining chunks, using chunk 0 as fixed reference
+ for ci in range(1, max_num_chunks):
+ indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])]
+ if not indices:
+ continue
+ _run_batch(
+ indices,
+ texts=[all_chunks[i][ci] for i in indices],
+ ref_audios=[first_chunk_map[i] for i in indices],
+ ref_texts=[all_chunks[i][0] for i in indices],
+ )
+
+ return chunk_results
+
+ def _preprocess_all(
+ self,
+ text: Union[str, list[str]],
+ language: Union[str, list[str], None] = None,
+ ref_text: Union[str, list[str], None] = None,
+ ref_audio: Union[
+ str,
+ list[str],
+ tuple[torch.Tensor, int],
+ list[tuple[torch.Tensor, int]],
+ None,
+ ] = None,
+ voice_clone_prompt: Union[
+ VoiceClonePrompt, list[VoiceClonePrompt], None
+ ] = None,
+ instruct: Union[str, list[str], None] = None,
+ preprocess_prompt: bool = True,
+ speed: Union[float, list[Optional[float]], None] = None,
+ duration: Union[float, list[Optional[float]], None] = None,
+ ) -> GenerationTask:
+
+ if isinstance(text, str):
+ text_list = [text]
+ else:
+ assert isinstance(
+ text, list
+ ), "text should be a string or a list of strings"
+ text_list = text
+ batch_size = len(text_list)
+
+ language_list = self._ensure_list(language, batch_size)
+ language_list = [_resolve_language(lang) for lang in language_list]
+ instruct_list = self._ensure_list(instruct, batch_size)
+ for i, s in enumerate(instruct_list):
+ if s is None:
+ continue
+ use_zh = bool(text_list[i] and _ZH_RE.search(text_list[i]))
+ instruct_list[i] = _resolve_instruct(s, use_zh=use_zh)
+
+ if voice_clone_prompt is not None and (
+ ref_text is not None or ref_audio is not None
+ ):
+ logger.warning(
+ "Both voice_clone_prompt and ref_text/ref_audio are provided. "
+ "ref_text/ref_audio will be ignored."
+ )
+ if voice_clone_prompt is None and ref_audio is not None:
+ # If voice_clone_prompt is not provided, create it from
+ # ref_audio (ref_text will be auto-transcribed if not given).
+ ref_text_list = self._ensure_list(ref_text, batch_size, auto_repeat=False)
+ ref_audio_list = self._ensure_list(ref_audio, batch_size, auto_repeat=False)
+
+ voice_clone_prompt = []
+ for i in range(len(ref_text_list)):
+ voice_clone_prompt.append(
+ self.create_voice_clone_prompt(
+ ref_audio=ref_audio_list[i],
+ ref_text=ref_text_list[i],
+ preprocess_prompt=preprocess_prompt,
+ )
+ )
+
+ voice_clone_prompt_list = self._ensure_list(voice_clone_prompt, batch_size)
+ if voice_clone_prompt_list[0] is not None:
+ ref_text_list = [vc.ref_text for vc in voice_clone_prompt_list]
+ ref_audio_tokens_list = [
+ vc.ref_audio_tokens for vc in voice_clone_prompt_list
+ ]
+ ref_rms_list = [vc.ref_rms for vc in voice_clone_prompt_list]
+ else:
+ ref_text_list = [None] * batch_size
+ ref_audio_tokens_list = [None] * batch_size
+ ref_rms_list = [None] * batch_size
+
+ # Normalize speed/duration to per-item lists (may contain None).
+ if speed is not None:
+ if isinstance(speed, (int, float)):
+ user_speed = [float(speed)] * batch_size
+ else:
+ user_speed = list(speed)
+ else:
+ user_speed = None
+
+ if duration is not None:
+ if isinstance(duration, (int, float)):
+ durations = [float(duration)] * batch_size
+ else:
+ durations = list(duration)
+ else:
+ durations = None
+
+ num_target_tokens_list = []
+ for i in range(batch_size):
+ # duration[i] overrides speed for estimation: use speed=1.0
+ # to get the raw estimate, then override target_lens below.
+ has_dur = durations is not None and durations[i] is not None
+ item_speed = 1.0 if has_dur else (user_speed[i] if user_speed else 1.0)
+ est = self._estimate_target_tokens(
+ text_list[i],
+ ref_text_list[i],
+ ref_audio_tokens_list[i].size(-1)
+ if ref_audio_tokens_list[i] is not None
+ else None,
+ speed=item_speed,
+ )
+ num_target_tokens_list.append(est)
+
+ # Per-item duration overrides: set target_lens to exact frame count
+ # and compute speed ratio so chunked generation scales proportionally.
+ speed_list: Optional[List[float]] = None
+ if durations is not None:
+ frame_rate = self.audio_tokenizer.config.frame_rate
+ speed_list = []
+ for i in range(batch_size):
+ if durations[i] is not None:
+ target_tokens = max(1, int(durations[i] * frame_rate))
+ est = num_target_tokens_list[i]
+ speed_list.append(est / target_tokens if target_tokens > 0 else 1.0)
+ num_target_tokens_list[i] = target_tokens
+ else:
+ s = user_speed[i] if user_speed else None
+ speed_list.append(s if s is not None else 1.0)
+ elif user_speed is not None:
+ speed_list = [s if s is not None else 1.0 for s in user_speed]
+
+ return GenerationTask(
+ batch_size=batch_size,
+ texts=text_list,
+ target_lens=num_target_tokens_list,
+ langs=language_list,
+ instructs=instruct_list,
+ ref_texts=ref_text_list,
+ ref_audio_tokens=ref_audio_tokens_list,
+ ref_rms=ref_rms_list,
+ speed=speed_list,
+ )
+
+ def _estimate_target_tokens(self, text, ref_text, num_ref_audio_tokens, speed=1.0):
+ """Estimate number of target audio tokens."""
+ if num_ref_audio_tokens is None or ref_text is None or len(ref_text) == 0:
+ # Fall back to a simple heuristic
+ ref_text = "Nice to meet you."
+ num_ref_audio_tokens = 25
+
+ est = self.duration_estimator.estimate_duration(
+ text, ref_text, num_ref_audio_tokens
+ )
+ if speed > 0 and speed != 1.0:
+ est = est / speed
+ return max(1, int(est))
+
+ def _ensure_list(
+ self, x: Union[Any, List[Any]], batch_size: int, auto_repeat: bool = True
+ ) -> List[Any]:
+ x_list = x if isinstance(x, list) else [x]
+ if len(x_list) not in (
+ 1,
+ batch_size,
+ ):
+ raise ValueError(
+ f"should be either the number of the text or 1, but got {len(x_list)}"
+ )
+ if auto_repeat and len(x_list) == 1 and batch_size is not None:
+ x_list = x_list * batch_size
+ return x_list
+
+ def _prepare_inference_inputs(
+ self,
+ text: str,
+ num_target_tokens: int,
+ ref_text: Optional[str] = None,
+ ref_audio_tokens: Optional[torch.Tensor] = None,
+ lang: Optional[str] = None,
+ instruct: Optional[str] = None,
+ denoise: bool = True,
+ ):
+ """Prepare input_ids and audio masks for inference.
+ Args:
+ text: Target text to generate.
+ num_target_tokens: Number of audio tokens to generate.
+ ref_text: Optional reference text for voice cloning.
+ ref_audio_tokens: Optional reference audio tokens for voice cloning.
+ with shape (C, T).
+ lang: Optional language ID.
+ instruct: Optional style instruction for voice design.
+ denoise: Whether to include the <|denoise|> token.
+ """
+
+ # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|>
+ # + <|instruct_start|>...<|instruct_end|>
+ style_text = ""
+ if denoise and ref_audio_tokens is not None:
+ style_text += "<|denoise|>"
+ lang_str = lang if lang else "None"
+ instruct_str = instruct if instruct else "None"
+ style_text += f"<|lang_start|>{lang_str}<|lang_end|>"
+ style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>"
+
+ style_tokens = (
+ self.text_tokenizer(style_text, return_tensors="pt")
+ .input_ids.repeat(self.config.num_audio_codebook, 1)
+ .unsqueeze(0)
+ ).to(
+ self.device
+ ) # [1, C, N1]
+
+ # Build text tokens
+ full_text = _combine_text(ref_text=ref_text, text=text)
+ wrapped_text = f"<|text_start|>{full_text}<|text_end|>"
+ text_tokens = (
+ _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer)
+ .repeat(self.config.num_audio_codebook, 1)
+ .unsqueeze(0)
+ ).to(
+ self.device
+ ) # [1, C, N2]
+
+ # Target: all MASK
+ target_audio_tokens = torch.full(
+ (1, self.config.num_audio_codebook, num_target_tokens),
+ self.config.audio_mask_id,
+ dtype=torch.long,
+ device=self.device,
+ )
+
+ # Conditional input
+ parts = [style_tokens, text_tokens]
+ if ref_audio_tokens is not None:
+ parts.append(ref_audio_tokens.unsqueeze(0).to(self.device))
+ parts.append(target_audio_tokens)
+ cond_input_ids = torch.cat(parts, dim=2)
+
+ cond_total_length = cond_input_ids.shape[2]
+ cond_audio_start_idx = cond_total_length - num_target_tokens
+ if ref_audio_tokens is not None:
+ cond_audio_start_idx -= ref_audio_tokens.size(-1)
+
+ cond_audio_mask = torch.zeros(
+ 1, cond_total_length, dtype=torch.bool, device=self.device
+ )
+ cond_audio_mask[0, cond_audio_start_idx:] = True
+
+ return {
+ "input_ids": cond_input_ids,
+ "audio_mask": cond_audio_mask,
+ }
+
+ def _generate_iterative(
+ self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig
+ ) -> List[torch.Tensor]:
+ """N-step iterative unmasked decoding.
+
+ Args:
+ task: A :class:`GenerationTask` containing batch texts, target
+ lengths, languages, instructions, and optional reference data.
+ gen_config: A :class:`OmniVoiceGenerationConfig` controlling
+ decoding steps, guidance, temperatures, etc.
+ Returns:
+ List of generated audio token tensors of shape (C, T) (one per
+ input text).
+ """
+
+ B = task.batch_size
+
+ for i in range(B):
+ logger.debug(
+ "Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d",
+ i,
+ task.texts[i],
+ task.ref_texts[i],
+ task.instructs[i],
+ task.langs[i],
+ task.target_lens[i],
+ )
+
+ inputs_list = [
+ self._prepare_inference_inputs(
+ task.texts[i],
+ task.target_lens[i],
+ task.ref_texts[i],
+ task.ref_audio_tokens[i],
+ task.langs[i],
+ task.instructs[i],
+ gen_config.denoise,
+ )
+ for i in range(B)
+ ]
+
+ c_lens = [inp["input_ids"].size(2) for inp in inputs_list]
+ max_c_len = max(c_lens)
+ pad_id = self.config.audio_mask_id # Or any other tokens
+
+ batch_input_ids = torch.full(
+ (2 * B, self.config.num_audio_codebook, max_c_len),
+ pad_id,
+ dtype=torch.long,
+ device=self.device,
+ )
+ batch_audio_mask = torch.zeros(
+ (2 * B, max_c_len), dtype=torch.bool, device=self.device
+ )
+ batch_attention_mask = torch.zeros(
+ (2 * B, 1, max_c_len, max_c_len), dtype=torch.bool, device=self.device
+ )
+
+ for i, inp in enumerate(inputs_list):
+ c_len, u_len = c_lens[i], task.target_lens[i]
+
+ # Cond (0 ~ B-1)
+ batch_input_ids[i, :, :c_len] = inp["input_ids"]
+ batch_audio_mask[i, :c_len] = inp["audio_mask"]
+ batch_attention_mask[i, :, :c_len, :c_len] = True
+
+ # Uncond (B ~ 2B-1)
+ batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:]
+ batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:]
+ batch_attention_mask[B + i, :, :u_len, :u_len] = True
+ if max_c_len > u_len:
+ pad_diag = torch.arange(u_len, max_c_len, device=self.device)
+ batch_attention_mask[B + i, :, pad_diag, pad_diag] = True
+
+ tokens = torch.full(
+ (B, self.config.num_audio_codebook, max(task.target_lens)),
+ self.config.audio_mask_id,
+ dtype=torch.long,
+ device=self.device,
+ )
+
+ timesteps = _get_time_steps(
+ t_start=0.0,
+ t_end=1.0,
+ num_step=gen_config.num_step,
+ t_shift=gen_config.t_shift,
+ ).tolist()
+ schedules = []
+ for t_len in task.target_lens:
+ total_mask = t_len * self.config.num_audio_codebook
+ rem = total_mask
+ sched = []
+ for step in range(gen_config.num_step):
+ num = (
+ rem
+ if step == gen_config.num_step - 1
+ else min(
+ math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])),
+ rem,
+ )
+ )
+ sched.append(int(num))
+ rem -= int(num)
+ schedules.append(sched)
+
+ layer_ids = torch.arange(
+ self.config.num_audio_codebook, device=self.device
+ ).view(1, -1, 1)
+
+ for step in range(gen_config.num_step):
+ batch_logits = self(
+ input_ids=batch_input_ids,
+ audio_mask=batch_audio_mask,
+ attention_mask=batch_attention_mask,
+ ).logits.to(torch.float32)
+
+ for i in range(B):
+ k = schedules[i][step]
+ if k <= 0:
+ continue
+
+ c_len, t_len = c_lens[i], task.target_lens[i]
+
+ # Extract real target Logits
+ # [1, C, T, V]
+ c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :]
+ u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :]
+
+ pred_tokens, scores = self._predict_tokens_with_scoring(
+ c_logits, u_logits, gen_config
+ )
+
+ scores = scores - (layer_ids * gen_config.layer_penalty_factor)
+
+ if gen_config.position_temperature > 0.0:
+ scores = _gumbel_sample(scores, gen_config.position_temperature)
+
+ sample_tokens = tokens[i : i + 1, :, :t_len]
+ scores.masked_fill_(
+ sample_tokens != self.config.audio_mask_id, -float("inf")
+ )
+
+ _, topk_idx = torch.topk(scores.flatten(), k)
+ flat_tokens = sample_tokens.flatten()
+ flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx]
+ sample_tokens.copy_(flat_tokens.view_as(sample_tokens))
+
+ # Update individual slices into batched structure
+ tokens[i : i + 1, :, :t_len] = sample_tokens
+ batch_input_ids[i : i + 1, :, c_len - t_len : c_len] = sample_tokens
+ batch_input_ids[B + i : B + i + 1, :, :t_len] = sample_tokens
+
+ return [tokens[i, :, : task.target_lens[i]] for i in range(B)]
+
+ def _predict_tokens_with_scoring(self, c_logits, u_logits, gen_config):
+ if gen_config.guidance_scale != 0:
+ c_log_probs = F.log_softmax(c_logits, dim=-1)
+ u_log_probs = F.log_softmax(u_logits, dim=-1)
+ log_probs = torch.log_softmax(
+ c_log_probs + gen_config.guidance_scale * (c_log_probs - u_log_probs),
+ dim=-1,
+ )
+ else:
+ log_probs = F.log_softmax(c_logits, dim=-1)
+
+ log_probs[..., self.config.audio_mask_id] = -float("inf")
+
+ if gen_config.class_temperature > 0.0:
+ filtered_probs = _filter_top_k(log_probs, ratio=0.1)
+ pred_tokens = _gumbel_sample(
+ filtered_probs, gen_config.class_temperature
+ ).argmax(dim=-1)
+ else:
+ pred_tokens = log_probs.argmax(dim=-1)
+
+ confidence_scores = log_probs.max(dim=-1)[0]
+
+ return pred_tokens, confidence_scores
+
+
+# ---------------------------------------------------------------------------
+# Standalone helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_packed_mask(document_ids):
+ return partial(_mask_mod_packed, document_ids)
+
+
+def _mask_mod_packed(document_ids, b, h, q_idx, kv_idx):
+ # 1. Sequence Packing Logic: Tokens must belong to the same document.
+ # Note: The doc_id for padding tokens is -1, which will automatically not match
+ # (if handled correctly) or be ignored.
+ same_doc = document_ids[q_idx] == document_ids[kv_idx]
+ return same_doc
+
+
+def _resolve_language(language: Optional[str]) -> Union[str, None]:
+ from omnivoice.utils.lang_map import LANG_IDS, LANG_NAME_TO_ID
+
+ if language is None or language.lower() == "none":
+ return None
+ if language in LANG_IDS:
+ return language
+ key = language.lower()
+ if key in LANG_NAME_TO_ID:
+ return LANG_NAME_TO_ID[key]
+ logger.warning(
+ f"Language '{language}' is not recognized. "
+ f"Please use a valid language ID (e.g., 'en', 'zh', 'ja', 'de') "
+ f"or a full language name (e.g., 'English', 'Chinese', 'Japanese'). "
+ f"See supported_language_ids() or supported_language_names() for details. "
+ f"Falling back to None (language-agnostic mode)."
+ )
+ return None
+
+
+def _resolve_instruct(
+ instruct: Optional[str], use_zh: bool = False
+) -> Union[str, None]:
+ """Validate and normalise a voice-design instruct string.
+
+ Supported instruct items (case-insensitive for English):
+
+ English (comma + space separated):
+ gender: male, female
+ age: child, teenager, young adult, middle-aged, elderly
+ pitch: very low pitch, low pitch, moderate pitch,
+ high pitch, very high pitch
+ style: whisper
+ accent: american accent, british accent, australian accent, ...
+
+ Chinese (full-width comma separated):
+ gender: 男, 女
+ age: 儿童, 少年, 青年, 中年, 老年
+ pitch: 极低音调, 低音调, 中音调, 高音调, 极高音调
+ style: 耳语
+ dialect: 河南话, 陕西话, 四川话, 贵州话, 云南话,
+ 桂林话, 济南话, 石家庄话, 甘肃话, 宁夏话,
+ 青岛话, 东北话
+
+ Minor issues (auto-fixed):
+ - Wrong separator (half-width comma in Chinese instruct or
+ full-width comma in English instruct)
+ - Leading / trailing commas
+
+ Major issues (raise ``ValueError``):
+ - Unsupported or misspelled instruct items
+ - Suggestions are offered for close matches
+
+ Args:
+ instruct: Raw instruct string, or ``None``.
+ use_zh: If True, normalise all items to Chinese (used when the
+ synthesis text contains Chinese and no accent is specified).
+
+ Returns:
+ Normalised instruct string, or ``None``.
+
+ Raises:
+ ValueError: if any instruct item is unsupported or misspelled.
+ """
+ if instruct is None:
+ return None
+
+ instruct_str = instruct.strip()
+ if not instruct_str:
+ return None
+
+ # Split on both half-width and full-width commas
+ raw_items = re.split(r"\s*[,,]\s*", instruct_str)
+ raw_items = [x for x in raw_items if x]
+
+ # Validate each item
+ unknown = []
+ normalised = []
+ for raw in raw_items:
+ n = raw.strip().lower()
+ if n in _INSTRUCT_ALL_VALID:
+ normalised.append(n)
+ else:
+ sug = difflib.get_close_matches(n, _INSTRUCT_ALL_VALID, n=1, cutoff=0.6)
+ unknown.append((raw, n, sug[0] if sug else None))
+
+ if unknown:
+ lines = []
+ for raw, n, sug in unknown:
+ if sug:
+ lines.append(f" '{raw}' -> '{n}' (unsupported; did you mean '{sug}'?)")
+ else:
+ lines.append(f" '{raw}' -> '{n}' (unsupported)")
+ err = (
+ f"Unsupported instruct items found in {instruct_str}:\n"
+ + "\n".join(lines)
+ + "\n\nValid English items: "
+ + ", ".join(sorted(_INSTRUCT_VALID_EN))
+ + "\nValid Chinese items: "
+ + ",".join(sorted(_INSTRUCT_VALID_ZH))
+ + "\n\nTip: Use only English or only Chinese instructs. "
+ "English instructs should use comma + space (e.g. "
+ "'male, indian accent'),\nChinese instructs should use full-width "
+ "comma (e.g. '男,河南话')."
+ )
+ raise ValueError(err)
+
+ # --- Language consistency: dialect forces Chinese, accent forces English ---
+ has_dialect = any(n.endswith("话") for n in normalised)
+ has_accent = any(" accent" in n for n in normalised)
+
+ if has_dialect and has_accent:
+ raise ValueError(
+ "Cannot mix Chinese dialect and English accent in a single instruct. "
+ "Dialects are for Chinese speech, accents for English speech."
+ )
+
+ if has_dialect:
+ use_zh = True
+ elif has_accent:
+ use_zh = False
+
+ # --- Unify to single language ---
+ if use_zh:
+ normalised = [_INSTRUCT_EN_TO_ZH.get(n, n) for n in normalised]
+ else:
+ normalised = [_INSTRUCT_ZH_TO_EN.get(n, n) for n in normalised]
+
+ # --- Category conflict check ---
+ conflicts = []
+ for cat in _INSTRUCT_MUTUALLY_EXCLUSIVE:
+ hits = [n for n in normalised if n in cat]
+ if len(hits) > 1:
+ conflicts.append(hits)
+ if conflicts:
+ parts = []
+ for group in conflicts:
+ parts.append(" vs ".join(f"'{x}'" for x in group))
+ raise ValueError(
+ "Conflicting instruct items within the same category: "
+ + "; ".join(parts)
+ + ". Each category (gender, age, pitch, style, accent, dialect) "
+ "allows at most one item."
+ )
+
+ # Determine separator based on language
+ has_zh = any(any("\u4e00" <= c <= "\u9fff" for c in n) for n in normalised)
+ separator = "," if has_zh else ", "
+
+ return separator.join(normalised)
+
+
+def _filter_top_k(logits: torch.Tensor, ratio: float = 0.1) -> torch.Tensor:
+ k = math.ceil(ratio * logits.shape[-1])
+ val, ind = logits.topk(k, dim=-1)
+ probs = torch.full_like(logits, float("-inf"))
+ probs.scatter_(-1, ind, val)
+ return probs
+
+
+def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor:
+ scaled_logits = logits / temperature
+ u = torch.rand_like(scaled_logits)
+ gumbel_noise = -torch.log(-torch.log(u + 1e-10) + 1e-10)
+ return scaled_logits + gumbel_noise
+
+
+def _get_time_steps(
+ t_start: float = 0.0,
+ t_end: float = 1.0,
+ num_step: int = 10,
+ t_shift: float = 1.0,
+ device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+ timesteps = torch.linspace(t_start, t_end, num_step + 1).to(device)
+ timesteps = t_shift * timesteps / (1 + (t_shift - 1) * timesteps)
+ return timesteps
+
+
+_NONVERBAL_PATTERN = re.compile(
+ r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|"
+ r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|"
+ r"surprise-yo|dissatisfaction-hnn)\]"
+)
+
+
+def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor:
+ """Tokenize text containing non-verbal tags, handling each tag independently.
+
+ Non-verbal tags are tokenized standalone to guarantee consistent token
+ IDs regardless of surrounding language context (Chinese, English, etc.).
+
+ Args:
+ text: Full text string potentially containing non-verbal tags.
+ tokenizer: HuggingFace text tokenizer instance.
+ Returns:
+ Token IDs tensor of shape (1, seq_len).
+ """
+ parts = []
+ last_end = 0
+ for m in _NONVERBAL_PATTERN.finditer(text):
+ if m.start() > last_end:
+ segment = text[last_end : m.start()]
+ ids = tokenizer(segment, add_special_tokens=False).input_ids
+ if ids:
+ parts.append(ids)
+ tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids
+ if tag_ids:
+ parts.append(tag_ids)
+ last_end = m.end()
+ if last_end < len(text):
+ segment = text[last_end:]
+ ids = tokenizer(segment, add_special_tokens=False).input_ids
+ if ids:
+ parts.append(ids)
+
+ if not parts:
+ result = tokenizer(text, return_tensors="pt").input_ids
+ else:
+ combined = []
+ for p in parts:
+ combined.extend(p)
+ result = torch.tensor([combined], dtype=torch.long)
+ return result
+
+
+def _combine_text(text, ref_text: Optional[str] = None) -> str:
+
+ # combine with reference text if not None
+ if ref_text:
+ full_text = ref_text.strip() + " " + text.strip()
+ else:
+ full_text = text.strip()
+
+ # filter out newline / carriage-return characters
+ full_text = re.sub(r"[\r\n]+", "", full_text)
+
+ # replace Chinese parentheses with English ones
+ full_text = full_text.replace("\uff08", "(").replace("\uff09", ")")
+
+ # collapse consecutive spaces / tabs into a single space
+ full_text = re.sub(r"[ \t]+", " ", full_text)
+
+ # remove spaces around chinese characters
+ chinese_range = r"[\u4e00-\u9fff]"
+ pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})"
+ full_text = re.sub(pattern, "", full_text)
+
+ return full_text
+
+
+# ---------------------------------------------------------------------------
+# Register with HuggingFace Auto classes
+# ---------------------------------------------------------------------------
+
+AutoConfig.register("omnivoice", OmniVoiceConfig)
+AutoModel.register(OmniVoiceConfig, OmniVoice)
diff --git a/omnivoice/scripts/__init__.py b/omnivoice/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/scripts/denoise_audio.py b/omnivoice/scripts/denoise_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..27fce910d6028ef3e0acebca8eeb1864ec4456fc
--- /dev/null
+++ b/omnivoice/scripts/denoise_audio.py
@@ -0,0 +1,1049 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Denoise audio with Sidon and pack results into WebDataset shards.
+
+Supports two input modes:
+
+1. WebDataset manifest (data.lst):
+ python denoise_audio.py \
+ --input_manifest data.lst \
+ --tar_output_pattern output/audios/shard-%06d.tar \
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+ --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \
+ --decoder_path sidon-v0.1/decoder_cuda.pt
+
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", ...}):
+ python denoise_audio.py \
+ --input_jsonl data.jsonl \
+ --tar_output_pattern output/audios/shard-%06d.tar \
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl \
+ --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \
+ --decoder_path sidon-v0.1/decoder_cuda.pt
+
+Output structure:
+ output_dir/
+ ├── audios/ # WebDataset tar shards (.flac audio + .json metadata)
+ │ ├── shard_000000.tar
+ │ └── ...
+ ├── txts/ # Per-shard JSONL metadata
+ │ ├── shard_000000.jsonl
+ │ └── ...
+ ├── data.lst # Manifest:
+ └── errors.jsonl # Failed samples with error details
+"""
+
+from __future__ import annotations
+
+import argparse
+import io
+import json
+import logging
+import os
+import pickle
+import struct
+import subprocess
+import sys
+import threading
+from concurrent.futures import FIRST_COMPLETED, Future, wait
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torchaudio
+import webdataset as wds
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+
+from omnivoice.data.batching import StreamLengthGroupDataset
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+import soundfile as sf
+from omnivoice.utils.common import str2bool
+
+SIDON_INPUT_SAMPLE_RATE = 16_000
+SIDON_OUTPUT_SAMPLE_RATE = 48_000
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description=__doc__)
+
+ # ── Input (mutually exclusive) ──
+ parser.add_argument(
+ "--input_manifest",
+ default=None,
+ help="WebDataset manifest (data.lst). Each line: "
+ " ",
+ )
+ parser.add_argument(
+ "--input_jsonl",
+ default=None,
+ help="Raw JSONL file. Each line: " '{"id": "...", "audio_path": "...", ...}',
+ )
+
+ # ── Output ──
+ parser.add_argument(
+ "--tar_output_pattern",
+ default=None,
+ help="Tar shard pattern, e.g. output/audios/shard_%%06d.tar",
+ )
+ parser.add_argument(
+ "--jsonl_output_pattern",
+ default=None,
+ help="JSONL shard pattern, e.g. output/txts/shard_%%06d.jsonl",
+ )
+ parser.add_argument(
+ "--samples_per_shard",
+ type=int,
+ default=1_000,
+ help="Maximum records per output shard",
+ )
+
+ # ── Model ──
+ parser.add_argument(
+ "--feature_extractor_path",
+ default=None,
+ help="Path to feature_extractor_cuda.pt",
+ )
+ parser.add_argument(
+ "--decoder_path",
+ default=None,
+ help="Path to decoder_cuda.pt",
+ )
+ parser.add_argument(
+ "--target_sample_rate",
+ type=int,
+ default=24_000,
+ help="Sample rate of the denoised output audio",
+ )
+
+ # ── Filtering ──
+ parser.add_argument(
+ "--min_length",
+ type=float,
+ default=0.0,
+ help="Minimum audio duration in seconds",
+ )
+ parser.add_argument(
+ "--max_length",
+ type=float,
+ default=80.0,
+ help="Maximum audio duration in seconds",
+ )
+
+ # ── Batching ──
+ parser.add_argument(
+ "--batch_duration",
+ type=float,
+ default=200.0,
+ help="Target batch duration in seconds for dynamic batching",
+ )
+ parser.add_argument(
+ "--max_sample",
+ type=int,
+ default=32,
+ help="Maximum samples per batch for dynamic batching",
+ )
+
+ # ── Distributed ──
+ parser.add_argument(
+ "--num_machines",
+ type=int,
+ default=1,
+ help="Total number of machines for distributed runs",
+ )
+ parser.add_argument(
+ "--machine_index",
+ type=int,
+ default=0,
+ help="Zero-based machine index when distributing across multiple "
+ "machines (e.g. 0, 1, ... num_machines-1)",
+ )
+
+ # ── Parallelism ──
+ parser.add_argument(
+ "--nj_per_gpu",
+ type=int,
+ default=1,
+ help="Worker processes per GPU (default 1)",
+ )
+ parser.add_argument(
+ "--loader_workers",
+ type=int,
+ default=16,
+ help="PyTorch DataLoader worker threads",
+ )
+
+ # ── Data order (JSONL mode) ──
+ parser.add_argument(
+ "--shuffle",
+ type=str2bool,
+ default=True,
+ help="Shuffle JSONL entries",
+ )
+ parser.add_argument(
+ "--shuffle_seed",
+ type=int,
+ default=42,
+ help="Seed for JSONL shuffle",
+ )
+
+ # ── Error handling ──
+ parser.add_argument(
+ "--skip_errors",
+ action="store_true",
+ help="Skip items that fail to denoise instead of aborting",
+ )
+ parser.add_argument(
+ "--_subprocess_worker",
+ action="store_true",
+ help=argparse.SUPPRESS,
+ )
+ return parser
+
+
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+
+
+def count_lines(path: str) -> int:
+ """Count newlines efficiently by reading binary chunks."""
+ count = 0
+ with open(path, "rb") as f:
+ for chunk in iter(lambda: f.read(1 << 20), b""):
+ count += chunk.count(b"\n")
+ return count
+
+
+PaddingStrategy = Union[bool, str]
+ReturnType = Union[torch.Tensor, np.ndarray]
+
+
+def extract_seamless_m4t_features(
+ raw_speech: Union[torch.Tensor, List[float], List[torch.Tensor], List[List[float]]],
+ sampling_rate: int = 16000,
+ num_mel_bins: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ preemphasis_coefficient: float = 0.97,
+ dither: float = 0.0,
+ window_type: str = "povey",
+ do_normalize_per_mel_bins: bool = True,
+ stride: int = 2,
+ padding: PaddingStrategy = "longest",
+ max_length: Optional[int] = None,
+ pad_to_multiple_of: Optional[int] = 2,
+ return_tensors: Optional[str] = "pt",
+ return_attention_mask: bool = True,
+ padding_value: float = 0.0,
+ device: torch.device = torch.device("cpu"),
+) -> Dict[str, ReturnType]:
+ """Extract SeamlessM4T features using Torch-only operators."""
+ if not isinstance(raw_speech, list):
+ raw_speech = [raw_speech]
+
+ processed_speech = [
+ torch.as_tensor(sample, dtype=torch.float32, device=device)
+ for sample in raw_speech
+ ]
+
+ features: List[torch.Tensor] = []
+ for waveform in processed_speech:
+ if waveform.ndim > 1:
+ waveform = waveform[0]
+ waveform_tensor = waveform.unsqueeze(0)
+ feature = torchaudio.compliance.kaldi.fbank(
+ waveform=waveform_tensor,
+ sample_frequency=sampling_rate,
+ num_mel_bins=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ preemphasis_coefficient=preemphasis_coefficient,
+ remove_dc_offset=True,
+ window_type=window_type,
+ use_energy=False,
+ energy_floor=1.192092955078125e-07,
+ )
+ features.append(feature.squeeze(0))
+
+ if do_normalize_per_mel_bins:
+ normalised: List[torch.Tensor] = []
+ for feature in features:
+ mean = feature.mean(0, keepdim=True)
+ var = feature.var(0, keepdim=True)
+ normalised.append((feature - mean) / torch.sqrt(var + 1e-5))
+ features = normalised
+
+ def _pad_batch(
+ features: List[torch.Tensor],
+ padding_strategy: PaddingStrategy = "longest",
+ max_length: Optional[int] = None,
+ pad_to_multiple_of: Optional[int] = None,
+ padding_value: float = 0.0,
+ ) -> tuple[torch.Tensor, torch.Tensor]:
+ if padding_strategy == "longest":
+ target_length = max(f.shape[0] for f in features)
+ elif max_length is not None:
+ target_length = max_length
+ else:
+ raise ValueError(
+ "max_length must be provided when padding_strategy is not 'longest'"
+ )
+
+ if pad_to_multiple_of is not None:
+ target_length = (
+ (target_length + pad_to_multiple_of - 1)
+ // pad_to_multiple_of
+ * pad_to_multiple_of
+ )
+
+ batch_size = len(features)
+ feature_dim = features[0].shape[1]
+ device = features[0].device
+
+ padded_features = torch.full(
+ (batch_size, target_length, feature_dim),
+ padding_value,
+ dtype=torch.float32,
+ device=device,
+ )
+ attention_mask = torch.zeros(
+ (batch_size, target_length),
+ dtype=torch.int64,
+ device=device,
+ )
+
+ for index, feature_tensor in enumerate(features):
+ seq_len = feature_tensor.shape[0]
+ padded_features[index, :seq_len] = feature_tensor
+ attention_mask[index, :seq_len] = 1
+
+ return padded_features, attention_mask
+
+ input_features, attention_mask = _pad_batch(
+ features,
+ padding_strategy=padding,
+ max_length=max_length,
+ pad_to_multiple_of=pad_to_multiple_of,
+ padding_value=padding_value,
+ )
+
+ batch_size, num_frames, num_channels = input_features.shape
+ new_num_frames = (num_frames // stride) * stride
+ input_features = input_features[:, :new_num_frames, :]
+ if return_attention_mask:
+ attention_mask = attention_mask[:, :new_num_frames]
+
+ input_features = input_features.reshape(
+ batch_size, new_num_frames // stride, num_channels * stride
+ )
+
+ output: Dict[str, ReturnType] = {"input_features": input_features}
+ if return_attention_mask:
+ output["attention_mask"] = attention_mask[:, 1::stride]
+
+ if return_tensors == "np":
+ for key, value in output.items():
+ output[key] = value.cpu().numpy() # type: ignore[assignment]
+
+ return output
+
+
+def serialise_flac(key: str, waveform: torch.Tensor, sample_rate: int) -> dict:
+ buffer = io.BytesIO()
+ audio = waveform.to(dtype=torch.float32).cpu().numpy()
+ if audio.ndim == 2:
+ audio = audio.T # (C, T) → (T, C) for soundfile
+ sf.write(buffer, audio, sample_rate, format="FLAC")
+ return {"__key__": key, "flac": buffer.getvalue()}
+
+
+def _normalise_value(value: Any) -> Any:
+ """Convert tensors and NumPy scalars to serialisable Python objects."""
+ if isinstance(value, torch.Tensor):
+ if value.ndim == 0:
+ return value.item()
+ return value.cpu().tolist()
+ if isinstance(value, np.generic):
+ return value.item()
+ if isinstance(value, np.ndarray):
+ return value.tolist()
+ return value
+
+
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+ cleaned: dict[str, Any] = {}
+ for key, value in metadata.items():
+ if value is None:
+ continue
+ cleaned[key] = _normalise_value(value)
+ return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Denoising model
+# ---------------------------------------------------------------------------
+
+
+class SpeechDenoisingProcessor:
+ """Run the TorchScripted feature extractor and decoder."""
+
+ def __init__(
+ self,
+ feature_extractor_path: str,
+ decoder_path: str,
+ device: str,
+ ) -> None:
+ self.device = torch.device(device)
+ self.feature_extractor = torch.jit.load(
+ feature_extractor_path, map_location=self.device
+ )
+ self.decoder = torch.jit.load(decoder_path, map_location=self.device)
+ self.feature_extractor.eval()
+ self.decoder.eval()
+
+ @torch.inference_mode()
+ def process(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
+ return self.process_batch([waveform], [sample_rate])[0]
+
+ @torch.inference_mode()
+ def process_batch(
+ self,
+ waveforms: Sequence[torch.Tensor] | torch.Tensor,
+ sample_rates: Optional[Sequence[int]] = None,
+ expected_lengths: Optional[Sequence[int]] = None,
+ ) -> List[torch.Tensor]:
+ if expected_lengths is None:
+ expected_lengths: list[int] = []
+ for waveform, sample_rate in zip(waveforms, sample_rates):
+ duration_seconds = waveform.shape[-1] / float(sample_rate)
+ expected_lengths.append(
+ int(round(duration_seconds * SIDON_OUTPUT_SAMPLE_RATE))
+ )
+ waveforms = torch.nn.functional.pad(waveforms, (0, 24000))
+
+ features = extract_seamless_m4t_features(
+ [x for x in waveforms],
+ return_tensors="pt",
+ padding_value=1.0,
+ device=self.device,
+ )
+ feature_tensor = self.feature_extractor(
+ features["input_features"].to(self.device)
+ )["last_hidden_state"]
+ restored_waveforms = self.decoder(feature_tensor.transpose(1, 2)).cpu()
+
+ results: List[torch.Tensor] = []
+ for sample_idx, sample in enumerate(restored_waveforms):
+ restored_waveform = sample.view(-1)
+ target_length = expected_lengths[sample_idx]
+ current_length = restored_waveform.shape[-1]
+ if target_length > 0 and current_length != target_length:
+ diff = target_length - current_length
+ if diff > 0:
+ restored_waveform = torch.nn.functional.pad(
+ restored_waveform, (0, diff)
+ )
+ elif diff < 0:
+ restored_waveform = restored_waveform[:target_length]
+ results.append(restored_waveform.contiguous())
+
+ return results
+
+
+# ---------------------------------------------------------------------------
+# Batch collation
+# ---------------------------------------------------------------------------
+
+
+class CollateFunction:
+ """Collate a list of samples into a padded batch."""
+
+ def __init__(
+ self,
+ sample_rate: int,
+ skip_errors: bool,
+ ) -> None:
+ self.sample_rate = sample_rate
+ self.skip_errors = skip_errors
+
+ def __call__(self, samples: Sequence[dict[str, Any]]) -> CollatedBatch:
+ keys: list[str] = []
+ waveforms: list[torch.Tensor] = []
+ durations: list[float] = []
+ metadata: list[dict[str, Any]] = []
+
+ for sample in samples:
+ keys.append(sample["label"]["id"])
+ waveforms.append(sample["audio"].squeeze(0))
+ durations.append(sample["audio"].size(-1) / self.sample_rate)
+ metadata.append(sample["label"])
+ waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
+
+ return CollatedBatch(
+ keys=keys, waveforms=waveforms, durations=durations, metadata=metadata
+ )
+
+
+@dataclass
+class CollatedBatch:
+ """Batch payload returned by the DataLoader collate function."""
+
+ keys: list[str]
+ waveforms: list[torch.Tensor]
+ durations: list[float]
+ metadata: list[dict[str, Any]]
+
+ @property
+ def size(self) -> int:
+ return len(self.keys)
+
+
+# ---------------------------------------------------------------------------
+# Subprocess-based GPU worker pool
+# ---------------------------------------------------------------------------
+#
+# Problem: PyTorch ≥2.8 caches CUDA device state at import time. Neither
+# forkserver nor spawn lets us change CUDA_VISIBLE_DEVICES *before* the CUDA
+# runtime captures the device list. The only reliable approach is to launch
+# each worker as a **subprocess** with CUDA_VISIBLE_DEVICES set in the
+# subprocess environment, guaranteeing it takes effect before `import torch`.
+#
+# Protocol (parent ↔ child, length-prefixed pickle over stdin/stdout):
+# Parent → child: 4-byte LE uint32 length + pickle(CollatedBatch)
+# Child → parent: 4-byte LE uint32 length + pickle(result dict)
+# Shutdown signal: 4 zero bytes (length == 0)
+
+
+def _subprocess_recv():
+ """Read a length-prefixed pickled object from stdin. Returns None on shutdown."""
+ raw = sys.stdin.buffer.read(4)
+ if len(raw) < 4:
+ return None
+ (length,) = struct.unpack(" Future:
+ worker = self.workers[self._rr % len(self.workers)]
+ self._rr += 1
+ with self._futures_lock:
+ req_id = self._next_id
+ self._next_id += 1
+ fut = Future()
+ self._futures[req_id] = fut
+ batch_dict = {
+ "_req_id": req_id,
+ "_batch": batch,
+ }
+ worker.submit(batch_dict)
+ return fut
+
+ def shutdown(self):
+ for worker in self.workers:
+ worker.shutdown()
+ for t in self._reader_threads:
+ t.join(timeout=5)
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+ parser = build_parser()
+ args = parser.parse_args()
+
+ # ── Subprocess worker mode ──
+ if args._subprocess_worker:
+ subprocess_worker_main()
+ return
+
+ # Validate input arguments
+ assert args.tar_output_pattern is not None, "--tar_output_pattern is required."
+ assert args.jsonl_output_pattern is not None, "--jsonl_output_pattern is required."
+ assert bool(args.input_manifest) != bool(
+ args.input_jsonl
+ ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+
+ if args.num_machines > 1:
+ assert (
+ 0 <= args.machine_index < args.num_machines
+ ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+
+ # ── Build base dataset and count total samples ──
+ if args.input_jsonl:
+ logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+ total_samples = count_lines(args.input_jsonl)
+ base_dataset = JsonlDatasetReader(
+ args.input_jsonl,
+ sample_rate=SIDON_INPUT_SAMPLE_RATE,
+ shuffle=args.shuffle,
+ shuffle_seed=args.shuffle_seed,
+ )
+ loader_workers = args.loader_workers
+ else:
+ logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+ manifest_num_lines = count_lines(args.input_manifest)
+ loader_workers = min(args.loader_workers, manifest_num_lines)
+ total_samples = 0
+ manifests = []
+ with open(args.input_manifest, "r", encoding="utf-8") as f:
+ for line_id, line in tqdm(
+ enumerate(f),
+ total=manifest_num_lines,
+ desc="Calculating dataset length",
+ ):
+ items = line.strip().split(" ")
+ tar_path, jsonl_path, num_items, duration = (
+ items[0],
+ items[1],
+ int(items[2]),
+ float(items[3]),
+ )
+ assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+ assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+ assert jsonl_path.endswith(
+ ".jsonl"
+ ), f"File {jsonl_path} is not a .jsonl file."
+ if (
+ args.num_machines > 1
+ and line_id % args.num_machines != args.machine_index
+ ):
+ continue
+ total_samples += num_items
+ manifests.append((tar_path, jsonl_path, num_items, duration))
+ logging.info(
+ f"Total shards: {manifest_num_lines}, "
+ f"Shards for current index: {len(manifests)}"
+ )
+ base_dataset = WebDatasetReader(
+ manifests=manifests,
+ sample_rate=SIDON_INPUT_SAMPLE_RATE,
+ evaluation=True,
+ )
+
+ # ── Dynamic batching + DataLoader ──
+ batched_dataset = StreamLengthGroupDataset(
+ dataset=base_dataset,
+ batch_duration=args.batch_duration,
+ max_sample=args.max_sample,
+ min_length=args.min_length,
+ max_length=args.max_length,
+ )
+
+ collate_fn = CollateFunction(
+ skip_errors=args.skip_errors,
+ sample_rate=SIDON_INPUT_SAMPLE_RATE,
+ )
+
+ dataloader = DataLoader(
+ dataset=batched_dataset,
+ batch_size=None,
+ collate_fn=collate_fn,
+ num_workers=loader_workers,
+ prefetch_factor=10 if loader_workers > 0 else None,
+ pin_memory=True,
+ persistent_workers=loader_workers > 0,
+ )
+
+ # ── Multi-GPU process pool ──
+ num_devices = torch.cuda.device_count()
+ if num_devices == 0:
+ logging.warning("No GPUs detected - using CPU for processing")
+ num_processes = args.nj_per_gpu
+ else:
+ num_processes = num_devices * args.nj_per_gpu
+ logging.info(
+ f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+ f"Total processes: {num_processes}"
+ )
+
+ # Build a list of (physical_gpu_id, num_workers) for each pool.
+ # When num_devices == 0 we use a single CPU pool.
+ if num_devices == 0:
+ pool_specs = [(None, num_processes)]
+ else:
+ pool_specs = [(gpu_id, args.nj_per_gpu) for gpu_id in range(num_devices)]
+
+ # ── Output paths ──
+ tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+ jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+ Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+ Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+
+ output_dir = Path(tar_output_pattern).parent.parent
+ error_log_path = str(output_dir / "errors.jsonl")
+ manifest_path = str(output_dir / "data.lst")
+
+ error_logger = logging.getLogger("error_log")
+ error_logger.setLevel(logging.ERROR)
+ error_logger.handlers.clear()
+ error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+ error_fh.setFormatter(logging.Formatter("%(message)s"))
+ error_logger.addHandler(error_fh)
+
+ # ── Progress and shard tracking ──
+ processed_count = 0
+ error_count = 0
+ write_error_count = 0
+ failed_ids = []
+ shard_idx = 0
+ shard_sample_count = 0
+ shard_duration = 0.0
+ samples_per_shard = args.samples_per_shard
+ shard_manifest = {}
+ target_sample_rate = args.target_sample_rate
+
+ tar_writer = None
+ jsonl_file = None
+
+ def open_new_shard():
+ nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ if shard_idx > 0 and shard_sample_count > 0:
+ prev_idx = shard_idx - 1
+ shard_manifest[prev_idx] = (
+ os.path.abspath(tar_output_pattern % prev_idx),
+ os.path.abspath(jsonl_output_pattern % prev_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+ tar_fname = tar_output_pattern % shard_idx
+ jsonl_fname = jsonl_output_pattern % shard_idx
+ tar_writer = wds.TarWriter(tar_fname)
+ jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+ shard_idx += 1
+ shard_sample_count = 0
+ shard_duration = 0.0
+
+ def write_sample(key, waveform, metadata):
+ nonlocal shard_sample_count, write_error_count, shard_duration
+ assert tar_writer is not None and jsonl_file is not None
+ try:
+ if target_sample_rate != SIDON_OUTPUT_SAMPLE_RATE:
+ waveform = torchaudio.functional.resample(
+ waveform,
+ orig_freq=SIDON_OUTPUT_SAMPLE_RATE,
+ new_freq=target_sample_rate,
+ )
+ waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.6
+
+ record = serialise_flac(key, waveform, target_sample_rate)
+ jsonl_record = _encode_metadata(metadata)
+ tar_writer.write(record)
+ jsonl_file.write(jsonl_record.decode("utf-8") + "\n")
+ shard_sample_count += 1
+ shard_duration += metadata.get("audio_duration", 0.0)
+ except Exception as exc:
+ write_error_count += 1
+ failed_ids.append(key)
+ error_logger.error(
+ json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+ )
+ logging.error(f"Write failed for sample {key}: {exc}")
+
+ def handle_result(result):
+ nonlocal processed_count, error_count
+ if result["status"] == "success":
+ for key, cleaned, metadata in zip(
+ result["keys"], result["results"], result["metadata"]
+ ):
+ if tar_writer is None or shard_sample_count >= samples_per_shard:
+ open_new_shard()
+ write_sample(key, cleaned, metadata)
+ processed_count += 1
+ else:
+ error_count += result["size"]
+ failed_ids.extend(result["keys"])
+ for key in result["keys"]:
+ error_logger.error(
+ json.dumps(
+ {"id": key, "reason": result["error"]},
+ ensure_ascii=False,
+ )
+ )
+ if not args.skip_errors:
+ raise RuntimeError(
+ f"Batch starting with {result['keys'][0]} failed - terminating"
+ )
+ logging.warning(
+ f"Skipping failed batch starting with {result['keys'][0]}: "
+ f"{result['error']}"
+ )
+
+ # ── Main processing loop ──
+ main_progress = tqdm(total=total_samples, desc="Denoising Audio")
+
+ # Launch subprocess-based GPU workers. CUDA_VISIBLE_DEVICES is set in the
+ # subprocess Popen environment so it takes effect before import torch.
+ pool = GPUWorkerPool(pool_specs, args.feature_extractor_path, args.decoder_path)
+ logging.info(f"Submitting tasks... ({num_processes} subprocess workers)")
+ try:
+ futures = set()
+ max_pending = num_processes * 2
+
+ def drain_completed():
+ nonlocal futures
+ done, _ = wait(futures, return_when=FIRST_COMPLETED)
+ for f in done:
+ futures.discard(f)
+ result = f.result()
+ main_progress.update(result["size"])
+ handle_result(result)
+ main_progress.set_postfix(
+ OK=processed_count,
+ Err=error_count,
+ )
+
+ for batch in dataloader:
+ if batch.size == 0:
+ continue
+ if len(futures) >= max_pending:
+ drain_completed()
+ futures.add(pool.submit(batch))
+
+ logging.info("Processing remaining pending batches...")
+ while futures:
+ drain_completed()
+
+ except Exception:
+ logging.error("Critical error during processing", exc_info=True)
+ raise
+ finally:
+ pool.shutdown()
+ main_progress.close()
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ if shard_idx > 0 and shard_sample_count > 0:
+ last_idx = shard_idx - 1
+ shard_manifest[last_idx] = (
+ os.path.abspath(tar_output_pattern % last_idx),
+ os.path.abspath(jsonl_output_pattern % last_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+
+ # ── Write manifest (data.lst) ──
+ with open(manifest_path, "w", encoding="utf-8") as mf:
+ for idx in sorted(shard_manifest.keys()):
+ tar_path, jsonl_path, count, duration = shard_manifest[idx]
+ mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+
+ # ── Summary ──
+ total_failed = error_count + write_error_count
+ filtered_and_skipped = total_samples - processed_count - total_failed
+ logging.info(
+ f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+ f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+ )
+ logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+ if total_failed > 0:
+ logging.info(f"Error details: {error_log_path}")
+ if failed_ids and args.skip_errors:
+ logging.warning(
+ f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+ )
+ if write_error_count > 0 and not args.skip_errors:
+ raise RuntimeError(
+ f"{write_error_count} samples failed to write - check logs for details"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/scripts/extract_audio_tokens.py b/omnivoice/scripts/extract_audio_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f03a4fc712c789bd9d743a70461e0436497822b
--- /dev/null
+++ b/omnivoice/scripts/extract_audio_tokens.py
@@ -0,0 +1,625 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Extract audio tokens from audio data and pack them into WebDataset shards.
+
+Supports two input modes:
+
+1. WebDataset manifest (data.lst):
+ python extract_audio_tokens.py \
+ --input_manifest data.lst \
+ --tar_output_pattern output/audios/shard-%06d.tar \
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl
+
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}):
+ python extract_audio_tokens.py \
+ --input_jsonl data.jsonl \
+ --tar_output_pattern output/audios/shard-%06d.tar \
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl
+
+Output structure:
+ output_dir/
+ ├── audios/ # WebDataset tar shards (.npy audio tokens + .json metadata)
+ │ ├── shard_000000.tar
+ │ └── ...
+ ├── txts/ # Per-shard JSONL metadata
+ │ ├── shard_000000.jsonl
+ │ └── ...
+ ├── data.lst # Manifest:
+ └── errors.jsonl # Failed samples with error details
+"""
+
+import argparse
+import io
+import json
+import logging
+import multiprocessing as mp
+import os
+import warnings
+from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+import webdataset as wds
+from torch.utils.data import DataLoader, IterableDataset
+from tqdm.auto import tqdm
+from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
+
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.common import str2bool
+
+warnings.filterwarnings(
+ "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm"
+)
+
+HIGGS_INPUT_SAMPLE_RATE = 24_000
+
+
+# Global variables: Store tokenizer and device for each worker process
+worker_tokenizer = None
+worker_feature_extractor = None
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--input_manifest",
+ default=None,
+ help="Path to input dataset manifest (data.lst).",
+ )
+ parser.add_argument(
+ "--input_jsonl",
+ default=None,
+ help="Path to raw JSONL file (alternative to --input_manifest).",
+ )
+ parser.add_argument(
+ "--tar_output_pattern",
+ required=True,
+ help="Tar shard pattern passed to WebDataset",
+ )
+ parser.add_argument(
+ "--jsonl_output_pattern",
+ required=True,
+ help="Jsonl shard pattern passed to WebDataset",
+ )
+ parser.add_argument(
+ "--samples_per_shard",
+ type=int,
+ default=1000,
+ help="Maximum records per shard",
+ )
+ parser.add_argument(
+ "--min_num_shards",
+ type=int,
+ default=32,
+ help="Minimum number of output shards (use to ensure "
+ "shard count >= num_gpu * num_workers)",
+ )
+ parser.add_argument(
+ "--tokenizer_path",
+ type=str,
+ default="eustlb/higgs-audio-v2-tokenizer",
+ help="Path to audio tokenizer.",
+ )
+ parser.add_argument(
+ "--skip_errors", action="store_true", help="Skip items that fail to process"
+ )
+ parser.add_argument(
+ "--min_length",
+ type=float,
+ default=0.0,
+ help="Minimum audio duration in seconds (e.g. 2.0)",
+ )
+ parser.add_argument(
+ "--max_length",
+ type=float,
+ default=float("inf"),
+ help="Maximum audio duration in seconds (e.g. 15.0)",
+ )
+ parser.add_argument(
+ "--num_machines",
+ type=int,
+ default=1,
+ help="Total number of machines for distributed runs",
+ )
+ parser.add_argument(
+ "--machine_index",
+ type=int,
+ default=0,
+ help="Zero-based machine index when distributing across multiple "
+ "machines (e.g. 0, 1, ... num_machines-1)",
+ )
+ parser.add_argument(
+ "--nj_per_gpu",
+ type=int,
+ default=3,
+ help="Number of worker processes to spawn per GPU.",
+ )
+ parser.add_argument(
+ "--loader_workers",
+ type=int,
+ default=24,
+ help="Number of DataLoader workers for streaming IterableDataset.",
+ )
+ parser.add_argument(
+ "--shuffle",
+ type=str2bool,
+ default=True,
+ help="Shuffle data by default.",
+ )
+ parser.add_argument(
+ "--shuffle-seed",
+ type=int,
+ default=42,
+ help="Random seed for shuffle (default: 42).",
+ )
+ return parser
+
+
+def count_lines(path):
+ with open(path, "rb") as f:
+ return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+
+
+def serialise_numpy(key: str, tokens: np.ndarray) -> dict:
+ buffer = io.BytesIO()
+ np.save(buffer, tokens)
+ return {"__key__": key, "npy": buffer.getvalue()}
+
+
+def process_init(rank_queue, tokenizer_path):
+ """
+ Initialization function for each worker process.
+ Assigns a specific GPU to the process and loads the tokenizer.
+ """
+ global worker_tokenizer, worker_feature_extractor
+
+ # Configure worker process logging
+ formatter = (
+ "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]"
+ " [Worker %(process)d] %(message)s"
+ )
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ # Get assigned GPU rank
+ rank = rank_queue.get()
+ # Determine device
+ if rank != -1 and torch.cuda.is_available():
+ worker_device = torch.device(f"cuda:{rank}")
+ else:
+ worker_device = torch.device("cpu")
+
+ logging.debug(f"Worker process initialized with device: {worker_device}")
+ # Load tokenizer onto the specified device
+ worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path)
+ worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+ tokenizer_path, device_map=worker_device
+ )
+ logging.debug(f"Tokenizer loaded successfully on device {worker_device}")
+
+
+def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]:
+ """
+ Single-sample processing function executed in worker processes.
+ Skips invalid samples during streaming processing.
+ """
+ try:
+ audio_tensor = sample.get("audio", None) # shape (1, T)
+ if audio_tensor is None:
+ raise ValueError("Sample missing 'audio' field")
+
+ with torch.inference_mode():
+ key = sample["label"]["id"]
+ inputs = worker_feature_extractor(
+ raw_audio=audio_tensor.squeeze(0).numpy(),
+ sampling_rate=HIGGS_INPUT_SAMPLE_RATE,
+ return_tensors="pt",
+ ).to(worker_tokenizer.device)
+ audio_tokens = worker_tokenizer.encode(
+ inputs["input_values"],
+ ).audio_codes.squeeze(0)
+
+ assert len(audio_tokens.shape) == 2
+ assert audio_tokens.size(0) == 8
+
+ num_tokens = audio_tokens.size(1)
+ metadata = sample["label"]
+ metadata["num_tokens"] = num_tokens
+
+ # Convert to numpy format for subsequent serialization (int16 to save space)
+ audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy()
+
+ return {
+ "status": "success",
+ "key": key,
+ "audio_tokens": audio_tokens_np,
+ "metadata": metadata,
+ "error_msg": None,
+ }
+ except Exception as e:
+ sample_id = sample.get("label", {}).get("id", "unknown")
+ logging.error(f"Failed to process sample {sample_id}: {e}")
+ return {
+ "status": "error",
+ "key": sample_id,
+ "audio_tokens": None,
+ "metadata": None,
+ "error_msg": str(e),
+ }
+
+
+def _normalise_value(value: Any) -> Any:
+ """Convert tensors and NumPy scalars to serialisable Python objects."""
+ if isinstance(value, torch.Tensor):
+ if value.ndim == 0:
+ return value.item()
+ return value.cpu().tolist()
+ if isinstance(value, np.generic):
+ return value.item()
+ if isinstance(value, np.ndarray):
+ return value.tolist()
+ return value
+
+
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+ cleaned: dict[str, Any] = {}
+ for key, value in metadata.items():
+ if value is None:
+ continue
+ cleaned[key] = _normalise_value(value)
+ return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+
+
+class StreamingLengthFilteredDataset(IterableDataset):
+ def __init__(
+ self,
+ base_iterable,
+ min_len: float,
+ max_len: float,
+ sr: int,
+ ):
+ self.base_iterable = base_iterable
+ self.min_len = min_len
+ self.max_len = max_len
+ self.sr = sr
+ self.filtered_count = 0
+
+ def __iter__(self):
+ """Stream samples one by one and filter on the fly."""
+ for sample in self.base_iterable:
+ try:
+ duration = sample["audio"].size(-1) / self.sr
+ if self.min_len <= duration <= self.max_len:
+ yield sample
+ else:
+ self.filtered_count += 1
+ logging.warning(
+ f"Filtered sample (duration out of range): "
+ f"{sample['label']['id']} ({duration:.2f}s)"
+ )
+ except Exception as e:
+ logging.warning(f"Skipped invalid sample during streaming: {e}")
+ continue
+
+
+def main() -> None:
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+ parser = build_parser()
+ args = parser.parse_args()
+ mp.set_start_method("spawn", force=True)
+
+ # Validate input arguments
+ assert bool(args.input_manifest) != bool(
+ args.input_jsonl
+ ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+
+ if args.num_machines > 1:
+ assert (
+ 0 <= args.machine_index < args.num_machines
+ ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+
+ # Build base dataset and count total samples based on input mode
+ if args.input_jsonl:
+ logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+ total_samples = count_lines(args.input_jsonl)
+ base_dataset = JsonlDatasetReader(
+ args.input_jsonl,
+ sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+ shuffle=args.shuffle,
+ shuffle_seed=args.shuffle_seed,
+ )
+ loader_workers = args.loader_workers
+ else:
+ logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+ manifest_num_lines = count_lines(args.input_manifest)
+ loader_workers = min(args.loader_workers, manifest_num_lines)
+ total_samples = 0
+ manifests = []
+ with open(args.input_manifest, "r", encoding="utf-8") as f:
+ for line_id, line in tqdm(
+ enumerate(f),
+ total=manifest_num_lines,
+ desc="Calculating dataset length",
+ ):
+ items = line.strip().split(" ")
+ tar_path, jsonl_path, num_items, duration = (
+ items[0],
+ items[1],
+ int(items[2]),
+ float(items[3]),
+ )
+ assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+ assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+ assert jsonl_path.endswith(
+ ".jsonl"
+ ), f"File {jsonl_path} is not a .jsonl file."
+ if (
+ args.num_machines > 1
+ and line_id % args.num_machines != args.machine_index
+ ):
+ continue
+ total_samples += num_items
+ manifests.append((tar_path, jsonl_path, num_items, duration))
+ logging.info(
+ f"Total shards: {manifest_num_lines}, "
+ f"Shards for current index: {len(manifests)}"
+ )
+ base_dataset = WebDatasetReader(
+ manifests=manifests,
+ sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+ evaluation=True,
+ )
+
+ # Adjust samples_per_shard if min_num_shards would be violated
+ samples_per_shard = args.samples_per_shard
+ if total_samples > 0:
+ estimated_shards = max(
+ 1, (total_samples + samples_per_shard - 1) // samples_per_shard
+ )
+ if estimated_shards < args.min_num_shards:
+ samples_per_shard = max(1, total_samples // args.min_num_shards)
+ logging.info(
+ f"Adjusted samples_per_shard from {args.samples_per_shard} to "
+ f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} "
+ f"(total_samples={total_samples})"
+ )
+
+ # Apply length filter and create DataLoader
+ filtered_dataset = StreamingLengthFilteredDataset(
+ base_iterable=base_dataset,
+ min_len=args.min_length,
+ max_len=args.max_length,
+ sr=HIGGS_INPUT_SAMPLE_RATE,
+ )
+ dataloader = DataLoader(
+ dataset=filtered_dataset,
+ batch_size=None,
+ num_workers=loader_workers,
+ persistent_workers=loader_workers > 0,
+ pin_memory=False,
+ )
+
+ # Configure multi-GPU multi-process setup
+ num_devices = torch.cuda.device_count()
+ if num_devices == 0:
+ logging.warning("No GPUs detected - using CPU for processing")
+ num_processes = args.nj_per_gpu
+ else:
+ num_processes = num_devices * args.nj_per_gpu
+ logging.info(
+ f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+ f"Total processes: {num_processes}"
+ )
+
+ # Shared GPU rank queue for process assignment
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+ for rank in list(range(num_devices)) * args.nj_per_gpu:
+ rank_queue.put(rank)
+ if num_devices == 0:
+ for _ in range(num_processes):
+ rank_queue.put(-1)
+
+ # Prepare output paths
+ tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+ jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+ Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+ Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+
+ # Determine output directory from tar_output_pattern
+ output_dir = Path(tar_output_pattern).parent.parent
+ error_log_path = str(output_dir / "errors.jsonl")
+ manifest_path = str(output_dir / "data.lst")
+
+ # Setup error logger (writes to errors.jsonl)
+ error_logger = logging.getLogger("error_log")
+ error_logger.setLevel(logging.ERROR)
+ error_logger.handlers.clear()
+ error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+ error_fh.setFormatter(logging.Formatter("%(message)s"))
+ error_logger.addHandler(error_fh)
+
+ # Progress and error tracking
+ processed_count = 0
+ error_count = 0
+ write_error_count = 0
+ failed_ids = []
+ shard_idx = 0
+ shard_sample_count = 0
+ shard_duration = 0.0
+ shard_manifest = {} # shard_idx -> (tar_path, jsonl_path, count, duration)
+
+ tar_writer = None
+ jsonl_file = None
+
+ def open_new_shard():
+ nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ # Record manifest for the previous shard
+ if shard_idx > 0 and shard_sample_count > 0:
+ prev_idx = shard_idx - 1
+ shard_manifest[prev_idx] = (
+ os.path.abspath(tar_output_pattern % prev_idx),
+ os.path.abspath(jsonl_output_pattern % prev_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+ tar_fname = tar_output_pattern % shard_idx
+ jsonl_fname = jsonl_output_pattern % shard_idx
+ tar_writer = wds.TarWriter(tar_fname)
+ jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+ shard_idx += 1
+ shard_sample_count = 0
+ shard_duration = 0.0
+
+ def write_sample(key, audio_tokens_np, metadata):
+ nonlocal shard_sample_count, write_error_count, shard_duration
+ assert tar_writer is not None and jsonl_file is not None
+ try:
+ token_record = serialise_numpy(key, audio_tokens_np)
+ json_record = _encode_metadata(metadata)
+ tar_writer.write(token_record)
+ jsonl_file.write(json_record.decode("utf-8") + "\n")
+ shard_sample_count += 1
+ shard_duration += metadata.get("audio_duration", 0.0)
+ except Exception as exc:
+ write_error_count += 1
+ failed_ids.append(key)
+ error_logger.error(
+ json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+ )
+ logging.error(f"Write failed for sample {key}: {exc}")
+
+ def handle_result(result):
+ nonlocal processed_count, error_count
+ if result["status"] == "success":
+ # Rotate shard if needed
+ if tar_writer is None or shard_sample_count >= samples_per_shard:
+ open_new_shard()
+ write_sample(result["key"], result["audio_tokens"], result["metadata"])
+ processed_count += 1
+ else:
+ error_count += 1
+ failed_ids.append(result["key"])
+ error_logger.error(
+ json.dumps(
+ {"id": result["key"], "reason": result["error_msg"]},
+ ensure_ascii=False,
+ )
+ )
+ if not args.skip_errors:
+ raise RuntimeError(
+ f"Sample {result['key']} processing failed due "
+ f"to {result['error_msg']} - terminating"
+ )
+ logging.warning(
+ f"Skipping failed sample {result['key']}: {result['error_msg']}"
+ )
+
+ main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens")
+
+ try:
+ with ProcessPoolExecutor(
+ max_workers=num_processes,
+ initializer=process_init,
+ initargs=(rank_queue, args.tokenizer_path),
+ ) as executor:
+ logging.info(f"Submitting tasks... ({num_processes} workers)")
+ futures = set()
+ max_pending = num_processes * 10
+
+ def drain_completed():
+ """Wait for at least one future to complete, process all done."""
+ nonlocal futures
+ done, _ = wait(futures, return_when=FIRST_COMPLETED)
+ for f in done:
+ futures.discard(f)
+ result = f.result()
+ main_progress.update(1)
+ handle_result(result)
+ main_progress.set_postfix(
+ Samples=processed_count,
+ Errors=error_count,
+ )
+
+ # Stream samples from DataLoader
+ for sample in dataloader:
+ if len(futures) >= max_pending:
+ drain_completed()
+
+ future = executor.submit(process_single_sample, sample)
+ futures.add(future)
+
+ # Process remaining futures
+ logging.info("Processing remaining pending samples...")
+ while futures:
+ drain_completed()
+
+ except Exception:
+ logging.error("Critical error during processing", exc_info=True)
+ raise
+ finally:
+ main_progress.close()
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ # Record the last shard in the manifest
+ if shard_idx > 0 and shard_sample_count > 0:
+ last_idx = shard_idx - 1
+ shard_manifest[last_idx] = (
+ os.path.abspath(tar_output_pattern % last_idx),
+ os.path.abspath(jsonl_output_pattern % last_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+
+ # Write manifest file (data.lst)
+ with open(manifest_path, "w", encoding="utf-8") as mf:
+ for idx in sorted(shard_manifest.keys()):
+ tar_path, jsonl_path, count, duration = shard_manifest[idx]
+ mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+
+ # Output final statistics
+ total_failed = error_count + write_error_count
+ filtered_and_skipped = total_samples - processed_count - total_failed
+ logging.info(
+ f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+ f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+ )
+ logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+ if total_failed > 0:
+ logging.info(f"Error details: {error_log_path}")
+ if failed_ids and args.skip_errors:
+ logging.warning(
+ f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+ )
+ if write_error_count > 0 and not args.skip_errors:
+ raise RuntimeError(
+ f"{write_error_count} samples failed to write - check logs for details"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/scripts/extract_audio_tokens_add_noise.py b/omnivoice/scripts/extract_audio_tokens_add_noise.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b72ac0561c71cbd034f655f05b5159aa6c7955
--- /dev/null
+++ b/omnivoice/scripts/extract_audio_tokens_add_noise.py
@@ -0,0 +1,819 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Extract audio tokens from audio data and pack them into WebDataset shards.
+
+Extends ``extract_audio_tokens.py`` with optional noise and reverberation
+augmentation on the prompt (reference) portion of the audio. Requires a
+noise manifest and/or RIR manifest.
+
+Supports two input modes:
+
+1. WebDataset manifest (data.lst):
+ python extract_audio_tokens_add_noise.py \\
+ --input_manifest data.lst \\
+ --noise_manifest noise.lst \\
+ --tar_output_pattern output/audios/shard-%06d.tar \\
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl
+
+2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}):
+ python extract_audio_tokens_add_noise.py \\
+ --input_jsonl data.jsonl \\
+ --noise_manifest noise.lst \\
+ --tar_output_pattern output/audios/shard-%06d.tar \\
+ --jsonl_output_pattern output/txts/shard-%06d.jsonl
+
+Output structure:
+ output_dir/
+ ├── audios/ # WebDataset tar shards (.npy audio tokens + .json metadata)
+ │ ├── shard_000000.tar
+ │ └── ...
+ ├── txts/ # Per-shard JSONL metadata
+ │ ├── shard_000000.jsonl
+ │ └── ...
+ ├── data.lst # Manifest:
+ └── errors.jsonl # Failed samples with error details
+"""
+
+import argparse
+import io
+import json
+import logging
+import math
+import multiprocessing as mp
+import os
+import random
+import warnings
+from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import webdataset as wds
+from torch.utils.data import DataLoader, IterableDataset
+from tqdm.auto import tqdm
+from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel
+
+from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader
+from omnivoice.utils.audio import load_audio_bytes
+from omnivoice.utils.common import str2bool
+
+warnings.filterwarnings(
+ "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm"
+)
+
+HIGGS_INPUT_SAMPLE_RATE = 24_000
+
+# Global variables: Store tokenizer and device for each worker process
+worker_tokenizer = None
+worker_feature_extractor = None
+worker_noise_sampler = None
+worker_rir_sampler = None
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--input_manifest",
+ default=None,
+ help="Path to input dataset manifest (data.lst).",
+ )
+ parser.add_argument(
+ "--input_jsonl",
+ default=None,
+ help="Path to raw JSONL file (alternative to --input_manifest).",
+ )
+ parser.add_argument(
+ "--tar_output_pattern",
+ required=True,
+ help="Tar shard pattern passed to WebDataset",
+ )
+ parser.add_argument(
+ "--jsonl_output_pattern",
+ required=True,
+ help="Jsonl shard pattern passed to WebDataset",
+ )
+ parser.add_argument(
+ "--samples_per_shard",
+ type=int,
+ default=1000,
+ help="Maximum records per shard",
+ )
+ parser.add_argument(
+ "--min_num_shards",
+ type=int,
+ default=32,
+ help="Minimum number of output shards (use to ensure "
+ "shard count >= num_gpu * num_workers)",
+ )
+ parser.add_argument(
+ "--tokenizer_path",
+ type=str,
+ default="eustlb/higgs-audio-v2-tokenizer",
+ help="Path to audio tokenizer.",
+ )
+ parser.add_argument(
+ "--skip_errors", action="store_true", help="Skip items that fail to process"
+ )
+ parser.add_argument(
+ "--min_length",
+ type=float,
+ default=0.0,
+ help="Minimum audio duration in seconds (e.g. 2.0)",
+ )
+ parser.add_argument(
+ "--max_length",
+ type=float,
+ default=float("inf"),
+ help="Maximum audio duration in seconds (e.g. 15.0)",
+ )
+ parser.add_argument(
+ "--num_machines",
+ type=int,
+ default=1,
+ help="Total number of machines for distributed runs",
+ )
+ parser.add_argument(
+ "--machine_index",
+ type=int,
+ default=0,
+ help="Zero-based machine index when distributing across multiple "
+ "machines (e.g. 0, 1, ... num_machines-1)",
+ )
+ parser.add_argument(
+ "--nj_per_gpu",
+ type=int,
+ default=3,
+ help="Number of worker processes to spawn per GPU.",
+ )
+ parser.add_argument(
+ "--loader_workers",
+ type=int,
+ default=24,
+ help="Number of DataLoader workers for streaming IterableDataset.",
+ )
+ parser.add_argument(
+ "--shuffle",
+ type=str2bool,
+ default=True,
+ help="Shuffle data by default.",
+ )
+ parser.add_argument(
+ "--shuffle-seed",
+ type=int,
+ default=42,
+ help="Random seed for shuffle (default: 42).",
+ )
+ parser.add_argument(
+ "--noise_manifest",
+ default=None,
+ help="Path to noise manifest (list of tar files). Enables prompt noise augmentation.",
+ )
+ parser.add_argument(
+ "--rir_manifest",
+ default=None,
+ help="Path to RIR manifest (list of tar files). Enables prompt reverb augmentation.",
+ )
+ return parser
+
+
+def count_lines(path):
+ with open(path, "rb") as f:
+ return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+
+
+def serialise_numpy(key: str, tokens: np.ndarray) -> dict:
+ buffer = io.BytesIO()
+ np.save(buffer, tokens)
+ return {"__key__": key, "npy": buffer.getvalue()}
+
+
+def _load_aug_audio(data, sample_rate=24000):
+ """Simple audio loader for augmentation files."""
+ return torch.from_numpy(load_audio_bytes(data, sample_rate))
+
+
+class SimpleWorkerSampler:
+ """A lightweight infinite sampler for noise/RIR within a worker process."""
+
+ def __init__(self, tar_paths, sample_rate=24000):
+ self.dataset = (
+ wds.WebDataset(
+ tar_paths, shardshuffle=True, nodesplitter=None, workersplitter=None
+ )
+ .decode()
+ .map(lambda s: self._decode(s, sample_rate))
+ .select(lambda x: x is not None)
+ .shuffle(100)
+ .repeat()
+ )
+ self.iterator = iter(self.dataset)
+
+ def _decode(self, sample, sample_rate):
+ for ext in ["wav", "flac", "mp3"]:
+ if ext in sample:
+ return _load_aug_audio(sample[ext], sample_rate)
+ return None
+
+ def sample_segment(self, target_len, allow_repeat=True):
+ """Get a random segment of noise matching the target length."""
+ try:
+ audio = next(self.iterator)
+ except StopIteration:
+ self.iterator = iter(self.dataset)
+ audio = next(self.iterator)
+
+ cur_len = audio.size(-1)
+ if cur_len < target_len and allow_repeat:
+ if cur_len > 0:
+ num_repeats = math.ceil(target_len / cur_len)
+ audio = audio.repeat(1, num_repeats)
+ else:
+ audio = F.pad(audio, (0, target_len), mode="constant")
+ cur_len = audio.size(-1)
+
+ if cur_len > target_len:
+ start = random.randint(0, cur_len - target_len)
+ audio = audio[..., start : start + target_len]
+
+ return audio
+
+
+def _convolve1d(signal: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor:
+ m = signal.size(-1)
+ n = kernel.size(-1)
+ padded_size = m + n - 1
+ f_signal = torch.fft.rfft(signal, n=padded_size)
+ f_kernel = torch.fft.rfft(kernel, n=padded_size)
+ f_result = f_signal * f_kernel
+ result = torch.fft.irfft(f_result, n=padded_size)
+ return result[:padded_size]
+
+
+def _apply_rir(audio, rir, mix_ratio=0.5):
+ rir_scaling_factor = 0.5**15
+ N_in = audio.shape[-1]
+ rir_d = rir[0, :] * rir_scaling_factor
+ aug_d = _convolve1d(audio[0], rir_d)
+ shift_index = torch.argmax(torch.abs(rir_d))
+ end_index = shift_index + N_in
+ if end_index > aug_d.shape[0]:
+ augmented = F.pad(aug_d[shift_index:], (0, end_index - aug_d.shape[0]))
+ else:
+ augmented = aug_d[shift_index:end_index]
+ power_before = torch.sum(audio[0] ** 2)
+ power_after = torch.sum(augmented**2)
+ if power_after > 0:
+ augmented *= torch.sqrt(power_before / power_after)
+ mixed = (1 - mix_ratio) * audio[0] + mix_ratio * augmented
+ return mixed.unsqueeze(0)
+
+
+def process_init(rank_queue, tokenizer_path, noise_manifest=None, rir_manifest=None):
+ """
+ Initialization function for each worker process.
+ Assigns a specific GPU to the process and loads the tokenizer.
+ """
+ global worker_tokenizer, worker_feature_extractor, worker_noise_sampler, worker_rir_sampler
+
+ # Configure worker process logging
+ formatter = (
+ "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]"
+ " [Worker %(process)d] %(message)s"
+ )
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+
+ # Get assigned GPU rank
+ rank = rank_queue.get()
+ # Determine device
+ if rank != -1 and torch.cuda.is_available():
+ worker_device = torch.device(f"cuda:{rank}")
+ else:
+ worker_device = torch.device("cpu")
+
+ logging.debug(f"Worker process initialized with device: {worker_device}")
+ # Load tokenizer onto the specified device
+ worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path)
+ worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(
+ tokenizer_path, device_map=worker_device
+ )
+ logging.debug(f"Tokenizer loaded successfully on device {worker_device}")
+
+ # Initialize augmentation samplers (optional)
+ if noise_manifest:
+ try:
+ with open(noise_manifest, "r") as f:
+ tars = [l.strip().split()[0] for l in f if l.strip()]
+ worker_noise_sampler = SimpleWorkerSampler(
+ tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE
+ )
+ logging.debug("Noise sampler initialized.")
+ except Exception as e:
+ logging.warning(f"Failed to load noise manifest: {e}")
+
+ if rir_manifest:
+ try:
+ with open(rir_manifest, "r") as f:
+ tars = [l.strip().split()[0] for l in f if l.strip()]
+ worker_rir_sampler = SimpleWorkerSampler(
+ tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE
+ )
+ logging.debug("RIR sampler initialized.")
+ except Exception as e:
+ logging.warning(f"Failed to load RIR manifest: {e}")
+
+
+def _augment_prompt(audio_tensor: torch.Tensor) -> tuple[torch.Tensor, int]:
+ """Apply noise/reverb augmentation to the front portion of audio.
+
+ Returns the augmented audio and the sample index where clean audio starts.
+ """
+ # Pre-normalization
+ max_val = audio_tensor.abs().max() + 1e-7
+ audio_tensor = (audio_tensor / max_val) * 0.6
+
+ total_len = audio_tensor.size(-1)
+ ratio = random.uniform(0.1, 0.3)
+ split_idx = int(total_len * ratio)
+ front_part = audio_tensor[:, :split_idx].clone()
+
+ # Apply noise
+ if worker_noise_sampler is not None:
+ noise = worker_noise_sampler.sample_segment(split_idx)
+ snr_db = random.uniform(5, 15)
+ sig_rms = front_part.norm(p=2) / (split_idx**0.5)
+ noise_rms = noise.norm(p=2) / (split_idx**0.5)
+ if noise_rms > 1e-9:
+ snr = 10 ** (snr_db / 20)
+ scale = sig_rms / (snr * noise_rms + 1e-8)
+ front_part = front_part + noise * scale
+
+ # Apply RIR (30% probability)
+ if worker_rir_sampler is not None and random.random() < 0.3:
+ rir = worker_rir_sampler.sample_segment(split_idx, allow_repeat=False)
+ reverb_amt = random.uniform(0.3, 1.0)
+ try:
+ front_part = _apply_rir(front_part, rir, reverb_amt)
+ except Exception as e:
+ logging.warning(f"RIR failed: {e}")
+
+ # Merge back
+ if front_part.device != audio_tensor.device:
+ front_part = front_part.to(audio_tensor.device)
+ audio_tensor[:, :split_idx] = front_part
+
+ # Post-normalization
+ max_val = audio_tensor.abs().max() + 1e-7
+ audio_tensor = (audio_tensor / max_val) * 0.9
+
+ return audio_tensor, split_idx
+
+
+def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]:
+ """
+ Single-sample processing function executed in worker processes.
+ Skips invalid samples during streaming processing.
+ """
+ try:
+ audio_tensor = sample.get("audio", None) # shape (1, T)
+ if audio_tensor is None:
+ raise ValueError("Sample missing 'audio' field")
+
+ # Apply prompt augmentation if noise/rir samplers are available
+ enable_aug = worker_noise_sampler is not None or worker_rir_sampler is not None
+ clean_sample_idx = 0
+ if enable_aug:
+ audio_tensor, clean_sample_idx = _augment_prompt(audio_tensor)
+
+ with torch.inference_mode():
+ key = sample["label"]["id"]
+
+ inputs = worker_feature_extractor(
+ raw_audio=audio_tensor.squeeze(0).numpy(),
+ sampling_rate=HIGGS_INPUT_SAMPLE_RATE,
+ return_tensors="pt",
+ ).to(worker_tokenizer.device)
+ audio_tokens = worker_tokenizer.encode(
+ inputs["input_values"],
+ ).audio_codes.squeeze(0)
+
+ assert len(audio_tokens.shape) == 2
+ assert audio_tokens.size(0) == 8
+
+ num_tokens = audio_tokens.size(1)
+ metadata = sample["label"]
+ metadata["num_tokens"] = num_tokens
+
+ if enable_aug:
+ clean_token_idx = math.ceil(
+ clean_sample_idx / worker_tokenizer.config.hop_length
+ )
+ metadata["clean_start_token_idx"] = clean_token_idx
+
+ # Convert to numpy format for subsequent serialization (int16 to save space)
+ audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy()
+
+ return {
+ "status": "success",
+ "key": key,
+ "audio_tokens": audio_tokens_np,
+ "metadata": metadata,
+ "error_msg": None,
+ }
+ except Exception as e:
+ sample_id = sample.get("label", {}).get("id", "unknown")
+ logging.error(f"Failed to process sample {sample_id}: {e}")
+ return {
+ "status": "error",
+ "key": sample_id,
+ "audio_tokens": None,
+ "metadata": None,
+ "error_msg": str(e),
+ }
+
+
+def _normalise_value(value: Any) -> Any:
+ """Convert tensors and NumPy scalars to serialisable Python objects."""
+ if isinstance(value, torch.Tensor):
+ if value.ndim == 0:
+ return value.item()
+ return value.cpu().tolist()
+ if isinstance(value, np.generic):
+ return value.item()
+ if isinstance(value, np.ndarray):
+ return value.tolist()
+ return value
+
+
+def _encode_metadata(metadata: dict[str, Any]) -> bytes:
+ cleaned: dict[str, Any] = {}
+ for key, value in metadata.items():
+ if value is None:
+ continue
+ cleaned[key] = _normalise_value(value)
+ return json.dumps(cleaned, ensure_ascii=False).encode("utf-8")
+
+
+class StreamingLengthFilteredDataset(IterableDataset):
+ def __init__(
+ self,
+ base_iterable,
+ min_len: float,
+ max_len: float,
+ sr: int,
+ ):
+ self.base_iterable = base_iterable
+ self.min_len = min_len
+ self.max_len = max_len
+ self.sr = sr
+ self.filtered_count = 0
+
+ def __iter__(self):
+ """Stream samples one by one and filter on the fly."""
+ for sample in self.base_iterable:
+ try:
+ duration = sample["audio"].size(-1) / self.sr
+ if self.min_len <= duration <= self.max_len:
+ yield sample
+ else:
+ self.filtered_count += 1
+ logging.warning(
+ f"Filtered sample (duration out of range): "
+ f"{sample['label']['id']} ({duration:.2f}s)"
+ )
+ except Exception as e:
+ logging.warning(f"Skipped invalid sample during streaming: {e}")
+ continue
+
+
+def main() -> None:
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+ logging.basicConfig(format=formatter, level=logging.INFO, force=True)
+ parser = build_parser()
+ args = parser.parse_args()
+ mp.set_start_method("spawn", force=True)
+
+ # Validate input arguments
+ assert bool(args.input_manifest) != bool(
+ args.input_jsonl
+ ), "Exactly one of --input_manifest or --input_jsonl must be provided."
+
+ if args.num_machines > 1:
+ assert (
+ 0 <= args.machine_index < args.num_machines
+ ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
+
+ # Build base dataset and count total samples based on input mode
+ if args.input_jsonl:
+ logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
+ total_samples = count_lines(args.input_jsonl)
+ base_dataset = JsonlDatasetReader(
+ args.input_jsonl,
+ sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+ shuffle=args.shuffle,
+ shuffle_seed=args.shuffle_seed,
+ )
+ loader_workers = args.loader_workers
+ else:
+ logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
+ manifest_num_lines = count_lines(args.input_manifest)
+ loader_workers = min(args.loader_workers, manifest_num_lines)
+ total_samples = 0
+ manifests = []
+ with open(args.input_manifest, "r", encoding="utf-8") as f:
+ for line_id, line in tqdm(
+ enumerate(f),
+ total=manifest_num_lines,
+ desc="Calculating dataset length",
+ ):
+ items = line.strip().split(" ")
+ tar_path, jsonl_path, num_items, duration = (
+ items[0],
+ items[1],
+ int(items[2]),
+ float(items[3]),
+ )
+ assert os.path.exists(tar_path), f"File {tar_path} does not exist."
+ assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
+ assert jsonl_path.endswith(
+ ".jsonl"
+ ), f"File {jsonl_path} is not a .jsonl file."
+ if (
+ args.num_machines > 1
+ and line_id % args.num_machines != args.machine_index
+ ):
+ continue
+ total_samples += num_items
+ manifests.append((tar_path, jsonl_path, num_items, duration))
+ logging.info(
+ f"Total shards: {manifest_num_lines}, "
+ f"Shards for current index: {len(manifests)}"
+ )
+ base_dataset = WebDatasetReader(
+ manifests=manifests,
+ sample_rate=HIGGS_INPUT_SAMPLE_RATE,
+ evaluation=True,
+ )
+
+ # Apply length filter and create DataLoader
+ filtered_dataset = StreamingLengthFilteredDataset(
+ base_iterable=base_dataset,
+ min_len=args.min_length,
+ max_len=args.max_length,
+ sr=HIGGS_INPUT_SAMPLE_RATE,
+ )
+ dataloader = DataLoader(
+ dataset=filtered_dataset,
+ batch_size=None,
+ num_workers=loader_workers,
+ persistent_workers=loader_workers > 0,
+ pin_memory=False,
+ )
+
+ # Adjust samples_per_shard if min_num_shards would be violated
+ samples_per_shard = args.samples_per_shard
+ if total_samples > 0:
+ estimated_shards = max(
+ 1, (total_samples + samples_per_shard - 1) // samples_per_shard
+ )
+ if estimated_shards < args.min_num_shards:
+ samples_per_shard = max(1, total_samples // args.min_num_shards)
+ logging.info(
+ f"Adjusted samples_per_shard from {args.samples_per_shard} to "
+ f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} "
+ f"(total_samples={total_samples})"
+ )
+
+ # Configure multi-GPU multi-process setup
+ num_devices = torch.cuda.device_count()
+ if num_devices == 0:
+ logging.warning("No GPUs detected - using CPU for processing")
+ num_processes = args.nj_per_gpu
+ else:
+ num_processes = num_devices * args.nj_per_gpu
+ logging.info(
+ f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, "
+ f"Total processes: {num_processes}"
+ )
+ if args.noise_manifest or args.rir_manifest:
+ logging.info(
+ f"Prompt augmentation enabled - "
+ f"noise: {args.noise_manifest or 'off'}, rir: {args.rir_manifest or 'off'}"
+ )
+
+ # Shared GPU rank queue for process assignment
+ manager = mp.Manager()
+ rank_queue = manager.Queue()
+ for rank in list(range(num_devices)) * args.nj_per_gpu:
+ rank_queue.put(rank)
+ if num_devices == 0:
+ for _ in range(num_processes):
+ rank_queue.put(-1)
+
+ # Prepare output paths
+ tar_output_pattern = str(Path(args.tar_output_pattern).expanduser())
+ jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser())
+ Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+ Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True)
+
+ # Determine output directory from tar_output_pattern
+ output_dir = Path(tar_output_pattern).parent.parent
+ error_log_path = str(output_dir / "errors.jsonl")
+ manifest_path = str(output_dir / "data.lst")
+
+ # Setup error logger (writes to errors.jsonl)
+ error_logger = logging.getLogger("error_log")
+ error_logger.setLevel(logging.ERROR)
+ error_logger.handlers.clear()
+ error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+ error_fh.setFormatter(logging.Formatter("%(message)s"))
+ error_logger.addHandler(error_fh)
+
+ # Progress and error tracking
+ processed_count = 0
+ error_count = 0
+ write_error_count = 0
+ failed_ids = []
+ shard_idx = 0
+ shard_sample_count = 0
+ shard_duration = 0.0
+ shard_manifest = {} # shard_idx -> (tar_path, jsonl_path, count, duration)
+
+ tar_writer = None
+ jsonl_file = None
+
+ def open_new_shard():
+ nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ # Record manifest for the previous shard
+ if shard_idx > 0 and shard_sample_count > 0:
+ prev_idx = shard_idx - 1
+ shard_manifest[prev_idx] = (
+ os.path.abspath(tar_output_pattern % prev_idx),
+ os.path.abspath(jsonl_output_pattern % prev_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+ tar_fname = tar_output_pattern % shard_idx
+ jsonl_fname = jsonl_output_pattern % shard_idx
+ tar_writer = wds.TarWriter(tar_fname)
+ jsonl_file = open(jsonl_fname, "w", encoding="utf-8")
+ shard_idx += 1
+ shard_sample_count = 0
+ shard_duration = 0.0
+
+ def write_sample(key, audio_tokens_np, metadata):
+ nonlocal shard_sample_count, write_error_count, shard_duration
+ assert tar_writer is not None and jsonl_file is not None
+ try:
+ token_record = serialise_numpy(key, audio_tokens_np)
+ json_record = _encode_metadata(metadata)
+ tar_writer.write(token_record)
+ jsonl_file.write(json_record.decode("utf-8") + "\n")
+ shard_sample_count += 1
+ shard_duration += metadata.get("audio_duration", 0.0)
+ except Exception as exc:
+ write_error_count += 1
+ failed_ids.append(key)
+ error_logger.error(
+ json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False)
+ )
+ logging.error(f"Write failed for sample {key}: {exc}")
+
+ def handle_result(result):
+ nonlocal processed_count, error_count
+ if result["status"] == "success":
+ # Rotate shard if needed
+ if tar_writer is None or shard_sample_count >= samples_per_shard:
+ open_new_shard()
+ write_sample(result["key"], result["audio_tokens"], result["metadata"])
+ processed_count += 1
+ else:
+ error_count += 1
+ failed_ids.append(result["key"])
+ error_logger.error(
+ json.dumps(
+ {"id": result["key"], "reason": result["error_msg"]},
+ ensure_ascii=False,
+ )
+ )
+ if not args.skip_errors:
+ raise RuntimeError(
+ f"Sample {result['key']} processing failed due "
+ f"to {result['error_msg']} - terminating"
+ )
+ logging.warning(
+ f"Skipping failed sample {result['key']}: {result['error_msg']}"
+ )
+
+ main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens")
+
+ try:
+ with ProcessPoolExecutor(
+ max_workers=num_processes,
+ initializer=process_init,
+ initargs=(
+ rank_queue,
+ args.tokenizer_path,
+ args.noise_manifest,
+ args.rir_manifest,
+ ),
+ ) as executor:
+ logging.info(f"Submitting tasks... ({num_processes} workers)")
+ futures = set()
+ max_pending = num_processes * 10
+
+ def drain_completed():
+ """Wait for at least one future to complete, process all done."""
+ nonlocal futures
+ done, _ = wait(futures, return_when=FIRST_COMPLETED)
+ for f in done:
+ futures.discard(f)
+ result = f.result()
+ main_progress.update(1)
+ handle_result(result)
+ main_progress.set_postfix(
+ Samples=processed_count,
+ Errors=error_count,
+ )
+
+ # Stream samples from DataLoader
+ for sample in dataloader:
+ if len(futures) >= max_pending:
+ drain_completed()
+
+ future = executor.submit(process_single_sample, sample)
+ futures.add(future)
+
+ # Process remaining futures
+ logging.info("Processing remaining pending samples...")
+ while futures:
+ drain_completed()
+
+ except Exception:
+ logging.error("Critical error during processing", exc_info=True)
+ raise
+ finally:
+ main_progress.close()
+ if tar_writer is not None:
+ tar_writer.close()
+ if jsonl_file is not None:
+ jsonl_file.close()
+ # Record the last shard in the manifest
+ if shard_idx > 0 and shard_sample_count > 0:
+ last_idx = shard_idx - 1
+ shard_manifest[last_idx] = (
+ os.path.abspath(tar_output_pattern % last_idx),
+ os.path.abspath(jsonl_output_pattern % last_idx),
+ shard_sample_count,
+ shard_duration,
+ )
+
+ # Write manifest file (data.lst)
+ with open(manifest_path, "w", encoding="utf-8") as mf:
+ for idx in sorted(shard_manifest.keys()):
+ tar_path, jsonl_path, count, duration = shard_manifest[idx]
+ mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+
+ # Output final statistics
+ total_failed = error_count + write_error_count
+ filtered_and_skipped = total_samples - processed_count - total_failed
+ logging.info(
+ f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, "
+ f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}"
+ )
+ logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+ if total_failed > 0:
+ logging.info(f"Error details: {error_log_path}")
+ if failed_ids and args.skip_errors:
+ logging.warning(
+ f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..."
+ )
+ if write_error_count > 0 and not args.skip_errors:
+ raise RuntimeError(
+ f"{write_error_count} samples failed to write - check logs for details"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/omnivoice/scripts/jsonl_to_webdataset.py b/omnivoice/scripts/jsonl_to_webdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..81442b4bc0f61b0bfebb0f2a757deb0051f242b3
--- /dev/null
+++ b/omnivoice/scripts/jsonl_to_webdataset.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pack a JSONL audio dataset into a customed WebDataset shards
+(paired .tar and .jsonl files).
+
+Usage:
+ python jsonl_to_webdataset.py \
+ --input data.jsonl \
+ --output output_dir/ \
+ --workers 16 \
+ --threads 4 \
+ --shard-size 1000 \
+ --sr 24000
+
+Input JSONL format (one JSON object per line):
+ {"id": "utt_001", "audio_path": "/data/wavs/001.wav", "text": "hello world", ...}
+
+ Required fields: "id", "audio_path", "text"
+ All other fields are preserved in the output metadata.
+
+Output structure:
+ output_dir/
+ ├── audios/ # WebDataset tar shards
+ │ ├── shard_000000.tar
+ │ ├── shard_000001.tar
+ │ └── ...
+ ├── txts/ # Per-shard JSONL metadata (with audio_duration added)
+ │ ├── shard_000000.jsonl
+ │ ├── shard_000001.jsonl
+ │ └── ...
+ ├── data.lst # Manifest:
+ └── errors.jsonl # Failed samples with error details
+"""
+
+import argparse
+import io
+import json
+import logging
+import multiprocessing as mp
+import os
+import random
+from concurrent.futures import (
+ FIRST_COMPLETED,
+ ProcessPoolExecutor,
+ ThreadPoolExecutor,
+ as_completed,
+ wait,
+)
+from itertools import islice
+from pathlib import Path
+
+import torch
+import torchaudio
+import webdataset as wds
+from tqdm import tqdm
+
+import soundfile as sf
+
+from omnivoice.utils.audio import load_waveform
+from omnivoice.utils.common import str2bool
+
+
+def build_parser() -> argparse.ArgumentParser:
+ parser = argparse.ArgumentParser(
+ description="Pack JSONL audio dataset into WebDataset shards."
+ )
+ parser.add_argument(
+ "--input", type=str, default="data.jsonl", help="Path to input JSONL file"
+ )
+ parser.add_argument(
+ "--output",
+ type=str,
+ default="emilia",
+ help="Path to output directory",
+ )
+ parser.add_argument(
+ "--workers",
+ type=int,
+ default=16,
+ help="Number of worker processes (default: 16)",
+ )
+ parser.add_argument(
+ "--threads",
+ type=int,
+ default=4,
+ help="Number of threads per worker process.",
+ )
+ parser.add_argument(
+ "--shard-size",
+ type=int,
+ default=1000,
+ help="Number of samples per shard (default: 1000)",
+ )
+ parser.add_argument(
+ "--sr", type=int, default=24000, help="Target sample rate (default: 24000)"
+ )
+ parser.add_argument(
+ "--shuffle",
+ type=str2bool,
+ default=True,
+ help="Shuffle data by default.",
+ )
+ parser.add_argument(
+ "--shuffle-seed",
+ type=int,
+ default=42,
+ help="Random seed for shuffle (default: 42)",
+ )
+ parser.add_argument(
+ "--min-duration",
+ type=float,
+ default=None,
+ help="Filter out samples shorter than this (seconds).",
+ )
+ parser.add_argument(
+ "--max-duration",
+ type=float,
+ default=None,
+ help="Filter out samples >= this duration (seconds).",
+ )
+ return parser
+
+
+def read_jsonl(file_path):
+ with open(file_path, "r", encoding="utf-8") as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ yield json.loads(line)
+
+
+def chunked_reader(iterator, chunk_size):
+ it = iter(iterator)
+ while chunk := list(islice(it, chunk_size)):
+ yield chunk
+
+
+def process_audio_item(meta, target_sr):
+ key = meta.get("id")
+ audio_path = meta.get("audio_path")
+
+ if not key or not audio_path:
+ return {
+ "error": {
+ "id": key,
+ "audio_path": audio_path,
+ "reason": "missing id or audio_path",
+ }
+ }
+
+ try:
+ if not os.path.exists(audio_path):
+ raise FileNotFoundError(f"{audio_path} not found")
+
+ waveform, sr = load_waveform(audio_path)
+ audio_duration = waveform.shape[1] / sr
+ meta["audio_duration"] = audio_duration
+
+ if target_sr and sr != target_sr:
+ waveform = torchaudio.functional.resample(
+ torch.from_numpy(waveform), orig_freq=sr, new_freq=target_sr
+ ).numpy()
+ sr = target_sr
+
+ audio_buffer = io.BytesIO()
+ sf.write(audio_buffer, waveform.T, sr, format="FLAC")
+ audio_bytes = audio_buffer.getvalue()
+
+ sample = {
+ "__key__": key,
+ "flac": audio_bytes,
+ }
+
+ return {"ok": (sample, meta)}
+
+ except Exception as e:
+ return {"error": {"id": key, "audio_path": audio_path, "reason": str(e)}}
+
+
+def process_single_shard(
+ shard_idx,
+ records,
+ output_tar_pattern,
+ output_jsonl_pattern,
+ target_sr,
+ num_threads=4,
+ min_duration=None,
+ max_duration=None,
+):
+ tar_fname = output_tar_pattern % shard_idx
+ jsonl_fname = output_jsonl_pattern % shard_idx
+
+ processed_count = 0
+ filtered_count = 0
+ error_count = 0
+ total_duration = 0.0
+ errors = []
+
+ with wds.TarWriter(tar_fname) as sink, open(
+ jsonl_fname, "w", encoding="utf-8"
+ ) as jsonl_f:
+
+ with ThreadPoolExecutor(max_workers=num_threads) as thread_pool:
+ futures = []
+
+ for meta in records:
+ f = thread_pool.submit(process_audio_item, meta, target_sr)
+ futures.append(f)
+
+ for f in as_completed(futures):
+ result = f.result()
+
+ if "error" in result:
+ error_count += 1
+ errors.append(result["error"])
+ continue
+
+ sample, meta = result["ok"]
+ dur = meta.get("audio_duration", 0.0)
+
+ # Duration filtering (based on actual audio_duration computed above)
+ if min_duration is not None and dur < min_duration:
+ filtered_count += 1
+ continue
+ if max_duration is not None and dur >= max_duration:
+ filtered_count += 1
+ continue
+
+ sink.write(sample)
+
+ jsonl_f.write(json.dumps(meta, ensure_ascii=False) + "\n")
+
+ total_duration += dur
+ processed_count += 1
+
+ # Clean up empty shard files
+ if processed_count == 0:
+ for p in (tar_fname, jsonl_fname):
+ if os.path.exists(p):
+ os.remove(p)
+
+ return (
+ shard_idx,
+ processed_count,
+ error_count,
+ filtered_count,
+ total_duration,
+ errors,
+ )
+
+
+def count_lines(path):
+ with open(path, "rb") as f:
+ return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b""))
+
+
+def pack_dataset(
+ input_jsonl,
+ output_dir,
+ samples_per_shard=5000,
+ num_workers=16,
+ target_sr=24000,
+ threads_per_worker=4,
+ shuffle=False,
+ shuffle_seed=None,
+ min_duration=None,
+ max_duration=None,
+):
+ input_path = Path(input_jsonl)
+ output_dir = Path(output_dir)
+ output_tar_dir = output_dir / "audios"
+ output_tar_dir.mkdir(parents=True, exist_ok=True)
+ output_jsonl_dir = output_dir / "txts"
+ output_jsonl_dir.mkdir(parents=True, exist_ok=True)
+
+ output_tar_pattern = str(output_tar_dir / "shard-%06d.tar")
+ output_jsonl_pattern = str(output_jsonl_dir / "shard-%06d.jsonl")
+
+ error_log_path = str(output_dir / "errors.jsonl")
+
+ # Setup error logger
+ error_logger = logging.getLogger("error_log")
+ error_logger.setLevel(logging.ERROR)
+ error_logger.handlers.clear()
+ fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8")
+ fh.setFormatter(logging.Formatter("%(message)s"))
+ error_logger.addHandler(fh)
+
+ shard_manifest = {}
+
+ print(f"Reading input: {input_path}")
+ print(f"Output dir: {output_dir}")
+ print(f"Strategy: {num_workers} Processes x {threads_per_worker} Threads")
+
+ if shuffle:
+ print("Load input dataset...")
+ entries = list(read_jsonl(input_path))
+ random.seed(shuffle_seed)
+ random.shuffle(entries)
+ print(f"Shuffled {len(entries)} entries (seed={shuffle_seed})")
+ total_lines = len(entries)
+ chunk_gen = chunked_reader(iter(entries), samples_per_shard)
+ else:
+ print("Calculating total lines...")
+ total_lines = count_lines(input_path)
+ chunk_gen = chunked_reader(read_jsonl(input_path), samples_per_shard)
+
+ if min_duration is not None or max_duration is not None:
+ print(
+ f"Duration filter: [{min_duration or 0:.2f}s"
+ f", {max_duration or float('inf'):.1f}s) (applied after audio decoding)"
+ )
+
+ total_shards_est = (total_lines + samples_per_shard - 1) // samples_per_shard
+ print(f"Total samples: {total_lines}, Estimated shards: {total_shards_est}")
+
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
+
+ futures = set()
+
+ shard_idx = 0
+ total_processed = 0
+ total_errors = 0
+ total_filtered = 0
+
+ pbar = tqdm(
+ total=total_shards_est,
+ desc="Shards Processed",
+ unit="shard",
+ )
+
+ def submit_next_chunks(limit):
+ """Pull up to `limit` chunks from generator, submit them."""
+ nonlocal shard_idx
+ submitted = 0
+ for chunk in chunk_gen:
+ f = executor.submit(
+ process_single_shard,
+ shard_idx,
+ chunk,
+ output_tar_pattern,
+ output_jsonl_pattern,
+ target_sr,
+ threads_per_worker,
+ min_duration,
+ max_duration,
+ )
+ futures.add(f)
+ shard_idx += 1
+ submitted += 1
+ if submitted >= limit:
+ break
+
+ submit_next_chunks(num_workers * 2)
+
+ while futures:
+ done, _ = wait(futures, return_when=FIRST_COMPLETED)
+
+ for f in done:
+ futures.remove(f)
+
+ try:
+ s_idx, p_count, e_count, f_count, s_duration, errors = f.result()
+ total_processed += p_count
+ total_errors += e_count
+ total_filtered += f_count
+
+ # Write error log
+ for err in errors:
+ err["shard_idx"] = s_idx
+ error_logger.error(json.dumps(err, ensure_ascii=False))
+
+ if p_count > 0:
+ tar_abs = os.path.abspath(output_tar_pattern % s_idx)
+ jsonl_abs = os.path.abspath(output_jsonl_pattern % s_idx)
+ shard_manifest[s_idx] = (
+ tar_abs,
+ jsonl_abs,
+ p_count,
+ s_duration,
+ )
+
+ pbar.set_postfix(
+ {
+ "Samples": total_processed,
+ "Filtered": total_filtered,
+ "Errors": total_errors,
+ }
+ )
+ pbar.update(1)
+ except Exception as e:
+ print(f"Shard task failed: {e}")
+
+ submit_next_chunks(1)
+
+ pbar.close()
+
+ # Write final manifest file (data.lst)
+ manifest_path = str(output_dir / "data.lst")
+ with open(manifest_path, "w", encoding="utf-8") as mf:
+ for idx in sorted(shard_manifest.keys()):
+ tar_path, jsonl_path, count, duration = shard_manifest[idx]
+ mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n")
+
+ print(f"\nDone! Output saved to {output_dir}")
+ print(f"Successfully packed: {total_processed}")
+ print(f"Filtered by duration: {total_filtered}")
+ print(f"Failed: {total_errors}")
+ print(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)")
+ if total_errors > 0:
+ print(f"Error details: {error_log_path}")
+
+
+if __name__ == "__main__":
+ mp.set_start_method("spawn", force=True)
+
+ args = build_parser().parse_args()
+ pack_dataset(
+ input_jsonl=args.input,
+ output_dir=args.output,
+ samples_per_shard=args.shard_size,
+ num_workers=args.workers,
+ target_sr=args.sr,
+ threads_per_worker=args.threads,
+ shuffle=args.shuffle,
+ shuffle_seed=args.shuffle_seed,
+ min_duration=args.min_duration,
+ max_duration=args.max_duration,
+ )
diff --git a/omnivoice/training/__init__.py b/omnivoice/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/training/builder.py b/omnivoice/training/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af2fc72b34dbdbd5de7c0d8f3386e8188386d41
--- /dev/null
+++ b/omnivoice/training/builder.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Builders for constructing training components.
+
+Provides factory functions to assemble the model, tokenizer, and data loaders
+from a ``TrainingConfig``. Called by ``omnivoice.cli.train`` to set up training.
+
+Key functions:
+- ``build_model_and_tokenizer()``: Loads the model and text tokenizer.
+- ``build_dataloaders()``: Builds train/eval data loaders from a data config JSON.
+ The batching strategy is chosen based on ``TrainingConfig.attn_implementation``:
+
+ - ``"flex_attention"``: sequence packing via ``PackingIterableDataset`` +
+ ``PackingDataCollator``. Batch shape is ``[1, C, batch_tokens]``.
+ - other (e.g. ``"sdpa"``): length-grouped padding via
+ ``StreamLengthGroupDataset`` + ``PaddingDataCollator``. Batch shape
+ is ``[B, C, max_len]`` where B ≥ 1 and max_len ≤ batch_tokens.
+"""
+
+import logging
+from functools import partial
+from typing import Tuple
+
+import torch
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+from transformers import logging as hf_logging
+from transformers.trainer_utils import seed_worker
+
+from omnivoice.data.batching import PackingIterableDataset, StreamLengthGroupDataset
+from omnivoice.data.collator import PackingDataCollator, PaddingDataCollator
+from omnivoice.data.dataset import WebDatasetReader, prepare_data_manifests_from_json
+from omnivoice.data.processor import OmniVoiceSampleProcessor
+from omnivoice.models.omnivoice import OmniVoice, OmniVoiceConfig, _resolve_model_path
+from omnivoice.training.config import TrainingConfig
+
+logger = logging.getLogger(__name__)
+
+
+def build_model_and_tokenizer(
+ config: TrainingConfig,
+) -> Tuple[OmniVoice, AutoTokenizer]:
+ """Load Tokenizer and Model, handle resizing and special tokens."""
+ logger.info("Initializing Model & Tokenizer...")
+
+ # 1. Tokenizer
+ tokenizer_path = (
+ config.init_from_checkpoint
+ if config.init_from_checkpoint
+ else config.llm_name_or_path
+ )
+ tokenizer_path = _resolve_model_path(tokenizer_path)
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+ if tokenizer.pad_token is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+ new_tokens = [
+ "<|denoise|>",
+ "<|lang_start|>",
+ "<|lang_end|>",
+ "<|instruct_start|>",
+ "<|instruct_end|>",
+ "<|text_start|>",
+ "<|text_end|>",
+ ]
+
+ tokens_to_add = [t for t in new_tokens if t not in tokenizer.get_vocab()]
+ if tokens_to_add:
+ tokenizer.add_special_tokens({"additional_special_tokens": tokens_to_add})
+
+ if config.init_from_checkpoint:
+ logger.info(f"Loading weights from {config.init_from_checkpoint}")
+ model = OmniVoice.from_pretrained(
+ config.init_from_checkpoint,
+ attn_implementation=config.attn_implementation,
+ dtype=torch.float32,
+ train=True,
+ )
+ else:
+ resolved_llm = _resolve_model_path(config.llm_name_or_path)
+ llm_config = AutoConfig.from_pretrained(resolved_llm)
+
+ ov_config = OmniVoiceConfig(
+ audio_vocab_size=config.audio_vocab_size,
+ audio_mask_id=config.audio_mask_id,
+ num_audio_codebook=config.num_audio_codebook,
+ audio_codebook_weights=config.audio_codebook_weights,
+ llm_config=llm_config,
+ )
+
+ original_level = hf_logging.get_verbosity()
+ hf_logging.set_verbosity_error() # suppress expected lm_head.weight warnings
+
+ llm = AutoModel.from_pretrained(
+ resolved_llm,
+ attn_implementation=config.attn_implementation,
+ dtype=torch.float32,
+ )
+
+ hf_logging.set_verbosity(original_level)
+ model = OmniVoice(config=ov_config, llm=llm)
+
+ # 3. Resize Embeddings
+ if len(tokenizer) != model.config.llm_config.vocab_size:
+ model.llm.resize_token_embeddings(len(tokenizer))
+ model.config.llm_config.vocab_size = len(tokenizer)
+
+ # 4. Config IDs
+ model.config.pad_token_id = tokenizer.pad_token_id
+ model.config.bos_token_id = tokenizer.bos_token_id
+ model.config.eos_token_id = tokenizer.eos_token_id
+
+ return model, tokenizer
+
+
+def build_dataloaders(
+ config: TrainingConfig, tokenizer: AutoTokenizer
+) -> Tuple[DataLoader, DataLoader]:
+ """Setup Data Pipeline: Manifests -> WDS -> Batching -> Loaders.
+
+ Batching strategy depends on ``config.attn_implementation``:
+ - ``"flex_attention"``: sequence packing (PackingIterableDataset +
+ PackingDataCollator). All samples are concatenated into one long sequence.
+ - other (e.g. ``"sdpa"``): length-grouped padding
+ (LengthGroupedIterableDataset + PaddingDataCollator). Samples with
+ similar token lengths are batched together and padded to the same length.
+ """
+ logger.info("Initializing Data Readers...")
+
+ processor = OmniVoiceSampleProcessor(
+ text_tokenizer=tokenizer,
+ num_channels=config.num_audio_codebook,
+ audio_mask_id=config.audio_mask_id,
+ prompt_ratio_range=config.prompt_ratio_range,
+ mask_ratio_range=config.mask_ratio_range,
+ drop_cond_ratio=config.drop_cond_ratio,
+ language_ratio=config.language_ratio,
+ use_pinyin_ratio=config.use_pinyin_ratio,
+ instruct_ratio=config.instruct_ratio,
+ only_instruct_ratio=config.only_instruct_ratio,
+ )
+
+ train_manifests, dev_manifests = prepare_data_manifests_from_json(
+ config.data_config
+ )
+ raw_train_ds = WebDatasetReader(manifests=train_manifests, evaluation=False)
+
+ use_packing = config.attn_implementation == "flex_attention"
+
+ if use_packing:
+ train_dataset = PackingIterableDataset(
+ raw_train_ds, processor, config.batch_tokens
+ )
+ collate_fn = PackingDataCollator(processor, config.batch_tokens)
+ else:
+ train_dataset = StreamLengthGroupDataset(
+ raw_train_ds,
+ batch_duration=config.batch_tokens,
+ min_length=config.min_sample_tokens,
+ max_length=config.max_sample_tokens,
+ max_sample=config.max_batch_size,
+ processor=processor,
+ length_fn=lambda s: s["length"],
+ )
+ collate_fn = PaddingDataCollator(processor, config.batch_tokens)
+
+ logger.info(
+ "Using %s (attn_implementation=%s)",
+ "sequence packing" if use_packing else "length-grouped padding",
+ config.attn_implementation,
+ )
+
+ init_fn = partial(
+ seed_worker,
+ num_workers=config.num_workers,
+ rank=(
+ torch.distributed.get_rank()
+ if torch.distributed.is_initialized()
+ else 0
+ ),
+ )
+
+ train_loader = DataLoader(
+ train_dataset,
+ batch_size=None,
+ num_workers=config.num_workers,
+ collate_fn=collate_fn,
+ worker_init_fn=init_fn,
+ pin_memory=True,
+ prefetch_factor=4,
+ )
+
+ eval_loader = None
+ if dev_manifests:
+ raw_dev_ds = WebDatasetReader(
+ manifests=dev_manifests, evaluation=True
+ )
+ if use_packing:
+ dev_dataset = PackingIterableDataset(
+ raw_dev_ds, processor, config.batch_tokens
+ )
+ else:
+ dev_dataset = StreamLengthGroupDataset(
+ raw_dev_ds,
+ batch_duration=config.batch_tokens,
+ min_length=config.min_sample_tokens,
+ max_length=config.max_sample_tokens,
+ max_sample=config.max_batch_size,
+ processor=processor,
+ length_fn=lambda s: s["length"],
+ )
+ eval_loader = DataLoader(
+ dev_dataset,
+ batch_size=None, # Each item is already a collated batch
+ num_workers=1,
+ collate_fn=collate_fn,
+ pin_memory=True,
+ prefetch_factor=2,
+ )
+
+ return train_loader, eval_loader
diff --git a/omnivoice/training/checkpoint.py b/omnivoice/training/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbf969d727708225a8edbd0d05611302db418b9
--- /dev/null
+++ b/omnivoice/training/checkpoint.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Checkpoint saving, resuming, and training logging.
+
+Provides utilities for saving/loading training checkpoints and logging metrics
+to console and trackers (TensorBoard/WandB). Used by ``OmniTrainer``.
+
+Key components:
+- ``TrainLogger``: Logs training metrics to console and Accelerate trackers.
+- ``save_checkpoint()``: Saves model, optimizer, and scheduler state.
+- ``load_checkpoint()``: Restores training state from a checkpoint directory.
+"""
+
+import logging
+import os
+import shutil
+import time
+from typing import Any, Dict, Optional
+
+import torch
+from accelerate import Accelerator
+from tqdm.auto import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+class TrainLogger:
+ """
+ Handles logging to console and trackers (TensorBoard/WandB)
+ """
+
+ def __init__(self, accelerator: Accelerator, total_steps: int, logging_steps: int):
+ self.accelerator = accelerator
+ self.total_steps = total_steps
+ self.logging_steps = logging_steps
+ self.start_time = None
+ self.progress_bar = None
+
+ def start(self, start_step: int = 0):
+ self.start_time = time.time()
+
+ if self.accelerator.is_main_process:
+ self.progress_bar = tqdm(
+ total=self.total_steps,
+ initial=start_step,
+ desc="Training",
+ dynamic_ncols=True,
+ disable=not self.accelerator.is_local_main_process,
+ )
+
+ def update(
+ self, step: int, loss: Optional[float] = None, lr: Optional[float] = None
+ ):
+ """
+ Called every step to update the progress bar UI.
+ """
+ if self.progress_bar:
+ self.progress_bar.update(1)
+
+ # Update real-time metrics on the progress bar itself
+ postfix = {}
+ if loss is not None:
+ postfix["loss"] = f"{loss:.4f}"
+ if lr is not None:
+ postfix["lr"] = f"{lr:.2e}"
+
+ if postfix:
+ self.progress_bar.set_postfix(postfix)
+
+ def log_metrics(self, step: int, metrics: Dict[str, Any]):
+ """
+ Called periodically to log to TensorBoard/WandB and console.
+ """
+ # Log to trackers (TensorBoard, etc.)
+ self.accelerator.log(metrics, step=step)
+
+ if self.accelerator.is_main_process:
+ # Format for console log (separate from tqdm)
+ # Remove keys that are redundant or too verbose for one line
+ formatted_metrics = []
+ for k, v in metrics.items():
+ if isinstance(v, float):
+ val_str = f"{v:.4f}"
+ if val_str == "0.0000" and v != 0:
+ formatted_metrics.append(f"{k}: {v:.2e}")
+ else:
+ formatted_metrics.append(f"{k}: {val_str}")
+ else:
+ formatted_metrics.append(f"{k}: {v}")
+
+ # Use external logger to write to file, tqdm.write to avoid breaking bar
+ msg = f"Step {step} | " + " | ".join(formatted_metrics)
+ if self.progress_bar:
+ self.progress_bar.write(msg)
+ else:
+ logger.info(msg)
+
+ def close(self):
+ if self.progress_bar:
+ self.progress_bar.close()
+
+
+def save_checkpoint(
+ accelerator: Accelerator,
+ model: torch.nn.Module,
+ tokenizer: Any,
+ output_dir: str,
+ step: int,
+ keep_last_n: int = 3,
+):
+ """
+ Saves model, tokenizer, and accelerator states (optimizer/scheduler).
+ Manages rotation of checkpoints.
+ """
+ checkpoint_dir = os.path.join(output_dir, f"checkpoint-{step}")
+
+ # 1. Save Accelerator State (Optimizer, Scheduler, RNG, Scaler)
+ accelerator.save_state(checkpoint_dir)
+
+ # 2. Save Model in HF format (config.json + pytorch_model.bin/safetensors)
+ unwrap_model = accelerator.unwrap_model(model)
+ unwrap_model.save_pretrained(
+ checkpoint_dir,
+ is_main_process=accelerator.is_main_process,
+ save_function=accelerator.save,
+ )
+
+ # 3. Save Tokenizer
+ if accelerator.is_main_process:
+ tokenizer.save_pretrained(checkpoint_dir)
+
+ logger.info(f"Saved checkpoint to {checkpoint_dir}")
+
+ # 4. Rotate checkpoints (Keep last N)
+ if accelerator.is_main_process and keep_last_n > 0:
+ checkpoints = [
+ d
+ for d in os.listdir(output_dir)
+ if d.startswith("checkpoint-")
+ and os.path.isdir(os.path.join(output_dir, d))
+ ]
+ # Sort by step number
+ checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
+
+ if len(checkpoints) > keep_last_n:
+ to_remove = checkpoints[:-keep_last_n]
+ for d in to_remove:
+ shutil.rmtree(os.path.join(output_dir, d))
+ logger.info(f"Removed old checkpoint {d}")
+
+
+def load_checkpoint(accelerator: Accelerator, checkpoint_path: str):
+ """
+ Resumes training state.
+ """
+ logger.info(f"Resuming from {checkpoint_path}")
+ accelerator.load_state(checkpoint_path)
+
+ # Try to infer step
+ try:
+ clean_path = os.path.normpath(checkpoint_path)
+ step = int(os.path.basename(clean_path).split("-")[-1])
+ return step
+ except ValueError:
+ return 0
diff --git a/omnivoice/training/config.py b/omnivoice/training/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..49cd8809a84b109aeccbac5d12884092d8d95f38
--- /dev/null
+++ b/omnivoice/training/config.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training configuration dataclass.
+
+Defines ``TrainingConfig``, a dataclass that holds all hyperparameters and paths
+for training. Loaded from a JSON config file via ``TrainingConfig.from_json()``
+in ``omnivoice.cli.train``.
+"""
+
+import json
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Tuple
+
+
+@dataclass
+class TrainingConfig:
+ # Key Paths
+ output_dir: Optional[str] = None
+ data_config: Optional[str] = None
+
+ # Model Specific
+ llm_name_or_path: str = "Qwen/Qwen3-0.6B"
+ audio_vocab_size: int = 1025 # valid vocab size + 1 (mask token)
+ audio_mask_id: int = 1024 # 1024 is the 1025-th token
+ num_audio_codebook: int = 8
+
+ # Model Training Specific
+ audio_codebook_weights: List[float | int] = field(
+ default_factory=lambda: [8, 8, 6, 6, 4, 4, 2, 2]
+ )
+ drop_cond_ratio: float = 0.1
+ prompt_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 0.3))
+ mask_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 1.0))
+ language_ratio: float = 0.8
+ use_pinyin_ratio: float = 0.3
+ instruct_ratio: float = 1.0
+ only_instruct_ratio: float = 0.5
+
+ # Init settings
+ resume_from_checkpoint: Optional[str] = None
+ init_from_checkpoint: Optional[str] = None
+
+ # Training Hyperparams
+ learning_rate: float = 1e-4
+ weight_decay: float = 0.01
+ max_grad_norm: float = 1.0
+ steps: int = 300000
+ seed: int = 42
+ lr_scheduler_type: str = "cosine"
+ warmup_type: str = "ratio"
+ warmup_ratio: float = 0.03
+ warmup_steps: int = 2000
+
+ # Data
+ batch_tokens: int = 8192
+ gradient_accumulation_steps: int = 1
+ num_workers: int = 8
+
+ # System
+ mixed_precision: str = "bf16"
+ allow_tf32: bool = True
+ use_deepspeed: bool = False
+ deepspeed_config: Optional[str] = None
+ attn_implementation: str = "flex_attention"
+
+ # Length-grouped batching (only used when attn_implementation != "flex_attention")
+ max_sample_tokens: int = 2000
+ min_sample_tokens: int = 50
+ max_batch_size: int = 64
+
+ # Logging
+ logging_steps: int = 100
+ eval_steps: int = 1000
+ save_steps: int = 10000
+ keep_last_n_checkpoints: int = -1
+
+ @classmethod
+ def from_json(cls, json_path: str):
+ with open(json_path, "r") as f:
+ cfg_dict = json.load(f)
+ valid_keys = cls.__annotations__.keys()
+ filtered_dict = {k: v for k, v in cfg_dict.items() if k in valid_keys}
+ instance = cls(**filtered_dict)
+ return instance
+
+ def save_to_json(self, json_path: str):
+ data = asdict(self)
+ with open(json_path, "w") as f:
+ json.dump(data, f, indent=4)
diff --git a/omnivoice/training/trainer.py b/omnivoice/training/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..081955ea8ec5a0f7a23bdcffeedc6bdde751330a
--- /dev/null
+++ b/omnivoice/training/trainer.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training loop for OmniVoice.
+
+Wraps the HuggingFace Accelerate training loop with checkpoint saving/resuming,
+evaluation, gradient accumulation, and learning rate scheduling.
+Launched via ``omnivoice.cli.train``.
+"""
+
+import logging
+import math
+import os
+import sys
+import time
+from datetime import timedelta
+from typing import Any, Optional
+
+import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.utils import DeepSpeedPlugin, InitProcessGroupKwargs, set_seed
+from torch.utils.data import DataLoader
+from transformers import (
+ get_cosine_schedule_with_warmup,
+ get_constant_schedule_with_warmup,
+)
+
+from omnivoice.training.checkpoint import TrainLogger, load_checkpoint
+from omnivoice.training.checkpoint import save_checkpoint as engine_save_checkpoint
+
+logger = logging.getLogger(__name__)
+
+
+def _to_device(batch, device):
+ """Move all tensors in a batch dict to the target device."""
+ return {
+ k: v.to(device, non_blocking=True) if isinstance(v, torch.Tensor) else v
+ for k, v in batch.items()
+ }
+
+
+class OmniTrainer:
+ def __init__(
+ self,
+ model: torch.nn.Module,
+ config: Any, # TrainingConfig
+ train_dataloader: DataLoader,
+ eval_dataloader: Optional[DataLoader] = None,
+ tokenizer: Optional[Any] = None,
+ optimizer: Optional[torch.optim.Optimizer] = None,
+ lr_scheduler: Optional[Any] = None,
+ ):
+ self.config = config
+ self.model = model
+ self.tokenizer = tokenizer
+ self.train_dataloader = train_dataloader
+ self.eval_dataloader = eval_dataloader
+
+ # 1. Initialize Accelerator
+ self.accelerator = self._init_accelerator()
+
+ # 2. Setup Optimizer & Scheduler if not provided
+ if optimizer is None:
+ self.optimizer, self.lr_scheduler = self.create_optimizer_and_scheduler()
+ else:
+ self.optimizer = optimizer
+ self.lr_scheduler = lr_scheduler
+
+ # 3. DeepSpeed Hack (Batch Size fix)
+ if self.accelerator.distributed_type == "DEEPSPEED":
+ self.accelerator.state.deepspeed_plugin.deepspeed_config[
+ "train_micro_batch_size_per_gpu"
+ ] = 1
+
+ # 4. Prepare with Accelerator
+ (self.model, self.optimizer, self.lr_scheduler,) = self.accelerator.prepare(
+ self.model,
+ self.optimizer,
+ self.lr_scheduler,
+ )
+
+ self.global_step = 0
+ self.epoch = 0
+
+ def _init_accelerator(self) -> Accelerator:
+ """Initialize Accelerator, DeepSpeed, and Logging."""
+ # TF32 setup
+ if getattr(self.config, "allow_tf32", False):
+ torch.set_float32_matmul_precision("high")
+
+ # Init handlers
+ ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)
+ init_kwargs = InitProcessGroupKwargs(timeout=timedelta(minutes=60))
+
+ # DeepSpeed setup
+ deepspeed_plugin = None
+ if self.config.use_deepspeed and self.config.deepspeed_config:
+ if not os.path.exists(self.config.deepspeed_config):
+ raise FileNotFoundError(
+ f"DeepSpeed config not found: {self.config.deepspeed_config}"
+ )
+ deepspeed_plugin = DeepSpeedPlugin(
+ hf_ds_config=self.config.deepspeed_config,
+ gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+ gradient_clipping=self.config.max_grad_norm,
+ )
+
+ accelerator = Accelerator(
+ gradient_accumulation_steps=self.config.gradient_accumulation_steps,
+ mixed_precision=self.config.mixed_precision,
+ log_with="tensorboard",
+ project_dir=self.config.output_dir,
+ step_scheduler_with_optimizer=False,
+ kwargs_handlers=[ddp_kwargs, init_kwargs],
+ deepspeed_plugin=deepspeed_plugin,
+ split_batches=False,
+ )
+
+ # Logging setup
+ if accelerator.is_main_process:
+ os.makedirs(self.config.output_dir, exist_ok=True)
+ # Try to save config if it has the method
+ if hasattr(self.config, "save_to_json"):
+ self.config.save_to_json(
+ os.path.join(self.config.output_dir, "initial_config.json")
+ )
+
+ logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+ handlers=[
+ logging.StreamHandler(sys.stdout),
+ logging.FileHandler(
+ os.path.join(self.config.output_dir, "train.log")
+ ),
+ ],
+ )
+ else:
+ logging.basicConfig(level=logging.ERROR)
+
+ logger.info(f"Loaded Config: {self.config}")
+ set_seed(self.config.seed)
+ accelerator.init_trackers("tensorboard")
+ return accelerator
+
+ def create_optimizer_and_scheduler(self):
+ """Default AdamW + configurable LR Scheduler."""
+ optimizer = torch.optim.AdamW(
+ self.model.parameters(),
+ lr=self.config.learning_rate,
+ weight_decay=self.config.weight_decay,
+ )
+
+ if self.config.warmup_type == "ratio":
+ final_warmup_steps = math.ceil(self.config.steps * self.config.warmup_ratio)
+ else:
+ final_warmup_steps = self.config.warmup_steps
+
+ if self.config.lr_scheduler_type == "constant":
+ lr_scheduler = get_constant_schedule_with_warmup(
+ optimizer=optimizer,
+ num_warmup_steps=final_warmup_steps,
+ )
+ else:
+ lr_scheduler = get_cosine_schedule_with_warmup(
+ optimizer=optimizer,
+ num_warmup_steps=final_warmup_steps,
+ num_training_steps=self.config.steps,
+ )
+ return optimizer, lr_scheduler
+
+ def save_checkpoint(self, step):
+ """Wrapper for engine save_checkpoint."""
+ engine_save_checkpoint(
+ self.accelerator,
+ self.model,
+ self.tokenizer,
+ self.config.output_dir,
+ step,
+ self.config.keep_last_n_checkpoints,
+ )
+ # Save config copy for convenience
+ if self.accelerator.is_main_process and hasattr(self.config, "save_to_json"):
+ checkpoint_dir = os.path.join(self.config.output_dir, f"checkpoint-{step}")
+ self.config.save_to_json(os.path.join(checkpoint_dir, "train_config.json"))
+
+ def load_checkpoint(self, checkpoint_path):
+ """Wrapper for loading."""
+ step = load_checkpoint(self.accelerator, checkpoint_path)
+ self.global_step = step
+ logger.info(f"Resumed from step {self.global_step}")
+ return step
+
+ def evaluate(self):
+ """Evaluation loop."""
+ if self.eval_dataloader is None:
+ return {}
+
+ self.model.eval()
+ logger.info(f"Running evaluation at step {self.global_step}...")
+
+ local_loss_sum = torch.tensor(0.0, device=self.accelerator.device)
+ eval_count = 0
+
+ with torch.no_grad():
+ for eval_batch in self.eval_dataloader:
+ eval_batch = _to_device(eval_batch, self.accelerator.device)
+ outputs = self.model(**eval_batch)
+ local_loss_sum += outputs.loss.detach()
+ eval_count += 1
+
+ if eval_count > 0:
+ local_mean = local_loss_sum / eval_count
+ else:
+ local_mean = torch.tensor(0.0, device=self.accelerator.device)
+
+ all_means = self.accelerator.gather(local_mean)
+ final_eval_loss = all_means.mean().item()
+
+ eval_metrics = {"eval/loss": final_eval_loss}
+ self.accelerator.log(eval_metrics, step=self.global_step)
+ logger.info(f"Eval Loss: {final_eval_loss:.4f}")
+
+ self.accelerator.wait_for_everyone()
+ self.model.train()
+ return eval_metrics
+
+ def train(self):
+ """Main training loop."""
+ logger.info("Starting Training Loop...")
+
+ # Resume if configured
+ if self.config.resume_from_checkpoint:
+ self.load_checkpoint(self.config.resume_from_checkpoint)
+
+ # Handle IterableDataset Epochs
+ if hasattr(self.train_dataloader.dataset, "set_epoch"):
+ self.train_dataloader.dataset.set_epoch(self.epoch)
+
+ # Logger
+ train_logger = TrainLogger(
+ self.accelerator, self.config.steps, self.config.logging_steps
+ )
+ train_logger.start(self.global_step)
+
+ self.model.train()
+ train_iterator = iter(self.train_dataloader)
+
+ logging_start_time = time.time()
+ logging_start_step = self.global_step
+ tr_loss = torch.tensor(0.0).to(self.accelerator.device)
+ logging_loss_scalar = 0.0
+
+ while self.global_step < self.config.steps:
+ try:
+ batch = next(train_iterator)
+ except StopIteration:
+ self.epoch += 1
+ logger.info(f"Epoch {self.epoch} starting. Resetting dataloader...")
+ if hasattr(self.train_dataloader.dataset, "set_epoch"):
+ self.train_dataloader.dataset.set_epoch(self.epoch)
+
+ train_iterator = iter(self.train_dataloader)
+ batch = next(train_iterator)
+
+ batch = _to_device(batch, self.accelerator.device)
+
+ with self.accelerator.accumulate(self.model):
+ outputs = self.model(**batch)
+ loss = outputs.loss
+ tr_loss += loss.detach()
+ self.accelerator.backward(loss)
+
+ if self.accelerator.sync_gradients:
+ # Clipping
+ grad_norm = 0.0
+ if self.config.max_grad_norm > 0:
+ grad_norm = self.accelerator.clip_grad_norm_(
+ self.model.parameters(), self.config.max_grad_norm
+ )
+ grad_norm = (
+ grad_norm.item() if grad_norm is not None else 0.0
+ )
+
+ self.optimizer.step()
+ self.lr_scheduler.step()
+ self.optimizer.zero_grad()
+ self.global_step += 1
+
+ # Logging
+ current_lr = self.lr_scheduler.get_last_lr()[0]
+ train_logger.update(
+ step=self.global_step, loss=loss.item(), lr=current_lr
+ )
+
+ if self.global_step % self.config.logging_steps == 0:
+ elapsed = time.time() - logging_start_time
+ steps_per_sec = (
+ (self.global_step - logging_start_step) / elapsed
+ if elapsed > 0
+ else 0
+ )
+
+ tr_loss_scalar = self.accelerator.gather(tr_loss).mean().item()
+ current_interval_loss = tr_loss_scalar - logging_loss_scalar
+ avg_loss = current_interval_loss / (
+ self.config.logging_steps
+ * self.config.gradient_accumulation_steps
+ )
+ logging_loss_scalar = tr_loss_scalar
+
+ logs = {
+ "train/loss": avg_loss,
+ "train/learning_rate": current_lr,
+ "train/grad_norm": grad_norm,
+ "train/epoch": self.epoch,
+ "train/steps_per_sec": steps_per_sec,
+ }
+ train_logger.log_metrics(step=self.global_step, metrics=logs)
+
+ logging_start_time = time.time()
+ logging_start_step = self.global_step
+
+ # Evaluate
+ if (
+ self.eval_dataloader is not None
+ and self.global_step % self.config.eval_steps == 0
+ ):
+ self.evaluate()
+
+ # Save
+ if self.global_step % self.config.save_steps == 0:
+ self.save_checkpoint(self.global_step)
+
+ # Final Save
+ self.save_checkpoint(self.global_step)
+ train_logger.close()
+ self.accelerator.end_training()
diff --git a/omnivoice/utils/__init__.py b/omnivoice/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/omnivoice/utils/audio.py b/omnivoice/utils/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..02c40100727df676e5545d748c3f0fb4c1bf37f1
--- /dev/null
+++ b/omnivoice/utils/audio.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Audio I/O and processing utilities.
+
+Provides functions for loading, resampling, silence removal,
+chunking, cross-fading, and format conversion.
+
+All public functions in this module operate on **numpy float32 arrays**
+with shape ``(C, T)`` (channels-first).
+"""
+
+import io
+import logging
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+from pydub import AudioSegment
+from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Loading
+# ---------------------------------------------------------------------------
+
+
+def load_waveform(audio_path: str):
+ """Load audio from a file path, returning (data, sample_rate).
+
+ Tries two backends in order:
+ 1. soundfile — covers WAV/FLAC/OGG etc., no ffmpeg needed.
+ 2. librosa — covers MP3/M4A etc. via audioread + ffmpeg.
+
+ Returns:
+ (data, sample_rate) where data is a numpy float32 array of
+ shape (C, T).
+ """
+ try:
+ data, sr = sf.read(audio_path, dtype="float32", always_2d=True)
+ return data.T, sr # (T, C) → (C, T)
+ except Exception:
+ # soundfile cannot handle MP3/M4A etc., fall back to librosa.
+ import librosa
+
+ data, sr = librosa.load(audio_path, sr=None, mono=False)
+ if data.ndim == 1:
+ data = data[np.newaxis, :]
+ return data, sr
+
+
+def load_audio(audio_path: str, sampling_rate: int) -> np.ndarray:
+ """Load a waveform from file and resample to the target rate.
+
+ Parameters:
+ audio_path: path of the audio.
+ sampling_rate: target sampling rate.
+
+ Returns:
+ Numpy float32 array of shape (1, T).
+ """
+ data, sr = load_waveform(audio_path)
+
+ if data.shape[0] > 1:
+ data = np.mean(data, axis=0, keepdims=True)
+ if sr != sampling_rate:
+ data = torchaudio.functional.resample(
+ torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate
+ ).numpy()
+
+ return data
+
+
+def load_audio_bytes(raw: bytes, sampling_rate: int) -> np.ndarray:
+ """Load audio from in-memory bytes and resample.
+
+ Parameters:
+ raw: raw audio file bytes (e.g. from WebDataset).
+ sampling_rate: target sampling rate.
+
+ Returns:
+ Numpy float32 array of shape (1, T).
+ """
+ buf = io.BytesIO(raw)
+
+ try:
+ data, sr = sf.read(buf, dtype="float32", always_2d=True)
+ data = data.T # (T, C) → (C, T)
+ except Exception:
+ import librosa
+
+ buf.seek(0)
+ data, sr = librosa.load(buf, sr=None, mono=False)
+ if data.ndim == 1:
+ data = data[np.newaxis, :]
+
+ if data.shape[0] > 1:
+ data = np.mean(data, axis=0, keepdims=True)
+ if sr != sampling_rate:
+ data = torchaudio.functional.resample(
+ torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate
+ ).numpy()
+
+ return data
+
+
+# ---------------------------------------------------------------------------
+# Audio processing (all numpy in / numpy out)
+# ---------------------------------------------------------------------------
+
+
+def numpy_to_audiosegment(audio: np.ndarray, sample_rate: int) -> AudioSegment:
+ """Convert a numpy float32 array of shape (C, T) to a pydub AudioSegment."""
+ audio_int = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
+ if audio_int.shape[0] > 1:
+ audio_int = audio_int.T.flatten() # interleave channels
+ return AudioSegment(
+ data=audio_int.tobytes(),
+ sample_width=2,
+ frame_rate=sample_rate,
+ channels=audio.shape[0],
+ )
+
+
+def audiosegment_to_numpy(aseg: AudioSegment) -> np.ndarray:
+ """Convert a pydub AudioSegment to a numpy float32 array of shape (C, T)."""
+ data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0
+ if aseg.channels == 1:
+ return data[np.newaxis, :]
+ return data.reshape(-1, aseg.channels).T
+
+
+def remove_silence(
+ audio: np.ndarray,
+ sampling_rate: int,
+ mid_sil: int = 300,
+ lead_sil: int = 100,
+ trail_sil: int = 300,
+) -> np.ndarray:
+ """Remove middle silences longer than *mid_sil* ms and trim edge silences.
+
+ Parameters:
+ audio: numpy array with shape (C, T).
+ sampling_rate: sampling rate of the audio.
+ mid_sil: middle-silence threshold in ms (0 to skip).
+ lead_sil: kept leading silence in ms.
+ trail_sil: kept trailing silence in ms.
+
+ Returns:
+ Numpy array with shape (C, T').
+ """
+ wave = numpy_to_audiosegment(audio, sampling_rate)
+
+ if mid_sil > 0:
+ non_silent_segs = split_on_silence(
+ wave,
+ min_silence_len=mid_sil,
+ silence_thresh=-50,
+ keep_silence=mid_sil,
+ seek_step=10,
+ )
+ wave = AudioSegment.silent(duration=0)
+ for seg in non_silent_segs:
+ wave += seg
+
+ wave = remove_silence_edges(wave, lead_sil, trail_sil, -50)
+
+ return audiosegment_to_numpy(wave)
+
+
+def remove_silence_edges(
+ audio: AudioSegment,
+ lead_sil: int = 100,
+ trail_sil: int = 300,
+ silence_threshold: float = -50,
+) -> AudioSegment:
+ """Remove edge silences, keeping *lead_sil* / *trail_sil* ms."""
+ start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+ start_idx = max(0, start_idx - lead_sil)
+ audio = audio[start_idx:]
+
+ audio = audio.reverse()
+ start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold)
+ start_idx = max(0, start_idx - trail_sil)
+ audio = audio[start_idx:]
+ audio = audio.reverse()
+
+ return audio
+
+
+def fade_and_pad_audio(
+ audio: np.ndarray,
+ pad_duration: float = 0.1,
+ fade_duration: float = 0.1,
+ sample_rate: int = 24000,
+) -> np.ndarray:
+ """Apply fade-in/out and pad with silence to prevent clicks.
+
+ Args:
+ audio: numpy array of shape (C, T).
+ pad_duration: silence padding duration per side (seconds).
+ fade_duration: fade curve duration (seconds).
+ sample_rate: audio sampling rate.
+
+ Returns:
+ Processed numpy array of shape (C, T_new).
+ """
+ if audio.shape[-1] == 0:
+ return audio
+
+ fade_samples = int(fade_duration * sample_rate)
+ pad_samples = int(pad_duration * sample_rate)
+
+ processed = audio.copy()
+
+ if fade_samples > 0:
+ k = min(fade_samples, processed.shape[-1] // 2)
+ if k > 0:
+ fade_in = np.linspace(0, 1, k, dtype=np.float32)[np.newaxis, :]
+ processed[..., :k] *= fade_in
+
+ fade_out = np.linspace(1, 0, k, dtype=np.float32)[np.newaxis, :]
+ processed[..., -k:] *= fade_out
+
+ if pad_samples > 0:
+ silence = np.zeros(
+ (processed.shape[0], pad_samples),
+ dtype=processed.dtype,
+ )
+ processed = np.concatenate([silence, processed, silence], axis=-1)
+
+ return processed
+
+
+def trim_long_audio(
+ audio: np.ndarray,
+ sampling_rate: int,
+ max_duration: float = 15.0,
+ min_duration: float = 3.0,
+ trim_threshold: float = 20.0,
+) -> np.ndarray:
+ """Trim audio to <= *max_duration* by splitting at the largest silence gap.
+
+ Only trims when the audio exceeds *trim_threshold* seconds.
+
+ Args:
+ audio: numpy array of shape (C, T).
+ sampling_rate: audio sampling rate.
+ max_duration: maximum duration in seconds.
+ min_duration: minimum duration in seconds.
+ trim_threshold: only trim if audio is longer than this (seconds).
+
+ Returns:
+ Trimmed numpy array.
+ """
+ duration = audio.shape[-1] / sampling_rate
+ if duration <= trim_threshold:
+ return audio
+
+ seg = numpy_to_audiosegment(audio, sampling_rate)
+ nonsilent = detect_nonsilent(
+ seg, min_silence_len=100, silence_thresh=-40, seek_step=10
+ )
+ if not nonsilent:
+ return audio
+
+ max_ms = int(max_duration * 1000)
+ min_ms = int(min_duration * 1000)
+
+ best_split = 0
+ for start, end in nonsilent:
+ if start > best_split and start <= max_ms:
+ best_split = start
+ if end > max_ms:
+ break
+
+ if best_split < min_ms:
+ best_split = min(max_ms, len(seg))
+
+ trimmed = seg[:best_split]
+ return audiosegment_to_numpy(trimmed)
+
+
+def cross_fade_chunks(
+ chunks: list[np.ndarray],
+ sample_rate: int,
+ silence_duration: float = 0.3,
+) -> np.ndarray:
+ """Concatenate audio chunks with silence gaps and cross-fade at boundaries.
+
+ Args:
+ chunks: list of numpy arrays, each (C, T).
+ sample_rate: audio sample rate.
+ silence_duration: total silence gap duration in seconds.
+
+ Returns:
+ Merged numpy array (C, T_total).
+ """
+ if len(chunks) == 1:
+ return chunks[0]
+
+ total_n = int(silence_duration * sample_rate)
+ fade_n = total_n // 3
+ silence_n = fade_n
+ merged = chunks[0].copy()
+
+ for chunk in chunks[1:]:
+ parts = [merged]
+
+ fout_n = min(fade_n, merged.shape[-1])
+ if fout_n > 0:
+ w_out = np.linspace(1, 0, fout_n, dtype=np.float32)[np.newaxis, :]
+ parts[-1][..., -fout_n:] *= w_out
+
+ parts.append(np.zeros((chunks[0].shape[0], silence_n), dtype=np.float32))
+
+ fade_in = chunk.copy()
+ fin_n = min(fade_n, fade_in.shape[-1])
+ if fin_n > 0:
+ w_in = np.linspace(0, 1, fin_n, dtype=np.float32)[np.newaxis, :]
+ fade_in[..., :fin_n] *= w_in
+
+ parts.append(fade_in)
+ merged = np.concatenate(parts, axis=-1)
+
+ return merged
diff --git a/omnivoice/utils/common.py b/omnivoice/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bdd342673ff6b8dd2fe658ec7dceb66e03839f0
--- /dev/null
+++ b/omnivoice/utils/common.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared utility functions."""
+
+import argparse
+import random
+
+import numpy as np
+import torch
+
+
+def str2bool(v):
+ """Used in argparse.ArgumentParser.add_argument to indicate
+ that a type is a bool type and user can enter
+
+ - yes, true, t, y, 1, to represent True
+ - no, false, f, n, 0, to represent False
+
+ See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa
+ """
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ elif v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ else:
+ raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def fix_random_seed(random_seed: int):
+ """
+ Set the same random seed for the libraries and modules.
+ Includes the ``random`` module, numpy, and torch.
+ """
+ random.seed(random_seed)
+ np.random.seed(random_seed)
+ torch.random.manual_seed(random_seed)
+ # Ensure deterministic ID creation
+ rd = random.Random()
+ rd.seed(random_seed)
diff --git a/omnivoice/utils/data_utils.py b/omnivoice/utils/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59fee4b0d7d98c18146c5577c192a263cabf1a2
--- /dev/null
+++ b/omnivoice/utils/data_utils.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data utilities for batch inference and evaluation.
+
+Provides ``read_test_list()`` to parse JSONL test list files used by
+``omnivoice.cli.infer_batch`` and evaluation scripts.
+"""
+
+import json
+import logging
+from pathlib import Path
+
+
+def read_test_list(path):
+ """Read a JSONL test list file.
+
+ Each line should be a JSON object. Only ``id`` and ``text`` are required;
+ all other fields are optional (default to ``None``):
+ id, text, ref_audio, ref_text, instruct,
+ language_id, language_name, duration, speed
+
+ Note: ``language_name`` is only used by evaluation scripts (under
+ ``omnivoice/eval/``) for grouping and reporting results. The model
+ itself only consumes ``language_id``.
+
+ Returns a list of dicts.
+ """
+ path = Path(path)
+ samples = []
+ with path.open("r", encoding="utf-8") as f:
+ for line_no, line in enumerate(f, 1):
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ obj = json.loads(line)
+ except json.JSONDecodeError:
+ logging.warning(f"Skipping malformed JSON at line {line_no}: {line}")
+ continue
+
+ sample = {
+ "id": obj.get("id"),
+ "text": obj.get("text"),
+ "ref_audio": obj.get("ref_audio"),
+ "ref_text": obj.get("ref_text"),
+ "language_id": obj.get("language_id"),
+ "language_name": obj.get("language_name"),
+ "duration": obj.get("duration"),
+ "speed": obj.get("speed"),
+ "instruct": obj.get("instruct"),
+ }
+ samples.append(sample)
+ return samples
diff --git a/omnivoice/utils/duration.py b/omnivoice/utils/duration.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3bb9d7641e5facdedb9ac8ea1ed59f704d4487c
--- /dev/null
+++ b/omnivoice/utils/duration.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Text duration estimation for TTS generation.
+
+Provides ``RuleDurationEstimator``, which estimates audio duration from text
+using character phonetic weights across 600+ languages. Used by
+``OmniVoice.generate()`` to determine output length when no duration is specified.
+"""
+
+import bisect
+import unicodedata
+from functools import lru_cache
+from typing import Optional
+
+
+class RuleDurationEstimator:
+ def __init__(self):
+ # ==========================================
+ # 1. Phonetic Weights Table
+ # ==========================================
+ # The weight represents the relative speaking time compared to
+ # a standard Latin letter.
+ # Benchmark: 1.0 = One Latin Character (~40-50ms)
+ self.weights = {
+ # --- Logographic (1 char = full syllable/word) ---
+ "cjk": 3.0, # Chinese, Japanese Kanji, etc.
+ # --- Syllabic / Blocks
+ "hangul": 2.5, # Korean Hangul
+ "kana": 2.2, # Japanese Hiragana/Katakana
+ "ethiopic": 3.0, # Amharic/Ge'ez
+ "yi": 3.0, # Yi script
+ # --- Abugida (Consonant-Vowel complexes) ---
+ "indic": 1.8, # Hindi, Bengali, Tamil, etc.
+ "thai_lao": 1.5, # Thai, Lao
+ "khmer_myanmar": 1.8, # Khmer, Myanmar
+ # --- Abjad (Consonant-heavy) ---
+ "arabic": 1.5, # Arabic, Persian, Urdu
+ "hebrew": 1.5, # Hebrew
+ # --- Alphabet (Segmental) ---
+ "latin": 1.0, # English, Spanish, French, Vietnamese, etc. (Baseline)
+ "cyrillic": 1.0, # Russian, Ukrainian
+ "greek": 1.0, # Greek
+ "armenian": 1.0, # Armenian
+ "georgian": 1.0, # Georgian
+ # --- Symbols & Misc ---
+ "punctuation": 0.5, # Pause capability
+ "space": 0.2, # Word boundary/Breath (0.05 / 0.22)
+ "digit": 3.5, # Numbers
+ "mark": 0.0, # Diacritics/Accents (Silent modifiers)
+ "default": 1.0, # Fallback for unknown scripts
+ }
+
+ # ==========================================
+ # 2. Unicode Range Mapping
+ # ==========================================
+ # Format: (End_Codepoint, Type_Key)
+ # Used for fast binary search (bisect).
+ self.ranges = [
+ (0x02AF, "latin"), # Latin (Basic, Supplement, Ext, IPA)
+ (0x03FF, "greek"), # Greek & Coptic
+ (0x052F, "cyrillic"), # Cyrillic
+ (0x058F, "armenian"), # Armenian
+ (0x05FF, "hebrew"), # Hebrew
+ (0x077F, "arabic"), # Arabic, Syriac, Arabic Supplement
+ (0x089F, "arabic"), # Arabic Extended-B (+ Syriac Supp)
+ (0x08FF, "arabic"), # Arabic Extended-A
+ (0x097F, "indic"), # Devanagari
+ (0x09FF, "indic"), # Bengali
+ (0x0A7F, "indic"), # Gurmukhi
+ (0x0AFF, "indic"), # Gujarati
+ (0x0B7F, "indic"), # Oriya
+ (0x0BFF, "indic"), # Tamil
+ (0x0C7F, "indic"), # Telugu
+ (0x0CFF, "indic"), # Kannada
+ (0x0D7F, "indic"), # Malayalam
+ (0x0DFF, "indic"), # Sinhala
+ (0x0EFF, "thai_lao"), # Thai & Lao
+ (0x0FFF, "indic"), # Tibetan (Abugida)
+ (0x109F, "khmer_myanmar"), # Myanmar
+ (0x10FF, "georgian"), # Georgian
+ (0x11FF, "hangul"), # Hangul Jamo
+ (0x137F, "ethiopic"), # Ethiopic
+ (0x139F, "ethiopic"), # Ethiopic Supplement
+ (0x13FF, "default"), # Cherokee
+ (0x167F, "default"), # Canadian Aboriginal Syllabics
+ (0x169F, "default"), # Ogham
+ (0x16FF, "default"), # Runic
+ (0x171F, "default"), # Tagalog (Baybayin)
+ (0x173F, "default"), # Hanunoo
+ (0x175F, "default"), # Buhid
+ (0x177F, "default"), # Tagbanwa
+ (0x17FF, "khmer_myanmar"), # Khmer
+ (0x18AF, "default"), # Mongolian
+ (0x18FF, "default"), # Canadian Aboriginal Syllabics Ext
+ (0x194F, "indic"), # Limbu
+ (0x19DF, "indic"), # Tai Le & New Tai Lue
+ (0x19FF, "khmer_myanmar"), # Khmer Symbols
+ (0x1A1F, "indic"), # Buginese
+ (0x1AAF, "indic"), # Tai Tham
+ (0x1B7F, "indic"), # Balinese
+ (0x1BBF, "indic"), # Sundanese
+ (0x1BFF, "indic"), # Batak
+ (0x1C4F, "indic"), # Lepcha
+ (0x1C7F, "indic"), # Ol Chiki (Santali)
+ (0x1C8F, "cyrillic"), # Cyrillic Extended-C
+ (0x1CBF, "georgian"), # Georgian Extended
+ (0x1CCF, "indic"), # Sundanese Supplement
+ (0x1CFF, "indic"), # Vedic Extensions
+ (0x1D7F, "latin"), # Phonetic Extensions
+ (0x1DBF, "latin"), # Phonetic Extensions Supplement
+ (0x1DFF, "default"), # Combining Diacritical Marks Supplement
+ (0x1EFF, "latin"), # Latin Extended Additional (Vietnamese)
+ (0x309F, "kana"), # Hiragana
+ (0x30FF, "kana"), # Katakana
+ (0x312F, "cjk"), # Bopomofo (Pinyin)
+ (0x318F, "hangul"), # Hangul Compatibility Jamo
+ (0x9FFF, "cjk"), # CJK Unified Ideographs (Main)
+ (0xA4CF, "yi"), # Yi Syllables
+ (0xA4FF, "default"), # Lisu
+ (0xA63F, "default"), # Vai
+ (0xA69F, "cyrillic"), # Cyrillic Extended-B
+ (0xA6FF, "default"), # Bamum
+ (0xA7FF, "latin"), # Latin Extended-D
+ (0xA82F, "indic"), # Syloti Nagri
+ (0xA87F, "default"), # Phags-pa
+ (0xA8DF, "indic"), # Saurashtra
+ (0xA8FF, "indic"), # Devanagari Extended
+ (0xA92F, "indic"), # Kayah Li
+ (0xA95F, "indic"), # Rejang
+ (0xA97F, "hangul"), # Hangul Jamo Extended-A
+ (0xA9DF, "indic"), # Javanese
+ (0xA9FF, "khmer_myanmar"), # Myanmar Extended-B
+ (0xAA5F, "indic"), # Cham
+ (0xAA7F, "khmer_myanmar"), # Myanmar Extended-A
+ (0xAADF, "indic"), # Tai Viet
+ (0xAAFF, "indic"), # Meetei Mayek Extensions
+ (0xAB2F, "ethiopic"), # Ethiopic Extended-A
+ (0xAB6F, "latin"), # Latin Extended-E
+ (0xABBF, "default"), # Cherokee Supplement
+ (0xABFF, "indic"), # Meetei Mayek
+ (0xD7AF, "hangul"), # Hangul Syllables
+ (0xFAFF, "cjk"), # CJK Compatibility
+ (0xFDFF, "arabic"), # Arabic Presentation Forms-A
+ (0xFE6F, "default"), # Variation Selectors
+ (0xFEFF, "arabic"), # Arabic Presentation Forms-B
+ (0xFFEF, "latin"), # Fullwidth Latin
+ ]
+ self.breakpoints = [r[0] for r in self.ranges]
+
+ @lru_cache(maxsize=4096)
+ def _get_char_weight(self, char):
+ """Determines the weight of a single character."""
+ code = ord(char)
+ if (65 <= code <= 90) or (97 <= code <= 122):
+ return self.weights["latin"]
+ if code == 32:
+ return self.weights["space"]
+
+ # Ignore arabic Tatweel
+ if code == 0x0640:
+ return self.weights["mark"]
+
+ category = unicodedata.category(char)
+
+ if category.startswith("M"):
+ return self.weights["mark"]
+
+ if category.startswith("P") or category.startswith("S"):
+ return self.weights["punctuation"]
+
+ if category.startswith("Z"):
+ return self.weights["space"]
+
+ if category.startswith("N"):
+ return self.weights["digit"]
+
+ # 3. Binary search for Unicode Block (此时区间里绝不会再混进标点符号)
+ idx = bisect.bisect_left(self.breakpoints, code)
+ if idx < len(self.ranges):
+ script_type = self.ranges[idx][1]
+ return self.weights.get(script_type, self.weights["default"])
+
+ # 4. Handle upper planes (CJK Ext B/C/D, Historic scripts)
+ if code > 0x20000:
+ return self.weights["cjk"]
+
+ return self.weights["default"]
+
+ def calculate_total_weight(self, text):
+ """Sums up the normalized weights for a string."""
+ return sum(self._get_char_weight(c) for c in text)
+
+ def estimate_duration(
+ self,
+ target_text: str,
+ ref_text: str,
+ ref_duration: float,
+ low_threshold: Optional[float] = 50,
+ boost_strength: float = 3,
+ ) -> float:
+ """
+
+ Args:
+ target_text (str): The text for which we want to estimate the duration.
+ ref_text (str): The reference text that was used to measure
+ the ref_duration.
+ ref_duration (float): The actual duration it took
+ to speak the ref_text.
+ low_threshold (float): The minimum duration threshold below which the
+ estimation will be considered unreliable.
+ boost_strength (float): Controls the power-curve boost for short durations.
+ Higher values boost small durations more aggressively.
+ 1 = no boost (linear), 2 = sqrt-like
+
+ Returns:
+ float: The estimated duration for the target_text based
+ on the ref_text and ref_duration.
+ """
+ if ref_duration <= 0 or not ref_text:
+ return 0.0
+
+ ref_weight = self.calculate_total_weight(ref_text)
+ if ref_weight == 0:
+ return 0.0
+
+ speed_factor = ref_weight / ref_duration
+ target_weight = self.calculate_total_weight(target_text)
+
+ estimated_duration = target_weight / speed_factor
+ if low_threshold is not None and estimated_duration < low_threshold:
+ alpha = 1.0 / boost_strength
+ return low_threshold * (estimated_duration / low_threshold) ** alpha
+ else:
+ return estimated_duration
+
+
+# ==========================================
+# Example Usage
+# ==========================================
+if __name__ == "__main__":
+ estimator = RuleDurationEstimator()
+
+ ref_txt = "Hello, world."
+ ref_dur = 1.5
+
+ test_cases = [
+ ("Hindi (With complex marks)", "नमस्ते दुनिया"),
+ ("Arabic (With vowels)", "مَرْحَبًا بِالْعَالَم"),
+ ("Vietnamese (Lots of diacritics)", "Chào thế giới"),
+ ("Chinese", "你好,世界!"),
+ ("Mixed Emoji", "Hello 🌍! This is fun 🎉"),
+ ]
+
+ print("--- Reference ---")
+ print(f"Reference Text: '{ref_txt}'")
+ print(f"Reference Duration: {ref_dur}s")
+ print("-" * 30)
+
+ for lang, txt in test_cases:
+ est_time = estimator.estimate_duration(txt, ref_txt, ref_dur)
+ weight = estimator.calculate_total_weight(txt)
+
+ print(f"[{lang}]")
+ print(f"Text: {txt}")
+ print(f"Total Weight: {weight:.2f}")
+ print(f"Estimated Duration: {est_time:.2f} s")
+ print("-" * 30)
diff --git a/omnivoice/utils/lang_map.py b/omnivoice/utils/lang_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcda10ae5ab93ac45ca12de1147a752585a5ed1
--- /dev/null
+++ b/omnivoice/utils/lang_map.py
@@ -0,0 +1,698 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Language name to ISO 639-3 code mapping.
+
+Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID``
+(for resolving language names to codes) and ``LANG_IDS`` (the set of supported
+ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided
+language names.
+"""
+
+# Auto-generated from docs/lang_id_name_map.tsv
+# Maps lowercase language name -> language ID code
+
+LANG_NAME_TO_ID = {
+ "abadi": "kbt",
+ "abkhazian": "ab",
+ "abron": "abr",
+ "abua": "abn",
+ "adamawa fulfulde": "fub",
+ "adyghe": "ady",
+ "afade": "aal",
+ "afrikaans": "af",
+ "agwagwune": "yay",
+ "aja (benin)": "ajg",
+ "akebu": "keu",
+ "alago": "ala",
+ "albanian": "sq",
+ "algerian arabic": "arq",
+ "algerian saharan arabic": "aao",
+ "ambo-pasco quechua": "qva",
+ "ambonese malay": "abs",
+ "amdo tibetan": "adx",
+ "amharic": "am",
+ "anaang": "anw",
+ "angika": "anp",
+ "antankarana malagasy": "xmv",
+ "aragonese": "an",
+ "arbëreshë albanian": "aae",
+ "arequipa-la unión quechua": "qxu",
+ "armenian": "hy",
+ "ashe": "ahs",
+ "ashéninka perené": "prq",
+ "askopan": "eiv",
+ "assamese": "as",
+ "asturian": "ast",
+ "atayal": "tay",
+ "awak": "awo",
+ "ayacucho quechua": "quy",
+ "azerbaijani": "az",
+ "baatonum": "bba",
+ "bacama": "bcy",
+ "bade": "bde",
+ "bafia": "ksf",
+ "bafut": "bfd",
+ "bagirmi fulfulde": "fui",
+ "bago-kusuntu": "bqg",
+ "baharna arabic": "abv",
+ "bakoko": "bkh",
+ "balanta-ganja": "bjt",
+ "balti": "bft",
+ "bamenyam": "bce",
+ "bamun": "bax",
+ "bangwinji": "bsj",
+ "banjar": "bjn",
+ "bankon": "abb",
+ "baoulé": "bci",
+ "bara malagasy": "bhr",
+ "barok": "bjk",
+ "basa (cameroon)": "bas",
+ "basa (nigeria)": "bzw",
+ "bashkir": "ba",
+ "basque": "eu",
+ "batak mandailing": "btm",
+ "batanga": "bnm",
+ "bateri": "btv",
+ "bats": "bbl",
+ "bayot": "bda",
+ "bebele": "beb",
+ "belarusian": "be",
+ "bengali": "bn",
+ "betawi": "bew",
+ "bhili": "bhb",
+ "bhojpuri": "bho",
+ "bilur": "bxf",
+ "bima": "bhp",
+ "bodo": "brx",
+ "boghom": "bux",
+ "bokyi": "bky",
+ "bomu": "bmq",
+ "bondei": "bou",
+ "borgu fulfulde": "fue",
+ "bosnian": "bs",
+ "brahui": "brh",
+ "braj": "bra",
+ "breton": "br",
+ "buduma": "bdm",
+ "buginese": "bug",
+ "bukharic": "bhh",
+ "bulgarian": "bg",
+ "bulu (cameroon)": "bum",
+ "bundeli": "bns",
+ "bunun": "bnn",
+ "bura-pabir": "bwr",
+ "burak": "bys",
+ "burmese": "my",
+ "burushaski": "bsk",
+ "cacaloxtepec mixtec": "miu",
+ "cajatambo north lima quechua": "qvl",
+ "cakfem-mushere": "cky",
+ "cameroon pidgin": "wes",
+ "campidanese sardinian": "sro",
+ "cantonese": "yue",
+ "catalan": "ca",
+ "cebuano": "ceb",
+ "cen": "cen",
+ "central kurdish": "ckb",
+ "central nahuatl": "nhn",
+ "central pame": "pbs",
+ "central pashto": "pst",
+ "central puebla nahuatl": "ncx",
+ "central tarahumara": "tar",
+ "central yupik": "esu",
+ "central-eastern niger fulfulde": "fuq",
+ "chadian arabic": "shu",
+ "chichewa": "ny",
+ "chichicapan zapotec": "zpv",
+ "chiga": "cgg",
+ "chimalapa zoque": "zoh",
+ "chimborazo highland quichua": "qug",
+ "chinese": "zh",
+ "chiquián ancash quechua": "qxa",
+ "chitwania tharu": "the",
+ "chokwe": "cjk",
+ "chuvash": "cv",
+ "cibak": "ckl",
+ "coastal konjo": "kjc",
+ "copainalá zoque": "zoc",
+ "cornish": "kw",
+ "corongo ancash quechua": "qwa",
+ "croatian": "hr",
+ "cross river mbembe": "mfn",
+ "cuyamecalco mixtec": "xtu",
+ "czech": "cs",
+ "dadiya": "dbd",
+ "dagbani": "dag",
+ "dameli": "dml",
+ "danish": "da",
+ "dargwa": "dar",
+ "dazaga": "dzg",
+ "deccan": "dcc",
+ "degema": "deg",
+ "dera (nigeria)": "kna",
+ "dghwede": "dgh",
+ "dhatki": "mki",
+ "dhivehi": "dv",
+ "dhofari arabic": "adf",
+ "dijim-bwilim": "cfa",
+ "dogri": "dgo",
+ "domaaki": "dmk",
+ "dotyali": "dty",
+ "duala": "dua",
+ "dutch": "nl",
+ "dũya": "ldb",
+ "dyula": "dyu",
+ "eastern balochi": "bgp",
+ "eastern bolivian guaraní": "gui",
+ "eastern egyptian bedawi arabic": "avl",
+ "eastern krahn": "kqo",
+ "eastern mari": "mhr",
+ "eastern yiddish": "ydd",
+ "ebrié": "ebr",
+ "eggon": "ego",
+ "egyptian arabic": "arz",
+ "ejagham": "etu",
+ "eleme": "elm",
+ "eloyi": "afo",
+ "embu": "ebu",
+ "english": "en",
+ "erzya": "myv",
+ "esan": "ish",
+ "esperanto": "eo",
+ "estonian": "et",
+ "eton (cameroon)": "eto",
+ "ewondo": "ewo",
+ "extremaduran": "ext",
+ "fang (equatorial guinea)": "fan",
+ "fanti": "fat",
+ "farefare": "gur",
+ "fe'fe'": "fmp",
+ "filipino": "fil",
+ "filomena mata-coahuitlán totonac": "tlp",
+ "finnish": "fi",
+ "fipa": "fip",
+ "french": "fr",
+ "fulah": "ff",
+ "galician": "gl",
+ "gambian wolof": "wof",
+ "ganda": "lg",
+ "garhwali": "gbm",
+ "gawar-bati": "gwt",
+ "gawri": "gwc",
+ "gbagyi": "gbr",
+ "gbari": "gby",
+ "geji": "gyz",
+ "gen": "gej",
+ "georgian": "ka",
+ "german": "de",
+ "geser-gorom": "ges",
+ "gheg albanian": "aln",
+ "ghomálá'": "bbj",
+ "gidar": "gid",
+ "glavda": "glw",
+ "goan konkani": "gom",
+ "goaria": "gig",
+ "goemai": "ank",
+ "gola": "gol",
+ "greek": "el",
+ "guarani": "gn",
+ "guduf-gava": "gdf",
+ "guerrero amuzgo": "amu",
+ "gujarati": "gu",
+ "gujari": "gju",
+ "gulf arabic": "afb",
+ "gurgula": "ggg",
+ "gusii": "guz",
+ "gusilay": "gsl",
+ "gweno": "gwe",
+ "güilá zapotec": "ztu",
+ "hadothi": "hoj",
+ "hahon": "hah",
+ "haitian": "ht",
+ "hakha chin": "cnh",
+ "hakö": "hao",
+ "halia": "hla",
+ "hausa": "ha",
+ "hawaiian": "haw",
+ "hazaragi": "haz",
+ "hebrew": "he",
+ "hemba": "hem",
+ "herero": "hz",
+ "highland konjo": "kjk",
+ "hijazi arabic": "acw",
+ "hindi": "hi",
+ "huarijio": "var",
+ "huautla mazatec": "mau",
+ "huaxcaleca nahuatl": "nhq",
+ "huba": "hbb",
+ "huitepec mixtec": "mxs",
+ "hula": "hul",
+ "hungarian": "hu",
+ "hunjara-kaina ke": "hkk",
+ "hwana": "hwo",
+ "ibibio": "ibb",
+ "icelandic": "is",
+ "idakho-isukha-tiriki": "ida",
+ "idoma": "idu",
+ "igbo": "ig",
+ "igo": "ahl",
+ "ikposo": "kpo",
+ "ikwere": "ikw",
+ "imbabura highland quichua": "qvi",
+ "indonesian": "id",
+ "indus kohistani": "mvy",
+ "interlingua (international auxiliary language association)": "ia",
+ "inupiaq": "ik",
+ "irish": "ga",
+ "iron ossetic": "os",
+ "isekiri": "its",
+ "isoko": "iso",
+ "italian": "it",
+ "ito": "itw",
+ "itzá": "itz",
+ "ixtayutla mixtec": "vmj",
+ "izon": "ijc",
+ "jambi malay": "jax",
+ "japanese": "ja",
+ "jaqaru": "jqr",
+ "jauja wanca quechua": "qxw",
+ "jaunsari": "jns",
+ "javanese": "jv",
+ "jiba": "juo",
+ "jju": "kaj",
+ "judeo-moroccan arabic": "aju",
+ "juxtlahuaca mixtec": "vmc",
+ "kabardian": "kbd",
+ "kabras": "lkb",
+ "kabuverdianu": "kea",
+ "kabyle": "kab",
+ "kachi koli": "gjk",
+ "kairak": "ckr",
+ "kalabari": "ijn",
+ "kalasha": "kls",
+ "kalenjin": "kln",
+ "kalkoti": "xka",
+ "kamba": "kam",
+ "kamo": "kcq",
+ "kanauji": "bjj",
+ "kanembu": "kbl",
+ "kannada": "kn",
+ "karekare": "kai",
+ "kashmiri": "ks",
+ "kathoriya tharu": "tkt",
+ "kati": "bsh",
+ "kazakh": "kk",
+ "keiyo": "eyo",
+ "khams tibetan": "khg",
+ "khana": "ogo",
+ "khetrani": "xhe",
+ "khmer": "km",
+ "khowar": "khw",
+ "kinga": "zga",
+ "kinnauri": "kfk",
+ "kinyarwanda": "rw",
+ "kirghiz": "ky",
+ "kirya-konzəl": "fkk",
+ "kochila tharu": "thq",
+ "kohistani shina": "plk",
+ "kohumono": "bcs",
+ "kok borok": "trp",
+ "kol (papua new guinea)": "kol",
+ "kom (cameroon)": "bkm",
+ "koma": "kmy",
+ "konkani": "knn",
+ "konzo": "koo",
+ "korean": "ko",
+ "korwa": "kfp",
+ "kota (india)": "kfe",
+ "koti": "eko",
+ "kuanua": "ksd",
+ "kuanyama": "kj",
+ "kui (india)": "uki",
+ "kulung (nigeria)": "bbu",
+ "kuot": "kto",
+ "kushi": "kuh",
+ "kwambi": "kwm",
+ "kwasio": "nmg",
+ "lala-roba": "lla",
+ "lamang": "hia",
+ "lao": "lo",
+ "larike-wakasihu": "alo",
+ "lasi": "lss",
+ "latgalian": "ltg",
+ "latvian": "lv",
+ "levantine arabic": "apc",
+ "liana-seti": "ste",
+ "liberia kpelle": "xpe",
+ "liberian english": "lir",
+ "libyan arabic": "ayl",
+ "ligurian": "lij",
+ "lijili": "mgi",
+ "lingala": "ln",
+ "lithuanian": "lt",
+ "loarki": "lrk",
+ "logooli": "rag",
+ "logudorese sardinian": "src",
+ "loja highland quichua": "qvj",
+ "loloda": "loa",
+ "longuda": "lnu",
+ "loxicha zapotec": "ztp",
+ "luba-lulua": "lua",
+ "luo": "luo",
+ "lushai": "lus",
+ "luxembourgish": "lb",
+ "maasina fulfulde": "ffm",
+ "maba (chad)": "mde",
+ "macedo-romanian": "rup",
+ "macedonian": "mk",
+ "mada (cameroon)": "mxu",
+ "mafa": "maf",
+ "maithili": "mai",
+ "malay": "ms",
+ "malayalam": "ml",
+ "mali": "gcc",
+ "malinaltepec me'phaa": "tcf",
+ "maltese": "mt",
+ "mandara": "tbf",
+ "mandjak": "mfv",
+ "manggarai": "mqy",
+ "manipuri": "mni",
+ "mansoanka": "msw",
+ "manx": "gv",
+ "maori": "mi",
+ "marathi": "mr",
+ "marghi central": "mrt",
+ "marghi south": "mfm",
+ "maria (india)": "mrr",
+ "marwari (pakistan)": "mve",
+ "masana": "mcn",
+ "masikoro malagasy": "msh",
+ "matsés": "mcf",
+ "mazaltepec zapotec": "zpy",
+ "mazatlán mazatec": "vmz",
+ "mazatlán mixe": "mzl",
+ "mbe": "mfo",
+ "mbo (cameroon)": "mbo",
+ "mbum": "mdd",
+ "medumba": "byv",
+ "mekeo": "mek",
+ "meru": "mer",
+ "mesopotamian arabic": "acm",
+ "mewari": "mtr",
+ "min nan chinese": "nan",
+ "mingrelian": "xmf",
+ "mitlatongo mixtec": "vmm",
+ "miya": "mkf",
+ "mokpwe": "bri",
+ "moksha": "mdf",
+ "mom jango": "ver",
+ "mongolian": "mn",
+ "moroccan arabic": "ary",
+ "motu": "meu",
+ "mpiemo": "mcx",
+ "mpumpong": "mgg",
+ "mundang": "mua",
+ "mungaka": "mhk",
+ "musey": "mse",
+ "musgu": "mug",
+ "musi": "mui",
+ "naba": "mne",
+ "najdi arabic": "ars",
+ "nalik": "nal",
+ "nawdm": "nmz",
+ "ndonga": "ng",
+ "neapolitan": "nap",
+ "nepali": "npi",
+ "ngamo": "nbh",
+ "ngas": "anc",
+ "ngiemboon": "nnh",
+ "ngizim": "ngi",
+ "ngomba": "jgo",
+ "ngombale": "nla",
+ "nigerian fulfulde": "fuv",
+ "nigerian pidgin": "pcm",
+ "nimadi": "noe",
+ "nobiin": "fia",
+ "north mesopotamian arabic": "ayp",
+ "north moluccan malay": "max",
+ "northern betsimisaraka malagasy": "bmm",
+ "northern hindko": "hno",
+ "northern kurdish": "kmr",
+ "northern pame": "pmq",
+ "northern pashto": "pbu",
+ "northern uzbek": "uzn",
+ "northwest gbaya": "gya",
+ "norwegian": "no",
+ "norwegian bokmål": "nb",
+ "norwegian nynorsk": "nn",
+ "notsi": "ncf",
+ "nyankpa": "yes",
+ "nyungwe": "nyu",
+ "nzanyi": "nja",
+ "nüpode huitoto": "hux",
+ "occitan": "oc",
+ "od": "odk",
+ "odia": "ory",
+ "odual": "odu",
+ "omani arabic": "acx",
+ "orizaba nahuatl": "nlv",
+ "orma": "orc",
+ "ormuri": "oru",
+ "oromo": "om",
+ "pahari-potwari": "phr",
+ "paiwan": "pwn",
+ "panjabi": "pa",
+ "papuan malay": "pmy",
+ "parkari koli": "kvx",
+ "pedi": "nso",
+ "pero": "pip",
+ "persian": "fa",
+ "petats": "pex",
+ "phalura": "phl",
+ "piemontese": "pms",
+ "piya-kwonci": "piy",
+ "plateau malagasy": "plt",
+ "polish": "pl",
+ "poqomam": "poc",
+ "portuguese": "pt",
+ "pulaar": "fuc",
+ "pular": "fuf",
+ "puno quechua": "qxp",
+ "pushto": "ps",
+ "pökoot": "pko",
+ "qaqet": "byx",
+ "quiotepec chinantec": "chq",
+ "rana tharu": "thr",
+ "rangi": "lag",
+ "rapoisi": "kyx",
+ "ratahan": "rth",
+ "rayón zoque": "zor",
+ "romanian": "ro",
+ "romansh": "rm",
+ "rombo": "rof",
+ "rotokas": "roo",
+ "rukai": "dru",
+ "russian": "ru",
+ "sacapulteco": "quv",
+ "saidi arabic": "aec",
+ "sakalava malagasy": "skg",
+ "sakizaya": "szy",
+ "saleman": "sau",
+ "samba daka": "ccg",
+ "samba leko": "ndi",
+ "san felipe otlaltepec popoloca": "pow",
+ "san francisco del mar huave": "hue",
+ "san juan atzingo popoloca": "poe",
+ "san martín itunyoso triqui": "trq",
+ "san miguel el grande mixtec": "mig",
+ "sansi": "ssi",
+ "sanskrit": "sa",
+ "santa ana de tusi pasco quechua": "qxt",
+ "santa catarina albarradas zapotec": "ztn",
+ "santali": "sat",
+ "santiago del estero quichua": "qus",
+ "saposa": "sps",
+ "saraiki": "skr",
+ "sardinian": "sc",
+ "saya": "say",
+ "sediq": "trv",
+ "serbian": "sr",
+ "seri": "sei",
+ "shina": "scl",
+ "shona": "sn",
+ "siar-lak": "sjr",
+ "sibe": "nco",
+ "sicilian": "scn",
+ "sihuas ancash quechua": "qws",
+ "sikkimese": "sip",
+ "sinaugoro": "snc",
+ "sindhi": "sd",
+ "sindhi bhil": "sbn",
+ "sinhala": "si",
+ "sinicahua mixtec": "xti",
+ "sipacapense": "qum",
+ "siwai": "siw",
+ "slovak": "sk",
+ "slovenian": "sl",
+ "solos": "sol",
+ "somali": "so",
+ "soninke": "snk",
+ "south giziga": "giz",
+ "south ucayali ashéninka": "cpy",
+ "southeastern nochixtlán mixtec": "mxy",
+ "southern betsimisaraka malagasy": "bzc",
+ "southern pashto": "pbt",
+ "southern pastaza quechua": "qup",
+ "soyaltepec mazatec": "vmp",
+ "spanish": "es",
+ "standard arabic": "arb",
+ "standard moroccan tamazight": "zgh",
+ "sudanese arabic": "apd",
+ "sulka": "sua",
+ "svan": "sva",
+ "swahili": "sw",
+ "swedish": "sv",
+ "tae'": "rob",
+ "tahaggart tamahaq": "thv",
+ "taita": "dav",
+ "tajik": "tg",
+ "tamil": "ta",
+ "tandroy-mahafaly malagasy": "tdx",
+ "tangale": "tan",
+ "tanosy malagasy": "txy",
+ "tarok": "yer",
+ "tatar": "tt",
+ "tedaga": "tuq",
+ "telugu": "te",
+ "tem": "kdh",
+ "teop": "tio",
+ "tepeuxila cuicatec": "cux",
+ "tepinapa chinantec": "cte",
+ "tera": "ttr",
+ "terei": "buo",
+ "termanu": "twu",
+ "tesaka malagasy": "tkg",
+ "tetelcingo nahuatl": "nhg",
+ "teutila cuicatec": "cut",
+ "thai": "th",
+ "tibetan": "bo",
+ "tidaá mixtec": "mtx",
+ "tidore": "tvo",
+ "tigak": "tgc",
+ "tigre": "tig",
+ "tigrinya": "ti",
+ "tilquiapan zapotec": "zts",
+ "tinputz": "tpz",
+ "tlacoapa me'phaa": "tpl",
+ "tlacoatzintepec chinantec": "ctl",
+ "tlingit": "tli",
+ "toki pona": "tok",
+ "tomoip": "tqp",
+ "tondano": "tdn",
+ "tonsea": "txs",
+ "tooro": "ttj",
+ "torau": "ttu",
+ "torwali": "trw",
+ "tsimihety malagasy": "xmw",
+ "tsotso": "lto",
+ "tswana": "tn",
+ "tugen": "tuy",
+ "tuki": "bag",
+ "tula": "tul",
+ "tulu": "tcy",
+ "tunen": "tvu",
+ "tungag": "lcm",
+ "tunisian arabic": "aeb",
+ "tupuri": "tui",
+ "turkana": "tuv",
+ "turkish": "tr",
+ "turkmen": "tk",
+ "tututepec mixtec": "mtu",
+ "twi": "tw",
+ "ubaghara": "byc",
+ "uighur": "ug",
+ "ukrainian": "uk",
+ "umbundu": "umb",
+ "upper sorbian": "hsb",
+ "urdu": "ur",
+ "ushojo": "ush",
+ "uzbek": "uz",
+ "vai": "vai",
+ "vietnamese": "vi",
+ "votic": "vot",
+ "võro": "vro",
+ "waci gbe": "wci",
+ "wadiyara koli": "kxp",
+ "waja": "wja",
+ "wakhi": "wbl",
+ "wanga": "lwg",
+ "wapan": "juk",
+ "warji": "wji",
+ "welsh": "cy",
+ "wemale": "weo",
+ "western frisian": "fy",
+ "western highland purepecha": "pua",
+ "western juxtlahuaca mixtec": "jmx",
+ "western maninkakan": "mlq",
+ "western mari": "mrj",
+ "western niger fulfulde": "fuh",
+ "western panjabi": "pnb",
+ "wolof": "wo",
+ "wuzlam": "udl",
+ "xanaguía zapotec": "ztg",
+ "xhosa": "xh",
+ "yace": "ekr",
+ "yakut": "sah",
+ "yalahatan": "jal",
+ "yanahuanca pasco quechua": "qur",
+ "yangben": "yav",
+ "yaqui": "yaq",
+ "yauyos quechua": "qux",
+ "yekhee": "ets",
+ "yiddish": "yi",
+ "yidgha": "ydg",
+ "yoruba": "yo",
+ "yutanduchi mixtec": "mab",
+ "zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi",
+ "zarma": "dje",
+ "zaza": "zza",
+ "zulu": "zu",
+ "ömie": "aom",
+}
+
+LANG_NAMES = set(LANG_NAME_TO_ID.keys())
+LANG_IDS = set(LANG_NAME_TO_ID.values())
+
+# Exceptions where .title() doesn't match the canonical casing from the TSV.
+_TITLE_EXCEPTIONS = {
+ "fe'fe'": "Fe'fe'",
+ "dũya": "Dũya",
+ "santiago del estero quichua": "Santiago del Estero Quichua",
+ "santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua",
+ "malinaltepec me'phaa": "Malinaltepec Me'phaa",
+ "tlacoapa me'phaa": "Tlacoapa Me'phaa",
+}
+
+
+def lang_display_name(name: str) -> str:
+ """Return a display-friendly version of a lowercase language name.
+
+ Uses .title() for most names, with manual exceptions for cases like
+ apostrophes and small words (de, del) that should stay lowercase.
+ """
+ return _TITLE_EXCEPTIONS.get(name, name.title())
diff --git a/omnivoice/utils/text.py b/omnivoice/utils/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc9adb04202eab422f7988cccb494edf17cdfad
--- /dev/null
+++ b/omnivoice/utils/text.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Text processing utilities for TTS inference.
+
+Provides:
+- ``chunk_text_punctuation()``: Splits long text into model-friendly chunks at
+ sentence boundaries, with abbreviation-aware punctuation splitting.
+- ``add_punctuation()``: Appends missing end punctuation (Chinese or English).
+"""
+
+from typing import List, Optional
+
+
+SPLIT_PUNCTUATION = set(".,;:!?。,;:!?")
+CLOSING_MARKS = set("\"'""')]》》>」】")
+
+END_PUNCTUATION = {
+ ";",
+ ":",
+ ",",
+ ".",
+ "!",
+ "?",
+ "…",
+ ")",
+ "]",
+ "}",
+ '"',
+ "'",
+ """,
+ "'",
+ ";",
+ ":",
+ ",",
+ "。",
+ "!",
+ "?",
+ "、",
+ "……",
+ ")",
+ "】",
+ """,
+ "'",
+}
+
+
+ABBREVIATIONS = {
+ "Mr.",
+ "Mrs.",
+ "Ms.",
+ "Dr.",
+ "Prof.",
+ "Sr.",
+ "Jr.",
+ "Rev.",
+ "Fr.",
+ "Hon.",
+ "Pres.",
+ "Gov.",
+ "Capt.",
+ "Gen.",
+ "Sen.",
+ "Rep.",
+ "Col.",
+ "Maj.",
+ "Lt.",
+ "Cmdr.",
+ "Sgt.",
+ "Cpl.",
+ "Co.",
+ "Corp.",
+ "Inc.",
+ "Ltd.",
+ "Est.",
+ "Dept.",
+ "St.",
+ "Ave.",
+ "Blvd.",
+ "Rd.",
+ "Mt.",
+ "Ft.",
+ "No.",
+ "Jan.",
+ "Feb.",
+ "Mar.",
+ "Apr.",
+ "Aug.",
+ "Sep.",
+ "Sept.",
+ "Oct.",
+ "Nov.",
+ "Dec.",
+ "i.e.",
+ "e.g.",
+ "vs.",
+ "Vs.",
+ "Etc.",
+ "approx.",
+ "fig.",
+ "def.",
+}
+
+
+def chunk_text_punctuation(
+ text: str,
+ chunk_len: int,
+ min_chunk_len: Optional[int] = None,
+) -> List[str]:
+ """
+ Splits the input tokens list into chunks according to punctuations,
+ avoiding splits on common abbreviations (e.g., Mr., No.).
+ """
+
+ # 1. Split the tokens according to punctuations.
+ sentences = []
+ current_sentence = []
+
+ tokens_list = list(text)
+
+ for token in tokens_list:
+ # If the first token of current sentence is punctuation,
+ # append it to the end of the previous sentence.
+ if (
+ len(current_sentence) == 0
+ and len(sentences) != 0
+ and (token in SPLIT_PUNCTUATION or token in CLOSING_MARKS)
+ ):
+ sentences[-1].append(token)
+ # Otherwise, append the current token to the current sentence.
+ else:
+ current_sentence.append(token)
+
+ # Split the sentence in positions of punctuations.
+ if token in SPLIT_PUNCTUATION:
+ is_abbreviation = False
+
+ if token == ".":
+ temp_str = "".join(current_sentence).strip()
+ if temp_str:
+ last_word = temp_str.split()[-1]
+ if last_word in ABBREVIATIONS:
+ is_abbreviation = True
+
+ if not is_abbreviation:
+ sentences.append(current_sentence)
+ current_sentence = []
+ # Assume the last few tokens are also a sentence
+ if len(current_sentence) != 0:
+ sentences.append(current_sentence)
+
+ # 2. Merge short sentences.
+ merged_chunks = []
+ current_chunk = []
+ for sentence in sentences:
+ if len(current_chunk) + len(sentence) <= chunk_len:
+ current_chunk.extend(sentence)
+ else:
+ if len(current_chunk) > 0:
+ merged_chunks.append(current_chunk)
+ current_chunk = sentence
+
+ if len(current_chunk) > 0:
+ merged_chunks.append(current_chunk)
+
+ # 4. Post-process: Check for undersized chunks and merge them
+ # with the previous chunk or next chunk (if it's the first chunk).
+ if min_chunk_len is not None:
+ first_chunk_short_flag = (
+ len(merged_chunks) > 0 and len(merged_chunks[0]) < min_chunk_len
+ )
+ final_chunks = []
+ for i, chunk in enumerate(merged_chunks):
+ if i == 1 and first_chunk_short_flag:
+ final_chunks[-1].extend(chunk)
+ else:
+ if len(chunk) >= min_chunk_len:
+ final_chunks.append(chunk)
+ else:
+ if len(final_chunks) == 0:
+ final_chunks.append(chunk)
+ else:
+ final_chunks[-1].extend(chunk)
+ else:
+ final_chunks = merged_chunks
+
+ chunk_strings = [
+ "".join(chunk).strip() for chunk in final_chunks if "".join(chunk).strip()
+ ]
+ return chunk_strings
+
+
+def add_punctuation(text: str):
+ """Add punctuation if there is not in the end of text"""
+ text = text.strip()
+
+ if not text:
+ return text
+
+ if text[-1] not in END_PUNCTUATION:
+ is_chinese = any("\u4e00" <= char <= "\u9fff" for char in text)
+
+ text += "。" if is_chinese else "."
+
+ return text
diff --git a/omnivoice/utils/voice_design.py b/omnivoice/utils/voice_design.py
new file mode 100644
index 0000000000000000000000000000000000000000..802321d5060b5a93db40ffef2f42a1dc947dea19
--- /dev/null
+++ b/omnivoice/utils/voice_design.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+# Copyright 2026 Xiaomi Corp. (authors: Han Zhu)
+#
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Voice-design instruct constants for TTS inference.
+
+Defines speaker attribute tags (gender, age, pitch, accent, dialect) and
+translation/validation utilities between English and Chinese. Used by
+``OmniVoice.generate()`` for voice design mode.
+"""
+
+import re
+
+_ZH_RE = re.compile(r'[\u4e00-\u9fff]')
+
+# Category = set of {english: chinese, ...} items that are mutually exclusive.
+# Accent (EN-only) and dialect (ZH-only) are stored as flat sets below.
+_INSTRUCT_CATEGORIES = [
+ {"male": "男", "female": "女"},
+ {"child": "儿童", "teenager": "少年", "young adult": "青年",
+ "middle-aged": "中年", "elderly": "老年"},
+ {"very low pitch": "极低音调", "low pitch": "低音调",
+ "moderate pitch": "中音调", "high pitch": "高音调",
+ "very high pitch": "极高音调"},
+ {"whisper": "耳语"},
+ # Accent (English-only, no Chinese counterpart)
+ {"american accent", "british accent", "australian accent",
+ "chinese accent", "canadian accent", "indian accent",
+ "korean accent", "portuguese accent", "russian accent", "japanese accent"},
+ # Dialect (Chinese-only, no English counterpart)
+ {"河南话", "陕西话", "四川话", "贵州话", "云南话", "桂林话",
+ "济南话", "石家庄话", "甘肃话", "宁夏话", "青岛话", "东北话"},
+]
+
+_INSTRUCT_EN_TO_ZH = {}
+_INSTRUCT_ZH_TO_EN = {}
+_INSTRUCT_MUTUALLY_EXCLUSIVE = []
+for _cat in _INSTRUCT_CATEGORIES:
+ if isinstance(_cat, dict):
+ _INSTRUCT_EN_TO_ZH.update(_cat)
+ _INSTRUCT_ZH_TO_EN.update({v: k for k, v in _cat.items()})
+ _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat) | set(_cat.values()))
+ else:
+ _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat))
+
+_INSTRUCT_ALL_VALID = (
+ set(_INSTRUCT_EN_TO_ZH) | set(_INSTRUCT_ZH_TO_EN)
+ | _INSTRUCT_MUTUALLY_EXCLUSIVE[-2] # accents
+ | _INSTRUCT_MUTUALLY_EXCLUSIVE[-1] # dialects
+)
+
+_INSTRUCT_VALID_EN = frozenset(i for i in _INSTRUCT_ALL_VALID if not _ZH_RE.search(i))
+_INSTRUCT_VALID_ZH = frozenset(i for i in _INSTRUCT_ALL_VALID if _ZH_RE.search(i))
diff --git a/sync_data/configs/config.json b/sync_data/configs/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..dfefd5e0bdf24a9e91a72f064fb455c78fbed547
--- /dev/null
+++ b/sync_data/configs/config.json
@@ -0,0 +1,40 @@
+{
+ "llm_name_or_path": "Qwen/Qwen3-0.6B",
+ "audio_vocab_size": 1025,
+ "audio_mask_id": 1024,
+ "num_audio_codebook": 8,
+
+ "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2],
+ "drop_cond_ratio": 0.1,
+ "prompt_ratio_range": [0.0, 0.3],
+ "mask_ratio_range": [0.0, 1.0],
+ "language_ratio": 0.8,
+ "use_pinyin_ratio": 0.0,
+ "instruct_ratio": 0.0,
+ "only_instruct_ratio": 0.0,
+
+ "resume_from_checkpoint": null,
+ "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2",
+
+ "learning_rate": 1e-5,
+ "weight_decay": 0.01,
+ "max_grad_norm": 1.0,
+ "steps": 5000,
+ "seed": 42,
+ "warmup_type": "ratio",
+ "warmup_ratio": 0.01,
+ "warmup_steps": 0,
+
+ "batch_tokens": 4096,
+ "gradient_accumulation_steps": 2,
+ "num_workers": 3,
+
+ "mixed_precision": "bf16",
+ "allow_tf32": true,
+ "attn_implementation": "sdpa",
+
+ "logging_steps": 50,
+ "eval_steps": 500,
+ "save_steps": 500,
+ "keep_last_n_checkpoints": -1
+}
diff --git a/sync_data/configs/data_saudi.json b/sync_data/configs/data_saudi.json
new file mode 100644
index 0000000000000000000000000000000000000000..62db32ff7ede8e2787aac0c3b91157127ca1df26
--- /dev/null
+++ b/sync_data/configs/data_saudi.json
@@ -0,0 +1,16 @@
+{
+ "train": [
+ {
+ "language_id": "ar",
+ "manifest_path": ["/home/riftuser/OmniVoice/sync_data/tokens/train/data.lst"],
+ "repeat": 1
+ }
+ ],
+ "dev": [
+ {
+ "language_id": "ar",
+ "manifest_path": ["/home/riftuser/OmniVoice/sync_data/tokens/dev/data.lst"],
+ "repeat": 1
+ }
+ ]
+}
diff --git a/sync_data/data/dev_raw.jsonl b/sync_data/data/dev_raw.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..44c9ae666d7bb98f654b4f1ebac3dea0228b0a92
--- /dev/null
+++ b/sync_data/data/dev_raw.jsonl
@@ -0,0 +1,15 @@
+{"id": "sample_000194", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000194.wav", "text": "تحت نور القمر، الشوارع لنا.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000122", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000122.wav", "text": "الحراس جايين! بسرعة، اختبي في الظلام!", "language_id": "ar", "instruct": "saudi, conversational, tense"}
+{"id": "sample_000089", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000089.wav", "text": "شوف! لقيت عملة من زمان!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000090", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000090.wav", "text": "نجهز للحج! وش لازم نجيب معنا؟", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000307", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000307.wav", "text": "يلا نبدا الدرس! تأكد من HP مالك وتحضّر للمستوى الجديد على صفحة ستة وخمسين.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000119", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000119.wav", "text": "يا هيوستن، طرنا! مهمتنا للمريخ تبدأ الحين!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000452", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000452.wav", "text": "شغل عدل! ادزلبتس استراتيجية الدفاع عن الواحة. خلنا نحتفل بكوب من القهوة مع GameMaster. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000127", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000127.wav", "text": "الحراس يغيرون النوبة. هذي فرصتنا نتسلل!", "language_id": "ar", "instruct": "saudi, conversational, tense"}
+{"id": "sample_000146", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000146.wav", "text": "الواحة تطلع مثل السراب، بس صدق.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000189", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000189.wav", "text": "غير الكفر بسرعة، ولا ما نطلع من هنا!", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000200", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000200.wav", "text": "غلي المويه، بعدين حط الهيل عشان الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000215", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000215.wav", "text": "افرك الورق على الجرح، العشب هذا مستخدم من زمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000016", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000016.wav", "text": "لازم نعبي موية زيادة للسفرة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000132", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000132.wav", "text": "النجوم ترشدنا بليل البر.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
diff --git a/sync_data/data/train_raw.jsonl b/sync_data/data/train_raw.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..33c3b92aaec619d50667738a58d4ae43548126b9
--- /dev/null
+++ b/sync_data/data/train_raw.jsonl
@@ -0,0 +1,280 @@
+{"id": "sample_000364", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000364.wav", "text": "مبروك! انت حليت آخر لغز وربحت جائزة مميزة. تْشوف التفاصيل في صفحة مئة واثنين وأربعين!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000257", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000257.wav", "text": "شوف! هذا نجم الشمال. المسافرين أول كانوا يستخدمونه عشان يلقون طريقهم في الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, awe-inspired"}
+{"id": "sample_000099", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000099.wav", "text": "يالله بسرعة! عدل حلاوة العيد قبل لا يجون الضيوف!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000093", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000093.wav", "text": "ياسلام! هالفلس العتيق يضبط بالضبط في يد التمثال!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000427", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000427.wav", "text": "يلا نخلي اللغز ذا سوا! شوفي صفحة مئة واثنين وأربعين للhint.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000294", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000294.wav", "text": "استعدوا يا رجيل، العدو جاي من الشرق عقب خمس عشرة دقيقة. صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000426", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000426.wav", "text": "شفتي الجمل يرقص جنب الواحَهْ البارحة؟ كان العرض عند ستة وخمسين شارع النجدي. It was super!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000130", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000130.wav", "text": "الاستعداد لهالرحلة شرف ومسؤولية.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000229", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000229.wav", "text": "القنبلة جاية! خبّ نفسك!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000166", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000166.wav", "text": "يتقدمون علينا! اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000113", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000113.wav", "text": "من البيوت الطينية للأبراج الزجاجية، سماء الرياض تحكي قصة تطورنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000475", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000475.wav", "text": "حياك الله في مجلس الصحارى. مهمتك تبتدي عند الغروب بْتمامَه في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000330", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000330.wav", "text": "أبغى منك تنظم تجمع العائلة في ستة وخمسين شارع النجدي. لا تنسى ترسل الدعوة بواتساب.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000318", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000318.wav", "text": "شفت شي غريب حول الواحة القديمه ليلة أمس؟ هذا مكتوب في سجل رقم خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000396", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000396.wav", "text": "حياكم الله في المجلس الاستراتيجي. فريقنا مِستعد يواجه تحديات جديدة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000161", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000161.wav", "text": "كل خطوة قدام تقربنا من النصر.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000005", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000005.wav", "text": "الرمل يشبه كثبان الفضا من فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000447", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000447.wav", "text": "لو سمحت اشرح لي كيف تعاملت مع موقف صعب في شغلك الأخير في Office خمسة وستين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000180", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000180.wav", "text": "يالله نلعب! أراهن إني أفوز!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000213", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000213.wav", "text": "تدرب على الدعوات، بتساعدك في الرحلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000148", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000148.wav", "text": "أسمع شي بالظلام... يمكن الهوى.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000375", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000375.wav", "text": "يا شريك، صار الوقت نقول وداع! لا تنسى puzzle party يوم الجمعة في خمسة ستة شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000214", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000214.wav", "text": "رم الشبكة لما تكون الموية راكدة، الصبر هو المفتاح.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000163", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000163.wav", "text": "لا تطلقون النار لين يقربون!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000009", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000009.wav", "text": "طور شريحة السرعة في جملك الروبوت للسباق!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000438", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000438.wav", "text": "قم اتبع الصقر إلى الواحَهْ باستخدام الmap في صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000256", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000256.wav", "text": "ياخي، اللعبة ذي بالنظارات قوية! حسيت إني أطير فوق الرياض صدق!", "language_id": "ar", "instruct": "saudi, conversational, thrilled"}
+{"id": "sample_000188", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000188.wav", "text": "دوس بنزين! قاعدين نفقدهم!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000343", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000343.wav", "text": "شوف ذا النقش القديم! كنه لقى كنز مخفي في صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000123", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000123.wav", "text": "الأكسجين عندنا في خطر. لازم نوسع البيت الأخضر على طول.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000136", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000136.wav", "text": "كل خطوة تقربك من الراحة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000370", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000370.wav", "text": "قال الملك، 'وشلون الجمل صار في ستة وخمسين شارع Royal؟' يمكن يتفرج على Netflix!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000403", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000403.wav", "text": "مرحبًا بك في المقابلة! اجلس واستمتع بـ qahwa المشهورة في صفحة ستة وخمسين من دليلنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000287", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000287.wav", "text": "يا ولد العم! شفت الجمل الجديد للجد؟ اسمه 'Speedster ثلاثة آلاف'!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000160", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000160.wav", "text": "استعدوا، المعركة هذي بتكون صعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000112", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000112.wav", "text": "الهبوب جاي! بسرعة، اضرب الخيمة وربط البعارين!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000460", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000460.wav", "text": "العدو قْرب من الكْثبان. جهز حرس القَصر للمعركة على صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000440", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000440.wav", "text": "يا هلا بالمسافر، مرحب فيك عند Checkpoint خمسة ستة! استانس بسباقات الجمال اللي عندنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000260", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000260.wav", "text": "الرموز القديمة على صخرة الفيل تطلع بس تحت ضوء القمر. يلا، فك رموزها قبل يطلع الفجر!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000157", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000157.wav", "text": "إذا سجلنا الحين، المشروبات علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000350", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000350.wav", "text": "جهّز نفسَك للحج! تأكد من Google Maps وقابلنا عند الوّاحات في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000404", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000404.wav", "text": "مرحبتس في Strategy واحد صفر واحد: كيف تدزلب الجمال وتغلب الكثبان! صفحة مئة وثلاثة وعشرون", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000305", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000305.wav", "text": "تذكر كيف كان الهوا الصحراوي على الكْثُبان عند الأوسيَس صفحة مئة واثنان وأربعون؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000419", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000419.wav", "text": "تعلمتوا الاستراتيجيات. الحين طبقوها علشان تغزون أراضي الصحرة. شوفوا صفحة مئة واثنين وأربعين في كتاب Game Manual.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000209", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000209.wav", "text": "قل القصيدة الحربية، تذكرنا بشجاعة أجدادنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000010", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000010.wav", "text": "يلّا نلحق القطار قبل لا يفوتنا!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000236", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000236.wav", "text": "لازم نسيطر على الأهرامات، اندفع الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000101", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000101.wav", "text": "تجهيز شنطة الحج مثل حل بازل مقدس!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000367", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000367.wav", "text": "لا تْقَلِق، الجمل عنده GPS. بس اتبع الإحداثيات على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000342", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000342.wav", "text": "عرضك ظليل. ما أقدر أبيع بأقل من مئتي coins.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000477", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000477.wav", "text": "حياكم في المهرجان! تعالوا عند بوث خمسة ستة لتجربة ما تنساها.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000423", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000423.wav", "text": "هلا بك يا قايد! حضّر عساكرك للمهمة الأولى في واحة خمسة وستين! شوف الخريطة في جهازك الـTablet.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000253", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000253.wav", "text": "الشاورما خلصت! دق على المورد واطلب دجاج زيادة، بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, busy"}
+{"id": "sample_000446", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000446.wav", "text": "تْذكّر أيام الوَاحَه، وين وُلِدَت Legends في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000140", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000140.wav", "text": "الدلة جاهزة! اسكبها للمعازيم.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000109", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000109.wav", "text": "بالعيد، نبدا بزيارة الكبار، وبعدين الصغار. كذا نكرم الحكمة والبراءة مع بعض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000450", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000450.wav", "text": "ليه الجمل ما رضى يتفاهم؟ خايف يقسم له الoasis برقم ستة وخمسين!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000164", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000164.wav", "text": "نكسر خط دفاعهم هنا، نفوز بالمعركة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000273", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000273.wav", "text": "يلا، لازم نهرب قبل ما الجمل يزهق! ليفل خمسة يستنانا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000366", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000366.wav", "text": "في الصحرَهْ، لقيت رسالة قديمة من جدّي على صفحة اثنين وأربعين في كتاب Desert Wisdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000153", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000153.wav", "text": "إذا ما سرعت، بنخسر قدام سكوتر!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000107", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000107.wav", "text": "في العيد، طلبت بساط طاير، بس جاني مكنسة كهرب بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000359", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000359.wav", "text": "حذر! ذا الجمل يحب ياكل guidebooks. تلقاه عند ستة وخمسين شارع النجدي بعد العصر.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000117", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000117.wav", "text": "امش في الجسر اللي بين أبراج مركز المملكة. شوف الرياض من فوق، يا سلام!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000008", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000008.wav", "text": "كسّر الجدار عشان توقف هجمة الفيروس!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000223", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000223.wav", "text": "اللي ينسى أصله، ما له مستقبل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000339", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000339.wav", "text": "هلا والله! أنا دليلك اللي بيخذك في مغامرة وسط النْجود. يلا نبدأ من ستة وخمسين شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000182", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000182.wav", "text": "الجمل ذا سلالته صافية، يستاهل كل ريال.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000013", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000013.wav", "text": "اقبض الطارات على السيف وقت العرضة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000196", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000196.wav", "text": "الأساطير تنولد هنا، في شوارعنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000226", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000226.wav", "text": "عبي الشوزن، قاعدين يهجمون علينا!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000221", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000221.wav", "text": "الصحراء تعلم الصبر، مثل ما يقولون البدو.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000243", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000243.wav", "text": "خلك منتبه! إذا أطلقت الصقر بدري، بنخسر السباق!", "language_id": "ar", "instruct": "saudi, conversational, competitive"}
+{"id": "sample_000474", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000474.wav", "text": "خلنا نخطط الstrategy لوحتنا! اتصل على صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان لعقد مجلس العائلة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000293", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000293.wav", "text": "تذكر سوالف جدودنا واحنا واقفين عند بوابة المدينة رقم خمسة ستة جنب برج Kingdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000175", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000175.wav", "text": "طلقة مضبوطه تغير كل شي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000275", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000275.wav", "text": "ليش الجمل دخل قبيلة Puzzle؟ عشان يحل ألغاز القفر في level ثلاثة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000106", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000106.wav", "text": "من جسر السما في برج المملكة بالرياض، تقدر تلمس الغيوم!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000456", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000456.wav", "text": "جهز الإمدادات لرحلتنا. لازم نوصل ل Oasis أربعة اثنان قبل غروب الشمس.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000480", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000480.wav", "text": "علمتني وش شفت في ستة وخمسين شارع النجدي. كان فيه أحد مشبوه؟", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000141", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000141.wav", "text": "كل خطوة بهالرحلة تقربك للإيمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000241", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000241.wav", "text": "الهوى قوي الليلة، ثبت الخيمة زين في الأرض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000421", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000421.wav", "text": "اقوموا يا محاربين! حتى الـcamel تخاف من خطتنا! الصفحة مئة واثنان وأربعون في الدليل يشرح الخطة.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000126", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000126.wav", "text": "عشان نوصل للمرحلة الجاية، لازم نعيد ترتيب هالرموز العتيقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000245", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000245.wav", "text": "احمِ المها من الصيادين، هي مهددة بالانقراض وتحتاج حمايتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000444", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000444.wav", "text": "اسرع، حل اللغز! المفتاح مخفي في صفحة مئة واثنين وأربعين من الmanual.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000199", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000199.wav", "text": "افرد العجينة وزيد الصلصة بترتيب.", "language_id": "ar", "instruct": "saudi, conversational, creative"}
+{"id": "sample_000207", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000207.wav", "text": "جدل السعف زين، السلة بتشيل كثير تمر.", "language_id": "ar", "instruct": "saudi, conversational, creative"}
+{"id": "sample_000259", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000259.wav", "text": "الشركة حقتنا في البلوكتشين محتاجة استثمار أكثر. يلا نعرض على شركات التمويل الجريء السعودية!", "language_id": "ar", "instruct": "saudi, conversational, ambitious"}
+{"id": "sample_000152", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000152.wav", "text": "الموضوع مو بس عن الفوز، هو عن الدقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000195", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000195.wav", "text": "آخر جولة، ما بقى إلا المحترفين!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000108", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000108.wav", "text": "اللفافة العتيقة تقول: 'الكنز في المكان اللي ظل أطول منارة يبوس أقدم بير وقت العصر.'", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000095", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000095.wav", "text": "ياهوه! جا وقت نعلق فوانيس رمضان!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000145", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000145.wav", "text": "الخريطة تقول الكنز مدفون تحت الرمال اللي تتحرك.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000135", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000135.wav", "text": "إذا ما قتلنا الزحمة، الحر بيقتلنا!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000159", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000159.wav", "text": "نتقدم، لا توقفون الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000198", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000198.wav", "text": "حرك الجريش لين يثقل، وبعدين زيد البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000387", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000387.wav", "text": "مبروك! الحين وصلت لآخر level، يلا نحتفل في ستة وخمسين شارع.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000219", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000219.wav", "text": "امش بالليل عشان تتفادى حرارة الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000277", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000277.wav", "text": "ليه الجمل قدّم للوظيفَه؟ يبي stable position!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000268", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000268.wav", "text": "الفخ النبطي القديم يشتغل! بسرعة، حل لغز خريطة النجوم عشان توقفه!", "language_id": "ar", "instruct": "saudi, conversational, thrilling"}
+{"id": "sample_000190", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000190.wav", "text": "حط القذائف مضبوط، لو غلطنا نضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000328", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000328.wav", "text": "وش عذرتس يا بنت عند ستة وخمسين شارع الواحة الساعه سبعة؟ شفتي سباق الجمل؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000179", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000179.wav", "text": "إذا تثق بصقرك، بيجيب الفريسة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000014", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000014.wav", "text": "الغرفة السرية تكشف الأسرار القديمة وقت الغروب.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000147", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000147.wav", "text": "القافلة تريح، بس البر ما ينام.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000232", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000232.wav", "text": "حط الحاجز هنا، بيكون وقفتنا الأخيرة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000011", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000011.wav", "text": "ساعدني ألقى حرامي الضايع في خربطة الشنط!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000105", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000105.wav", "text": "تجهيز شنط الحج مثل حل لعبة المكعبات بحبات السبحة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000225", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000225.wav", "text": "عبي السلاح بسرعة! إحنا تحت ضرب النار!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000178", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000178.wav", "text": "عجل بالحصان! قربنا نوصل للنهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000181", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000181.wav", "text": "لازم نلقى لنا ملجأ قبل ما تجي العاصفة.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000121", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000121.wav", "text": "البطولة لنا! تعب فريقنا ما راح على الفاضي!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000270", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000270.wav", "text": "العملاق النبطي قاعد يصحى! بسرعة، استخدم العود السحري حقك عشان تصعقه، وبعدين اضربه بسيفك المعقوف!", "language_id": "ar", "instruct": "saudi, conversational, epic"}
+{"id": "sample_000373", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000373.wav", "text": "دور على المفتاح المخفي عشان تفتح الباب، وإلا بتظل هنا للأبد! ترى أقرب phone في الدور ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000285", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000285.wav", "text": "دور على الأثر القديم في manual صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000283", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000283.wav", "text": "بسرعة! لازم نلقى الscroll المخفي في المكتبه قبل يرجعون الحراس! المكتبه في دور ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000206", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000206.wav", "text": "حكم الخيوط زين، هالسجادة بتحمل قصص أهلنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000262", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000262.wav", "text": "لازم نوازن بين الاستدامة والتقنية. زيد المزارع العمودية في المنطقة خمسة.", "language_id": "ar", "instruct": "saudi, conversational, innovative"}
+{"id": "sample_000435", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000435.wav", "text": "بسرعة، علمني، وش هي عاصمة أستراليا؟ تلميحة: شوفي صفحة مئة واثنين وأربعين في الڤايد!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000291", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000291.wav", "text": "اتفقت القبيلة! بنلتقي في واحة ستة وخمسون. جبو أحسن strategies عندكم!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000280", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000280.wav", "text": "عدّينا كثبان واجد سوا؛ وداعًا يا رفيقي لين نرجع نلتقي في ليفل عشرة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000174", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000174.wav", "text": "محاصرين! لازم نطلعهم بقوة!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000197", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000197.wav", "text": "سر الكبسة يجي مع ظبط البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000278", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000278.wav", "text": "دير بالك! الجمل سرق قْهَوَتِس في المهرجان! الظاهر إنه عنده مئة HP مثل Boss في اللعبة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000346", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000346.wav", "text": "هلا بك في الفريق، يا القايد. قاعدتك على ستة وخمسين شارع النجدي. جهّز استراتيجيتك.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000228", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000228.wav", "text": "عجل! عبي المدفع قبل ما يضربونا!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000170", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000170.wav", "text": "ما عندنا مؤونة كفاية، لازم ننسحب!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000234", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000234.wav", "text": "تراجع الحين! تجمع عند نقطة التفتيش الجاية!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000125", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000125.wav", "text": "ثلاثة، اثنان، واحد، انطلق! دعس البنزين وكون الأول!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000208", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000208.wav", "text": "طبخ الرز واللحم على نار هادية، الضيوف بيجون قريب.", "language_id": "ar", "instruct": "saudi, conversational, creative"}
+{"id": "sample_000151", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000151.wav", "text": "العدو قريب، لازم نهرب الحين!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000102", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000102.wav", "text": "من فوق برج المملكة، تقدر تشوف باكر من اليوم!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000204", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000204.wav", "text": "الصبار فيه موية تكفينا أيام.", "language_id": "ar", "instruct": "saudi, conversational, creative"}
+{"id": "sample_000120", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000120.wav", "text": "يا ربعي! علامة الضرب تبين المكان على خريطة الكنز ذي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000203", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000203.wav", "text": "خيط الشماغ للعريس، خلي الأطراف زينة.", "language_id": "ar", "instruct": "saudi, conversational, creative"}
+{"id": "sample_000424", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000424.wav", "text": "يا بطل! وش اسمك الملحمي قبل نخلّص العالم؟ حط اسمك هنا وشوف صفحة مئة وواحد.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000003", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000003.wav", "text": "حطينا في الدرعية القديمة! يالله بسرعة، عدّل طريقة كلامك لا يحسبونا جنّ الناس هنا!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000379", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000379.wav", "text": "ذي الأحجار القديمة في Ruins خمسة ستة تحكي سَالْفة ضايعة مع الزمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000381", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000381.wav", "text": "العدو جاي من ورى الرِّمال. جهّز الدفاعات في سيكتور خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000454", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000454.wav", "text": "أذكر الأيام الزينات في الصحرا تحت ضو الگمر. تشبه Level خمسة باللعبة في صفحة مئة وثلاثة وعشرون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000400", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000400.wav", "text": "النصر لنا! خلونا نركب الجمال ونحتفل، بس لا تنسى تسجل رقم خمسة ستة شارع النجدي في GPS.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000384", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000384.wav", "text": "لازم تسوي سعر أحسن لهالبضاعة، ولا بتخاطر تخسر الصفقة. شوف الشروط بصفحة ستة وخمسين من كتيب Steam.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000162", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000162.wav", "text": "اليد اللي ثابتة تفوز، مو اليد السريعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000265", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000265.wav", "text": "الآلة القديمة تشتغل! بسرعة، حل لغز الكتابة القديمة عشان تدخل الغرفة المخبية!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000248", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000248.wav", "text": "يبه، ليه صخرة الفيل وكل الصخور ذي أشكالها غريبة كذا؟", "language_id": "ar", "instruct": "saudi, conversational, curious"}
+{"id": "sample_000104", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000104.wav", "text": "شد حيلك! هالبقي على الكثبان أخشن من بعير فيه الزغطة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000006", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000006.wav", "text": "البس نظارة الواقع عشان تشوف خريطة الكنز المخبية!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000393", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000393.wav", "text": "مرحبا بكم في بداية مغامرتنا! خلونا نشوف وش فيه بصفحة ستة وخمسون سوى!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000139", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000139.wav", "text": "الرموز باهتة، بس معناها قوي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000463", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000463.wav", "text": "بسرعة، قولي لي وش شفتي ب ستة وخمسين شارع النجدي! كان مجلس Falcon؟", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000453", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000453.wav", "text": "شوف ذا النقوش القديمة! يمكن تكشف secret strategy عن خطة سرية. العنوان: ستة وخمسون شارع النجدي", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000231", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000231.wav", "text": "تراجع للغطاء، الوضع هنا يضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000279", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000279.wav", "text": "جَمِّع السرعة عشان تفوز بكأس الصحراء. Level ثلاثة ينتظرك.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000173", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000173.wav", "text": "طلقة مضبوطة يعني فوز مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000264", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000264.wav", "text": "المنطقة الآمنة تضيق! روح برج المملكة عشان تاخذ ميزة المكان العالي!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000227", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000227.wav", "text": "سوق الدبابة للمرتفع، نحتاج الأفضلية!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000183", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000183.wav", "text": "الحصان باسمه القوي يشيل فخر القبيلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000115", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000115.wav", "text": "النقوش في الحجر تتكلم عن طرق التجارة القديمة. تقدر تفك شفرة كلامها؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000114", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000114.wav", "text": "جا وقت نعلق فانوس رمضان! يلا ننور بيتنا للشهر الفضيل.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000103", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000103.wav", "text": "في العيد، تمنيت بعير... بس جاني لعبة محشية بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000478", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000478.wav", "text": "مرحبا بكم يا رْجال الصغار! خلونا نبدا الدرس بقوة وشجاعة. شوفوا صفحة مئة واثنان وأربعون في كتاب game.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000097", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000097.wav", "text": "الجليب العتيق يوسوس، 'حط ريال وتمن أمنية!'", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000201", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000201.wav", "text": "هشّك الرز بالشوكة عشان يطلع خفيف.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000205", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000205.wav", "text": "دربه يرجع بالصيد، الثقة تبني مع الوقت.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000276", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000276.wav", "text": "سويتِها! الحين الصحراء صارت بأمان. تعالوا نلتقي عند الواحة لاحتفال كبير مع الشلة في ليفل واحد صفر!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000274", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000274.wav", "text": "ذا الجمل اللي في حوشتس عميل سري ولا بس يعشق قهوة وPlayStation؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000374", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000374.wav", "text": "حيّاكم الله في المهرجان! انبسطوا بسباق الجمال وخذوا تمرات تس مجانًا من booth خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000266", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000266.wav", "text": "انتبه! درونات الشركات تفحص المكان. استخدم جهاز الإخفاء السايبر حقك عشان ما ينكشف وجودك.", "language_id": "ar", "instruct": "saudi, conversational, futuristic"}
+{"id": "sample_000344", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000344.wav", "text": "حياك الله في هالمستوى من Puzzle! لا تضيع جملَك في متاهة النفود عند سبعة وستين شارع النبطي.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000184", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000184.wav", "text": "كلّم حصانك بهدوء، وبيثق فيك وقت المعركة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000224", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000224.wav", "text": "النار تدفي قلوبنا، والقصص تدفي الأرواح.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000252", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000252.wav", "text": "يالله، وزع الرماة على طرف طويق! ما نبي يخترقون القلعة!", "language_id": "ar", "instruct": "saudi, conversational, competitive"}
+{"id": "sample_000212", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000212.wav", "text": "الغزلان سريعة، امش على آثارها بحذر.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000218", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000218.wav", "text": "النقوش هذي تحكي قصة سقوط المدينة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000401", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000401.wav", "text": "السلام عليكم يا زائرٍ نبيل. أنا رفيقتس اللي بيدزلك في هذي الألغاز الغامضة في Level خمسة من اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000110", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000110.wav", "text": "هالنقوش على الحجر تبين ساعة الماء القديمة في مدائن صالح. تقدر تشغلها من جديد؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000142", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000142.wav", "text": "الرمل يخبّي أكثر من العظام.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000111", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000111.wav", "text": "ترتيب النقل الجماعي لحجاج حملتنا مثل حل لغز صعب. كل واحد له طلبات مختلفة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000192", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000192.wav", "text": "دوّر السيارة! خلك ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000239", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000239.wav", "text": "خل القهوة تغلي ببطء، الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000448", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000448.wav", "text": "هلا، لقيت الجمل الضايع؟ دور في صفحة مئة واثنين وأربعين من الmanual!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000202", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000202.wav", "text": "خيط الثوب زين، كل شي مهم للعريس.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000242", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000242.wav", "text": "ابني الطاحونة في وجه الهوى، لازم تدور بدون مشاكل.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000263", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000263.wav", "text": "دبابات العدو جاية من الشرق! حمّل الطلقات اللي تخترق الدروع وصوّب!", "language_id": "ar", "instruct": "saudi, conversational, intense"}
+{"id": "sample_000267", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000267.wav", "text": "السباق يبدأ بعد خمسة دقايق! خذ لك سيارة سريعة وتعال لموقف استاد الملك فهد.", "language_id": "ar", "instruct": "saudi, conversational, satirical"}
+{"id": "sample_000414", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000414.wav", "text": "بسرعة! حل اللغز عشان تفتح الجمل قبل توصلنا القبيلة المنافسة. لا تنسى تسجل الرقم في صفحة ستة وخمسون بالدليل!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000220", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000220.wav", "text": "خلنا ندور أفضل الصفقات في السوق قبل ما نكمل.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000129", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000129.wav", "text": "لازم نلقى المفتاح المخفي علشان ندخل القبر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000457", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000457.wav", "text": "شوف ذا الباب القديم! يحسسك إنه Portal مخفي لعالم ثاني، في خريطة ليفل ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000308", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000308.wav", "text": "تسذكر يوم اجتمعنا تحت النجوم بالصحراء؟ كنا نسولف عن PlayStation وشارع خمسة ستة بالرياض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000177", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000177.wav", "text": "النجوم بتدلنا بالليل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000155", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000155.wav", "text": "ما نخليهم ياخذون التل. اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000012", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000012.wav", "text": "بسرعة، طلّق الصقور على الطريدة اللي تفر!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000399", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000399.wav", "text": "تأكد من إرسال تفاصيل المعاهدة إلى المجلس عند الساعة العاشرة صباحًا عبر email.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000018", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000018.wav", "text": "النجوم تدلنا في الرمل اللي ما له نهاية.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000271", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000271.wav", "text": "انتبِه لخطواتك. الطريق هنا في ليفل خمسة خطير جدًا وsteep.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000158", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000158.wav", "text": "ثبت في مكانك، لا تفقد تركيزك!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000250", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000250.wav", "text": "المترو زحمة! لازم نلقى طريق أسرع للمول.", "language_id": "ar", "instruct": "saudi, conversational, stressed"}
+{"id": "sample_000131", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000131.wav", "text": "لازم نزين الخيمة للاحتفال الكبير!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000430", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000430.wav", "text": "هلا، خلونا نحفر هنا ونلقي treasure قديم! يمكن نلقى fossil الجمل؟ صفحة ثلاثمئة واثنا عشر في الدليل.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000412", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000412.wav", "text": "شفت سباق الجمال عند واحة ستة وخمسين؟ كان رهيب!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000238", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000238.wav", "text": "الهواء مثالي، خلنا نسبقهم لخط النهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000144", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000144.wav", "text": "شلون كل محل هنا يبيع نفس الشي؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000340", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000340.wav", "text": "مبروك يا بطل! قِمتَ بقود قبيْلتك للمجد. لين نلتقي مرة ثانية في الويْحَه على صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000193", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000193.wav", "text": "دوس بنزين! نتسابق على ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000326", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000326.wav", "text": "تذكر يوم اللي جالك تحدي وغلبته في مقابلة مع شركة RiyadhTech. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000137", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000137.wav", "text": "الرمل يتحرك مع كل خطوة، يخفي الطريق قدامنا.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000261", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000261.wav", "text": "الخط الأزرق زحمة! حوّل الركاب للخط الأخضر عشان يوصلون المول بالوقت!", "language_id": "ar", "instruct": "saudi, conversational, urgent"}
+{"id": "sample_000211", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000211.wav", "text": "شوف رجوله، قوية. بيكون ممتاز للقافلة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000118", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000118.wav", "text": "من بيوت الدرعية الطينية لأبراج الرياض، عمارتنا تحكي قصتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000337", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000337.wav", "text": "سمعتْ؟ خُطتْ المَلِگ الجديدة سرية مثل رقم الهاتف صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000230", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000230.wav", "text": "حط المتفجرات وارجع، بنفجر الجسر!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000092", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000092.wav", "text": "قلت 'قزازة موية'، مو 'بعير موية'!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000240", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000240.wav", "text": "ارفع السيف فوق، العرضة بتبدأ قريب!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000433", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000433.wav", "text": "تتذكر حكايات Al-Majlis القديمة؟ دايمًا تلهم الشجاعة. تذكر صفحة مئة واثنان وأربعون بالكتاب.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000254", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000254.wav", "text": "عشان نفتح الدور الجاي، لازم نحل اللغز الاقتصادي ذا. فكر مثل المستثمر!", "language_id": "ar", "instruct": "saudi, conversational, challenging"}
+{"id": "sample_000143", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000143.wav", "text": "الصقر حلق، ودانا وجهتنا.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000246", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000246.wav", "text": "يا سلام، قطعنا البلاد بساعة وحدة! الهايبرلوب ذا شي ثاني!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000128", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000128.wav", "text": "طيارات العدو وراك! سو حركات المراوغة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000165", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000165.wav", "text": "قريبين نفوز! لا تخففون الضغط!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000169", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000169.wav", "text": "تحركوا! ما نقدر نجلس هنا أكثر!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000116", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000116.wav", "text": "المشوار لمكة مو بس للجسم. أنت جاهز روحياً للحج؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000168", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000168.wav", "text": "لو تسجل، العشا علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000297", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000297.wav", "text": "يا زين! جملنا جاب VIP pass للْحَجّ. دق على صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة للتفاصيل.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000237", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000237.wav", "text": "نحتاج تعزيزات نفك الحصار عن بغداد!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000191", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000191.wav", "text": "انتبه من الحفر! ما نبي نطيح!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000468", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000468.wav", "text": "قوّموا المكينة! نْسابق صوب الويحات بسرعة صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000172", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000172.wav", "text": "بهالسرعة بنفوت العشا!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000100", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000100.wav", "text": "رمول الزمان خشت المفتاح؛ ما يشوفه إلا عين الصقر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"}
+{"id": "sample_000323", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000323.wav", "text": "مرحبا بك في عالم الصحراء. استعد للمعركة، يا بطل. رحلتك تبدأ الآن في صفحة ستة وخمسون من Game Guide.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000134", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000134.wav", "text": "لقينا الخريطة! يالله نبدأ المغامرة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000314", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000314.wav", "text": "حياك الله في مجلس التخطيط. من فضلك اذكر اسمك ودورك. حنا في غرفة رقم اثني عشر.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000394", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000394.wav", "text": "مرحبًا بك يا محارب. قول لنا عن رحلتك قُدّام المجلس على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000149", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000149.wav", "text": "أحسن لك تكون أسرع من الهوى، وإلا راحت عليك!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000272", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000272.wav", "text": "صدى الحكمة القديمة باقي يتردد في هالأطلال المْنسية، كنك تلقى شي زي ليفل خمسة في لعبة strategy.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000133", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000133.wav", "text": "ضرب عازف العود على الأوتار، وبدأوا الناس يصفقون.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000244", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000244.wav", "text": "تأكد أن كل التوصيلات على الوقت، زحمة المدينة ممكن تسبب تأخير.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000015", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000015.wav", "text": "شوف! لقيت كتابة قديمة على الجدار ذا!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000091", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000091.wav", "text": "يبه، ليه نلبس ثياب يديدة للعيد؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000255", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000255.wav", "text": "وناسة عالم الشتاء! نجرب اللعبة الدوارة ولا صالة الجليد أول؟", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000418", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000418.wav", "text": "مرحبًا يا محاربين! خلونا نغزو الديرة ونسجل مئة نقطة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000222", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000222.wav", "text": "القبيلة واقفة مع بعض مثل الجدار المتين.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000167", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000167.wav", "text": "يدك ثابتة وعقلك صافٍ، كذا تفوز.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000320", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000320.wav", "text": "مبروك! دزيت هالمستوى. لا تنسى تشيك على وقود الجمل بصفحة مئة واثنين وأربعين من الـ User Manual.", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000338", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000338.wav", "text": "تتذگر طعم القهوة مع التمر بالمجلس؟ أظن إنه كان في صالة رقم اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000233", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000233.wav", "text": "حط الفخ هنا، بننصب لهم كمين مع الفجر.", "language_id": "ar", "instruct": "saudi, conversational, critical"}
+{"id": "sample_000235", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000235.wav", "text": "دور القناص قبل ما يضرب مرة ثانية!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000295", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000295.wav", "text": "بسرعة، دوّر على المفتاح المخفي في القاعة majestic قبل ما يرجعون الحراس! شوف صفحة مئة واثنين وأربعين للخطوات.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000150", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000150.wav", "text": "لازم نرجع نرتب الصفوف قبل ما يهجمون مرة ثانية.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000216", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000216.wav", "text": "اطلع بحذر، أفضل التمر فوق.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000094", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000094.wav", "text": "شوف! الكعبة قدامنا على طول!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000171", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000171.wav", "text": "شكله الكفر انفجر، لازم نصلحه بسرعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000356", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000356.wav", "text": "ارسل الكشافة لجهة الكثبان الشرقية يدورون عن إشارات كمين. لا تنسى تحدث الخريطة في جهاز GPS اللي معك للإصدار اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000392", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000392.wav", "text": "تشوف الصقور يطرن فوق الجبل؟ خلنا ننضم معهم في الصفحة مئة واثنان وأربعون من Mountain Quest!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000138", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000138.wav", "text": "الغنم جاهزة! يالله نبدأ العزيمة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000351", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000351.wav", "text": "حياكم الله في سوق التجارة! شوفوا السرج الخاص بالجمل عندنا بسعر مئة وتسعة وتسعين ريال!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000096", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000096.wav", "text": "التجهيز للحج مثل لعبة الطناخة بس حقيقية!", "language_id": "ar", "instruct": "saudi, conversational, humorous"}
+{"id": "sample_000210", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000210.wav", "text": "السعر غالي، خلنا نتفق على شي ينفعنا اثنيننا.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000251", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000251.wav", "text": "متأكد إن البالون ذا يشيلنا كلنا؟ أحس إنه يترنح شوي فوق!", "language_id": "ar", "instruct": "saudi, conversational, nervous"}
+{"id": "sample_000258", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000258.wav", "text": "ياخي، هذا مرشد سياحي تصويري؟ كأنك تمشي بالماضي والمستقبل مع بعض!", "language_id": "ar", "instruct": "saudi, conversational, curious"}
+{"id": "sample_000124", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000124.wav", "text": "العاصفة تقرب! لازم نوصل المنطقة الآمنة بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000269", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000269.wav", "text": "فوت الكورة بين رجلين المدافع وسو تمريرة حايط مع ربيعك عشان تسجل في زقاق السوق!", "language_id": "ar", "instruct": "saudi, conversational, energetic"}
+{"id": "sample_000455", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000455.wav", "text": "اذكر يوم تعلمنا عن قبائل Bedouin وتقاليدهم في صفحة أربعة وثلاثين من كتاب History.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000467", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000467.wav", "text": "لقَ ذا الواحَهْ المخفّية قبل غروب الشمس وقَم المخيم قريب من الأنقاض القديمة في شارع الملك عبد الله رقم سبعة تسعة.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000247", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000247.wav", "text": "شوف الحفلة التصويرية ذي! التقنية غيرت جو الترفيه عندنا بشكل!", "language_id": "ar", "instruct": "saudi, conversational, amazed"}
+{"id": "sample_000185", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000185.wav", "text": "حصان الفارس أعز صديق له وقت القتال.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000154", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000154.wav", "text": "باقي لك هدف واحد. خلها تضبط.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000442", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000442.wav", "text": "الاجتماع العائلي في الواحة مهم جداً. لازم نسوي الخطة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000249", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000249.wav", "text": "السيارات الكهربا ساكتة مرة! كأننا نسابق في المستقبل!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000459", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000459.wav", "text": "تذكر يوم تسابقنا فوق الكْثبان الرملية في Desert Racer، ندزلب غروب الشمس؟ قابلني عند ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000156", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000156.wav", "text": "قريب توصل، باقي لك لفة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000007", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000007.wav", "text": "برمج الدرونات تسقي المزارع اللي فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000217", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000217.wav", "text": "الحجارة هذي من قرون واقفة، اسمع قصصها.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000341", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000341.wav", "text": "تْفَكَّر فِي الرِّحْلَة عبر الصّحْرَا لِـ Mecca. تذَكَّر سَوالِف أَجْدَادْنَا اللي قالوها في صفحة مئة واثنين وأربعين.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000187", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000187.wav", "text": "الدبابة جايه من الشرق! جهز المدفع!", "language_id": "ar", "instruct": "saudi, conversational, serious"}
+{"id": "sample_000302", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000302.wav", "text": "شفت أحد مشبوه قريب من الوَحَه حول الساعه صفر تسعة: صفر صفر؟ يمكن كان لابس قميص مكتوب عليه Desert Eagle.", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000176", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000176.wav", "text": "فاضي! مرر الكورة!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
+{"id": "sample_000017", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000017.wav", "text": "نقوش الحنا حق جدتي مثل القصيد اللي يسيل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"}
+{"id": "sample_000407", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000407.wav", "text": "شوف! الكثبان تلمع مثل fireflies! يلا نحل هال puzzle بسرعه!", "language_id": "ar", "instruct": "saudi, conversational, excited"}
diff --git a/sync_data/tokens/dev/data.lst b/sync_data/tokens/dev/data.lst
new file mode 100644
index 0000000000000000000000000000000000000000..ec6c1f750231f415e86915481bba0bbd683b8889
--- /dev/null
+++ b/sync_data/tokens/dev/data.lst
@@ -0,0 +1,15 @@
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000000.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000000.jsonl 1 4.200
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000001.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000001.jsonl 1 2.840
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000002.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000002.jsonl 1 4.840
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000003.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000003.jsonl 1 4.840
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000004.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000004.jsonl 1 2.720
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000005.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000005.jsonl 1 7.280
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000006.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000006.jsonl 1 3.480
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000007.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000007.jsonl 1 2.840
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000008.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000008.jsonl 1 9.720
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000009.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000009.jsonl 1 3.800
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000010.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000010.jsonl 1 3.360
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000011.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000011.jsonl 1 4.880
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000012.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000012.jsonl 1 3.840
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000013.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000013.jsonl 1 3.160
+/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000014.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000014.jsonl 1 3.360
diff --git a/sync_data/tokens/dev/errors.jsonl b/sync_data/tokens/dev/errors.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sync_data/tokens/dev/txts/shard-000000.jsonl b/sync_data/tokens/dev/txts/shard-000000.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..77facbe9da1adbeea0c2e54840db31915a59ff99
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000000.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000127", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000127.wav", "text": "الحراس يغيرون النوبة. هذي فرصتنا نتسلل!", "language_id": "ar", "instruct": "saudi, conversational, tense", "audio_duration": 4.2, "num_tokens": 105}
diff --git a/sync_data/tokens/dev/txts/shard-000001.jsonl b/sync_data/tokens/dev/txts/shard-000001.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ebc3da34ac4249d958409d971a5dc73ffdbb8c67
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000001.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000016", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000016.wav", "text": "لازم نعبي موية زيادة للسفرة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71}
diff --git a/sync_data/tokens/dev/txts/shard-000002.jsonl b/sync_data/tokens/dev/txts/shard-000002.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c39504e0df05dc4fb38434fba140f64a44f34a19
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000002.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000119", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000119.wav", "text": "يا هيوستن، طرنا! مهمتنا للمريخ تبدأ الحين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121}
diff --git a/sync_data/tokens/dev/txts/shard-000003.jsonl b/sync_data/tokens/dev/txts/shard-000003.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f5b91fdd6d124f2d4e5880c9172a4c16f2c03802
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000003.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000215", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000215.wav", "text": "افرك الورق على الجرح، العشب هذا مستخدم من زمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.84, "num_tokens": 121}
diff --git a/sync_data/tokens/dev/txts/shard-000004.jsonl b/sync_data/tokens/dev/txts/shard-000004.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f4449cfbf53bbbde7546c823403cdb8ca1946577
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000004.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000132", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000132.wav", "text": "النجوم ترشدنا بليل البر.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.72, "num_tokens": 68}
diff --git a/sync_data/tokens/dev/txts/shard-000007.jsonl b/sync_data/tokens/dev/txts/shard-000007.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0a7b73c9d3ac6b7d65c8448c22161b273c381d83
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000007.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000089", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000089.wav", "text": "شوف! لقيت عملة من زمان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71}
diff --git a/sync_data/tokens/dev/txts/shard-000008.jsonl b/sync_data/tokens/dev/txts/shard-000008.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bbf8a7d77e89f6365826b23d827a91d62ced464e
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000008.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000452", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000452.wav", "text": "شغل عدل! ادزلبتس استراتيجية الدفاع عن الواحة. خلنا نحتفل بكوب من القهوة مع GameMaster. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.72, "num_tokens": 243}
diff --git a/sync_data/tokens/dev/txts/shard-000010.jsonl b/sync_data/tokens/dev/txts/shard-000010.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..74ecadc871e8ef90121f0acfce58334c58a57146
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000010.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000090", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000090.wav", "text": "نجهز للحج! وش لازم نجيب معنا؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.36, "num_tokens": 84}
diff --git a/sync_data/tokens/dev/txts/shard-000011.jsonl b/sync_data/tokens/dev/txts/shard-000011.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..96e7fe923f5b1c4c389143fe9d6be0d5e07e49e1
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000011.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000200", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000200.wav", "text": "غلي المويه، بعدين حط الهيل عشان الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.88, "num_tokens": 122}
diff --git a/sync_data/tokens/dev/txts/shard-000013.jsonl b/sync_data/tokens/dev/txts/shard-000013.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e186e53bb82e7f180a3f7dff9e2efebb4b0acbd7
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000013.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79}
diff --git a/sync_data/tokens/dev/txts/shard-000014.jsonl b/sync_data/tokens/dev/txts/shard-000014.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5a2fc147d3014e4b9faf44d87cda601068b1263d
--- /dev/null
+++ b/sync_data/tokens/dev/txts/shard-000014.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000189", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000189.wav", "text": "غير الكفر بسرعة، ولا ما نطلع من هنا!", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84}
diff --git a/sync_data/tokens/train/data.lst b/sync_data/tokens/train/data.lst
new file mode 100644
index 0000000000000000000000000000000000000000..610df5b33763993ec28fc04b5e0c2e062b28baa7
--- /dev/null
+++ b/sync_data/tokens/train/data.lst
@@ -0,0 +1,35 @@
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000000.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000000.jsonl 8 42.280
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000001.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000001.jsonl 8 44.000
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000002.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000002.jsonl 8 48.000
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000003.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000003.jsonl 8 35.560
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000004.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000004.jsonl 8 45.360
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000005.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000005.jsonl 8 41.680
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000006.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000006.jsonl 8 49.120
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000007.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000007.jsonl 8 47.360
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000008.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000008.jsonl 8 41.840
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000009.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000009.jsonl 8 35.160
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000010.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000010.jsonl 8 41.360
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000011.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000011.jsonl 8 33.080
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000012.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000012.jsonl 8 41.360
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000013.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000013.jsonl 8 46.160
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000014.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000014.jsonl 8 43.760
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000015.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000015.jsonl 8 47.520
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000016.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000016.jsonl 8 42.800
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000017.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000017.jsonl 8 30.440
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000018.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000018.jsonl 8 42.040
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000019.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000019.jsonl 8 44.720
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000020.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000020.jsonl 8 37.600
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000021.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000021.jsonl 8 43.320
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000022.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000022.jsonl 8 44.960
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000023.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000023.jsonl 8 43.880
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000024.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000024.jsonl 8 51.960
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000025.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000025.jsonl 8 38.240
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000026.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000026.jsonl 8 48.400
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000027.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000027.jsonl 8 40.320
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000028.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000028.jsonl 8 33.960
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000029.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000029.jsonl 8 56.080
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000030.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000030.jsonl 8 44.080
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000031.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000031.jsonl 8 37.160
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000032.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000032.jsonl 8 44.600
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000033.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000033.jsonl 8 35.480
+/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000034.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000034.jsonl 8 50.640
diff --git a/sync_data/tokens/train/errors.jsonl b/sync_data/tokens/train/errors.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sync_data/tokens/train/txts/shard-000000.jsonl b/sync_data/tokens/train/txts/shard-000000.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..824a974d463b8266fdef90892b107674e770a844
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000000.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000144", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000144.wav", "text": "شلون كل محل هنا يبيع نفس الشي؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.96, "num_tokens": 74}
+{"id": "sample_000171", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000171.wav", "text": "شكله الكفر انفجر، لازم نصلحه بسرعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000294", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000294.wav", "text": "استعدوا يا رجيل، العدو جاي من الشرق عقب خمس عشرة دقيقة. صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.44, "num_tokens": 111}
+{"id": "sample_000167", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000167.wav", "text": "يدك ثابتة وعقلك صافٍ، كذا تفوز.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.52, "num_tokens": 113}
+{"id": "sample_000399", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000399.wav", "text": "تأكد من إرسال تفاصيل المعاهدة إلى المجلس عند الساعة العاشرة صباحًا عبر email.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.48, "num_tokens": 162}
+{"id": "sample_000421", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000421.wav", "text": "اقوموا يا محاربين! حتى الـcamel تخاف من خطتنا! الصفحة مئة واثنان وأربعون في الدليل يشرح الخطة.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.6, "num_tokens": 240}
+{"id": "sample_000166", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000166.wav", "text": "يتقدمون علينا! اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67}
+{"id": "sample_000426", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000426.wav", "text": "شفتي الجمل يرقص جنب الواحَهْ البارحة؟ كان العرض عند ستة وخمسين شارع النجدي. It was super!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.96, "num_tokens": 199}
diff --git a/sync_data/tokens/train/txts/shard-000001.jsonl b/sync_data/tokens/train/txts/shard-000001.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca8bbf77cb397c134773fcfa1c0fc8bbb863d6af
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000001.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000367", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000367.wav", "text": "لا تْقَلِق، الجمل عنده GPS. بس اتبع الإحداثيات على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.32, "num_tokens": 158}
+{"id": "sample_000193", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000193.wav", "text": "دوس بنزين! نتسابق على ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000320", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000320.wav", "text": "مبروك! دزيت هالمستوى. لا تنسى تشيك على وقود الجمل بصفحة مئة واثنين وأربعين من الـ User Manual.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.68, "num_tokens": 217}
+{"id": "sample_000270", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000270.wav", "text": "العملاق النبطي قاعد يصحى! بسرعة، استخدم العود السحري حقك عشان تصعقه، وبعدين اضربه بسيفك المعقوف!", "language_id": "ar", "instruct": "saudi, conversational, epic", "audio_duration": 8.56, "num_tokens": 214}
+{"id": "sample_000393", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000393.wav", "text": "مرحبا بكم في بداية مغامرتنا! خلونا نشوف وش فيه بصفحة ستة وخمسون سوى!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.72, "num_tokens": 168}
+{"id": "sample_000094", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000094.wav", "text": "شوف! الكعبة قدامنا على طول!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.0, "num_tokens": 75}
+{"id": "sample_000152", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000152.wav", "text": "الموضوع مو بس عن الفوز، هو عن الدقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.84, "num_tokens": 96}
+{"id": "sample_000204", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000204.wav", "text": "الصبار فيه موية تكفينا أيام.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.16, "num_tokens": 79}
diff --git a/sync_data/tokens/train/txts/shard-000002.jsonl b/sync_data/tokens/train/txts/shard-000002.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..bcc07be20a53c26f3e1d10c199e0bbc8df790869
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000002.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000134", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000134.wav", "text": "لقينا الخريطة! يالله نبدأ المغامرة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.6, "num_tokens": 90}
+{"id": "sample_000381", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000381.wav", "text": "العدو جاي من ورى الرِّمال. جهّز الدفاعات في سيكتور خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.0, "num_tokens": 125}
+{"id": "sample_000379", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000379.wav", "text": "ذي الأحجار القديمة في Ruins خمسة ستة تحكي سَالْفة ضايعة مع الزمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.92, "num_tokens": 198}
+{"id": "sample_000014", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000014.wav", "text": "الغرفة السرية تكشف الأسرار القديمة وقت الغروب.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.84, "num_tokens": 121}
+{"id": "sample_000278", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000278.wav", "text": "دير بالك! الجمل سرق قْهَوَتِس في المهرجان! الظاهر إنه عنده مئة HP مثل Boss في اللعبة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.2, "num_tokens": 205}
+{"id": "sample_000107", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000107.wav", "text": "في العيد، طلبت بساط طاير، بس جاني مكنسة كهرب بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.28, "num_tokens": 132}
+{"id": "sample_000178", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000178.wav", "text": "عجل بالحصان! قربنا نوصل للنهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.56, "num_tokens": 89}
+{"id": "sample_000474", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000474.wav", "text": "خلنا نخطط الstrategy لوحتنا! اتصل على صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان لعقد مجلس العائلة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.6, "num_tokens": 240}
diff --git a/sync_data/tokens/train/txts/shard-000003.jsonl b/sync_data/tokens/train/txts/shard-000003.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1261997f398a8637e39d7ac3a5e36201dd543b6d
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000003.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000477", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000477.wav", "text": "حياكم في المهرجان! تعالوا عند بوث خمسة ستة لتجربة ما تنساها.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.72, "num_tokens": 143}
+{"id": "sample_000128", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000128.wav", "text": "طيارات العدو وراك! سو حركات المراوغة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000217", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000217.wav", "text": "الحجارة هذي من قرون واقفة، اسمع قصصها.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.12, "num_tokens": 103}
+{"id": "sample_000187", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000187.wav", "text": "الدبابة جايه من الشرق! جهز المدفع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000151", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000151.wav", "text": "العدو قريب، لازم نهرب الحين!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 2.84, "num_tokens": 71}
+{"id": "sample_000456", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000456.wav", "text": "جهز الإمدادات لرحلتنا. لازم نوصل ل Oasis أربعة اثنان قبل غروب الشمس.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.56, "num_tokens": 164}
+{"id": "sample_000168", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000168.wav", "text": "لو تسجل، العشا علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.72, "num_tokens": 68}
+{"id": "sample_000118", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000118.wav", "text": "من بيوت الدرعية الطينية لأبراج الرياض، عمارتنا تحكي قصتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.4, "num_tokens": 160}
diff --git a/sync_data/tokens/train/txts/shard-000004.jsonl b/sync_data/tokens/train/txts/shard-000004.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d18a5416fdd9fc52b691dee3abf628b90340f5d7
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000004.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000295", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000295.wav", "text": "بسرعة، دوّر على المفتاح المخفي في القاعة majestic قبل ما يرجعون الحراس! شوف صفحة مئة واثنين وأربعين للخطوات.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.8, "num_tokens": 170}
+{"id": "sample_000092", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000092.wav", "text": "قلت 'قزازة موية'، مو 'بعير موية'!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000007", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000007.wav", "text": "برمج الدرونات تسقي المزارع اللي فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.2, "num_tokens": 80}
+{"id": "sample_000340", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000340.wav", "text": "مبروك يا بطل! قِمتَ بقود قبيْلتك للمجد. لين نلتقي مرة ثانية في الويْحَه على صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.92, "num_tokens": 248}
+{"id": "sample_000227", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000227.wav", "text": "سوق الدبابة للمرتفع، نحتاج الأفضلية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000274", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000274.wav", "text": "ذا الجمل اللي في حوشتس عميل سري ولا بس يعشق قهوة وPlayStation؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.92, "num_tokens": 148}
+{"id": "sample_000478", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000478.wav", "text": "مرحبا بكم يا رْجال الصغار! خلونا نبدا الدرس بقوة وشجاعة. شوفوا صفحة مئة واثنان وأربعون في كتاب game.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.92, "num_tokens": 223}
+{"id": "sample_000160", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000160.wav", "text": "استعدوا، المعركة هذي بتكون صعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.24, "num_tokens": 81}
diff --git a/sync_data/tokens/train/txts/shard-000005.jsonl b/sync_data/tokens/train/txts/shard-000005.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..952c7b62277bc4d917902b33dea843029746d819
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000005.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000305", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000305.wav", "text": "تذكر كيف كان الهوا الصحراوي على الكْثُبان عند الأوسيَس صفحة مئة واثنان وأربعون؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.08, "num_tokens": 177}
+{"id": "sample_000250", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000250.wav", "text": "المترو زحمة! لازم نلقى طريق أسرع للمول.", "language_id": "ar", "instruct": "saudi, conversational, stressed", "audio_duration": 4.0, "num_tokens": 100}
+{"id": "sample_000223", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000223.wav", "text": "اللي ينسى أصله، ما له مستقبل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.2, "num_tokens": 80}
+{"id": "sample_000237", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000237.wav", "text": "نحتاج تعزيزات نفك الحصار عن بغداد!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000245", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000245.wav", "text": "احمِ المها من الصيادين، هي مهددة بالانقراض وتحتاج حمايتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.2, "num_tokens": 130}
+{"id": "sample_000403", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000403.wav", "text": "مرحبًا بك في المقابلة! اجلس واستمتع بـ qahwa المشهورة في صفحة ستة وخمسين من دليلنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.04, "num_tokens": 176}
+{"id": "sample_000116", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000116.wav", "text": "المشوار لمكة مو بس للجسم. أنت جاهز روحياً للحج؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.36, "num_tokens": 134}
+{"id": "sample_000394", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000394.wav", "text": "مرحبًا بك يا محارب. قول لنا عن رحلتك قُدّام المجلس على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.32, "num_tokens": 158}
diff --git a/sync_data/tokens/train/txts/shard-000006.jsonl b/sync_data/tokens/train/txts/shard-000006.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d353c62bba197b9472cb6975cae984e8b079a5c9
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000006.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000150", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000150.wav", "text": "لازم نرجع نرتب الصفوف قبل ما يهجمون مرة ثانية.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.04, "num_tokens": 101}
+{"id": "sample_000433", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000433.wav", "text": "تتذكر حكايات Al-Majlis القديمة؟ دايمًا تلهم الشجاعة. تذكر صفحة مئة واثنان وأربعون بالكتاب.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 9.28, "num_tokens": 232}
+{"id": "sample_000418", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000418.wav", "text": "مرحبًا يا محاربين! خلونا نغزو الديرة ونسجل مئة نقطة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.88, "num_tokens": 147}
+{"id": "sample_000125", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000125.wav", "text": "ثلاثة، اثنان، واحد، انطلق! دعس البنزين وكون الأول!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.72, "num_tokens": 193}
+{"id": "sample_000247", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000247.wav", "text": "شوف الحفلة التصويرية ذي! التقنية غيرت جو الترفيه عندنا بشكل!", "language_id": "ar", "instruct": "saudi, conversational, amazed", "audio_duration": 6.08, "num_tokens": 152}
+{"id": "sample_000010", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000010.wav", "text": "يلّا نلحق القطار قبل لا يفوتنا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.52, "num_tokens": 63}
+{"id": "sample_000427", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000427.wav", "text": "يلا نخلي اللغز ذا سوا! شوفي صفحة مئة واثنين وأربعين للhint.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.04, "num_tokens": 126}
+{"id": "sample_000430", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000430.wav", "text": "هلا، خلونا نحفر هنا ونلقي treasure قديم! يمكن نلقى fossil الجمل؟ صفحة ثلاثمئة واثنا عشر في الدليل.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.56, "num_tokens": 214}
diff --git a/sync_data/tokens/train/txts/shard-000007.jsonl b/sync_data/tokens/train/txts/shard-000007.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..636b66b6321ff31ead332d6a0fcd3b1984acb1e1
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000007.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000285", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000285.wav", "text": "دور على الأثر القديم في manual صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.52, "num_tokens": 113}
+{"id": "sample_000268", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000268.wav", "text": "الفخ النبطي القديم يشتغل! بسرعة، حل لغز خريطة النجوم عشان توقفه!", "language_id": "ar", "instruct": "saudi, conversational, thrilling", "audio_duration": 6.16, "num_tokens": 154}
+{"id": "sample_000159", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000159.wav", "text": "نتقدم، لا توقفون الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.64, "num_tokens": 66}
+{"id": "sample_000453", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000453.wav", "text": "شوف ذا النقوش القديمة! يمكن تكشف secret strategy عن خطة سرية. العنوان: ستة وخمسون شارع النجدي", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.04, "num_tokens": 226}
+{"id": "sample_000257", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000257.wav", "text": "شوف! هذا نجم الشمال. المسافرين أول كانوا يستخدمونه عشان يلقون طريقهم في الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, awe-inspired", "audio_duration": 6.92, "num_tokens": 173}
+{"id": "sample_000480", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000480.wav", "text": "علمتني وش شفت في ستة وخمسين شارع النجدي. كان فيه أحد مشبوه؟", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.28, "num_tokens": 132}
+{"id": "sample_000440", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000440.wav", "text": "يا هلا بالمسافر، مرحب فيك عند Checkpoint خمسة ستة! استانس بسباقات الجمال اللي عندنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.48, "num_tokens": 187}
+{"id": "sample_000444", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000444.wav", "text": "اسرع، حل اللغز! المفتاح مخفي في صفحة مئة واثنين وأربعين من الmanual.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.32, "num_tokens": 133}
diff --git a/sync_data/tokens/train/txts/shard-000008.jsonl b/sync_data/tokens/train/txts/shard-000008.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1af0dc657a8d0f4566ae90b7a4a1dcb3968f31ed
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000008.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000102", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000102.wav", "text": "من فوق برج المملكة، تقدر تشوف باكر من اليوم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.4, "num_tokens": 110}
+{"id": "sample_000149", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000149.wav", "text": "أحسن لك تكون أسرع من الهوى، وإلا راحت عليك!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.36, "num_tokens": 109}
+{"id": "sample_000407", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000407.wav", "text": "شوف! الكثبان تلمع مثل fireflies! يلا نحل هال puzzle بسرعه!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.12, "num_tokens": 153}
+{"id": "sample_000249", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000249.wav", "text": "السيارات الكهربا ساكتة مرة! كأننا نسابق في المستقبل!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121}
+{"id": "sample_000231", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000231.wav", "text": "تراجع للغطاء، الوضع هنا يضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000232", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000232.wav", "text": "حط الحاجز هنا، بيكون وقفتنا الأخيرة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.16, "num_tokens": 79}
+{"id": "sample_000339", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000339.wav", "text": "هلا والله! أنا دليلك اللي بيخذك في مغامرة وسط النْجود. يلا نبدأ من ستة وخمسين شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 8.36, "num_tokens": 209}
+{"id": "sample_000423", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000423.wav", "text": "هلا بك يا قايد! حضّر عساكرك للمهمة الأولى في واحة خمسة وستين! شوف الخريطة في جهازك الـTablet.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.2, "num_tokens": 180}
diff --git a/sync_data/tokens/train/txts/shard-000009.jsonl b/sync_data/tokens/train/txts/shard-000009.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b6f57f02e2a3472dea2e6887bb7d30ae1761a412
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000009.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000179", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000179.wav", "text": "إذا تثق بصقرك، بيجيب الفريسة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.56, "num_tokens": 89}
+{"id": "sample_000172", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000172.wav", "text": "بهالسرعة بنفوت العشا!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.12, "num_tokens": 53}
+{"id": "sample_000188", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000188.wav", "text": "دوس بنزين! قاعدين نفقدهم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000137", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000137.wav", "text": "الرمل يتحرك مع كل خطوة، يخفي الطريق قدامنا.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.52, "num_tokens": 113}
+{"id": "sample_000184", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000184.wav", "text": "كلّم حصانك بهدوء، وبيثق فيك وقت المعركة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.52, "num_tokens": 113}
+{"id": "sample_000246", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000246.wav", "text": "يا سلام، قطعنا البلاد بساعة وحدة! الهايبرلوب ذا شي ثاني!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.96, "num_tokens": 149}
+{"id": "sample_000375", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000375.wav", "text": "يا شريك، صار الوقت نقول وداع! لا تنسى puzzle party يوم الجمعة في خمسة ستة شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.88, "num_tokens": 197}
+{"id": "sample_000153", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000153.wav", "text": "إذا ما سرعت، بنخسر قدام سكوتر!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.2, "num_tokens": 80}
diff --git a/sync_data/tokens/train/txts/shard-000010.jsonl b/sync_data/tokens/train/txts/shard-000010.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6bb76bb51479d5182bea7a33dc6b3924aa6037f6
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000010.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000226", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000226.wav", "text": "عبي الشوزن، قاعدين يهجمون علينا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.96, "num_tokens": 74}
+{"id": "sample_000448", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000448.wav", "text": "هلا، لقيت الجمل الضايع؟ دور في صفحة مئة واثنين وأربعين من الmanual!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.28, "num_tokens": 157}
+{"id": "sample_000271", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000271.wav", "text": "انتبِه لخطواتك. الطريق هنا في ليفل خمسة خطير جدًا وsteep.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.56, "num_tokens": 139}
+{"id": "sample_000263", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000263.wav", "text": "دبابات العدو جاية من الشرق! حمّل الطلقات اللي تخترق الدروع وصوّب!", "language_id": "ar", "instruct": "saudi, conversational, intense", "audio_duration": 6.2, "num_tokens": 155}
+{"id": "sample_000120", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000120.wav", "text": "يا ربعي! علامة الضرب تبين المكان على خريطة الكنز ذي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.76, "num_tokens": 119}
+{"id": "sample_000147", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000147.wav", "text": "القافلة تريح، بس البر ما ينام.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000222", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000222.wav", "text": "القبيلة واقفة مع بعض مثل الجدار المتين.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.64, "num_tokens": 116}
+{"id": "sample_000111", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000111.wav", "text": "ترتيب النقل الجماعي لحجاج حملتنا مثل حل لغز صعب. كل واحد له طلبات مختلفة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.32, "num_tokens": 183}
diff --git a/sync_data/tokens/train/txts/shard-000011.jsonl b/sync_data/tokens/train/txts/shard-000011.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..eda9e738ea11ed679f5a661d08d7fe82635f79d7
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000011.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000113", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000113.wav", "text": "من البيوت الطينية للأبراج الزجاجية، سماء الرياض تحكي قصة تطورنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.92, "num_tokens": 173}
+{"id": "sample_000197", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000197.wav", "text": "سر الكبسة يجي مع ظبط البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000243", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000243.wav", "text": "خلك منتبه! إذا أطلقت الصقر بدري، بنخسر السباق!", "language_id": "ar", "instruct": "saudi, conversational, competitive", "audio_duration": 4.48, "num_tokens": 112}
+{"id": "sample_000136", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000136.wav", "text": "كل خطوة تقربك من الراحة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.84, "num_tokens": 71}
+{"id": "sample_000130", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000130.wav", "text": "الاستعداد لهالرحلة شرف ومسؤولية.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84}
+{"id": "sample_000252", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000252.wav", "text": "يالله، وزع الرماة على طرف طويق! ما نبي يخترقون القلعة!", "language_id": "ar", "instruct": "saudi, conversational, competitive", "audio_duration": 5.32, "num_tokens": 133}
+{"id": "sample_000169", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000169.wav", "text": "تحركوا! ما نقدر نجلس هنا أكثر!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.04, "num_tokens": 76}
+{"id": "sample_000236", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000236.wav", "text": "لازم نسيطر على الأهرامات، اندفع الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85}
diff --git a/sync_data/tokens/train/txts/shard-000012.jsonl b/sync_data/tokens/train/txts/shard-000012.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..53b339edecb158d5b7704540ad1ff837c8bd6e3e
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000012.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000012", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000012.wav", "text": "بسرعة، طلّق الصقور على الطريدة اللي تفر!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.68, "num_tokens": 92}
+{"id": "sample_000275", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000275.wav", "text": "ليش الجمل دخل قبيلة Puzzle؟ عشان يحل ألغاز القفر في level ثلاثة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.08, "num_tokens": 152}
+{"id": "sample_000177", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000177.wav", "text": "النجوم بتدلنا بالليل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.44, "num_tokens": 61}
+{"id": "sample_000266", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000266.wav", "text": "انتبه! درونات الشركات تفحص المكان. استخدم جهاز الإخفاء السايبر حقك عشان ما ينكشف وجودك.", "language_id": "ar", "instruct": "saudi, conversational, futuristic", "audio_duration": 7.8, "num_tokens": 195}
+{"id": "sample_000468", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000468.wav", "text": "قوّموا المكينة! نْسابق صوب الويحات بسرعة صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.84, "num_tokens": 96}
+{"id": "sample_000308", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000308.wav", "text": "تسذكر يوم اجتمعنا تحت النجوم بالصحراء؟ كنا نسولف عن PlayStation وشارع خمسة ستة بالرياض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.44, "num_tokens": 211}
+{"id": "sample_000277", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000277.wav", "text": "ليه الجمل قدّم للوظيفَه؟ يبي stable position!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.28, "num_tokens": 132}
+{"id": "sample_000220", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000220.wav", "text": "خلنا ندور أفضل الصفقات في السوق قبل ما نكمل.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.8, "num_tokens": 95}
diff --git a/sync_data/tokens/train/txts/shard-000013.jsonl b/sync_data/tokens/train/txts/shard-000013.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..66284a14be7ad236137b58d8542b475829762965
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000013.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000291", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000291.wav", "text": "اتفقت القبيلة! بنلتقي في واحة ستة وخمسون. جبو أحسن strategies عندكم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.56, "num_tokens": 164}
+{"id": "sample_000269", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000269.wav", "text": "فوت الكورة بين رجلين المدافع وسو تمريرة حايط مع ربيعك عشان تسجل في زقاق السوق!", "language_id": "ar", "instruct": "saudi, conversational, energetic", "audio_duration": 7.4, "num_tokens": 185}
+{"id": "sample_000442", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000442.wav", "text": "الاجتماع العائلي في الواحة مهم جداً. لازم نسوي الخطة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.64, "num_tokens": 166}
+{"id": "sample_000213", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000213.wav", "text": "تدرب على الدعوات، بتساعدك في الرحلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000106", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000106.wav", "text": "من جسر السما في برج المملكة بالرياض، تقدر تلمس الغيوم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.12, "num_tokens": 128}
+{"id": "sample_000258", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000258.wav", "text": "ياخي، هذا مرشد سياحي تصويري؟ كأنك تمشي بالماضي والمستقبل مع بعض!", "language_id": "ar", "instruct": "saudi, conversational, curious", "audio_duration": 6.64, "num_tokens": 166}
+{"id": "sample_000457", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000457.wav", "text": "شوف ذا الباب القديم! يحسسك إنه Portal مخفي لعالم ثاني، في خريطة ليفل ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.92, "num_tokens": 173}
+{"id": "sample_000011", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000011.wav", "text": "ساعدني ألقى حرامي الضايع في خربطة الشنط!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.48, "num_tokens": 87}
diff --git a/sync_data/tokens/train/txts/shard-000014.jsonl b/sync_data/tokens/train/txts/shard-000014.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..02c83cfd4487b274760750042b634ee204e0575d
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000014.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000096", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000096.wav", "text": "التجهيز للحج مثل لعبة الطناخة بس حقيقية!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.24, "num_tokens": 131}
+{"id": "sample_000260", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000260.wav", "text": "الرموز القديمة على صخرة الفيل تطلع بس تحت ضوء القمر. يلا، فك رموزها قبل يطلع الفجر!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 7.64, "num_tokens": 191}
+{"id": "sample_000463", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000463.wav", "text": "بسرعة، قولي لي وش شفتي ب ستة وخمسين شارع النجدي! كان مجلس Falcon؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.36, "num_tokens": 134}
+{"id": "sample_000238", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000238.wav", "text": "الهواء مثالي، خلنا نسبقهم لخط النهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.76, "num_tokens": 94}
+{"id": "sample_000234", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000234.wav", "text": "تراجع الحين! تجمع عند نقطة التفتيش الجاية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.12, "num_tokens": 103}
+{"id": "sample_000467", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000467.wav", "text": "لقَ ذا الواحَهْ المخفّية قبل غروب الشمس وقَم المخيم قريب من الأنقاض القديمة في شارع الملك عبد الله رقم سبعة تسعة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.56, "num_tokens": 239}
+{"id": "sample_000214", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000214.wav", "text": "رم الشبكة لما تكون الموية راكدة، الصبر هو المفتاح.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.24, "num_tokens": 131}
+{"id": "sample_000176", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000176.wav", "text": "فاضي! مرر الكورة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71}
diff --git a/sync_data/tokens/train/txts/shard-000015.jsonl b/sync_data/tokens/train/txts/shard-000015.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b71a12b848bfe7b19abe21389c8dfcd23aa94877
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000015.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000280", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000280.wav", "text": "عدّينا كثبان واجد سوا؛ وداعًا يا رفيقي لين نرجع نلتقي في ليفل عشرة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.8, "num_tokens": 220}
+{"id": "sample_000287", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000287.wav", "text": "يا ولد العم! شفت الجمل الجديد للجد؟ اسمه 'Speedster ثلاثة آلاف'!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.16, "num_tokens": 154}
+{"id": "sample_000114", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000114.wav", "text": "جا وقت نعلق فانوس رمضان! يلا ننور بيتنا للشهر الفضيل.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.0, "num_tokens": 125}
+{"id": "sample_000279", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000279.wav", "text": "جَمِّع السرعة عشان تفوز بكأس الصحراء. Level ثلاثة ينتظرك.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.36, "num_tokens": 134}
+{"id": "sample_000124", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000124.wav", "text": "العاصفة تقرب! لازم نوصل المنطقة الآمنة بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.76, "num_tokens": 94}
+{"id": "sample_000302", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000302.wav", "text": "شفت أحد مشبوه قريب من الوَحَه حول الساعه صفر تسعة: صفر صفر؟ يمكن كان لابس قميص مكتوب عليه Desert Eagle.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.12, "num_tokens": 178}
+{"id": "sample_000174", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000174.wav", "text": "محاصرين! لازم نطلعهم بقوة!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.28, "num_tokens": 82}
+{"id": "sample_000364", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000364.wav", "text": "مبروك! انت حليت آخر لغز وربحت جائزة مميزة. تْشوف التفاصيل في صفحة مئة واثنين وأربعين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 8.04, "num_tokens": 201}
diff --git a/sync_data/tokens/train/txts/shard-000016.jsonl b/sync_data/tokens/train/txts/shard-000016.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..c44dd9c96b26f03c7d5a93aee6044cea1b32d42a
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000016.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000154", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000154.wav", "text": "باقي لك هدف واحد. خلها تضبط.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000265", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000265.wav", "text": "الآلة القديمة تشتغل! بسرعة، حل لغز الكتابة القديمة عشان تدخل الغرفة المخبية!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 6.84, "num_tokens": 171}
+{"id": "sample_000211", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000211.wav", "text": "شوف رجوله، قوية. بيكون ممتاز للقافلة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.64, "num_tokens": 116}
+{"id": "sample_000104", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000104.wav", "text": "شد حيلك! هالبقي على الكثبان أخشن من بعير فيه الزغطة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.28, "num_tokens": 132}
+{"id": "sample_000323", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000323.wav", "text": "مرحبا بك في عالم الصحراء. استعد للمعركة، يا بطل. رحلتك تبدأ الآن في صفحة ستة وخمسون من Game Guide.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.92, "num_tokens": 248}
+{"id": "sample_000202", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000202.wav", "text": "خيط الثوب زين، كل شي مهم للعريس.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.76, "num_tokens": 94}
+{"id": "sample_000123", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000123.wav", "text": "الأكسجين عندنا في خطر. لازم نوسع البيت الأخضر على طول.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.8, "num_tokens": 120}
+{"id": "sample_000207", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000207.wav", "text": "جدل السعف زين، السلة بتشيل كثير تمر.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 4.16, "num_tokens": 104}
diff --git a/sync_data/tokens/train/txts/shard-000017.jsonl b/sync_data/tokens/train/txts/shard-000017.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..675a3f5b300cb07c140c75e49a7f6a2a932c3330
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000017.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000099", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000099.wav", "text": "يالله بسرعة! عدل حلاوة العيد قبل لا يجون الضيوف!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.68, "num_tokens": 92}
+{"id": "sample_000008", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000008.wav", "text": "كسّر الجدار عشان توقف هجمة الفيروس!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.56, "num_tokens": 89}
+{"id": "sample_000185", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000185.wav", "text": "حصان الفارس أعز صديق له وقت القتال.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.8, "num_tokens": 95}
+{"id": "sample_000129", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000129.wav", "text": "لازم نلقى المفتاح المخفي علشان ندخل القبر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000203", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000203.wav", "text": "خيط الشماغ للعريس، خلي الأطراف زينة.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.8, "num_tokens": 95}
+{"id": "sample_000219", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000219.wav", "text": "امش بالليل عشان تتفادى حرارة الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.16, "num_tokens": 79}
+{"id": "sample_000105", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000105.wav", "text": "تجهيز شنط الحج مثل حل لعبة المكعبات بحبات السبحة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.56, "num_tokens": 139}
+{"id": "sample_000131", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000131.wav", "text": "لازم نزين الخيمة للاحتفال الكبير!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.24, "num_tokens": 81}
diff --git a/sync_data/tokens/train/txts/shard-000018.jsonl b/sync_data/tokens/train/txts/shard-000018.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1fb755b18e787e6a98dc4c22478699328f5f5330
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000018.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000350", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000350.wav", "text": "جهّز نفسَك للحج! تأكد من Google Maps وقابلنا عند الوّاحات في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.56, "num_tokens": 164}
+{"id": "sample_000343", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000343.wav", "text": "شوف ذا النقش القديم! كنه لقى كنز مخفي في صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.16, "num_tokens": 154}
+{"id": "sample_000017", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000017.wav", "text": "نقوش الحنا حق جدتي مثل القصيد اللي يسيل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.4, "num_tokens": 110}
+{"id": "sample_000356", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000356.wav", "text": "ارسل الكشافة لجهة الكثبان الشرقية يدورون عن إشارات كمين. لا تنسى تحدث الخريطة في جهاز GPS اللي معك للإصدار اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.88, "num_tokens": 247}
+{"id": "sample_000235", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000235.wav", "text": "دور القناص قبل ما يضرب مرة ثانية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.88, "num_tokens": 72}
+{"id": "sample_000228", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000228.wav", "text": "عجل! عبي المدفع قبل ما يضربونا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.32, "num_tokens": 83}
+{"id": "sample_000216", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000216.wav", "text": "اطلع بحذر، أفضل التمر فوق.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000242", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000242.wav", "text": "ابني الطاحونة في وجه الهوى، لازم تدور بدون مشاكل.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.36, "num_tokens": 134}
diff --git a/sync_data/tokens/train/txts/shard-000019.jsonl b/sync_data/tokens/train/txts/shard-000019.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2efe0a6331df6640e503d372e12297f539c2eec9
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000019.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000208", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000208.wav", "text": "طبخ الرز واللحم على نار هادية، الضيوف بيجون قريب.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 4.36, "num_tokens": 109}
+{"id": "sample_000297", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000297.wav", "text": "يا زين! جملنا جاب VIP pass للْحَجّ. دق على صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة للتفاصيل.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.56, "num_tokens": 239}
+{"id": "sample_000267", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000267.wav", "text": "السباق يبدأ بعد خمسة دقايق! خذ لك سيارة سريعة وتعال لموقف استاد الملك فهد.", "language_id": "ar", "instruct": "saudi, conversational, satirical", "audio_duration": 6.56, "num_tokens": 164}
+{"id": "sample_000183", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000183.wav", "text": "الحصان باسمه القوي يشيل فخر القبيلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.16, "num_tokens": 104}
+{"id": "sample_000225", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000225.wav", "text": "عبي السلاح بسرعة! إحنا تحت ضرب النار!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000139", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000139.wav", "text": "الرموز باهتة، بس معناها قوي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84}
+{"id": "sample_000162", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000162.wav", "text": "اليد اللي ثابتة تفوز، مو اليد السريعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.0, "num_tokens": 100}
+{"id": "sample_000272", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000272.wav", "text": "صدى الحكمة القديمة باقي يتردد في هالأطلال المْنسية، كنك تلقى شي زي ليفل خمسة في لعبة strategy.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 9.32, "num_tokens": 233}
diff --git a/sync_data/tokens/train/txts/shard-000020.jsonl b/sync_data/tokens/train/txts/shard-000020.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b9e79b8af285bd6e95aeffa52f2edef51df7bc3c
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000020.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000112", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000112.wav", "text": "الهبوب جاي! بسرعة، اضرب الخيمة وربط البعارين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.8, "num_tokens": 120}
+{"id": "sample_000155", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000155.wav", "text": "ما نخليهم ياخذون التل. اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.2, "num_tokens": 80}
+{"id": "sample_000117", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000117.wav", "text": "امش في الجسر اللي بين أبراج مركز المملكة. شوف الرياض من فوق، يا سلام!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.68, "num_tokens": 142}
+{"id": "sample_000255", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000255.wav", "text": "وناسة عالم الشتاء! نجرب اللعبة الدوارة ولا صالة الجليد أول؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.68, "num_tokens": 142}
+{"id": "sample_000205", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000205.wav", "text": "دربه يرجع بالصيد، الثقة تبني مع الوقت.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.0, "num_tokens": 100}
+{"id": "sample_000133", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000133.wav", "text": "ضرب عازف العود على الأوتار، وبدأوا الناس يصفقون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.04, "num_tokens": 101}
+{"id": "sample_000455", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000455.wav", "text": "اذكر يوم تعلمنا عن قبائل Bedouin وتقاليدهم في صفحة أربعة وثلاثين من كتاب History.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.64, "num_tokens": 166}
+{"id": "sample_000170", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000170.wav", "text": "ما عندنا مؤونة كفاية، لازم ننسحب!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.56, "num_tokens": 89}
diff --git a/sync_data/tokens/train/txts/shard-000021.jsonl b/sync_data/tokens/train/txts/shard-000021.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..39e0734309b8205fbd4f3ee9414be6788c7635e4
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000021.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000460", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000460.wav", "text": "العدو قْرب من الكْثبان. جهز حرس القَصر للمعركة على صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.52, "num_tokens": 163}
+{"id": "sample_000142", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000142.wav", "text": "الرمل يخبّي أكثر من العظام.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.24, "num_tokens": 81}
+{"id": "sample_000212", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000212.wav", "text": "الغزلان سريعة، امش على آثارها بحذر.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000374", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000374.wav", "text": "حيّاكم الله في المهرجان! انبسطوا بسباق الجمال وخذوا تمرات تس مجانًا من booth خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.6, "num_tokens": 190}
+{"id": "sample_000459", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000459.wav", "text": "تذكر يوم تسابقنا فوق الكْثبان الرملية في Desert Racer، ندزلب غروب الشمس؟ قابلني عند ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.88, "num_tokens": 222}
+{"id": "sample_000140", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000140.wav", "text": "الدلة جاهزة! اسكبها للمعازيم.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000248", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000248.wav", "text": "يبه، ليه صخرة الفيل وكل الصخور ذي أشكالها غريبة كذا؟", "language_id": "ar", "instruct": "saudi, conversational, curious", "audio_duration": 5.24, "num_tokens": 131}
+{"id": "sample_000412", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000412.wav", "text": "شفت سباق الجمال عند واحة ستة وخمسين؟ كان رهيب!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.64, "num_tokens": 116}
diff --git a/sync_data/tokens/train/txts/shard-000022.jsonl b/sync_data/tokens/train/txts/shard-000022.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b790d74a2ad426f99d0db48f180d2a329ffcbca
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000022.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000141", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000141.wav", "text": "كل خطوة بهالرحلة تقربك للإيمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000143", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000143.wav", "text": "الصقر حلق، ودانا وجهتنا.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.28, "num_tokens": 82}
+{"id": "sample_000328", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000328.wav", "text": "وش عذرتس يا بنت عند ستة وخمسين شارع الواحة الساعه سبعة؟ شفتي سباق الجمل؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.8, "num_tokens": 145}
+{"id": "sample_000229", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000229.wav", "text": "القنبلة جاية! خبّ نفسك!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67}
+{"id": "sample_000396", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000396.wav", "text": "حياكم الله في المجلس الاستراتيجي. فريقنا مِستعد يواجه تحديات جديدة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.64, "num_tokens": 216}
+{"id": "sample_000401", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000401.wav", "text": "السلام عليكم يا زائرٍ نبيل. أنا رفيقتس اللي بيدزلك في هذي الألغاز الغامضة في Level خمسة من اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.68, "num_tokens": 217}
+{"id": "sample_000126", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000126.wav", "text": "عشان نوصل للمرحلة الجاية، لازم نعيد ترتيب هالرموز العتيقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.64, "num_tokens": 141}
+{"id": "sample_000450", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000450.wav", "text": "ليه الجمل ما رضى يتفاهم؟ خايف يقسم له الoasis برقم ستة وخمسين!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.6, "num_tokens": 165}
diff --git a/sync_data/tokens/train/txts/shard-000023.jsonl b/sync_data/tokens/train/txts/shard-000023.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..f8f9b1dc5e60071654d36535fe7fe4b8c69ad8b5
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000023.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000251", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000251.wav", "text": "متأكد إن البالون ذا يشيلنا كلنا؟ أحس إنه يترنح شوي فوق!", "language_id": "ar", "instruct": "saudi, conversational, nervous", "audio_duration": 5.72, "num_tokens": 143}
+{"id": "sample_000201", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000201.wav", "text": "هشّك الرز بالشوكة عشان يطلع خفيف.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79}
+{"id": "sample_000110", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000110.wav", "text": "هالنقوش على الحجر تبين ساعة الماء القديمة في مدائن صالح. تقدر تشغلها من جديد؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 6.88, "num_tokens": 172}
+{"id": "sample_000196", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000196.wav", "text": "الأساطير تنولد هنا، في شوارعنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.12, "num_tokens": 78}
+{"id": "sample_000091", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000091.wav", "text": "يبه، ليه نلبس ثياب يديدة للعيد؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.2, "num_tokens": 80}
+{"id": "sample_000337", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000337.wav", "text": "سمعتْ؟ خُطتْ المَلِگ الجديدة سرية مثل رقم الهاتف صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.68, "num_tokens": 242}
+{"id": "sample_000262", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000262.wav", "text": "لازم نوازن بين الاستدامة والتقنية. زيد المزارع العمودية في المنطقة خمسة.", "language_id": "ar", "instruct": "saudi, conversational, innovative", "audio_duration": 6.44, "num_tokens": 161}
+{"id": "sample_000244", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000244.wav", "text": "تأكد أن كل التوصيلات على الوقت، زحمة المدينة ممكن تسبب تأخير.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.68, "num_tokens": 142}
diff --git a/sync_data/tokens/train/txts/shard-000024.jsonl b/sync_data/tokens/train/txts/shard-000024.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..1a0f7b3e1f2ac82c99bd01f9f13ab4069a4914de
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000024.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000359", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000359.wav", "text": "حذر! ذا الجمل يحب ياكل guidebooks. تلقاه عند ستة وخمسين شارع النجدي بعد العصر.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.0, "num_tokens": 200}
+{"id": "sample_000261", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000261.wav", "text": "الخط الأزرق زحمة! حوّل الركاب للخط الأخضر عشان يوصلون المول بالوقت!", "language_id": "ar", "instruct": "saudi, conversational, urgent", "audio_duration": 6.32, "num_tokens": 158}
+{"id": "sample_000424", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000424.wav", "text": "يا بطل! وش اسمك الملحمي قبل نخلّص العالم؟ حط اسمك هنا وشوف صفحة مئة وواحد.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.6, "num_tokens": 190}
+{"id": "sample_000264", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000264.wav", "text": "المنطقة الآمنة تضيق! روح برج المملكة عشان تاخذ ميزة المكان العالي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.36, "num_tokens": 134}
+{"id": "sample_000419", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000419.wav", "text": "تعلمتوا الاستراتيجيات. الحين طبقوها علشان تغزون أراضي الصحرة. شوفوا صفحة مئة واثنين وأربعين في كتاب Game Manual.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.88, "num_tokens": 222}
+{"id": "sample_000180", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000180.wav", "text": "يالله نلعب! أراهن إني أفوز!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.88, "num_tokens": 72}
+{"id": "sample_000276", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000276.wav", "text": "سويتِها! الحين الصحراء صارت بأمان. تعالوا نلتقي عند الواحة لاحتفال كبير مع الشلة في ليفل واحد صفر!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.08, "num_tokens": 227}
+{"id": "sample_000192", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000192.wav", "text": "دوّر السيارة! خلك ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.84, "num_tokens": 96}
diff --git a/sync_data/tokens/train/txts/shard-000025.jsonl b/sync_data/tokens/train/txts/shard-000025.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a7fbf26e9c0a44945b224eceaeaffbdc146b3b53
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000025.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000138", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000138.wav", "text": "الغنم جاهزة! يالله نبدأ العزيمة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.12, "num_tokens": 78}
+{"id": "sample_000013", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000013.wav", "text": "اقبض الطارات على السيف وقت العرضة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.08, "num_tokens": 77}
+{"id": "sample_000195", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000195.wav", "text": "آخر جولة، ما بقى إلا المحترفين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000224", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000224.wav", "text": "النار تدفي قلوبنا، والقصص تدفي الأرواح.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.36, "num_tokens": 109}
+{"id": "sample_000135", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000135.wav", "text": "إذا ما قتلنا الزحمة، الحر بيقتلنا!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.52, "num_tokens": 88}
+{"id": "sample_000446", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000446.wav", "text": "تْذكّر أيام الوَاحَه، وين وُلِدَت Legends في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.08, "num_tokens": 177}
+{"id": "sample_000404", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000404.wav", "text": "مرحبتس في Strategy واحد صفر واحد: كيف تدزلب الجمال وتغلب الكثبان! صفحة مئة وثلاثة وعشرون", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.88, "num_tokens": 222}
+{"id": "sample_000145", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000145.wav", "text": "الخريطة تقول الكنز مدفون تحت الرمال اللي تتحرك.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.72, "num_tokens": 118}
diff --git a/sync_data/tokens/train/txts/shard-000026.jsonl b/sync_data/tokens/train/txts/shard-000026.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..da051815cbaf7b8045c803177df7a367532860eb
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000026.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000283", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000283.wav", "text": "بسرعة! لازم نلقى الscroll المخفي في المكتبه قبل يرجعون الحراس! المكتبه في دور ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.48, "num_tokens": 162}
+{"id": "sample_000097", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000097.wav", "text": "الجليب العتيق يوسوس، 'حط ريال وتمن أمنية!'", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.6, "num_tokens": 115}
+{"id": "sample_000392", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000392.wav", "text": "تشوف الصقور يطرن فوق الجبل؟ خلنا ننضم معهم في الصفحة مئة واثنان وأربعون من Mountain Quest!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.64, "num_tokens": 166}
+{"id": "sample_000233", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000233.wav", "text": "حط الفخ هنا، بننصب لهم كمين مع الفجر.", "language_id": "ar", "instruct": "saudi, conversational, critical", "audio_duration": 3.76, "num_tokens": 94}
+{"id": "sample_000366", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000366.wav", "text": "في الصحرَهْ، لقيت رسالة قديمة من جدّي على صفحة اثنين وأربعين في كتاب Desert Wisdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.8, "num_tokens": 220}
+{"id": "sample_000438", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000438.wav", "text": "قم اتبع الصقر إلى الواحَهْ باستخدام الmap في صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.72, "num_tokens": 118}
+{"id": "sample_000314", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000314.wav", "text": "حياك الله في مجلس التخطيط. من فضلك اذكر اسمك ودورك. حنا في غرفة رقم اثني عشر.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.8, "num_tokens": 170}
+{"id": "sample_000293", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000293.wav", "text": "تذكر سوالف جدودنا واحنا واقفين عند بوابة المدينة رقم خمسة ستة جنب برج Kingdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.6, "num_tokens": 165}
diff --git a/sync_data/tokens/train/txts/shard-000027.jsonl b/sync_data/tokens/train/txts/shard-000027.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..037eaf50532e928945c8e42b77bb857606dc31fb
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000027.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000344", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000344.wav", "text": "حياك الله في هالمستوى من Puzzle! لا تضيع جملَك في متاهة النفود عند سبعة وستين شارع النبطي.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.56, "num_tokens": 189}
+{"id": "sample_000158", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000158.wav", "text": "ثبت في مكانك، لا تفقد تركيزك!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.88, "num_tokens": 97}
+{"id": "sample_000273", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000273.wav", "text": "يلا، لازم نهرب قبل ما الجمل يزهق! ليفل خمسة يستنانا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.44, "num_tokens": 136}
+{"id": "sample_000239", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000239.wav", "text": "خل القهوة تغلي ببطء، الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.8, "num_tokens": 95}
+{"id": "sample_000015", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000015.wav", "text": "شوف! لقيت كتابة قديمة على الجدار ذا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.32, "num_tokens": 83}
+{"id": "sample_000447", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000447.wav", "text": "لو سمحت اشرح لي كيف تعاملت مع موقف صعب في شغلك الأخير في Office خمسة وستين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.24, "num_tokens": 181}
+{"id": "sample_000006", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000006.wav", "text": "البس نظارة الواقع عشان تشوف خريطة الكنز المخبية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.6, "num_tokens": 115}
+{"id": "sample_000342", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000342.wav", "text": "عرضك ظليل. ما أقدر أبيع بأقل من مئتي coins.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.48, "num_tokens": 112}
diff --git a/sync_data/tokens/train/txts/shard-000028.jsonl b/sync_data/tokens/train/txts/shard-000028.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d89eae66f751df8acfb0a5fbbb36834b2984a67a
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000028.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000165", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000165.wav", "text": "قريبين نفوز! لا تخففون الضغط!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.44, "num_tokens": 86}
+{"id": "sample_000095", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000095.wav", "text": "ياهوه! جا وقت نعلق فوانيس رمضان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.32, "num_tokens": 108}
+{"id": "sample_000191", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000191.wav", "text": "انتبه من الحفر! ما نبي نطيح!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.36, "num_tokens": 84}
+{"id": "sample_000115", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000115.wav", "text": "النقوش في الحجر تتكلم عن طرق التجارة القديمة. تقدر تفك شفرة كلامها؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 5.88, "num_tokens": 147}
+{"id": "sample_000103", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000103.wav", "text": "في العيد، تمنيت بعير... بس جاني لعبة محشية بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.2, "num_tokens": 130}
+{"id": "sample_000175", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000175.wav", "text": "طلقة مضبوطه تغير كل شي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000230", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000230.wav", "text": "حط المتفجرات وارجع، بنفجر الجسر!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.52, "num_tokens": 88}
+{"id": "sample_000198", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000198.wav", "text": "حرك الجريش لين يثقل، وبعدين زيد البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.84, "num_tokens": 121}
diff --git a/sync_data/tokens/train/txts/shard-000029.jsonl b/sync_data/tokens/train/txts/shard-000029.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7cf277b6c9f4e5f51e4af1e745e77d27273c4596
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000029.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000148", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000148.wav", "text": "أسمع شي بالظلام... يمكن الهوى.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.64, "num_tokens": 91}
+{"id": "sample_000400", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000400.wav", "text": "النصر لنا! خلونا نركب الجمال ونحتفل، بس لا تنسى تسجل رقم خمسة ستة شارع النجدي في GPS.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.24, "num_tokens": 231}
+{"id": "sample_000370", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000370.wav", "text": "قال الملك، 'وشلون الجمل صار في ستة وخمسين شارع Royal؟' يمكن يتفرج على Netflix!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.96, "num_tokens": 174}
+{"id": "sample_000003", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000003.wav", "text": "حطينا في الدرعية القديمة! يالله بسرعة، عدّل طريقة كلامك لا يحسبونا جنّ الناس هنا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.48, "num_tokens": 162}
+{"id": "sample_000206", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000206.wav", "text": "حكم الخيوط زين، هالسجادة بتحمل قصص أهلنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.64, "num_tokens": 116}
+{"id": "sample_000414", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000414.wav", "text": "بسرعة! حل اللغز عشان تفتح الجمل قبل توصلنا القبيلة المنافسة. لا تنسى تسجل الرقم في صفحة ستة وخمسون بالدليل!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.24, "num_tokens": 206}
+{"id": "sample_000475", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000475.wav", "text": "حياك الله في مجلس الصحارى. مهمتك تبتدي عند الغروب بْتمامَه في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.56, "num_tokens": 214}
+{"id": "sample_000326", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000326.wav", "text": "تذكر يوم اللي جالك تحدي وغلبته في مقابلة مع شركة RiyadhTech. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.32, "num_tokens": 208}
diff --git a/sync_data/tokens/train/txts/shard-000030.jsonl b/sync_data/tokens/train/txts/shard-000030.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..07e30946093766b89e88a0914614d2150dcfc6fd
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000030.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000173", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000173.wav", "text": "طلقة مضبوطة يعني فوز مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.52, "num_tokens": 88}
+{"id": "sample_000351", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000351.wav", "text": "حياكم الله في سوق التجارة! شوفوا السرج الخاص بالجمل عندنا بسعر مئة وتسعة وتسعين ريال!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.56, "num_tokens": 189}
+{"id": "sample_000182", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000182.wav", "text": "الجمل ذا سلالته صافية، يستاهل كل ريال.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.12, "num_tokens": 103}
+{"id": "sample_000259", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000259.wav", "text": "الشركة حقتنا في البلوكتشين محتاجة استثمار أكثر. يلا نعرض على شركات التمويل الجريء السعودية!", "language_id": "ar", "instruct": "saudi, conversational, ambitious", "audio_duration": 7.12, "num_tokens": 178}
+{"id": "sample_000108", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000108.wav", "text": "اللفافة العتيقة تقول: 'الكنز في المكان اللي ظل أطول منارة يبوس أقدم بير وقت العصر.'", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 8.6, "num_tokens": 215}
+{"id": "sample_000009", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000009.wav", "text": "طور شريحة السرعة في جملك الروبوت للسباق!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.24, "num_tokens": 106}
+{"id": "sample_000163", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000163.wav", "text": "لا تطلقون النار لين يقربون!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67}
+{"id": "sample_000387", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000387.wav", "text": "مبروك! الحين وصلت لآخر level، يلا نحتفل في ستة وخمسين شارع.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.24, "num_tokens": 156}
diff --git a/sync_data/tokens/train/txts/shard-000031.jsonl b/sync_data/tokens/train/txts/shard-000031.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..423dcc1ce7a4d3433167f96968955b6911593e11
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000031.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000256", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000256.wav", "text": "ياخي، اللعبة ذي بالنظارات قوية! حسيت إني أطير فوق الرياض صدق!", "language_id": "ar", "instruct": "saudi, conversational, thrilled", "audio_duration": 6.52, "num_tokens": 163}
+{"id": "sample_000199", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000199.wav", "text": "افرد العجينة وزيد الصلصة بترتيب.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.48, "num_tokens": 87}
+{"id": "sample_000018", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000018.wav", "text": "النجوم تدلنا في الرمل اللي ما له نهاية.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.84, "num_tokens": 96}
+{"id": "sample_000157", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000157.wav", "text": "إذا سجلنا الحين، المشروبات علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.0, "num_tokens": 75}
+{"id": "sample_000338", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000338.wav", "text": "تتذگر طعم القهوة مع التمر بالمجلس؟ أظن إنه كان في صالة رقم اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.04, "num_tokens": 176}
+{"id": "sample_000109", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000109.wav", "text": "بالعيد، نبدا بزيارة الكبار، وبعدين الصغار. كذا نكرم الحكمة والبراءة مع بعض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.04, "num_tokens": 176}
+{"id": "sample_000218", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000218.wav", "text": "النقوش هذي تحكي قصة سقوط المدينة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85}
+{"id": "sample_000156", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000156.wav", "text": "قريب توصل، باقي لك لفة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71}
diff --git a/sync_data/tokens/train/txts/shard-000032.jsonl b/sync_data/tokens/train/txts/shard-000032.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..46abe43b5831596dd75a36870c751c5a52449c12
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000032.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000241", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000241.wav", "text": "الهوى قوي الليلة، ثبت الخيمة زين في الأرض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.24, "num_tokens": 106}
+{"id": "sample_000341", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000341.wav", "text": "تْفَكَّر فِي الرِّحْلَة عبر الصّحْرَا لِـ Mecca. تذَكَّر سَوالِف أَجْدَادْنَا اللي قالوها في صفحة مئة واثنين وأربعين.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.28, "num_tokens": 207}
+{"id": "sample_000093", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000093.wav", "text": "ياسلام! هالفلس العتيق يضبط بالضبط في يد التمثال!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121}
+{"id": "sample_000384", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000384.wav", "text": "لازم تسوي سعر أحسن لهالبضاعة، ولا بتخاطر تخسر الصفقة. شوف الشروط بصفحة ستة وخمسين من كتيب Steam.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.48, "num_tokens": 212}
+{"id": "sample_000100", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000100.wav", "text": "رمول الزمان خشت المفتاح؛ ما يشوفه إلا عين الصقر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.84, "num_tokens": 121}
+{"id": "sample_000121", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000121.wav", "text": "البطولة لنا! تعب فريقنا ما راح على الفاضي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.12, "num_tokens": 103}
+{"id": "sample_000190", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000190.wav", "text": "حط القذائف مضبوط، لو غلطنا نضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000318", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000318.wav", "text": "شفت شي غريب حول الواحة القديمه ليلة أمس؟ هذا مكتوب في سجل رقم خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.08, "num_tokens": 152}
diff --git a/sync_data/tokens/train/txts/shard-000033.jsonl b/sync_data/tokens/train/txts/shard-000033.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3150ae595ccf1439a33fadbc81cc51110a37a000
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000033.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000240", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000240.wav", "text": "ارفع السيف فوق، العرضة بتبدأ قريب!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.6, "num_tokens": 90}
+{"id": "sample_000181", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000181.wav", "text": "لازم نلقى لنا ملجأ قبل ما تجي العاصفة.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.28, "num_tokens": 82}
+{"id": "sample_000210", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000210.wav", "text": "السعر غالي، خلنا نتفق على شي ينفعنا اثنيننا.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.2, "num_tokens": 105}
+{"id": "sample_000435", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000435.wav", "text": "بسرعة، علمني، وش هي عاصمة أستراليا؟ تلميحة: شوفي صفحة مئة واثنين وأربعين في الڤايد!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.32, "num_tokens": 183}
+{"id": "sample_000005", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000005.wav", "text": "الرمل يشبه كثبان الفضا من فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.12, "num_tokens": 78}
+{"id": "sample_000101", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000101.wav", "text": "تجهيز شنطة الحج مثل حل بازل مقدس!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.96, "num_tokens": 99}
+{"id": "sample_000161", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000161.wav", "text": "كل خطوة قدام تقربنا من النصر.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.32, "num_tokens": 83}
+{"id": "sample_000254", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000254.wav", "text": "عشان نفتح الدور الجاي، لازم نحل اللغز الاقتصادي ذا. فكر مثل المستثمر!", "language_id": "ar", "instruct": "saudi, conversational, challenging", "audio_duration": 6.68, "num_tokens": 167}
diff --git a/sync_data/tokens/train/txts/shard-000034.jsonl b/sync_data/tokens/train/txts/shard-000034.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7dab3246cfc9edd10b0f895d49a0d37e1fcfd5c5
--- /dev/null
+++ b/sync_data/tokens/train/txts/shard-000034.jsonl
@@ -0,0 +1,8 @@
+{"id": "sample_000209", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000209.wav", "text": "قل القصيدة الحربية، تذكرنا بشجاعة أجدادنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.44, "num_tokens": 111}
+{"id": "sample_000253", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000253.wav", "text": "الشاورما خلصت! دق على المورد واطلب دجاج زيادة، بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, busy", "audio_duration": 4.68, "num_tokens": 117}
+{"id": "sample_000373", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000373.wav", "text": "دور على المفتاح المخفي عشان تفتح الباب، وإلا بتظل هنا للأبد! ترى أقرب phone في الدور ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.04, "num_tokens": 226}
+{"id": "sample_000221", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000221.wav", "text": "الصحراء تعلم الصبر، مثل ما يقولون البدو.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.8, "num_tokens": 95}
+{"id": "sample_000346", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000346.wav", "text": "هلا بك في الفريق، يا القايد. قاعدتك على ستة وخمسين شارع النجدي. جهّز استراتيجيتك.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.52, "num_tokens": 188}
+{"id": "sample_000164", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000164.wav", "text": "نكسر خط دفاعهم هنا، نفوز بالمعركة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.72, "num_tokens": 93}
+{"id": "sample_000454", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000454.wav", "text": "أذكر الأيام الزينات في الصحرا تحت ضو الگمر. تشبه Level خمسة باللعبة في صفحة مئة وثلاثة وعشرون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 10.0, "num_tokens": 250}
+{"id": "sample_000330", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000330.wav", "text": "أبغى منك تنظم تجمع العائلة في ستة وخمسين شارع النجدي. لا تنسى ترسل الدعوة بواتساب.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.44, "num_tokens": 186}
diff --git a/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl b/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b26ca14951bd5ea2c214b1fee206edcbaec24d28
--- /dev/null
+++ b/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl
@@ -0,0 +1 @@
+{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79}