diff --git a/omnivoice/cli/__init__.py b/omnivoice/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/cli/demo.py b/omnivoice/cli/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..f45ac775ed4a984282d64e72334a8e6d6504e520 --- /dev/null +++ b/omnivoice/cli/demo.py @@ -0,0 +1,548 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Gradio demo for OmniVoice. + +Supports voice cloning and voice design. + +Usage: + omnivoice-demo --model /path/to/checkpoint --port 8000 +""" + +import argparse +import logging +from typing import Any, Dict + +import gradio as gr +import numpy as np +import torch + +from omnivoice import OmniVoice, OmniVoiceGenerationConfig +from omnivoice.utils.lang_map import LANG_NAMES, lang_display_name + + +def get_best_device(): + """Auto-detect the best available device: CUDA > MPS > CPU.""" + if torch.cuda.is_available(): + return "cuda" + if torch.backends.mps.is_available(): + return "mps" + return "cpu" + + +# --------------------------------------------------------------------------- +# Language list — all 600+ supported languages +# --------------------------------------------------------------------------- +_ALL_LANGUAGES = ["Auto"] + sorted(lang_display_name(n) for n in LANG_NAMES) + + +# --------------------------------------------------------------------------- +# Voice Design instruction templates +# --------------------------------------------------------------------------- +# Each option is displayed as "English / 中文". +# The model expects English for accents and Chinese for dialects. +_CATEGORIES = { + "Gender / 性别": ["Male / 男", "Female / 女"], + "Age / 年龄": [ + "Child / 儿童", + "Teenager / 少年", + "Young Adult / 青年", + "Middle-aged / 中年", + "Elderly / 老年", + ], + "Pitch / 音调": [ + "Very Low Pitch / 极低音调", + "Low Pitch / 低音调", + "Moderate Pitch / 中音调", + "High Pitch / 高音调", + "Very High Pitch / 极高音调", + ], + "Style / 风格": ["Whisper / 耳语"], + "English Accent / 英文口音": [ + "American Accent / 美式口音", + "Australian Accent / 澳大利亚口音", + "British Accent / 英国口音", + "Chinese Accent / 中国口音", + "Canadian Accent / 加拿大口音", + "Indian Accent / 印度口音", + "Korean Accent / 韩国口音", + "Portuguese Accent / 葡萄牙口音", + "Russian Accent / 俄罗斯口音", + "Japanese Accent / 日本口音", + ], + "Chinese Dialect / 中文方言": [ + "Henan Dialect / 河南话", + "Shaanxi Dialect / 陕西话", + "Sichuan Dialect / 四川话", + "Guizhou Dialect / 贵州话", + "Yunnan Dialect / 云南话", + "Guilin Dialect / 桂林话", + "Jinan Dialect / 济南话", + "Shijiazhuang Dialect / 石家庄话", + "Gansu Dialect / 甘肃话", + "Ningxia Dialect / 宁夏话", + "Qingdao Dialect / 青岛话", + "Northeast Dialect / 东北话", + ], +} + +_ATTR_INFO = { + "English Accent / 英文口音": "Only effective for English speech.", + "Chinese Dialect / 中文方言": "Only effective for Chinese speech.", +} + +# --------------------------------------------------------------------------- +# Argument parser +# --------------------------------------------------------------------------- + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="omnivoice-demo", + description="Launch a Gradio demo for OmniVoice.", + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "--model", + default="k2-fsa/OmniVoice", + help="Model checkpoint path or HuggingFace repo id.", + ) + parser.add_argument( + "--device", default=None, help="Device to use. Auto-detected if not specified." + ) + parser.add_argument("--ip", default="0.0.0.0", help="Server IP (default: 0.0.0.0).") + parser.add_argument( + "--port", type=int, default=7860, help="Server port (default: 7860)." + ) + parser.add_argument( + "--root-path", + default=None, + help="Root path for reverse proxy.", + ) + parser.add_argument( + "--share", action="store_true", default=False, help="Create public link." + ) + parser.add_argument( + "--no-asr", + action="store_true", + default=False, + help="Skip loading Whisper ASR model. Reference text auto-transcription" + " will be unavailable.", + ) + parser.add_argument( + "--asr-model", + default="openai/whisper-large-v3-turbo", + help="ASR model path or HuggingFace repo id" + " (default: openai/whisper-large-v3-turbo).", + ) + return parser + + +# --------------------------------------------------------------------------- +# Build demo +# --------------------------------------------------------------------------- + + +def build_demo( + model: OmniVoice, + checkpoint: str, + generate_fn=None, +) -> gr.Blocks: + + sampling_rate = model.sampling_rate + + # -- shared generation core -- + def _gen_core( + text, + language, + ref_audio, + instruct, + num_step, + guidance_scale, + denoise, + speed, + duration, + preprocess_prompt, + postprocess_output, + mode, + ref_text=None, + ): + if not text or not text.strip(): + return None, "Please enter the text to synthesize." + + gen_config = OmniVoiceGenerationConfig( + num_step=int(num_step or 32), + guidance_scale=float(guidance_scale) if guidance_scale is not None else 2.0, + denoise=bool(denoise) if denoise is not None else True, + preprocess_prompt=bool(preprocess_prompt), + postprocess_output=bool(postprocess_output), + ) + + lang = language if (language and language != "Auto") else None + + kw: Dict[str, Any] = dict( + text=text.strip(), language=lang, generation_config=gen_config + ) + + if speed is not None and float(speed) != 1.0: + kw["speed"] = float(speed) + if duration is not None and float(duration) > 0: + kw["duration"] = float(duration) + + if mode == "clone": + if not ref_audio: + return None, "Please upload a reference audio." + kw["voice_clone_prompt"] = model.create_voice_clone_prompt( + ref_audio=ref_audio, + ref_text=ref_text, + ) + + if instruct and instruct.strip(): + kw["instruct"] = instruct.strip() + + try: + audio = model.generate(**kw) + except Exception as e: + return None, f"Error: {type(e).__name__}: {e}" + + waveform = (audio[0] * 32767).astype(np.int16) + return (sampling_rate, waveform), "Done." + + # Allow external wrappers (e.g. spaces.GPU for ZeroGPU Spaces) + _gen = generate_fn if generate_fn is not None else _gen_core + + # ===================================================================== + # UI + # ===================================================================== + theme = gr.themes.Soft( + font=["Inter", "Arial", "sans-serif"], + ) + css = """ + .gradio-container {max-width: 100% !important; font-size: 16px !important;} + .gradio-container h1 {font-size: 1.5em !important;} + .gradio-container .prose {font-size: 1.1em !important;} + .compact-audio audio {height: 60px !important;} + .compact-audio .waveform {min-height: 80px !important;} + """ + + # Reusable: language dropdown component + def _lang_dropdown(label="Language (optional) / 语种 (可选)", value="Auto"): + return gr.Dropdown( + label=label, + choices=_ALL_LANGUAGES, + value=value, + allow_custom_value=False, + interactive=True, + info="Keep as Auto to auto-detect the language.", + ) + + # Reusable: optional generation settings accordion + def _gen_settings(): + with gr.Accordion("Generation Settings (optional)", open=False): + sp = gr.Slider( + 0.5, + 1.5, + value=1.0, + step=0.05, + label="Speed", + info="1.0 = normal. >1 faster, <1 slower. Ignored if Duration is set.", + ) + du = gr.Number( + value=None, + label="Duration (seconds)", + info=( + "Leave empty to use speed." + " Set a fixed duration to override speed." + ), + ) + ns = gr.Slider( + 4, + 64, + value=32, + step=1, + label="Inference Steps", + info="Default: 32. Lower = faster, higher = better quality.", + ) + dn = gr.Checkbox( + label="Denoise", + value=True, + info="Default: enabled. Uncheck to disable denoising.", + ) + gs = gr.Slider( + 0.0, + 4.0, + value=2.0, + step=0.1, + label="Guidance Scale (CFG)", + info="Default: 2.0.", + ) + pp = gr.Checkbox( + label="Preprocess Prompt", + value=True, + info="apply silence removal and trimming to the reference " + "audio, add punctuation in the end of reference text (if not already)", + ) + po = gr.Checkbox( + label="Postprocess Output", + value=True, + info="Remove long silences from generated audio.", + ) + return ns, gs, dn, sp, du, pp, po + + with gr.Blocks(theme=theme, css=css, title="OmniVoice Demo") as demo: + gr.Markdown( + """ +# OmniVoice Demo + +State-of-the-art text-to-speech model for **600+ languages**, supporting: + +- **Voice Clone** — Clone any voice from a reference audio +- **Voice Design** — Create custom voices with speaker attributes + +Built with [OmniVoice](https://github.com/k2-fsa/OmniVoice) +by Xiaomi AI Lab Next-gen Kaldi team. +""" + ) + + with gr.Tabs(): + # ============================================================== + # Voice Clone + # ============================================================== + with gr.TabItem("Voice Clone"): + with gr.Row(): + with gr.Column(scale=1): + vc_text = gr.Textbox( + label="Text to Synthesize / 待合成文本", + lines=4, + placeholder="Enter the text you want to synthesize...", + ) + vc_ref_audio = gr.Audio( + label="Reference Audio / 参考音频", + type="filepath", + elem_classes="compact-audio", + ) + gr.Markdown( + "" + "Recommended: 3–10 seconds audio. " + "" + ) + vc_ref_text = gr.Textbox( + label=("Reference Text (optional)" " / 参考音频文本(可选)"), + lines=2, + placeholder="Transcript of the reference audio. Leave empty" + " to auto-transcribe via ASR models.", + ) + vc_lang = _lang_dropdown("Language (optional) / 语种 (可选)") + with gr.Accordion("Instruct (optional)", open=False): + vc_instruct = gr.Textbox(label="Instruct", lines=2) + ( + vc_ns, + vc_gs, + vc_dn, + vc_sp, + vc_du, + vc_pp, + vc_po, + ) = _gen_settings() + vc_btn = gr.Button("Generate / 生成", variant="primary") + with gr.Column(scale=1): + vc_audio = gr.Audio( + label="Output Audio / 合成结果", + type="numpy", + ) + vc_status = gr.Textbox(label="Status / 状态", lines=2) + + def _clone_fn( + text, lang, ref_aud, ref_text, instruct, ns, gs, dn, sp, du, pp, po + ): + return _gen( + text, + lang, + ref_aud, + instruct, + ns, + gs, + dn, + sp, + du, + pp, + po, + mode="clone", + ref_text=ref_text or None, + ) + + vc_btn.click( + _clone_fn, + inputs=[ + vc_text, + vc_lang, + vc_ref_audio, + vc_ref_text, + vc_instruct, + vc_ns, + vc_gs, + vc_dn, + vc_sp, + vc_du, + vc_pp, + vc_po, + ], + outputs=[vc_audio, vc_status], + ) + + # ============================================================== + # Voice Design + # ============================================================== + with gr.TabItem("Voice Design"): + with gr.Row(): + with gr.Column(scale=1): + vd_text = gr.Textbox( + label="Text to Synthesize / 待合成文本", + lines=4, + placeholder="Enter the text you want to synthesize...", + ) + vd_lang = _lang_dropdown() + + _AUTO = "Auto" + vd_groups = [] + for _cat, _choices in _CATEGORIES.items(): + vd_groups.append( + gr.Dropdown( + label=_cat, + choices=[_AUTO] + _choices, + value=_AUTO, + info=_ATTR_INFO.get(_cat), + ) + ) + + ( + vd_ns, + vd_gs, + vd_dn, + vd_sp, + vd_du, + vd_pp, + vd_po, + ) = _gen_settings() + vd_btn = gr.Button("Generate / 生成", variant="primary") + with gr.Column(scale=1): + vd_audio = gr.Audio( + label="Output Audio / 合成结果", + type="numpy", + ) + vd_status = gr.Textbox(label="Status / 状态", lines=2) + + def _build_instruct(groups): + """Extract instruct text from UI dropdowns. + + Language unification and validation is handled by + _resolve_instruct inside _preprocess_all. + """ + selected = [g for g in groups if g and g != "Auto"] + if not selected: + return None + parts = [] + for v in selected: + if " / " in v: + en, zh = v.split(" / ", 1) + # Dialects have no English equivalent + if "Dialect" in v.split(" / ")[0]: + parts.append(zh.strip()) + else: + parts.append(en.strip()) + else: + parts.append(v) + return ", ".join(parts) + + def _design_fn(text, lang, ns, gs, dn, sp, du, pp, po, *groups): + return _gen( + text, + lang, + None, + _build_instruct(groups), + ns, + gs, + dn, + sp, + du, + pp, + po, + mode="design", + ) + + vd_btn.click( + _design_fn, + inputs=[ + vd_text, + vd_lang, + vd_ns, + vd_gs, + vd_dn, + vd_sp, + vd_du, + vd_pp, + vd_po, + ] + + vd_groups, + outputs=[vd_audio, vd_status], + ) + + return demo + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(argv=None) -> int: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s: %(message)s", + ) + parser = build_parser() + args = parser.parse_args(argv) + + device = args.device or get_best_device() + + checkpoint = args.model + if not checkpoint: + parser.print_help() + return 0 + logging.info(f"Loading model from {checkpoint}, device={device} ...") + model = OmniVoice.from_pretrained( + checkpoint, + device_map=device, + dtype=torch.float16, + load_asr=not args.no_asr, + asr_model_name=args.asr_model, + ) + print("Model loaded.") + + demo = build_demo(model, checkpoint) + + demo.queue().launch( + server_name=args.ip, + server_port=args.port, + share=args.share, + root_path=args.root_path, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/omnivoice/cli/infer.py b/omnivoice/cli/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..32447b91527a6a247b904b312ca88f09a5908277 --- /dev/null +++ b/omnivoice/cli/infer.py @@ -0,0 +1,158 @@ +"""Single-item inference CLI for OmniVoice. + +Generates audio from a single text input using voice cloning, +voice design, or auto voice. + +Usage: + # Voice cloning + omnivoice-infer --model k2-fsa/OmniVoice \ + --text "Hello, this is a text for text-to-speech." \ + --ref_audio ref.wav --ref_text "Reference transcript." --output out.wav + + # Voice design + omnivoice-infer --model k2-fsa/OmniVoice \ + --text "Hello, this is a text for text-to-speech." \ + --instruct "male, British accent" --output out.wav + + # Auto voice + omnivoice-infer --model k2-fsa/OmniVoice \ + --text "Hello, this is a text for text-to-speech." --output out.wav +""" + +import argparse +import logging + +import torch + +import soundfile as sf + +from omnivoice.models.omnivoice import OmniVoice +from omnivoice.utils.common import str2bool + + +def get_best_device(): + """Auto-detect the best available device: CUDA > MPS > CPU.""" + if torch.cuda.is_available(): + return "cuda" + if torch.backends.mps.is_available(): + return "mps" + return "cpu" + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="OmniVoice single-item inference", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--model", + type=str, + default="k2-fsa/OmniVoice", + help="Model checkpoint path or HuggingFace repo id.", + ) + parser.add_argument( + "--text", + type=str, + required=True, + help="Text to synthesize.", + ) + parser.add_argument( + "--output", + type=str, + required=True, + help="Output WAV file path.", + ) + # Voice cloning + parser.add_argument( + "--ref_audio", + type=str, + default=None, + help="Reference audio file path for voice cloning.", + ) + parser.add_argument( + "--ref_text", + type=str, + default=None, + help="Reference text describing the reference audio.", + ) + # Voice design + parser.add_argument( + "--instruct", + type=str, + default=None, + help="Style instruction for voice design mode.", + ) + parser.add_argument( + "--language", + type=str, + default=None, + help="Language name (e.g. 'English') or code (e.g. 'en').", + ) + # Generation parameters + parser.add_argument("--num_step", type=int, default=32) + parser.add_argument("--guidance_scale", type=float, default=2.0) + parser.add_argument("--speed", type=float, default=1.0) + parser.add_argument( + "--duration", + type=float, + default=None, + help="Fixed output duration in seconds. If set, overrides the " + "model's duration estimation. The speed factor is automatically " + "adjusted to match while preserving language-aware pacing.", + ) + parser.add_argument("--t_shift", type=float, default=0.1) + parser.add_argument("--denoise", type=str2bool, default=True) + parser.add_argument( + "--postprocess_output", + type=str2bool, + default=True, + ) + parser.add_argument("--layer_penalty_factor", type=float, default=5.0) + parser.add_argument("--position_temperature", type=float, default=5.0) + parser.add_argument("--class_temperature", type=float, default=0.0) + parser.add_argument( + "--device", + type=str, + default=None, + help="Device to use for inference. Auto-detected if not specified.", + ) + return parser + + +def main(): + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + args = get_parser().parse_args() + + device = args.device or get_best_device() + logging.info(f"Loading model from {args.model} on {device} ...") + model = OmniVoice.from_pretrained( + args.model, device_map=device, dtype=torch.float16 + ) + + logging.info(f"Generating audio for: {args.text[:80]}...") + audios = model.generate( + text=args.text, + language=args.language, + ref_audio=args.ref_audio, + ref_text=args.ref_text, + instruct=args.instruct, + duration=args.duration, + num_step=args.num_step, + guidance_scale=args.guidance_scale, + speed=args.speed, + t_shift=args.t_shift, + denoise=args.denoise, + postprocess_output=args.postprocess_output, + layer_penalty_factor=args.layer_penalty_factor, + position_temperature=args.position_temperature, + class_temperature=args.class_temperature, + ) + + sf.write(args.output, audios[0], model.sampling_rate) + logging.info(f"Saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/omnivoice/cli/infer_batch.py b/omnivoice/cli/infer_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..4b24912bbb538b24429fb78c0cc191bd7010d429 --- /dev/null +++ b/omnivoice/cli/infer_batch.py @@ -0,0 +1,545 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Batch inference CLI for OmniVoice. + +Distributes TTS generation across multiple GPUs for large-scale tasks. +Reads a JSONL test list, generates audio in parallel, and saves results. + +Usage: + omnivoice-infer-batch --model k2-fsa/OmniVoice \ + --test_list test.jsonl --res_dir results/ + +Test list format (JSONL, one JSON object per line): + Required fields: "id", "text" + Voice cloning: "ref_audio", "ref_text" + Voice design: "instruct" + Optional: "language_id", "duration", "speed" +""" + +import argparse +import logging +import multiprocessing as mp +import os +import signal +import time +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import List, Optional, Tuple + +import torch +from tqdm import tqdm + +from omnivoice.models.omnivoice import OmniVoice +import soundfile as sf + +from omnivoice.utils.audio import load_audio +from omnivoice.utils.common import str2bool +from omnivoice.utils.data_utils import read_test_list +from omnivoice.utils.duration import RuleDurationEstimator + + +def get_best_device(): + """Auto-detect the best available device: CUDA > MPS > CPU.""" + if torch.cuda.is_available(): + return "cuda", torch.cuda.device_count() + if torch.backends.mps.is_available(): + return "mps", 1 + return "cpu", 1 + + +worker_model = None +SAMPLING_RATE = 24000 + + +def get_parser(): + parser = argparse.ArgumentParser(description="Infer OmniVoice Model") + parser.add_argument( + "--model", + type=str, + default="k2-fsa/OmniVoice", + help="Path to the model checkpoint (local dir or HF repo id). " + "Audio tokenizer is expected at /audio_tokenizer/.", + ) + parser.add_argument( + "--test_list", + type=str, + required=True, + help="Path to the JSONL file containing test samples. " + "Each line is a JSON object with the following fields: " + '"id" (str, required): unique name for the output file; ' + '"text" (str, required): text to synthesize; ' + '"ref_audio" (str): path to reference audio for voice cloning; ' + '"ref_text" (str): transcript of the reference audio; ' + '"instruct" (str): instruction for voice design (used when ref_audio is absent); ' + '"language_id" (str): language code, e.g. "en"; ' + '"duration" (float): target duration in seconds; ' + '"speed" (float): speaking speed multiplier. ' + "Only id and text are required; all other fields are optional.", + ) + parser.add_argument( + "--res_dir", + type=str, + required=True, + help="Directory to save the generated audio files.", + ) + parser.add_argument( + "--num_step", + type=int, + default=32, + help="Number of steps for iterative decoding.", + ) + parser.add_argument( + "--guidance_scale", + type=float, + default=2.0, + help="Scale for Classifier-Free Guidance.", + ) + parser.add_argument( + "--t_shift", + type=float, + default=0.1, + help="Shift t to smaller ones if t_shift < 1.0", + ) + parser.add_argument( + "--nj_per_gpu", + type=int, + default=1, + help="Number of worker processes to spawn per GPU.", + ) + parser.add_argument( + "--audio_chunk_duration", + type=float, + default=15.0, + help="Maximum duration of audio chunk (in seconds) for splitting. " + '"Not split" if <= 0.', + ) + parser.add_argument( + "--audio_chunk_threshold", + type=float, + default=30.0, + help=( + "The duration threshold (in seconds) to decide" + " whether to split audio into chunks." + ), + ) + parser.add_argument( + "--batch_duration", + type=float, + default=1000.0, + help="Maximum total duration (reference + generated) per batch (seconds).", + ) + parser.add_argument( + "--batch_size", + type=int, + default=0, + help="Fixed batch size (number of samples per batch). " + "If > 0, use fixed-size batching instead of duration-based batching.", + ) + parser.add_argument( + "--warmup", + type=int, + default=0, + help="Number of dummy inference runs per worker before real inference " + "starts, to warm up CUDA kernels and caches.", + ) + parser.add_argument( + "--preprocess_prompt", + type=str2bool, + default=True, + help="Whether to preprocess reference audio (silence removal, trimming). " + "Set to False to keep raw audio.", + ) + parser.add_argument( + "--postprocess_output", + type=str2bool, + default=True, + help="Whether to post-process generated audio (remove silence).", + ) + parser.add_argument( + "--layer_penalty_factor", + type=float, + default=5.0, + help="The penalty factor for layer-wise sampling.", + ) + parser.add_argument( + "--position_temperature", + type=float, + default=5.0, + help="The temperature for position selection.", + ) + parser.add_argument( + "--class_temperature", + type=float, + default=0.0, + help="The temperature for class token sampling.", + ) + parser.add_argument( + "--denoise", + type=str2bool, + default=True, + help="Whether to add <|denoise|> token in the reference.", + ) + parser.add_argument( + "--lang_id", + type=str, + default=None, + help="Language id to use when test_list JSONL entries do not contain " + "a language_id field.", + ) + return parser + + +def process_init(rank_queue, model_checkpoint, warmup=0): + """Initializer for each worker process. + + Loads model (with tokenizers and duration estimator) onto a specific GPU + via ``OmniVoice.from_pretrained()``. + """ + global worker_model + + torch.set_num_threads(2) + torch.set_num_interop_threads(2) + + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " + "[Worker %(process)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + rank = rank_queue.get() + device_type, device_id = rank + if device_type == "cpu": + worker_device = "cpu" + elif device_type == "mps": + worker_device = "mps" + else: + worker_device = f"cuda:{device_id}" + + logging.info(f"Initializing worker on device: {worker_device}") + + worker_model = OmniVoice.from_pretrained( + model_checkpoint, + device_map=worker_device, + dtype=torch.float16, + ) + + if warmup > 0: + logging.info(f"Running {warmup} warmup iterations on {worker_device}") + dummy_ref_audio = ( + torch.randn(1, SAMPLING_RATE), + SAMPLING_RATE, + ) # 1s dummy audio + for i in range(warmup): + worker_model.generate( + text=["hello"], + language=["en"], + ref_audio=[dummy_ref_audio], + ref_text=["hello"], + ) + logging.info(f"Warmup complete on {worker_device}") + + logging.info(f"Worker on {worker_device} initialized successfully.") + + +def estimate_sample_total_duration( + duration_estimator: RuleDurationEstimator, + text: str, + ref_text: Optional[str], + ref_audio_path: Optional[str], + gen_duration: Optional[float] = None, +) -> float: + """Estimate total duration (ref + generated) for a single sample. + + When ``ref_audio_path`` is ``None`` (instruct / voice-design mode), + the reference duration is treated as 0 and only the estimated generated + duration contributes to the total. + """ + if ref_audio_path is not None: + ref_wav = load_audio(ref_audio_path, SAMPLING_RATE) + ref_duration = ref_wav.shape[-1] / SAMPLING_RATE + else: + ref_duration = 0 + + if gen_duration is None: + if ref_audio_path is not None: + gen_duration = duration_estimator.estimate_duration( + text, ref_text or "", ref_duration, low_threshold=2.0 + ) + else: + gen_duration = duration_estimator.estimate_duration( + text, "Nice to meet you.", 0.5, low_threshold=2.0 + ) + + total_duration = ref_duration + gen_duration + return total_duration + + +def _sort_samples_by_duration( + samples: List[Tuple], + duration_estimator: RuleDurationEstimator, +) -> List[Tuple[Tuple, float]]: + """Return (sample, total_duration) pairs sorted by duration descending.""" + sample_with_duration = [] + for sample in samples: + _, ref_text, ref_audio_path, text, _, dur, _, _ = sample + total_duration = estimate_sample_total_duration( + duration_estimator, text, ref_text, ref_audio_path, gen_duration=dur + ) + sample_with_duration.append((sample, total_duration)) + sample_with_duration.sort(key=lambda x: x[1], reverse=True) + return sample_with_duration + + +def cluster_samples_by_duration( + samples: List[Tuple], + duration_estimator: RuleDurationEstimator, + batch_duration: float, +) -> List[List[Tuple]]: + sample_with_duration = _sort_samples_by_duration(samples, duration_estimator) + batches = [] + current_batch = [] + current_total_duration = 0.0 + + for sample, duration in sample_with_duration: + if duration > batch_duration: + batches.append([sample]) + continue + + if current_total_duration + duration <= batch_duration: + current_batch.append(sample) + current_total_duration += duration + else: + batches.append(current_batch) + current_batch = [sample] + current_total_duration = duration + + if current_batch: + batches.append(current_batch) + + logging.info(f"Clustered {len(samples)} samples into {len(batches)} batches") + return batches + + +def cluster_samples_by_batch_size( + samples: List[Tuple], + duration_estimator: RuleDurationEstimator, + batch_size: int, +) -> List[List[Tuple]]: + """Split samples into fixed-size batches, sorted by duration to minimize padding.""" + sample_with_duration = _sort_samples_by_duration(samples, duration_estimator) + sorted_samples = [s for s, _ in sample_with_duration] + + batches = [ + sorted_samples[i : i + batch_size] + for i in range(0, len(sorted_samples), batch_size) + ] + logging.info( + f"Split {len(samples)} samples into {len(batches)} batches " + f"(fixed batch_size={batch_size}, sorted by duration)" + ) + return batches + + +def run_inference_batch( + batch_samples: List[Tuple], + res_dir: str, + **gen_kwargs, +) -> List[Tuple]: + global worker_model + + save_names = [] + ref_texts = [] + ref_audio_paths = [] + texts = [] + langs = [] + durations = [] + speeds = [] + instructs = [] + + for sample in batch_samples: + save_name, ref_text, ref_audio_path, text, lang_id, dur, spd, instruct = sample + save_names.append(save_name) + ref_texts.append(ref_text) + ref_audio_paths.append(ref_audio_path) + texts.append(text) + langs.append(lang_id) + durations.append(dur) + speeds.append(spd) + instructs.append(instruct) + + start_time = time.time() + audios = worker_model.generate( + text=texts, + language=langs, + ref_audio=ref_audio_paths if any(p is not None for p in ref_audio_paths) else None, + ref_text=ref_texts if any(t is not None for t in ref_texts) else None, + duration=durations if any(d is not None for d in durations) else None, + speed=speeds if any(s is not None for s in speeds) else None, + instruct=instructs if any(i is not None for i in instructs) else None, + **gen_kwargs, + ) + batch_synth_time = time.time() - start_time + + results = [] + for save_name, audio in zip(save_names, audios): + save_path = os.path.join(res_dir, save_name + ".wav") + sf.write(save_path, audio, worker_model.sampling_rate) + audio_duration = audio.shape[-1] / worker_model.sampling_rate + results.append( + ( + save_name, + batch_synth_time / len(batch_samples), + audio_duration, + "success", + ) + ) + + return results + + +def main(): + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + mp.set_start_method("spawn", force=True) + + args = get_parser().parse_args() + os.makedirs(args.res_dir, exist_ok=True) + + device_type, num_devices = get_best_device() + if device_type == "cpu": + logging.warning( + "No GPU found. Falling back to CPU inference. This might be slow." + ) + + num_processes = num_devices * args.nj_per_gpu + logging.info( + f"Using {device_type} ({num_devices} device(s))." + f" Spawning {num_processes} worker processes." + ) + + manager = mp.Manager() + rank_queue = manager.Queue() + for rank in list(range(num_devices)) * args.nj_per_gpu: + rank_queue.put((device_type, rank)) + + samples_raw = read_test_list(args.test_list) + samples = [] + for s in samples_raw: + lang_id = args.lang_id if args.lang_id is not None else s.get("language_id") + samples.append( + ( + s["id"], + s.get("ref_text"), + s.get("ref_audio"), + s["text"], + lang_id, + s.get("duration"), + s.get("speed"), + s.get("instruct"), + ) + ) + + total_synthesis_time = [] + total_audio_duration = [] + + try: + with ProcessPoolExecutor( + max_workers=num_processes, + initializer=process_init, + initargs=(rank_queue, args.model, args.warmup), + ) as executor: + futures = [] + + logging.info("Running batch inference") + + # Split samples by mode (voice-clone vs non-voice-clone) before + # clustering so that each batch is homogeneous. Mixing ref_audio + # and non-ref_audio samples in the same batch would crash in + # generate() → create_voice_clone_prompt(). + clone_samples = [s for s in samples if s[2] is not None] + other_samples = [s for s in samples if s[2] is None] + + duration_estimator = RuleDurationEstimator() + batches = [] + for subset in (clone_samples, other_samples): + if not subset: + continue + if args.batch_size > 0: + batches.extend( + cluster_samples_by_batch_size( + subset, duration_estimator, args.batch_size + ) + ) + else: + batches.extend( + cluster_samples_by_duration( + subset, duration_estimator, args.batch_duration + ) + ) + + args_dict = vars(args) + + for batch in batches: + futures.append( + executor.submit( + run_inference_batch, batch_samples=batch, **args_dict + ) + ) + + for future in tqdm( + as_completed(futures), total=len(futures), desc="Processing samples" + ): + try: + result = future.result() + for s_name, synth_time, audio_dur, status in result: + total_synthesis_time.append(synth_time) + total_audio_duration.append(audio_dur) + rtf = synth_time / audio_dur if audio_dur > 0 else float("inf") + logging.debug( + f"Processed {s_name}: Audio Duration={audio_dur:.2f}s, " + f"Synthesis Time={synth_time:.2f}s, RTF={rtf:.4f}" + ) + except Exception as e: + logging.error(f"Failed to process sample: {e}") + detailed_error = traceback.format_exc() + logging.error(f"Detailed error: {detailed_error}") + + except (Exception, KeyboardInterrupt) as e: + logging.critical( + f"An unrecoverable error occurred: {e}. Terminating all processes." + ) + detailed_error_info = traceback.format_exc() + logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}") + os.killpg(os.getpgid(os.getpid()), signal.SIGKILL) + + total_synthesis_time = sum(total_synthesis_time) + total_audio_duration = sum(total_audio_duration) + logging.info("--- Summary ---") + logging.info(f"Total audio duration: {total_audio_duration:.2f}s") + logging.info(f"Total synthesis time: {total_synthesis_time:.2f}s") + if total_audio_duration > 0: + average_rtf = total_synthesis_time / total_audio_duration + logging.info(f"Average RTF: {average_rtf:.4f}") + else: + logging.warning("No speech was generated. RTF cannot be computed.") + + logging.info("Done!") + + +if __name__ == "__main__": + main() diff --git a/omnivoice/cli/train.py b/omnivoice/cli/train.py new file mode 100644 index 0000000000000000000000000000000000000000..947746e16e8cb7a15040596618df7fbeb4f29c0b --- /dev/null +++ b/omnivoice/cli/train.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training CLI for OmniVoice. + +Launches distributed training via HuggingFace Accelerate. +Supports pre-training on Emilia data and finetuning on custom data. + +Usage: + accelerate launch --gpu_ids 0,1,2,3 --num_processes 4 \\ + -m omnivoice.cli.train \\ + --train_config train_config.json \\ + --data_config data_config.json \\ + --output_dir output/ + +See examples/run_emilia.sh and examples/run_finetune.sh for full pipelines. +""" + +import argparse + +from omnivoice.training.builder import build_dataloaders, build_model_and_tokenizer +from omnivoice.training.config import TrainingConfig +from omnivoice.training.trainer import OmniTrainer + + +def main(): + parser = argparse.ArgumentParser(description="OmniVoice Training Entry Point") + parser.add_argument( + "--train_config", type=str, required=True, help="Path to config JSON" + ) + parser.add_argument( + "--output_dir", type=str, required=True, help="Where to save checkpoints" + ) + parser.add_argument( + "--data_config", type=str, required=True, help="Path to data config JSON" + ) + args = parser.parse_args() + + # 1. Load Configuration + config = TrainingConfig.from_json(args.train_config) + config.output_dir = args.output_dir + config.data_config = args.data_config + + # 2. Build Components + model, tokenizer = build_model_and_tokenizer(config) + train_loader, eval_loader = build_dataloaders(config, tokenizer) + + # 3. Initialize Trainer and Start + trainer = OmniTrainer( + model=model, + config=config, + train_dataloader=train_loader, + eval_dataloader=eval_loader, + tokenizer=tokenizer, + ) + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/data/__init__.py b/omnivoice/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/data/batching.py b/omnivoice/data/batching.py new file mode 100644 index 0000000000000000000000000000000000000000..6f78cbf6abd1368f315c007db56544f988f8942e --- /dev/null +++ b/omnivoice/data/batching.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Batching strategies for streaming/iterable datasets. + +Provides length-based grouping and packing for efficient training with +variable-length audio. + +Key classes: +- ``PackingIterableDataset``: Packs multiple samples into fixed-length sequences + for training. Used by ``omnivoice.training.builder`` with flex_attention. +- ``StreamLengthGroupDataset``: Groups samples by length into buckets. Used by + data processing scripts (e.g. ``omnivoice/scripts/``) and by + ``omnivoice.training.builder`` when ``attn_implementation != "flex_attention"``. +""" + +import bisect +import logging +from typing import Any, Dict, Iterator, List, Optional + +import numpy as np + +from omnivoice.data.dataset import IterableDataReader, WrappedIterableDataset + + +class StreamLengthGroupDataset(WrappedIterableDataset): + """A streaming dataset that groups samples by their lengths into buckets. + + By default, length is measured as audio duration in seconds from a raw + waveform field. Pass a custom ``length_fn`` to use a different measure — + e.g. ``lambda s: s["length"]`` for processed training data, in which case + ``batch_duration`` and ``min/max_length`` should use the same units. + + If ``processor`` is provided, each raw sample is processed before length + measurement and bucketing, and the yielded batches contain **processed** + samples. This allows accurate bucketing by post-processing token length + (used in the SDPA training path). + """ + + def __init__( + self, + dataset: IterableDataReader, + batch_duration: float, + min_length: float = 0.5, + max_length: float = 30.0, + num_buckets: int = 20, + audio_key: str = "audio", + drop_last: bool = False, + max_sample: Optional[int] = None, + length_fn: Optional[Any] = None, + processor: Optional[Any] = None, + ): + self.dataset = dataset + self.batch_duration = batch_duration + self.min_length = min_length + self.max_length = max_length + self.num_buckets = num_buckets + self.audio_key = audio_key + self.drop_last = drop_last + self.max_sample = max_sample if max_sample is not None else float("inf") + self.length_fn = length_fn + self.processor = processor + + self.boundaries = np.linspace(min_length, max_length, num_buckets + 1)[1:] + + def set_epoch(self, epoch: int): + """ + Set the epoch for shuffling. + """ + self.dataset.set_epoch(epoch) + + def _get_bucket_id(self, length: float) -> int: + + return bisect.bisect_left(self.boundaries, length) + + def __iter__(self) -> Iterator[List[Dict[str, Any]]]: + buckets = [[] for _ in range(self.num_buckets)] + bucket_max_len = [0.0] * self.num_buckets + + for sample in self.dataset: + if self.processor is not None: + try: + sample = self.processor(sample) + except Exception as e: + logging.warning(f"Error processing sample: {e}") + continue + + if self.length_fn is not None: + duration = self.length_fn(sample) + else: + audio = sample[self.audio_key] + duration = audio.size(-1) / self.dataset.sample_rate + + if duration < self.min_length or duration > self.max_length: + # logging.warning(f"Skipping sample with duration {duration:.2f}s") + continue + + b_id = self._get_bucket_id(duration) + buckets[b_id].append(sample) + + if duration > bucket_max_len[b_id]: + bucket_max_len[b_id] = duration + + if ( + bucket_max_len[b_id] * (len(buckets[b_id]) + 1) >= self.batch_duration + or len(buckets[b_id]) >= self.max_sample + ): + yield buckets[b_id] + buckets[b_id] = [] + bucket_max_len[b_id] = 0.0 + + if not self.drop_last: + for b_idx, bucket in enumerate(buckets): + if bucket: + yield bucket + buckets[b_idx] = [] + + +class PackingIterableDataset(WrappedIterableDataset): + """ + An IterableDataset that dynamically processes samples using a processor + and packs them into batches based on the real token count. + + Args: + dataset (Iterable): The raw dataset to process. + processor (Callable): A processor to process each sample. + batch_tokens (int): Maximum number of tokens per batch. + """ + + def __init__( + self, + dataset: IterableDataReader, + processor: Any, + batch_tokens: int, + ): + self.dataset = dataset + self.processor = processor + self.batch_tokens = batch_tokens + self.skip_batches = 0 + + def set_epoch(self, epoch: int): + """ + Set the epoch for shuffling. + """ + self.dataset.set_epoch(epoch) + + def __iter__(self) -> Iterator[List[Dict[str, Any]]]: + current_batch = [] + current_token_count = 0 + + for raw_sample in self.dataset: + # Process the sample using the processor + try: + processed_sample = self.processor(raw_sample) + except Exception as e: + logging.warning(f"Error processing sample {raw_sample}: {e}") + continue + + sample_length = processed_sample["length"] + + if sample_length > self.batch_tokens: + continue + + # Check if adding this sample exceeds the batch token limit + if current_token_count + sample_length > self.batch_tokens: + # Yield the current batch and start a new one + yield current_batch + current_batch = [] + current_token_count = 0 + + # Add the processed sample to the current batch + current_batch.append(processed_sample) + current_token_count += sample_length + + # Yield the last batch if it's not empty + if current_batch: + yield current_batch diff --git a/omnivoice/data/collator.py b/omnivoice/data/collator.py new file mode 100644 index 0000000000000000000000000000000000000000..c1cac27462f919e44d1d359ed8c722f302903a87 --- /dev/null +++ b/omnivoice/data/collator.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data collators for OmniVoice training. + +Two strategies are available: + +- ``PackingDataCollator``: Concatenates samples into a single long sequence + (sequence packing). Used with flex_attention. Batch shape is ``[1, C, L]``. +- ``PaddingDataCollator``: Pads samples to the same length and stacks them. + Used with SDPA/eager attention. Batch shape is ``[B, C, max_len]``. +""" + +from typing import Any, Dict, List + +import torch + + +class PaddingDataCollator: + """Pads a list of processed samples to the same length and stacks them. + + Produces a standard ``[B, C, max_len]`` batch suitable for SDPA/eager + attention, where B is the number of samples in the batch, C is the number + of audio codebook layers, and max_len is the longest sequence in the batch. + + A 4D boolean attention mask of shape ``[B, 1, max_len, max_len]`` is included. + Each query position can attend to all non-padding key positions (bidirectional), + matching the masked-diffusion training objective. When passed as a 4D tensor, + HuggingFace models use it directly without adding an additional causal mask. + + No ``document_ids`` are emitted — each sample occupies its own batch row. + """ + + def __init__(self, processor, batch_tokens: int): + self.batch_tokens = batch_tokens + self.processor = processor + + def __call__(self, processed_samples: List[Dict[str, Any]]) -> Dict[str, Any]: + pad_id = self.processor.text_tokenizer.pad_token_id + max_len = max(s["length"] for s in processed_samples) + B = len(processed_samples) + + padded_input_ids = [] + padded_labels = [] + padded_audio_mask = [] + padded_position_ids = [] + # valid[b, j] = True if position j is a real (non-padding) token for sample b + valid = torch.zeros(B, max_len, dtype=torch.bool) + + for i, s in enumerate(processed_samples): + length = s["length"] + pad = max_len - length + + padded_input_ids.append( + torch.nn.functional.pad(s["input_ids"], (0, pad), value=pad_id) + ) # [C, max_len] + padded_labels.append( + torch.nn.functional.pad(s["labels"], (0, pad), value=-100) + ) # [C, max_len] + padded_audio_mask.append( + torch.nn.functional.pad(s["audio_mask"], (0, pad), value=False) + ) # [max_len] + padded_position_ids.append( + torch.nn.functional.pad( + torch.arange(length, dtype=torch.long), (0, pad), value=0 + ) + ) # [max_len] + valid[i, :length] = True + + # Stack into [B, C, max_len] / [B, max_len] + input_ids = torch.stack(padded_input_ids, dim=0) # [B, C, max_len] + labels = torch.stack(padded_labels, dim=0) # [B, C, max_len] + audio_mask = torch.stack(padded_audio_mask, dim=0) # [B, max_len] + position_ids = torch.stack(padded_position_ids, dim=0) # [B, max_len] + + # 4D bidirectional attention mask: mask[b, 0, i, j] = valid[b, j] + # All query positions attend to all non-padding key positions. + attention_mask = valid[:, None, None, :].expand(B, 1, max_len, max_len).contiguous() + + return { + "input_ids": input_ids, # [B, C, max_len] + "labels": labels, # [B, C, max_len] + "audio_mask": audio_mask, # [B, max_len] + "position_ids": position_ids, # [B, max_len] + "attention_mask": attention_mask, # [B, 1, max_len, max_len] + } + + +class PackingDataCollator: + def __init__(self, processor, batch_tokens: int): + self.batch_tokens = batch_tokens + self.processor = processor + + def __call__(self, processed_samples: List[Dict[str, Any]]) -> Dict[str, Any]: + + target_length = self.batch_tokens + + input_ids = torch.cat( + [s["input_ids"] for s in processed_samples], dim=1 + ) # [C, Total_Len], C is the number of codebook layers of the audio tokenizer + labels = torch.cat( + [s["labels"] for s in processed_samples], dim=1 + ) # [C, Total_Len] + audio_mask = torch.cat( + [s["audio_mask"] for s in processed_samples], dim=0 + ) # [Total_Len] + + position_ids = torch.cat( + [torch.arange(s["length"], dtype=torch.long) for s in processed_samples], + dim=0, + ) # [Total_Len] + + pad_length = target_length - input_ids.shape[1] + + input_ids = torch.nn.functional.pad( + input_ids, + pad=(0, pad_length), + value=self.processor.text_tokenizer.pad_token_id, + ) + + labels = torch.nn.functional.pad(labels, pad=(0, pad_length), value=-100) + + audio_mask = torch.nn.functional.pad( + audio_mask, pad=(0, pad_length), value=False + ) + + position_ids = torch.nn.functional.pad( + position_ids, pad=(0, pad_length), value=0 + ) + + return_list = { + "input_ids": input_ids.unsqueeze(0), # [1, C, L] + "labels": labels.unsqueeze(0), # [1, C, L] + "audio_mask": audio_mask.unsqueeze(0), # [1, L] + "position_ids": position_ids.unsqueeze(0), # [1, L] + } + + document_ids_list = [] + + for i, s in enumerate(processed_samples): + seq_len = s["length"] + document_ids_list.append(torch.full((seq_len,), i, dtype=torch.int32)) + + document_ids = torch.cat(document_ids_list, dim=0) + + document_ids = torch.nn.functional.pad( + document_ids, pad=(0, pad_length), value=-1 + ) + return_list["document_ids"] = document_ids.unsqueeze(0) # [1, L] + + return return_list diff --git a/omnivoice/data/dataset.py b/omnivoice/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..f3811e9b5a7364f2d54f54d3a4af8211266826f9 --- /dev/null +++ b/omnivoice/data/dataset.py @@ -0,0 +1,544 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Dataset and data-loading utilities for training and evaluation. + +Provides WebDataset-based iterable datasets, manifest parsing, and audio/token +loading. Used by ``omnivoice.training.builder.build_dataloaders()`` to construct +train and eval data loaders. + +Key functions: +- ``prepare_data_manifests_from_json()``: Parses a data config JSON into train/dev + manifests. + +Key classes: +- ``WebDatasetReader``: Reads audio/text pairs from WebDataset tar shards as an + iterable dataset. +- ``MuxWebDatasetReader``: Multiplexes multiple WebDataset readers for + multilingual data. +- ``JsonlDatasetReader``: Reads audio/text pairs from a JSONL manifest file. + Used by data processing scripts (e.g. ``omnivoice/scripts/``). +- ``SampleDecoder``: Decodes individual samples (audio or tokens + labels). +""" + +import io +import json +import logging +import os +import random +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import torch +import torch.distributed as dist +import webdataset as wds + +from omnivoice.utils.audio import load_audio, load_audio_bytes +from torch.utils.data import IterableDataset + + +def load_audio_webdataset(data, sample_rate: int = 24000, device="cpu"): + """ + Load audio from bytes data and resample to the target sample rate if needed. + Return a tensor of shape (1, num_samples) + """ + audio = torch.from_numpy(load_audio_bytes(data, sample_rate)) + audio = audio.to(device) + return audio + + +def prepare_data_manifests_from_json( + data_config: str, +) -> Tuple[List[Tuple[str, str, int, float]], List[Tuple[str, str, int, float]]]: + """ + Prepare data manifests from a json file. + A typical multilingual json file is in the following format: + { + "train": + [ + { + "language_id": "en", + "manifest_path": [ + "/Emilia/EN/data.lst" + ], + "repeat": 1 + }, + { + "language_id": "zh", + "manifest_path": [ + "/Emilia/ZH/data.lst" + ], + "repeat": 1 + } + ], + "dev": + [ + { + "language_id": "en", + "manifest_path": [ + "/Emilia/EN-dev/data.lst" + ], + "repeat": 1 + }, + { + "language_id": "zh", + "manifest_path": [ + "/Emilia/ZH-dev/data.lst" + ], + "repeat": 1 + } + ] + } + + "language_id" is not used, just for better organization of multilingual data. + "repeat" is an optional field, default to 1, which indicates how many times + the manifest should be repeated. + + The simplist format is like: + { + "train": + [ + { + "manifest_path": [ + "/Emilia/EN/data.lst", + "/Emilia/ZH/data.lst" + ], + } + ], + "dev": + [ + { + "manifest_path": [ + "/Emilia/EN-dev/data.lst", + "/Emilia/ZH-dev/data.lst" + ], + } + ] + + data.lst format (items separated by space): + /path/to/data.tar /path/to/label.jsonl num_items num_seconds + """ + train_manifests = [] + dev_manifests = [] + with open(data_config, "r", encoding="utf-8") as f: + data = json.load(f) + for item in data["train"]: + manifest_paths = item["manifest_path"] + repeat = item.get("repeat", 1) + for manifest_path in manifest_paths: + # assert manifest_path is a file + assert os.path.isfile(manifest_path), f"{manifest_path} is not a file." + train_manifests.extend( + webdataset_manifest_reader(manifest_path) * repeat + ) + if "dev" in data: + for item in data["dev"]: + manifest_paths = item["manifest_path"] + repeat = item.get("repeat", 1) + for manifest_path in manifest_paths: + dev_manifests.extend( + webdataset_manifest_reader(manifest_path) * repeat + ) + return train_manifests, dev_manifests + + +def webdataset_manifest_reader( + manifest_path: str, +) -> List[Tuple[str, str]]: + """ + Read a manifest file containing webdataset tar paths and label jsonl paths. + Each line in the manifest file is in the format of: + /path/to/data.tar /path/to/label.jsonl num_items num_seconds + """ + manifests = [] + with open(manifest_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split() + if len(parts) != 4: + raise ValueError( + f"Invalid manifest line: {line}. " + f"Each line must contain " + "tar_path, label_jsonl_path, num_items, num_seconds." + ) + tar_path, label_jsonl_path, num_items, num_seconds = ( + parts[0], + parts[1], + int(parts[2]), + float(parts[3]), + ) + manifests.append((tar_path, label_jsonl_path, num_items, num_seconds)) + return manifests + + +class SampleDecoder: + """ + Decode a sample from webdataset, including loading audio/tokens and fetching label. + """ + + def __init__( + self, + tar_to_label: Dict, + sample_rate: int = 24000, + audio_format: Optional[Tuple[str]] = None, + normalize_audio: bool = True, + ): + """ + Args: + tar_to_label: + A dict mapping from audio tar file to label tar file. + sample_rate: + Target sample rate for audio. Required if audio is loaded. + audio_format: + Tuple of audio file extensions to look for in the sample. + """ + self.tar_to_label = tar_to_label + self.sample_rate = sample_rate + self.label_dataset = None + if audio_format is None: + self.audio_format = ("flac", "wav", "mp3") + else: + self.audio_format = audio_format + self.normalize_audio = normalize_audio + + def __call__(self, sample): + return_dict = {} + src = sample["__url__"] + key = sample["__key__"] + if ( + self.label_dataset is None + or self.label_dataset.path != self.tar_to_label[src] + ): + self.label_dataset = LabelDataset(self.tar_to_label[src]) + + audio = torch.empty(0) + if "npy" in sample: + audio_tokens = torch.from_numpy(sample["npy"]) + return_dict["audio_tokens"] = audio_tokens + else: + for ext in self.audio_format: + if ext in sample: + # load audio (1, num_samples) + audio = load_audio_webdataset( + sample[ext], sample_rate=self.sample_rate + ) + if self.normalize_audio: + audio = (audio / (audio.abs().max() + 1e-7)) * 0.9 + break + return_dict["audio"] = audio + return_dict["audio_duration"] = audio.size(-1) / self.sample_rate + + label = self.label_dataset[key] + + return_dict["label"] = label + return return_dict + + +class LabelDataset: + def __init__(self, jsonl_path: str): + """ + Load labels from a jsonl file. + Args: + jsonl_path: + Path to the jsonl file containing labels. + Each line in the manifest file is in the format of: + {"idx": "idx", "text": "transcription text"} + """ + self._labels = {} + self.path = jsonl_path + if not os.path.exists(jsonl_path): + raise FileNotFoundError(f"Label jsonl file {jsonl_path} does not exist.") + with open(jsonl_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + item = json.loads(line) + if "id" in item: + self._labels[item["id"]] = item + + def __getitem__(self, key): + return self._labels[key] + + +class IterableDataReader: + "Interfaces for classes reading data." + + sample_rate: int + + def set_epoch(self, epoch: int): + raise NotImplementedError + + def __iter__(self) -> Iterator[Dict[str, Any]]: + raise NotImplementedError + + def __len__(self) -> int: + raise NotImplementedError + + +class WrappedIterableDataset(IterableDataset): + "IterableDataset interfaces in this project." + + def set_epoch(self, epoch: int): + raise NotImplementedError + + def __iter__(self) -> Iterator[List[Dict[str, Any]]]: + raise NotImplementedError + + +class WebDatasetReader(IterableDataReader): + def __init__( + self, + manifests: List[Tuple[str, str, int, float]], + evaluation: bool = False, + shuffle_buffer_size: int = 20000, + sample_rate: int = 24000, + ): + self.shuffle_buffer_size = shuffle_buffer_size + self.evaluation = evaluation + self.epoch = 0 + + self.orig_urls = [] + self.tar_to_label = {} + self.num_items = 0 + self.num_seconds = 0.0 + for tar_path, label_jsonl_path, num_items, num_seconds in manifests: + self.orig_urls.append(tar_path) + self.tar_to_label[tar_path] = label_jsonl_path + self.num_items += num_items + self.num_seconds += num_seconds + self.urls = self.orig_urls.copy() + self.sample_decoder = SampleDecoder( + tar_to_label=self.tar_to_label, + sample_rate=sample_rate, + ) + self.sample_rate = sample_rate + + def set_epoch(self, epoch: int): + """ + Set the epoch for shuffling. + """ + self.epoch = epoch + self.urls = self.orig_urls.copy() + if not self.evaluation: + random.Random(epoch).shuffle(self.urls) + + def __iter__(self) -> Iterator[Dict[str, Any]]: + + dataset = wds.WebDataset( + self.urls, + shardshuffle=False, + workersplitter=wds.split_by_worker, + nodesplitter=wds.split_by_node, + ) + + pipeline = dataset.decode().map(self.sample_decoder) + if not self.evaluation: + pipeline = pipeline.shuffle(self.shuffle_buffer_size, seed=self.epoch) + return iter(pipeline) + + def __len__(self) -> int: + return self.num_items + + +class JsonlDatasetReader(IterableDataReader): + """Read raw JSONL and load audio files, matching WebDatasetReader output format. + + Each JSONL line should be a JSON object with at least: + {"id": "...", "audio_path": "/path/to/audio.wav", ...} + + Yields dicts of the form: {"audio": Tensor(1, T), "label": dict} + """ + + def __init__( + self, + jsonl_path: str, + sample_rate: int = 24_000, + shuffle: bool = True, + shuffle_seed: int = 42, + normalize_audio: bool = True, + ): + self.jsonl_path = jsonl_path + self.sample_rate = sample_rate + self.shuffle = shuffle + self.shuffle_seed = shuffle_seed + self.normalize_audio = normalize_audio + + def set_epoch(self, epoch: int): + self.shuffle_seed = epoch + + def _read_lines(self) -> list[dict]: + entries = [] + with open(self.jsonl_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + entries.append(json.loads(line)) + if self.shuffle: + random.seed(self.shuffle_seed) + random.shuffle(entries) + logging.info( + f"Shuffled {len(entries)} JSONL entries (seed={self.shuffle_seed})" + ) + return entries + + def _stream_lines(self): + with open(self.jsonl_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + def __iter__(self): + source = self._read_lines() if self.shuffle else self._stream_lines() + + # Split data across distributed ranks (multi-GPU / DDP) + if dist.is_initialized(): + rank = dist.get_rank() + world_size = dist.get_world_size() + source = [item for i, item in enumerate(source) if i % world_size == rank] + + # Split data across DataLoader workers to avoid duplication + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None: + source = ( + item + for i, item in enumerate(source) + if i % worker_info.num_workers == worker_info.id + ) + + for meta in source: + audio_path = meta.get("audio_path") + if not audio_path or not os.path.exists(audio_path): + logging.warning( + f"Skipping {meta.get('id', '?')}: audio_path missing or not found" + ) + continue + try: + waveform = torch.from_numpy( + load_audio(audio_path, self.sample_rate) + ) + if self.normalize_audio: + waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.9 + meta["audio_duration"] = waveform.shape[1] / self.sample_rate + yield {"audio": waveform, "label": meta} + except Exception as e: + logging.warning(f"Skipping {meta.get('id', '?')}: {e}") + + +class MuxWebDatasetReader(IterableDataReader): + def __init__( + self, + readers: List[WebDatasetReader], + weights: Optional[List[float]] = None, + stop_early: bool = False, + seed: int = 0, + ): + self.readers = readers + self.stop_early = stop_early + self.mux_iterator = LazyIteratorMultiplexer( + *readers, + stop_early=stop_early, + weights=weights, + seed=seed, + ) + + def set_epoch(self, epoch: int): + """ + Set the epoch for shuffling. + """ + for reader in self.readers: + reader.set_epoch(epoch) + + def __iter__(self) -> Iterator[Dict[str, Any]]: + return iter(self.mux_iterator) + + +class LazyIteratorMultiplexer: + """ + A wrapper over multiple iterators that enables to combine + lazy manifests in Lhotse. During iteration, unlike + :class:`.LazyIteratorChain`, + :class:`.LazyIteratorMultiplexer` at each step randomly + selects the iterable used to yield an item. + + Since the iterables might be of different length, we provide + a ``weights`` parameter to let the user decide which iterables + should be sampled more frequently than others. + When an iterable is exhausted, we will keep sampling from the other iterables, until + we exhaust them all, unless ``stop_early`` is set to ``True``. + """ + + def __init__( + self, + *iterators: IterableDataReader, + stop_early: bool = False, + weights: Optional[List[float]] = None, + seed: int = 0, + ) -> None: + self.iterators = list(iterators) + self.stop_early = stop_early + self.seed = seed + + assert ( + len(self.iterators) > 1 + ), "There have to be at least two iterables to multiplex." + + if weights is None: + if all(hasattr(it, "__len__") for it in self.iterators): + lengths = [len(it) for it in self.iterators] + total_length = sum(lengths) + self.weights = [length / total_length for length in lengths] + else: + self.weights = [1] * len(self.iterators) + else: + self.weights = weights + + assert len(self.iterators) == len(self.weights) + + def __iter__(self): + + rng = random.Random(self.seed) + iters = [iter(it) for it in self.iterators] + exhausted = [False for _ in range(len(iters))] + + def should_continue(): + if self.stop_early: + return not any(exhausted) + else: + return not all(exhausted) + + while should_continue(): + active_indexes, active_weights = zip( + *[ + (i, w) + for i, (is_exhausted, w) in enumerate(zip(exhausted, self.weights)) + if not is_exhausted + ] + ) + idx = rng.choices(active_indexes, weights=active_weights, k=1)[0] + selected = iters[idx] + try: + item = next(selected) + yield item + except StopIteration: + exhausted[idx] = True + continue + + def __len__(self) -> int: + return sum(len(iterator) for iterator in self.iterators) diff --git a/omnivoice/data/processor.py b/omnivoice/data/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3ec1b320a7178f895dd542cdbd9c70b16b3bb2 --- /dev/null +++ b/omnivoice/data/processor.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training sample processor for OmniVoice. + +Converts raw audio/text samples into model-ready tensors: applies prompt/mask +tokenization, randomly drops conditioning, and injects language/instruct tokens. +Used by ``omnivoice.training.builder`` to build the data pipeline. + +Contains two processor classes: +- ``OmniVoiceSampleProcessor``: Full processor used for training. +- ``OmniVoiceSimpleSampleProcessor``: Simplified processor (not used for training). +""" + +import random +from typing import Any, Dict + +import torch + + +class OmniVoiceSampleProcessor: + """ + Handles the logic of processing a raw sample into tensors + (masking, tokenization, etc.). + """ + + def __init__( + self, + text_tokenizer: Any, + num_channels: int, + audio_mask_id: int, + prompt_ratio_range: tuple, + mask_ratio_range: tuple, + drop_cond_ratio: float, + language_ratio: float, + use_pinyin_ratio: float, + instruct_ratio: float, + only_instruct_ratio: float, + ): + self.text_tokenizer = text_tokenizer + self.num_channels = num_channels + self.audio_mask_id = audio_mask_id + self.prompt_ratio_range = prompt_ratio_range + self.mask_ratio_range = mask_ratio_range + self.drop_cond_ratio = drop_cond_ratio + + self.language_ratio = language_ratio + self.use_pinyin_ratio = use_pinyin_ratio + self.instruct_ratio = instruct_ratio + self.only_instruct_ratio = only_instruct_ratio + + def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]: + + # clean_start_token_idx is only used for prompt denoising training, + # where the prompt region is augmented with noises and the model + # needs to learn to recover the clean prompt. + # clean_start_token_idx indicates the start index of the clean generated token. + if "clean_start_token_idx" in sample["label"]: + drop_cond = False + else: + drop_cond = random.uniform(0, 1) < self.drop_cond_ratio + + if drop_cond: + prompt_ratio = 0.0 + drop_text = True + use_language = False + use_instruct = False + else: + prompt_ratio = random.uniform(*self.prompt_ratio_range) + drop_text = False + use_language = random.uniform(0, 1) < self.language_ratio + use_instruct = random.uniform(0, 1) < self.instruct_ratio + if use_instruct and random.uniform(0, 1) < self.only_instruct_ratio: + prompt_ratio = 0.0 + + mask_ratio = random.uniform(*self.mask_ratio_range) + + # --- Style --- + style = "" + if use_language: + language = sample["label"].get("language_id", "None") + else: + language = "None" + if use_instruct: + instruct = sample["label"].get("instruct", "None") + else: + instruct = "None" + + if "clean_start_token_idx" in sample["label"]: + style += "<|denoise|>" + + style += f"<|lang_start|>{language}<|lang_end|>" + style += f"<|instruct_start|>{instruct}<|instruct_end|>" + + style_inputs = self.text_tokenizer(style, return_tensors="pt").input_ids.repeat( + self.num_channels, 1 + ) + style_labels = torch.full( + style_inputs.shape, -100 + ) # Style prompt does not compute loss + + # --- Text --- + if ( + "text_pinyin" in sample["label"] + and random.uniform(0, 1) < self.use_pinyin_ratio + ): + text = sample["label"]["text_pinyin"] + else: + text = sample["label"]["text"] + text_inputs = self.text_tokenizer( + f"<|text_start|>{text}<|text_end|>", return_tensors="pt" + ).input_ids.repeat(self.num_channels, 1) + text_labels = torch.full(text_inputs.shape, -100) # Text does not compute loss + + # --- Audio --- + audio_tokens = sample["audio_tokens"].long() + + # Masking Logic + if "clean_start_token_idx" in sample["label"]: + prompt_length = sample["label"]["clean_start_token_idx"] + else: + prompt_length = int(audio_tokens.shape[1] * prompt_ratio) + + audio_inputs = audio_tokens.clone() + audio_labels = audio_tokens.clone() + + # Apply masking + maskable_region = audio_tokens[:, prompt_length:] + token_mask = torch.rand(maskable_region.shape) < mask_ratio + audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id + audio_labels[:, prompt_length:][ + ~token_mask + ] = -100 # Only compute loss on masked tokens + if not drop_cond: + audio_labels[:, :prompt_length] = -100 # No loss on prompt region + + # --- Concatenation --- + if drop_text: + input_ids = audio_inputs + labels = audio_labels + total_length = input_ids.shape[1] + audio_mask = torch.ones(total_length, dtype=torch.bool) + else: + input_ids = torch.cat([style_inputs, text_inputs, audio_inputs], dim=1) + labels = torch.cat([style_labels, text_labels, audio_labels], dim=1) + total_length = input_ids.shape[1] + audio_start_idx = style_inputs.shape[1] + text_inputs.shape[1] + audio_mask = torch.zeros(total_length, dtype=torch.bool) + audio_mask[audio_start_idx:] = True + + return_dict = { + "input_ids": input_ids, # [C, L] + "labels": labels, # [C, L] + "audio_mask": audio_mask, # [L] + "length": total_length, + } + + return return_dict + + +class OmniVoiceSimpleSampleProcessor: + """ + Handles the logic of processing a raw sample into tensors + (masking, tokenization, etc.). + This is a simpler version that does not include language, instructions, + or denoising prompts. + We do not use it for training as OmniVoiceSampleProcessor can cover this case. + We keep it as a reference implementation for users to understand the basic logics. + """ + + def __init__( + self, + text_tokenizer: Any, + num_channels: int, + audio_mask_id: int, + prompt_ratio_range: tuple, + mask_ratio_range: tuple, + drop_cond_ratio: float, + ): + self.text_tokenizer = text_tokenizer + self.num_channels = num_channels + self.audio_mask_id = audio_mask_id + self.prompt_ratio_range = prompt_ratio_range + self.mask_ratio_range = mask_ratio_range + self.drop_cond_ratio = drop_cond_ratio + + def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]: + drop_cond = random.uniform(0, 1) < self.drop_cond_ratio + mask_ratio = random.uniform(*self.mask_ratio_range) + + if drop_cond: + prompt_ratio = 0.0 + else: + prompt_ratio = random.uniform(*self.prompt_ratio_range) + + # --- Text --- + text = sample["label"]["text"] + text_inputs = self.text_tokenizer( + f"<|text_start|>{text}<|text_end|>", return_tensors="pt" + ).input_ids.repeat(self.num_channels, 1) + text_labels = torch.full(text_inputs.shape, -100) # Text does not compute loss + + # --- Audio --- + audio_tokens = sample["audio_tokens"].long() + + # Masking Logic + prompt_length = int(audio_tokens.shape[1] * prompt_ratio) + audio_inputs = audio_tokens.clone() + audio_labels = audio_tokens.clone() + + # Apply masking + maskable_region = audio_tokens[:, prompt_length:] + token_mask = torch.rand(maskable_region.shape) < mask_ratio + audio_inputs[:, prompt_length:][token_mask] = self.audio_mask_id + audio_labels[:, prompt_length:][ + ~token_mask + ] = -100 # Only compute loss on masked tokens + + if not drop_cond: + # No loss on prompt region + audio_labels[:, :prompt_length] = -100 + + # --- Concatenation --- + if drop_cond: + input_ids = audio_inputs + labels = audio_labels + total_length = input_ids.shape[1] + audio_mask = torch.ones(total_length, dtype=torch.bool) + else: + input_ids = torch.cat([text_inputs, audio_inputs], dim=1) + labels = torch.cat([text_labels, audio_labels], dim=1) + total_length = input_ids.shape[1] + audio_start_idx = text_inputs.shape[1] + audio_mask = torch.zeros(total_length, dtype=torch.bool) + audio_mask[audio_start_idx:] = True + + return_dict = { + "input_ids": input_ids, # [C, L] + "labels": labels, # [C, L] + "audio_mask": audio_mask, # [L] + "length": total_length, + } + + return return_dict diff --git a/omnivoice/eval/__init__.py b/omnivoice/eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..88e1e0a04cb2911882c19385fe1670c04064862f --- /dev/null +++ b/omnivoice/eval/__init__.py @@ -0,0 +1,4 @@ +import warnings + +# Suppress specific warnings from zhconv that are not relevant to WER calculation +warnings.filterwarnings("ignore", category=UserWarning) diff --git a/omnivoice/eval/models/ecapa_tdnn_wavlm.py b/omnivoice/eval/models/ecapa_tdnn_wavlm.py new file mode 100644 index 0000000000000000000000000000000000000000..1219fbcb152fdb390b371ab183b5af6c98589830 --- /dev/null +++ b/omnivoice/eval/models/ecapa_tdnn_wavlm.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ECAPA_TDNN_WAVLM(nn.Module): + def __init__( + self, + feat_dim=80, + channels=512, + emb_dim=192, + global_context_att=False, + sr=16000, + ssl_model_path=None, + ): + super().__init__() + self.sr = sr + + if ssl_model_path is None: + self.feature_extract = torch.hub.load("s3prl/s3prl", "wavlm_large") + else: + self.feature_extract = torch.hub.load( + os.path.dirname(ssl_model_path), + "wavlm_local", + source="local", + ckpt=os.path.join(ssl_model_path, "wavlm_large.pt"), + ) + + if len(self.feature_extract.model.encoder.layers) == 24 and hasattr( + self.feature_extract.model.encoder.layers[23].self_attn, + "fp32_attention", + ): + self.feature_extract.model.encoder.layers[ + 23 + ].self_attn.fp32_attention = False + if len(self.feature_extract.model.encoder.layers) == 24 and hasattr( + self.feature_extract.model.encoder.layers[11].self_attn, + "fp32_attention", + ): + self.feature_extract.model.encoder.layers[ + 11 + ].self_attn.fp32_attention = False + + self.feat_num = self.get_feat_num() + self.feature_weight = nn.Parameter(torch.zeros(self.feat_num)) + + self.instance_norm = nn.InstanceNorm1d(feat_dim) + # self.channels = [channels] * 4 + [channels * 3] + self.channels = [channels] * 4 + [1536] + + self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2) + self.layer2 = SE_Res2Block( + self.channels[0], + self.channels[1], + kernel_size=3, + stride=1, + padding=2, + dilation=2, + scale=8, + se_bottleneck_dim=128, + ) + self.layer3 = SE_Res2Block( + self.channels[1], + self.channels[2], + kernel_size=3, + stride=1, + padding=3, + dilation=3, + scale=8, + se_bottleneck_dim=128, + ) + self.layer4 = SE_Res2Block( + self.channels[2], + self.channels[3], + kernel_size=3, + stride=1, + padding=4, + dilation=4, + scale=8, + se_bottleneck_dim=128, + ) + + # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1) + cat_channels = channels * 3 + self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1) + self.pooling = AttentiveStatsPool( + self.channels[-1], + attention_channels=128, + global_context_att=global_context_att, + ) + self.bn = nn.BatchNorm1d(self.channels[-1] * 2) + self.linear = nn.Linear(self.channels[-1] * 2, emb_dim) + + def get_feat_num(self): + self.feature_extract.eval() + wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)] + with torch.no_grad(): + features = self.feature_extract(wav) + select_feature = features["hidden_states"] + if isinstance(select_feature, (list, tuple)): + return len(select_feature) + else: + return 1 + + def get_feat(self, x): + with torch.no_grad(): + x = self.feature_extract([sample for sample in x]) + + x = x["hidden_states"] + if isinstance(x, (list, tuple)): + x = torch.stack(x, dim=0) + else: + x = x.unsqueeze(0) + norm_weights = ( + F.softmax(self.feature_weight, dim=-1) + .unsqueeze(-1) + .unsqueeze(-1) + .unsqueeze(-1) + ) + x = (norm_weights * x).sum(dim=0) + x = torch.transpose(x, 1, 2) + 1e-6 + + x = self.instance_norm(x) + return x + + def forward(self, x): + x = self.get_feat(x) + + out1 = self.layer1(x) + out2 = self.layer2(out1) + out3 = self.layer3(out2) + out4 = self.layer4(out3) + + out = torch.cat([out2, out3, out4], dim=1) + out = F.relu(self.conv(out)) + out = self.bn(self.pooling(out)) + out = self.linear(out) + + return out + + +# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN + +""" Res2Conv1d + BatchNorm1d + ReLU +""" + + +class Res2Conv1dReluBn(nn.Module): + """ + in_channels == out_channels == channels + """ + + def __init__( + self, + channels, + kernel_size=1, + stride=1, + padding=0, + dilation=1, + bias=True, + scale=4, + ): + super().__init__() + assert channels % scale == 0, "{} % {} != 0".format(channels, scale) + self.scale = scale + self.width = channels // scale + self.nums = scale if scale == 1 else scale - 1 + + self.convs = [] + self.bns = [] + for i in range(self.nums): + self.convs.append( + nn.Conv1d( + self.width, + self.width, + kernel_size, + stride, + padding, + dilation, + bias=bias, + ) + ) + self.bns.append(nn.BatchNorm1d(self.width)) + self.convs = nn.ModuleList(self.convs) + self.bns = nn.ModuleList(self.bns) + + def forward(self, x): + out = [] + spx = torch.split(x, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + # Order: conv -> relu -> bn + sp = self.convs[i](sp) + sp = self.bns[i](F.relu(sp)) + out.append(sp) + if self.scale != 1: + out.append(spx[self.nums]) + out = torch.cat(out, dim=1) + + return out + + +""" Conv1d + BatchNorm1d + ReLU +""" + + +class Conv1dReluBn(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0, + dilation=1, + bias=True, + ): + super().__init__() + self.conv = nn.Conv1d( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + bias=bias, + ) + self.bn = nn.BatchNorm1d(out_channels) + + def forward(self, x): + return self.bn(F.relu(self.conv(x))) + + +""" The SE connection of 1D case. +""" + + +class SE_Connect(nn.Module): + def __init__(self, channels, se_bottleneck_dim=128): + super().__init__() + self.linear1 = nn.Linear(channels, se_bottleneck_dim) + self.linear2 = nn.Linear(se_bottleneck_dim, channels) + + def forward(self, x): + out = x.mean(dim=2) + out = F.relu(self.linear1(out)) + out = torch.sigmoid(self.linear2(out)) + out = x * out.unsqueeze(2) + + return out + + +""" SE-Res2Block of the ECAPA-TDNN architecture. +""" + + +# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale): +# return nn.Sequential( +# Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0), +# Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale), +# Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0), +# SE_Connect(channels) +# ) + + +class SE_Res2Block(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation, + scale, + se_bottleneck_dim, + ): + super().__init__() + self.Conv1dReluBn1 = Conv1dReluBn( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + self.Res2Conv1dReluBn = Res2Conv1dReluBn( + out_channels, kernel_size, stride, padding, dilation, scale=scale + ) + self.Conv1dReluBn2 = Conv1dReluBn( + out_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + + def forward(self, x): + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.Conv1dReluBn1(x) + x = self.Res2Conv1dReluBn(x) + x = self.Conv1dReluBn2(x) + x = self.SE_Connect(x) + + return x + residual + + +""" Attentive weighted mean and standard deviation pooling. +""" + + +class AttentiveStatsPool(nn.Module): + def __init__(self, in_dim, attention_channels=128, global_context_att=False): + super().__init__() + self.global_context_att = global_context_att + + # Use Conv1d with stride == 1 rather than Linear, + # then we don't need to transpose inputs. + if global_context_att: + self.linear1 = nn.Conv1d( + in_dim * 3, attention_channels, kernel_size=1 + ) # equals W and b in the paper + else: + self.linear1 = nn.Conv1d( + in_dim, attention_channels, kernel_size=1 + ) # equals W and b in the paper + self.linear2 = nn.Conv1d( + attention_channels, in_dim, kernel_size=1 + ) # equals V and k in the paper + + def forward(self, x): + + if self.global_context_att: + context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x) + context_std = torch.sqrt( + torch.var(x, dim=-1, keepdim=True) + 1e-10 + ).expand_as(x) + x_in = torch.cat((x, context_mean, context_std), dim=1) + else: + x_in = x + + # DON'T use ReLU here! In experiments, I find ReLU hard to converge. + alpha = torch.tanh(self.linear1(x_in)) + # alpha = F.relu(self.linear1(x_in)) + alpha = torch.softmax(self.linear2(alpha), dim=2) + mean = torch.sum(alpha * x, dim=2) + residuals = torch.sum(alpha * (x**2), dim=2) - mean**2 + std = torch.sqrt(residuals.clamp(min=1e-9)) + return torch.cat([mean, std], dim=1) diff --git a/omnivoice/eval/models/utmos.py b/omnivoice/eval/models/utmos.py new file mode 100644 index 0000000000000000000000000000000000000000..dca1d4ef50e9483b7913a36077b12336171080ba --- /dev/null +++ b/omnivoice/eval/models/utmos.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +UTMOS strong model. +Implementation from https://github.com/tarepan/SpeechMOS + +""" + +import math +from typing import List, Optional, Tuple + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + + +class UTMOS22Strong(nn.Module): + """Saeki_2022 paper's `UTMOS strong learner` inference model + (w/o Phoneme encoder).""" + + def __init__(self): + """Init.""" + + super().__init__() # pyright: ignore [reportUnknownMemberType] + + feat_ssl, feat_domain_emb, feat_judge_emb, feat_rnn_h, feat_proj_h = ( + 768, + 128, + 128, + 512, + 2048, + ) + feat_cat = feat_ssl + feat_domain_emb + feat_judge_emb + + # SSL/DataDomainEmb/JudgeIdEmb/BLSTM/Projection + self.wav2vec2 = Wav2Vec2Model() + self.domain_emb = nn.Parameter( + data=torch.empty(1, feat_domain_emb), requires_grad=False + ) + self.judge_emb = nn.Parameter( + data=torch.empty(1, feat_judge_emb), requires_grad=False + ) + self.blstm = nn.LSTM( + input_size=feat_cat, + hidden_size=feat_rnn_h, + batch_first=True, + bidirectional=True, + ) + self.projection = nn.Sequential( + nn.Linear(feat_rnn_h * 2, feat_proj_h), nn.ReLU(), nn.Linear(feat_proj_h, 1) + ) + + def forward(self, wave: Tensor, sr: int) -> Tensor: # pylint: disable=invalid-name + """wave-to-score :: (B, T) -> (B,)""" + + # Feature extraction :: (B, T) -> (B, Frame, Feat) + unit_series = self.wav2vec2(wave) + bsz, frm, _ = unit_series.size() + + # DataDomain/JudgeId Embedding's Batch/Time expansion :: + # (B=1, Feat) -> (B=bsz, Frame=frm, Feat) + domain_series = self.domain_emb.unsqueeze(1).expand(bsz, frm, -1) + judge_series = self.judge_emb.unsqueeze(1).expand(bsz, frm, -1) + + # Feature concatenation :: (B, Frame, Feat=f1) + (B, Frame, Feat=f2) + + # (B, Frame, Feat=f3) -> (B, Frame, Feat=f1+f2+f3) + cat_series = torch.cat([unit_series, domain_series, judge_series], dim=2) + + # Frame-scale score estimation :: (B, Frame, Feat) -> (B, Frame, Feat) + # -> (B, Frame, Feat=1) - BLSTM/Projection + feat_series = self.blstm(cat_series)[0] + score_series = self.projection(feat_series) + + # Utterance-scale score :: (B, Frame, Feat=1) -> (B, Feat=1) + # -> (B,) - Time averaging + utter_score = score_series.mean(dim=1).squeeze(1) * 2 + 3 + + return utter_score + + +class Wav2Vec2Model(nn.Module): + """Wav2Vev2.""" + + def __init__(self): + super().__init__() # pyright: ignore [reportUnknownMemberType] + + feat_h1, feat_h2 = 512, 768 + feature_enc_layers = ( + [(feat_h1, 10, 5)] + [(feat_h1, 3, 2)] * 4 + [(feat_h1, 2, 2)] * 2 + ) + + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers + ) # pyright: ignore [reportGeneralTypeIssues] + self.layer_norm = nn.LayerNorm(feat_h1) + self.post_extract_proj = nn.Linear(feat_h1, feat_h2) + self.dropout_input = nn.Dropout(0.1) + self.encoder = TransformerEncoder(feat_h2) + + # Remnants + self.mask_emb = nn.Parameter(torch.FloatTensor(feat_h2)) + + def forward(self, source: Tensor): + """FeatureEncoder + ContextTransformer""" + + # Feature encoding + features = self.feature_extractor(source) + features = features.transpose(1, 2) + features = self.layer_norm(features) + features = self.post_extract_proj(features) + + # Context transformer + x = self.encoder(features) + + return x + + +class ConvFeatureExtractionModel(nn.Module): + """Feature Encoder.""" + + def __init__(self, conv_layers: List[Tuple[int, int, int]]): + super().__init__() # pyright: ignore [reportUnknownMemberType] + + def block( + n_in: int, n_out: int, k: int, stride: int, is_group_norm: bool = False + ): + if is_group_norm: + return nn.Sequential( + nn.Conv1d(n_in, n_out, k, stride=stride, bias=False), + nn.Dropout(p=0.0), + nn.GroupNorm(dim, dim, affine=True), + nn.GELU(), + ) + else: + return nn.Sequential( + nn.Conv1d(n_in, n_out, k, stride=stride, bias=False), + nn.Dropout(p=0.0), + nn.GELU(), + ) + + in_d = 1 + self.conv_layers = nn.ModuleList() + for i, params in enumerate(conv_layers): + (dim, k, stride) = params + self.conv_layers.append(block(in_d, dim, k, stride, is_group_norm=i == 0)) + in_d = dim + + def forward(self, series: Tensor) -> Tensor: + """:: (B, T) -> (B, Feat, Frame)""" + + series = series.unsqueeze(1) + for conv in self.conv_layers: + series = conv(series) + + return series + + +class TransformerEncoder(nn.Module): + """Transformer.""" + + def build_encoder_layer(self, feat: int): + """Layer builder.""" + return TransformerSentenceEncoderLayer( + embedding_dim=feat, + ffn_embedding_dim=3072, + num_attention_heads=12, + activation_fn="gelu", + dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.0, + layer_norm_first=False, + ) + + def __init__(self, feat: int): + super().__init__() # pyright: ignore [reportUnknownMemberType] + + self.required_seq_len_multiple = 2 + + self.pos_conv = nn.Sequential( + *[ + nn.utils.weight_norm( + nn.Conv1d(feat, feat, kernel_size=128, padding=128 // 2, groups=16), + name="weight", + dim=2, + ), + SamePad(128), + nn.GELU(), + ] + ) + self.layer_norm = nn.LayerNorm(feat) + self.layers = nn.ModuleList([self.build_encoder_layer(feat) for _ in range(12)]) + + def forward(self, x: Tensor) -> Tensor: + + x_conv = self.pos_conv(x.transpose(1, 2)).transpose(1, 2) + x = x + x_conv + + x = self.layer_norm(x) + + # pad to the sequence length dimension + x, pad_length = pad_to_multiple( + x, self.required_seq_len_multiple, dim=-2, value=0 + ) + if pad_length > 0: + padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) + padding_mask[:, -pad_length:] = True + else: + padding_mask, _ = pad_to_multiple( + None, self.required_seq_len_multiple, dim=-1, value=True + ) + + # :: (B, T, Feat) -> (T, B, Feat) + x = x.transpose(0, 1) + for layer in self.layers: + x = layer(x, padding_mask) + # :: (T, B, Feat) -> (B, T, Feat) + x = x.transpose(0, 1) + + # undo paddding + if pad_length > 0: + x = x[:, :-pad_length] + + return x + + +class SamePad(nn.Module): + """Tail inverse padding.""" + + def __init__(self, kernel_size: int): + super().__init__() # pyright: ignore [reportUnknownMemberType] + assert kernel_size % 2 == 0, "`SamePad` now support only even kernel." + + def forward(self, x: Tensor) -> Tensor: + return x[:, :, :-1] + + +def pad_to_multiple( + x: Optional[Tensor], multiple: int, dim: int = -1, value: float = 0 +) -> Tuple[Optional[Tensor], int]: + """Tail padding.""" + if x is None: + return None, 0 + tsz = x.size(dim) + m = tsz / multiple + remainder = math.ceil(m) * multiple - tsz + if m.is_integer(): + return x, 0 + pad_offset = (0,) * (-1 - dim) * 2 + + return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder + + +class TransformerSentenceEncoderLayer(nn.Module): + """Transformer Encoder Layer used in BERT/XLM style pre-trained models.""" + + def __init__( + self, + embedding_dim: int, + ffn_embedding_dim: int, + num_attention_heads: int, + activation_fn: str, + dropout: float, + attention_dropout: float, + activation_dropout: float, + layer_norm_first: bool, + ) -> None: + super().__init__() # pyright: ignore [reportUnknownMemberType] + + assert layer_norm_first is False, "`layer_norm_first` is fixed to `False`" + assert activation_fn == "gelu", "`activation_fn` is fixed to `gelu`" + + feat = embedding_dim + + self.self_attn = MultiheadAttention( + feat, num_attention_heads, attention_dropout + ) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(activation_dropout) + self.dropout3 = nn.Dropout(dropout) + self.fc1 = nn.Linear(feat, ffn_embedding_dim) + self.fc2 = nn.Linear(ffn_embedding_dim, feat) + self.self_attn_layer_norm = nn.LayerNorm(feat) + self.final_layer_norm = nn.LayerNorm(feat) + + def forward(self, x: Tensor, self_attn_padding_mask: Optional[Tensor]): + # Res[Attn-Do]-LN + residual = x + x = self.self_attn(x, x, x, self_attn_padding_mask) + x = self.dropout1(x) + x = residual + x + x = self.self_attn_layer_norm(x) + + # Res[SegFC-GELU-Do-SegFC-Do]-LN + residual = x + x = F.gelu(self.fc1(x)) # pyright: ignore [reportUnknownMemberType] + x = self.dropout2(x) + x = self.fc2(x) + x = self.dropout3(x) + x = residual + x + x = self.final_layer_norm(x) + + return x + + +class MultiheadAttention(nn.Module): + """Multi-headed attention.""" + + def __init__(self, embed_dim: int, num_heads: int, dropout: float): + super().__init__() # pyright: ignore [reportUnknownMemberType] + + self.embed_dim, self.num_heads, self.p_dropout = embed_dim, num_heads, dropout + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True) + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True) + + def forward( + self, + query: Tensor, + key: Tensor, + value: Tensor, + key_padding_mask: Optional[Tensor], + ) -> Tensor: + """ + Args: + query :: (T, B, Feat) + key_padding_mask :: (B, src_len) - mask to exclude keys that are pads + , where padding elements are indicated by 1s. + """ + return F.multi_head_attention_forward( + query=query, + key=key, + value=value, + embed_dim_to_check=self.embed_dim, + num_heads=self.num_heads, + in_proj_weight=torch.empty([0]), + in_proj_bias=torch.cat( + (self.q_proj.bias, self.k_proj.bias, self.v_proj.bias) + ), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=self.p_dropout, + out_proj_weight=self.out_proj.weight, + out_proj_bias=self.out_proj.bias, + training=False, + key_padding_mask=key_padding_mask.bool() + if key_padding_mask is not None + else None, + need_weights=False, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + )[0] diff --git a/omnivoice/eval/mos/utmos.py b/omnivoice/eval/mos/utmos.py new file mode 100644 index 0000000000000000000000000000000000000000..e69d30ff45093d828c978dfd85a76aa61e7d31ae --- /dev/null +++ b/omnivoice/eval/mos/utmos.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Calculate UTMOS score with automatic Mean Opinion Score (MOS) prediction system +""" +import argparse +import logging +import multiprocessing as mp +import os +import sys +import traceback +import warnings +from concurrent.futures import ProcessPoolExecutor, as_completed + +import numpy as np +import torch +from tqdm import tqdm + +from omnivoice.eval.models.utmos import UTMOS22Strong +from omnivoice.eval.utils import load_eval_waveform +from omnivoice.utils.data_utils import read_test_list + +warnings.filterwarnings("ignore") + +# Global variables for workers +worker_model = None +worker_device = None +worker_sr = 16000 + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Calculate UTMOS score using UTMOS22Strong model." + ) + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing evaluated speech files.", + ) + parser.add_argument( + "--test-list", + type=str, + required=True, + help="Path to the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of our evaluation model repository." + "Download from https://huggingface.co/k2-fsa/TTS_eval_models." + "Will use 'tts_eval_models/mos/utmos22_strong_step7459_v1.pt'" + " in this script", + ) + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where UTMOS information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--nj-per-gpu", + type=int, + default=1, + help="Number of worker processes to spawn per GPU.", + ) + return parser + + +def get_device(rank: int = 0) -> torch.device: + assert torch.cuda.is_available(), "CUDA is required but not available." + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + return device + + +def worker_init( + rank_queue, + model_path, +): + """Initialize worker process with model and device.""" + global worker_model, worker_device, worker_sr + + # Limit CPU threads per worker + torch.set_num_threads(2) + + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + rank = rank_queue.get() if rank_queue else -1 + + worker_device = get_device(rank) + worker_sr = 16000 + + logging.debug(f"Initializing UTMOS worker on {worker_device}") + + # Initialize Model + worker_model = UTMOS22Strong() + try: + # Load weights to CPU first, then move to device + state_dict = torch.load(model_path, map_location="cpu") + worker_model.load_state_dict(state_dict) + except Exception as e: + logging.error(f"Failed to load model from {model_path}: {e}") + raise + + worker_model.to(worker_device) + worker_model.eval() + + +@torch.no_grad() +def run_utmos_worker(file_idx, wav_path, language_name): + """Worker function to process a single audio file.""" + try: + if not os.path.exists(wav_path): + return file_idx, wav_path, language_name, f"File not found: {wav_path}", "error" + + # Load and preprocess waveform + speech = load_eval_waveform(wav_path, worker_sr, device=worker_device) + + # Compute score + # UTMOS expects input shape (Batch, Time) + score = worker_model(speech.unsqueeze(0), worker_sr) + + return file_idx, wav_path, language_name, score.item(), "success" + + except Exception as e: + error_detail = ( + f"Error processing {wav_path}: {str(e)}\n" + f"Traceback:\n{traceback.format_exc()}" + ) + return file_idx, wav_path, language_name, error_detail, "error" + + +def main(): + parser = get_parser() + args = parser.parse_args() + + # Main process thread setting + torch.set_num_threads(2) + + mp.set_start_method("spawn", force=True) + + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + # Validate inputs + if not os.path.isdir(args.wav_path): + logging.error(f"Invalid directory: {args.wav_path}") + sys.exit(1) + + model_path = os.path.join(args.model_dir, "mos/utmos22_strong_step7459_v1.pt") + if not os.path.exists(model_path): + logging.error(f"Model file not found at {model_path}") + sys.exit(1) + + # Scan directory for files + logging.info(f"Calculating UTMOS for {args.wav_path}") + + wav_files = [] + try: + samples = read_test_list(args.test_list) + for s in samples: + language_name = s.get("language_name") or "unknown" + eval_wav_path = os.path.join(args.wav_path, f"{s['id']}.{args.extension}") + wav_files.append((eval_wav_path, language_name)) + except Exception as e: + raise ValueError(f"Error reading test list {args.test_list}: {e}") + + # Setup Parallel Processing + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_procs = num_gpus * args.nj_per_gpu + + logging.info( + f"Starting evaluation with {total_procs} processes on {num_gpus} GPUs." + ) + + manager = mp.Manager() + rank_queue = manager.Queue() + + for rank in list(range(num_gpus)) * args.nj_per_gpu: + rank_queue.put(rank) + + scores = [] + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + fout = open(args.decode_path, "w", encoding="utf8") + logging.info(f"Saving detailed UTMOS results to: {args.decode_path}") + fout.write("Name\tUTMOS\n") + + try: + with ProcessPoolExecutor( + max_workers=total_procs, + initializer=worker_init, + initargs=( + rank_queue, + model_path, + ), + ) as executor: + futures = [] + for i, (wav_path, language_name) in enumerate(wav_files): + futures.append( + executor.submit(run_utmos_worker, i, wav_path, language_name) + ) + + pbar = tqdm( + as_completed(futures), total=len(wav_files), desc="Evaluating UTMOS" + ) + lang_stats = {} + for future in pbar: + idx, path, language_name, result, status = future.result() + if status == "success": + if language_name not in lang_stats: + lang_stats[language_name] = [] + lang_stats[language_name].append(result) + scores.append(result) + if fout: + if language_name == "unknown": + fout.write(f"{os.path.basename(path)}\t{result:.2f}\n") + else: + fout.write( + f"{language_name}\t{os.path.basename(path)}\t{result:.2f}\n" + ) + else: + pbar.write(f"!!! FAILED [File {idx}]: {path} | {result}") + + except (Exception, KeyboardInterrupt) as e: + logging.critical( + f"An unrecoverable error occurred: {e}. Terminating all processes." + ) + detailed_error_info = traceback.format_exc() + logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}") + sys.exit(1) + + print("-" * 50) + + if len(lang_stats) > 1: + lang_scores = [] + for lang in sorted(lang_stats.keys()): + l_scores = lang_stats[lang] + l_avg = np.mean(l_scores) + lang_scores.append(l_scores) + l_count = len(l_scores) + logging.info(f"[{lang}] UTMOS score: {l_avg:.3f} ({l_count} samples)") + if fout: + fout.write(f"[{lang}] UTMOS: {l_avg:.3f} ({l_count} samples)\n") + logging.info( + f"Macro-average UTMOS over {len(lang_stats)} languages: " + f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}" + ) + if fout: + fout.write( + f"\nMacro-average UTMOS over {len(lang_stats)} languages: " + f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n" + ) + + if scores: + avg_score = np.mean(scores) + logging.info(f"Processed {len(scores)}/{len(wav_files)} files.") + logging.info(f"UTMOS score: {avg_score:.2f}") + if fout: + fout.write(f"\nAverage UTMOS: {avg_score:.2f}\n") + else: + logging.error("No valid scores computed.") + print("-" * 50) + + if fout: + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/speaker_similarity/sim.py b/omnivoice/eval/speaker_similarity/sim.py new file mode 100644 index 0000000000000000000000000000000000000000..429b12807ab490fd281cee20fc2e122e94243ef6 --- /dev/null +++ b/omnivoice/eval/speaker_similarity/sim.py @@ -0,0 +1,321 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Computes speaker similarity (SIM-o) using a WavLM-based + ECAPA-TDNN speaker verification model. +""" +import argparse +import logging +import multiprocessing as mp +import os +import sys +import traceback +import warnings +from concurrent.futures import ProcessPoolExecutor, as_completed + +import numpy as np +import torch +from tqdm import tqdm + +from omnivoice.eval.models.ecapa_tdnn_wavlm import ECAPA_TDNN_WAVLM +from omnivoice.eval.utils import load_eval_waveform +from omnivoice.utils.data_utils import read_test_list + +warnings.filterwarnings("ignore") + +# Global variables for workers +worker_model = None +worker_device = None +worker_sr = 16000 + + +def get_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Calculate speaker similarity (SIM-o) score." + ) + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing evaluated speech files.", + ) + parser.add_argument( + "--test-list", + type=str, + required=True, + help="Path to the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of our evaluation model repository." + "Download from https://huggingface.co/k2-fsa/TTS_eval_models." + "Will use 'tts_eval_models/speaker_similarity/wavlm_large_finetune.pth'" + "and 'tts_eval_models/speaker_similarity/wavlm_large/' in this script", + ) + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files.", + ) + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where SIM-o information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--nj-per-gpu", + type=int, + default=1, + help="Number of worker processes to spawn per GPU.", + ) + return parser + + +def get_device(rank: int = 0) -> torch.device: + assert torch.cuda.is_available(), "CUDA is required but not available." + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + return device + + +def worker_init( + rank_queue, + sv_model_path, + ssl_model_path, +): + """Initialize worker process with model and device.""" + global worker_model, worker_device, worker_sr + + torch.set_num_threads(2) + + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] [Worker %(process)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + rank = rank_queue.get() if rank_queue else -1 + + worker_device = get_device(rank) + worker_sr = 16000 + + logging.debug(f"Initializing SIM-o worker on {worker_device}") + # Temporarily suppress INFO logs to hide verbose WavLM config + logging.disable(logging.INFO) + + # Initialize Model + try: + worker_model = ECAPA_TDNN_WAVLM( + feat_dim=1024, + channels=512, + emb_dim=256, + sr=worker_sr, + ssl_model_path=ssl_model_path, + ) + state_dict = torch.load( + sv_model_path, map_location=lambda storage, loc: storage + ) + worker_model.load_state_dict(state_dict["model"], strict=False) + worker_model.to(worker_device) + worker_model.eval() + finally: + # Restore normal logging + logging.disable(logging.NOTSET) + + +@torch.no_grad() +def get_embedding(wav_path: str) -> torch.Tensor: + """Extract embedding for a single file.""" + speech = load_eval_waveform(wav_path, worker_sr, device=worker_device, max_seconds=120) + return worker_model([speech]) + + +def run_similarity_worker(line_idx, sample, wav_dir, extension): + """Worker function to process a single pair.""" + try: + wav_name = sample["id"] + ref_wav_path = sample["ref_audio"] + language_name = sample.get("language_name") or "unknown" + eval_wav_path = os.path.join(wav_dir, f"{wav_name}.{extension}") + + if not os.path.exists(ref_wav_path): + return line_idx, f"Reference not found: {ref_wav_path}", None, "error" + if not os.path.exists(eval_wav_path): + return line_idx, f"Eval wav not found: {eval_wav_path}", None, "error" + + # Compute embeddings pair-wise + ref_emb = get_embedding(ref_wav_path) + eval_emb = get_embedding(eval_wav_path) + + # Cosine Similarity + similarity = torch.nn.functional.cosine_similarity(ref_emb, eval_emb, dim=-1) + + return ( + line_idx, + (ref_wav_path, eval_wav_path, language_name), + similarity.item(), + "success", + ) + + except Exception as e: + error_detail = f"Error: {str(e)}\nTraceback:\n{traceback.format_exc()}" + return line_idx, str(sample), error_detail, "error" + + +def main(): + parser = get_parser() + args = parser.parse_args() + + # Main process thread setting + torch.set_num_threads(2) + + mp.set_start_method("spawn", force=True) + + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + # Prepare paths + sv_model_path = os.path.join( + args.model_dir, "speaker_similarity/wavlm_large_finetune.pth" + ) + ssl_model_path = os.path.join(args.model_dir, "speaker_similarity/wavlm_large/") + + if not os.path.exists(sv_model_path) or not os.path.exists(ssl_model_path): + logging.error("Model files not found. Please check --model-dir.") + sys.exit(1) + + logging.info(f"Calculating SIM-o for {args.wav_path}") + # Read list + samples = read_test_list(args.test_list) + + # Setup Parallel Processing + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_procs = num_gpus * args.nj_per_gpu + + logging.info( + f"Starting evaluation with {total_procs} processes " f"on {num_gpus} GPUs." + ) + + manager = mp.Manager() + rank_queue = manager.Queue() + + for rank in list(range(num_gpus)) * args.nj_per_gpu: + rank_queue.put(rank) + + scores = [] + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + fout = open(args.decode_path, "w", encoding="utf8") + logging.info(f"Saving detailed SIM-o results to: {args.decode_path}") + fout.write("Prompt-path\tEval-path\tSIM-o\n") + + try: + with ProcessPoolExecutor( + max_workers=total_procs, + initializer=worker_init, + initargs=( + rank_queue, + sv_model_path, + ssl_model_path, + ), + ) as executor: + futures = [] + for i, sample in enumerate(samples): + futures.append( + executor.submit( + run_similarity_worker, i, sample, args.wav_path, args.extension + ) + ) + + pbar = tqdm( + as_completed(futures), total=len(samples), desc="Evaluating SIM-o" + ) + + lang_stats = {} + + for future in pbar: + idx, context, result, status = future.result() + if status == "success": + prompt_path, eval_path, lang = context + scores.append(result) + + # Accumulate per-language + if lang not in lang_stats: + lang_stats[lang] = [] + lang_stats[lang].append(result) + + if fout: + if lang == "unknown": + fout.write(f"{prompt_path}\t{eval_path}\t{result:.2f}\n") + else: + fout.write( + f"{lang}\t{context[0]}\t{context[1]}\t{result:.2f}\n" + ) + else: + pbar.write(f"!!! FAILED [Line {idx}]: {context} | Error: {result}") + + except (Exception, KeyboardInterrupt) as e: + logging.critical( + f"An unrecoverable error occurred: {e}. " f"Terminating all processes." + ) + detailed_error_info = traceback.format_exc() + logging.error(f"--- DETAILED TRACEBACK ---\n{detailed_error_info}") + sys.exit(1) + + print("-" * 50) + if len(lang_stats) > 1: + lang_scores = [] + for lang in sorted(lang_stats.keys()): + l_scores = lang_stats[lang] + l_avg = np.mean(l_scores) + lang_scores.append(l_scores) + l_count = len(l_scores) + logging.info(f"[{lang}] SIM-o score: {l_avg:.3f} ({l_count} pairs)") + if fout: + fout.write(f"[{lang}] SIM-o: {l_avg:.3f} ({l_count} pairs)\n") + logging.info( + f"Macro-average SIM-o over {len(lang_stats)} languages: " + f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}" + ) + if fout: + fout.write( + f"\nMacro-average SIM-o over {len(lang_stats)} languages: " + f"{np.mean([np.mean(ls) for ls in lang_scores]):.3f}\n" + ) + + if scores: + avg_score = np.mean(scores) + logging.info(f"Processed {len(scores)}/{len(samples)} pairs.") + logging.info(f"SIM-o score: {avg_score:.3f}") + if fout: + fout.write(f"\nAverage SIM-o: {avg_score:.3f}\n") + else: + logging.error("No valid scores computed.") + if fout: + fout.close() + print("-" * 50) + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/utils.py b/omnivoice/eval/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..375d9636581ba33bcfa33e8b6f54c8dc8bdf11cc --- /dev/null +++ b/omnivoice/eval/utils.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typing import Optional + +import soundfile as sf +import torch +import torchaudio + + +def load_eval_waveform( + fname: str, + sample_rate: int, + dtype: str = "float32", + device: torch.device = torch.device("cpu"), + return_numpy: bool = False, + max_seconds: Optional[float] = None, +) -> torch.Tensor: + """ + Load an audio file, preprocess it, and convert to a PyTorch tensor. + + Args: + fname (str): Path to the audio file. + sample_rate (int): Target sample rate for resampling. + dtype (str, optional): Data type to load audio as (default: "float32"). + device (torch.device, optional): Device to place the resulting tensor + on (default: CPU). + return_numpy (bool): If True, returns a NumPy array instead of a + PyTorch tensor. + max_seconds (float): Maximum length (seconds) of the audio tensor. + If the audio is longer than this, it will be truncated. + + Returns: + torch.Tensor: Processed audio waveform as a PyTorch tensor, + with shape (num_samples,). + + Notes: + - If the audio is stereo, it will be converted to mono by averaging channels. + - If the audio's sample rate differs from the target, it will be resampled. + """ + # Load audio file with specified data type + wav_data, sr = sf.read(fname, dtype=dtype) + + # Convert stereo to mono if necessary + if len(wav_data.shape) == 2: + wav_data = wav_data.mean(1) + + # Resample to target sample rate if needed + if sr != sample_rate: + wav_data = torchaudio.functional.resample( + torch.from_numpy(wav_data), orig_freq=sr, new_freq=sample_rate + ).numpy() + + if max_seconds is not None: + # Trim to max length + max_length = int(sample_rate * max_seconds) + if len(wav_data) > max_length: + wav_data = wav_data[:max_length] + logging.warning( + f"Wav file {fname} is longer than {max_seconds}s, " + f"truncated to {max_seconds}s to avoid OOM." + ) + if return_numpy: + return wav_data + else: + wav_data = torch.from_numpy(wav_data) + return wav_data.to(device) diff --git a/omnivoice/eval/wer/common.py b/omnivoice/eval/wer/common.py new file mode 100644 index 0000000000000000000000000000000000000000..c081fbd970f73e413bea6f7446a74dad2b45c849 --- /dev/null +++ b/omnivoice/eval/wer/common.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Shared utilities for WER evaluation scripts. +""" +import logging + +import numpy as np +from jiwer import compute_measures + + +def process_one(hypothesis: str, truth: str, post_process, lang: str = None) -> dict: + """ + Computes WER and related metrics for a single hypothesis-truth pair. + + Args: + hypothesis (str): The transcribed text from the ASR model. + truth (str): The ground truth transcript. + post_process (callable): Text normalization function defined by each script. + Signature: post_process(text, lang) or post_process(text). + lang (str): The language code for post_process. Pass None if post_process + does not accept a lang argument. + + Returns: + dict: A dict containing: + - truth (str): Post-processed ground truth text. + - hypothesis (str): Post-processed hypothesis text. + - wer (float): Word Error Rate. + - substitutions (int): Number of substitutions. + - deletions (int): Number of deletions. + - insertions (int): Number of insertions. + - word_num (int): Number of words in the post-processed ground truth. + """ + if lang is not None: + truth_processed = post_process(truth, lang) + hypothesis_processed = post_process(hypothesis, lang) + else: + truth_processed = post_process(truth) + hypothesis_processed = post_process(hypothesis) + measures = compute_measures(truth_processed, hypothesis_processed) + word_num = len(truth_processed.split(" ")) + return { + "truth": truth_processed, + "hypo": hypothesis_processed, + "wer": measures["wer"], + "substitutions": measures["substitutions"], + "deletions": measures["deletions"], + "insertions": measures["insertions"], + "word_num": word_num, + } + + +def log_metrics(fout, prefix, i_list, d_list, s_list, w_total, ndigits=2): + """Log weighted WER metrics for a subset of results.""" + metrics_wer = round( + (np.sum(s_list) + np.sum(d_list) + np.sum(i_list)) / w_total * 100, ndigits + ) + metrics_inse = np.sum(i_list) + metrics_dele = np.sum(d_list) + metrics_subs = np.sum(s_list) + + logging.info(f"{prefix} WER: {metrics_wer}%") + logging.info( + f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, " + f"{metrics_subs} sub / {w_total} words" + ) + if fout: + fout.write(f"{prefix} WER: {metrics_wer}%\n") + fout.write( + f"{prefix} Errors: {metrics_inse} ins, {metrics_dele} del, " + f"{metrics_subs} sub / {w_total} words\n" + ) + return metrics_wer diff --git a/omnivoice/eval/wer/fleurs.py b/omnivoice/eval/wer/fleurs.py new file mode 100644 index 0000000000000000000000000000000000000000..b1899afc9dba7759e1c56746e79198a066854fd1 --- /dev/null +++ b/omnivoice/eval/wer/fleurs.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Computes word error rate (WER) for FLEURS multilingual evaluation. + +Uses omnilingual-asr for ASR transcription across 100+ languages. +Requires a separate environment with ``omnilingual_asr`` installed. + +Usage: + python3 omnivoice/eval/wer/fleurs.py \\ + --wav-path results/fleurs \\ + --test-list test.jsonl \\ + --decode-path results/fleurs.wer.log \\ + --model-card omniASR_LLM_Unlimited_7B_v2 \\ + --chunk-size 100 --batch-size 50 +""" +import argparse +import logging +import multiprocessing as mp +import os +import re +import sys +import traceback +import types +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from typing import List, Union + +import numpy as np +import torch +from tqdm import tqdm + +try: + from omnilingual_asr.models.inference.pipeline import ASRInferencePipeline + from omnilingual_asr.models.wav2vec2_llama.lang_ids import supported_langs +except ImportError: + logging.error("Please install omnilingual_asr first.") + exit(1) + +# omnilingual-asr may pull a transformers version that lacks +# HiggsAudioV2TokenizerModel. Pre-register stubs to bypass +# omnivoice/__init__.py heavy imports. +if "omnivoice" not in sys.modules: + _root = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) + for _name in ( + "omnivoice", + "omnivoice.eval", + "omnivoice.eval.wer", + "omnivoice.utils", + ): + if _name not in sys.modules: + _m = types.ModuleType(_name) + _m.__path__ = [os.path.join(_root, *_name.split(".")[1:])] + _m.__package__ = _name + sys.modules[_name] = _m + +from omnivoice.eval.wer.common import log_metrics, process_one +from omnivoice.eval.wer.text_norm_omni import text_normalize +from omnivoice.utils.data_utils import read_test_list + +# --- Global variables for worker processes --- +worker_pipe = None +worker_device = None + + +# fix mismatched language codes between OmniVoice and Omnilingual-ASR model +rename = { + "et": "ekk", + "ms": "zsm", + "sw": "swh", + "npi": "nep", +} + + +def read_language_mapping_from_tsv( + mapping_path: Path, +) -> dict[str, Union[str, List[str]]]: + with open(mapping_path, "r", encoding="utf-8") as f: + _ = f.readline() # Skip header + language_mapping = {} + for line in f: + parts = line.strip().split("\t") + mixed_id, language_name, iso_639_3_id, duration = parts + language_mapping[iso_639_3_id] = mixed_id + return language_mapping + + +iso_639_3_id_to_mixed_id = read_language_mapping_from_tsv( + Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv") +) + +mixed_id_to_omnilingual_asr_lang = {} + +for lang in supported_langs: + if lang in ("cmn_Hant",): + continue + iso_639_3_lang_code = lang.split("_")[0] + if iso_639_3_lang_code in iso_639_3_id_to_mixed_id: + mixed_id = iso_639_3_id_to_mixed_id[iso_639_3_lang_code] + mixed_id_to_omnilingual_asr_lang[mixed_id] = lang + else: + mixed_id_to_omnilingual_asr_lang[iso_639_3_lang_code] = lang + + +def clean_cjk_spaces(text): + """ + Removes spaces adjacent to Chinese and Japanese characters while preserving + meaningful spaces in English or other languages (like Korean). + """ + + # Define CJK (Chinese, Japanese) Unicode ranges + # \u4e00-\u9fff: CJK Unified Ideographs (Chinese) + # \u3040-\u309f: Hiragana (Japanese) + # \u30a0-\u30ff: Katakana (Japanese) + # \u3000-\u303f: CJK Symbols and Punctuation + cjk_range = r"\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f" + + # 1. Remove spaces between two CJK characters + # Example: "我 爱 你" -> "我爱你" + text = re.sub(f"([{cjk_range}])\\s+([{cjk_range}])", r"\1\2", text) + + # 2. Remove spaces between a CJK character and a non-CJK character (English/Numbers) + # Example: "我 爱 you" -> "我爱you" + text = re.sub(f"([{cjk_range}])\\s+", r"\1", text) + text = re.sub(f"\\s+([{cjk_range}])", r"\1", text) + + # 3. Collapse multiple spaces into one for the remaining parts (e.g., English words) + text = re.sub(r"\s+", " ", text) + + return text.strip() + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Computes WER with Whisper.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing speech files.", + ) + + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where WER information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--model-card", + type=str, + default="omniASR_LLM_7B", + help="Model card name for OmniASR (e.g., omniASR_LLM_7B) or local path.", + ) + parser.add_argument( + "--test-list", + type=str, + default="test.jsonl", + help="path of the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--lang", + type=str, + default=None, + help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese). + If not provided, the script will evaluate all languages found in the test list. + If specified, only samples of the given language will be evaluated. + """, + ) + parser.add_argument( + "--batch-size", + type=int, + default=8, + help="Batch size for decoding with the Hugging Face pipeline.", + ) + parser.add_argument( + "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU." + ) + parser.add_argument( + "--chunk-size", + type=int, + default=300, + help="Number of samples per task chunk sent to workers.", + ) + return parser + + +def load_omni_model(model_card, device): + logging.info(f"Loading OmniASR model ({model_card}) on {device}...") + try: + pipeline = ASRInferencePipeline(model_card=model_card, device=str(device)) + return pipeline + except Exception as e: + logging.error(f"Failed to load OmniASR pipeline: {e}") + return None + + +def process_init(rank_queue, model_card): + """ + Initializer for each worker process. + """ + global worker_pipe, worker_device + + # Configure threads constraint + torch.set_num_threads(2) + + try: + rank = rank_queue.get(timeout=10) + except Exception: + raise RuntimeError("Failed to get GPU rank from queue.") + + assert torch.cuda.is_available(), "CUDA is required but not available." + worker_device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + + logging.info(f"Initializing worker on device: {worker_device}") + + try: + # Using the model_card argument + worker_pipe = load_omni_model(model_card, worker_device) + if worker_pipe is None: + raise RuntimeError("Model loading failed.") + except Exception as e: + logging.critical(f"Failed to load model on {worker_device}: {e}") + raise e + + +def post_process(text: str, lang: str) -> str: + """ + Cleans and normalizes text for WER calculation. + Args: + text (str): The input text to be processed. + lang (str): The language of the input text. + + Returns: + str: The cleaned and normalized text. + """ + lang_id = lang[:3] # Extract ISO 639-3 code (e.g., 'eng' from 'eng_Latn') + text = text_normalize( + text, + iso_code=lang_id, + lower_case=True, + remove_numbers=False, + remove_brackets=False, + ) + text = clean_cjk_spaces(text) + text = text.replace(" ", "|") + text = " ".join([x for x in text]) + return text + + +def run_eval_worker(data_chunk, language, batch_size): + """ + Worker function to process a chunk of data. + Uses the global worker_pipe initialized by process_init. + """ + global worker_pipe + if worker_pipe is None: + logging.error("Worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + # Prepare batch lists for OmniASR + audio_paths = [item["wav_path"] for item in data_chunk] + + # OmniASR expects explicit language codes for each file if not auto-detected. + # Using the language passed to the worker function, or item specific language + # Assuming item['lang_id'] is compatible (e.g., 'en', 'zh', 'arb_Arab') + # If the model needs full tokens like 'en_Latn', conversion might be needed here depending on input data. + lang_list = [item.get("lang_id", language) for item in data_chunk] + + # Use the pipeline to infer batch + # OmniASR pipeline.transcribe returns a list of strings + transcriptions = worker_pipe.transcribe( + audio_paths, lang=lang_list, batch_size=batch_size + ) + + for i, hypo_text in enumerate(transcriptions): + ref_item = data_chunk[i] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + lang_id = ref_item.get("lang_id") + lang_name = ref_item.get("lang_name") + + m = process_one(hypo_text, truth, post_process, lang_id) + m["wav_path"] = wav_path + m["lang_name"] = lang_name + metrics_buffer.append(m) + + except Exception: + logging.error( + f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}" + ) + return [] + + return metrics_buffer + + +def main(): + parser = get_parser() + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + force=True, + ) + + # 1. Prepare Data + logging.info("Reading test list...") + data_by_lang = defaultdict(list) + total_files = 0 + wav_root = Path(args.wav_path) + + samples = read_test_list(args.test_list) + for s in samples: + wav_path = str(wav_root / f"{s['id']}.{args.extension}") + if not os.path.exists(wav_path): + logging.warning(f"File missing: {wav_path}") + continue + + lang_id = s.get("language_id") or "unknown" + if lang_id in rename: + lang_id = mixed_id_to_omnilingual_asr_lang[rename[lang_id]] + else: + lang_id = mixed_id_to_omnilingual_asr_lang[lang_id] + item = { + "wav_path": wav_path, + "truth_text": s["text"], + "lang_id": lang_id, + "lang_name": s.get("language_name") or "unknown", + } + if args.lang and s.get("language_id") != args.lang: + continue + + data_by_lang[s.get("language_name") or "unknown"].append(item) + + total_files += 1 + + logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.") + + # 2. Worker config + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_workers = num_gpus * args.nj_per_gpu + + mp.set_start_method("spawn", force=True) + manager = mp.Manager() + rank_queue = manager.Queue() + + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + rank_queue.put(rank) + + # 3. Scheduling: Split languages into chunks + # This prevents one huge language from blocking a worker for too long, + # allows better load balancing across the pool. + tasks = [] + chunk_size = args.chunk_size + + for lang_name, items in data_by_lang.items(): + # Slicing the list into chunks + for i in range(0, len(items), chunk_size): + chunk = items[i : i + chunk_size] + tasks.append({"chunk": chunk, "lang": lang_name}) + + logging.info( + f"Split data into {len(tasks)} chunks (size ~{chunk_size}). Spawning {total_workers} workers." + ) + + # 4. Execution + results = [] + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init, + initargs=(rank_queue, args.model_card), + ) as executor: + + futures = [] + for task in tasks: + futures.append( + executor.submit( + run_eval_worker, task["chunk"], task["lang"], args.batch_size + ) + ) + + # Unified progress bar + with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar: + for future in as_completed(futures): + try: + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + except Exception as e: + logging.error(f"Task failed: {e}") + + # 5. Metrics Aggregation + wers, inses, deles, subses = [], [], [], [] + word_nums = 0 + + # Store metrics per language + lang_stats = {} + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + logging.info(f"Saving detailed WER results to: {args.decode_path}") + fout = open(args.decode_path, "w", encoding="utf-8") + + for res in results: + wers.append(float(res["wer"])) + inses.append(float(res["insertions"])) + deles.append(float(res["deletions"])) + subses.append(float(res["substitutions"])) + word_nums += res["word_num"] + + if fout: + fout.write( + f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t" + f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t" + f"{res['substitutions']}\n" + ) + lang_name = res["lang_name"] + + # Per language stats + if lang_name not in lang_stats: + lang_stats[lang_name] = { + "inses": [], + "deles": [], + "subses": [], + "word_nums": 0, + } + lang_stats[lang_name]["inses"].append(float(res["insertions"])) + lang_stats[lang_name]["deles"].append(float(res["deletions"])) + lang_stats[lang_name]["subses"].append(float(res["substitutions"])) + lang_stats[lang_name]["word_nums"] += res["word_num"] + + print("-" * 50) + # Log per-language stats + per_lang_wers = [] + for lang in sorted(lang_stats.keys()): + stats = lang_stats[lang] + if stats["word_nums"] > 0: + lang_wer = log_metrics( + fout, + f"[{lang}]", + stats["inses"], + stats["deles"], + stats["subses"], + stats["word_nums"], + ) + per_lang_wers.append(lang_wer) + print("-" * 50) + + # Log Macro-average WER + if len(per_lang_wers) > 1: + macro_wer = np.mean(per_lang_wers) + logging.info( + f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%" + ) + if fout: + fout.write( + f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n" + ) + count_le_5 = sum(1 for w in per_lang_wers if w <= 5.0) + count_le_10 = sum(1 for w in per_lang_wers if w <= 10.0) + count_le_20 = sum(1 for w in per_lang_wers if w <= 20.0) + + stats_msg = ( + f"Languages with WER/CER <= 5%: {count_le_5}/{len(per_lang_wers)}\n" + f"Languages with WER/CER <= 10%: {count_le_10}/{len(per_lang_wers)}\n" + f"Languages with WER/CER <= 20%: {count_le_20}/{len(per_lang_wers)}" + ) + + logging.info("\n" + stats_msg) + if fout: + fout.write(stats_msg + "\n") + + # Log overall stats + if word_nums > 0: + log_metrics(fout, "Overall", inses, deles, subses, word_nums) + + if fout: + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/wer/hubert.py b/omnivoice/eval/wer/hubert.py new file mode 100644 index 0000000000000000000000000000000000000000..48706bb215641e360b8fd49ffca0e23e1a82134f --- /dev/null +++ b/omnivoice/eval/wer/hubert.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Computes word error rate (WER) with Hubert models for LibriSpeech test sets. +""" +import argparse +import logging +import multiprocessing as mp +import os +import re +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +import numpy as np +import torch +from tqdm import tqdm + +from omnivoice.eval.utils import load_eval_waveform +from omnivoice.eval.wer.common import process_one +from omnivoice.utils.data_utils import read_test_list + +# --- Global variables for worker processes --- +worker_pipe = None +worker_device = None + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Computes WER with Hubert-based ASR model.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing speech files.", + ) + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where WER information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of our evaluation model repository." + "Download from https://huggingface.co/k2-fsa/TTS_eval_models." + "Will use 'tts_eval_models/wer/hubert-large-ls960-ft/'" + " in this script", + ) + parser.add_argument( + "--test-list", + type=str, + default="transcript.jsonl", + help="path of the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size for decoding with the Hugging Face pipeline.", + ) + parser.add_argument( + "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU." + ) + return parser + + +def process_init(rank_queue, model_dir): + global worker_pipe, worker_device + + torch.set_num_threads(2) + + try: + rank = rank_queue.get(timeout=10) + except Exception: + raise RuntimeError("Failed to get GPU rank from queue.") + + assert torch.cuda.is_available(), "CUDA is required but not available." + worker_device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + + logging.info(f"Initializing worker on device: {worker_device}") + + try: + worker_pipe = load_hubert_model(model_dir, worker_device) + if worker_pipe is None: + raise RuntimeError("Model loading failed.") + except Exception as e: + logging.critical(f"Failed to load model on {worker_device}: {e}") + raise e + + +def load_hubert_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/hubert-large-ls960-ft/") + if not os.path.exists(model_path): + logging.error( + f"Hubert model not found at {model_path}. " + "Please download from https://huggingface.co/k2-fsa/TTS_eval_models" + ) + return None + + logging.debug(f"Loading Hubert-based ASR model on {device}...") + import transformers + + # Suppress transformers logging + transformers.logging.set_verbosity_error() + + pipe = transformers.pipeline( + "automatic-speech-recognition", + model=model_path, + device=device, + tokenizer=model_path, + ) + return pipe + + +def post_process(text: str) -> str: + """ + Cleans and normalizes text for WER calculation. + Args: + text (str): The input text to be processed. + + Returns: + str: The cleaned and normalized text. + """ + text = text.replace("‘", "'").replace("’", "'") + text = re.sub(r"[^a-zA-Z0-9']", " ", text.lower()) + text = re.sub(r"\s+", " ", text).strip() + return text + + +def run_eval_worker(data_chunk, batch_size): + global worker_pipe + if worker_pipe is None: + logging.error("Worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + dataset = [ + { + "array": load_eval_waveform( + item["wav_path"], sample_rate=16000, return_numpy=True + ), + "sampling_rate": 16000, + } + for item in data_chunk + ] + generate_kwargs = {"language": "english", "task": "transcribe"} + + iterator = worker_pipe( + dataset, generate_kwargs=generate_kwargs, batch_size=batch_size + ) + + for i, out in enumerate(iterator): + hypothesis = out["text"].strip() + ref_item = data_chunk[i] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + + m = process_one(hypothesis, truth, post_process) + m["wav_path"] = wav_path + metrics_buffer.append(m) + + except Exception: + logging.error(f"Worker failed on chunk:\n{traceback.format_exc()}") + return [] + + return metrics_buffer + + +def main(): + parser = get_parser() + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + force=True, + ) + + logging.info(f"Calculating WER for {args.wav_path}") + + data_list = [] + samples = read_test_list(args.test_list) + for s in samples: + wav_full_path = str(Path(args.wav_path) / (s["id"] + "." + args.extension)) + if not os.path.exists(wav_full_path): + logging.warning(f"File missing: {wav_full_path}") + continue + data_list.append( + { + "wav_path": wav_full_path, + "truth_text": s["text"], + } + ) + total_files = len(data_list) + + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_workers = num_gpus * args.nj_per_gpu + + mp.set_start_method("spawn", force=True) + manager = mp.Manager() + rank_queue = manager.Queue() + + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + rank_queue.put(rank) + + chunk_size = max(1, args.batch_size) + tasks = [data_list[i : i + chunk_size] for i in range(0, total_files, chunk_size)] + + logging.info( + f"Split data into {len(tasks)} chunks (size ~{chunk_size}). " + f"Spawning {total_workers} workers." + ) + + results = [] + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init, + initargs=(rank_queue, args.model_dir), + ) as executor: + + futures = [] + for chunk in tasks: + futures.append(executor.submit(run_eval_worker, chunk, args.batch_size)) + + with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar: + for future in as_completed(futures): + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + + wers, inses, deles, subses = [], [], [], [] + word_nums = 0 + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + fout = open(args.decode_path, "w", encoding="utf8") + logging.info(f"Saving detailed WER results to: {args.decode_path}") + fout.write( + "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n" + ) + + for res in results: + wers.append(float(res["wer"])) + inses.append(float(res["insertions"])) + deles.append(float(res["deletions"])) + subses.append(float(res["substitutions"])) + word_nums += res["word_num"] + + if fout: + fout.write( + f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t" + f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t" + f"{res['substitutions']}\n" + ) + + wer_weighted = ( + round( + (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2 + ) + if word_nums > 0 + else float("nan") + ) + + inse_sum = np.sum(inses) + dele_sum = np.sum(deles) + subs_sum = np.sum(subses) + + print("-" * 50) + logging.info(f"Processed {len(results)}/{total_files} files.") + wer_info = f"WER: {wer_weighted}%" + detailed_info = ( + f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words" + ) + logging.info(wer_info) + logging.info(detailed_info) + print("-" * 50) + + if fout: + fout.write(wer_info + "\n" + detailed_info + "\n") + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/wer/minimax.py b/omnivoice/eval/wer/minimax.py new file mode 100644 index 0000000000000000000000000000000000000000..14793048d0c92fde69ae8bfdcc74dc6ad37b095d --- /dev/null +++ b/omnivoice/eval/wer/minimax.py @@ -0,0 +1,596 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Computes word error rate (WER) with Whisper-large-v3 for English and +Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets. +""" +import argparse +import logging +import multiprocessing as mp +import os +import traceback +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path +from typing import List, Union + +import numpy as np +import torch +import zhconv +from tqdm import tqdm + +from omnivoice.eval.utils import load_eval_waveform +from omnivoice.eval.wer.common import log_metrics, process_one +from omnivoice.eval.wer.text_norm_omni import text_normalize +from omnivoice.utils.data_utils import read_test_list + +# --- Global variables for worker processes --- +worker_pipe = None +worker_paraformer = None +worker_device = None + + +def read_language_mapping_from_tsv( + mapping_path: Path, +) -> dict[str, Union[str, List[str]]]: + with open(mapping_path, "r", encoding="utf-8") as f: + _ = f.readline() # Skip header + language_mapping = {} + for line in f: + parts = line.strip().split("\t") + mixed_id, language_name, iso_639_3_id, duration = parts + language_mapping[mixed_id] = iso_639_3_id + return language_mapping + + +mixed_id_to_iso_639_3_id = read_language_mapping_from_tsv( + Path(f"{os.path.dirname(__file__)}/../../../docs/lang_id_name_map.tsv") +) + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Computes WER with Whisper.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing speech files.", + ) + + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where WER information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of evaluation models repository. " + "Download from https://huggingface.co/k2-fsa/TTS_eval_models. ", + ) + parser.add_argument( + "--test-list", + type=str, + default="test.jsonl", + help="path of the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--lang", + type=str, + default=None, + help="""Language code to evaluate (e.g., 'en' for English, 'zh' for Chinese). + If not provided, the script will evaluate all languages found in the test list. + If specified, only samples of the given language will be evaluated. + """, + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size for decoding with the Hugging Face pipeline.", + ) + parser.add_argument( + "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU." + ) + parser.add_argument( + "--chunk-size", + type=int, + default=10, + help="Number of samples per task chunk sent to workers.", + ) + return parser + + +def load_whisper_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/whisper-large-v3/") + if not os.path.exists(model_path): + logging.error(f"Whisper model not found at {model_path}.") + return None + + import transformers + + # Suppress transformers logging + transformers.logging.set_verbosity_error() + + logging.info(f"Loading Whisper model on {device}...") + pipe = transformers.pipeline( + "automatic-speech-recognition", + model=model_path, + chunk_length_s=30, + dtype=torch.float16 if "cuda" in str(device) else torch.float32, + device=device, + ) + return pipe + + +def load_paraformer_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/paraformer-zh/") + if not os.path.exists(model_path): + logging.error(f"Paraformer model not found at {model_path}.") + return None + + logging.info(f"Loading Paraformer model on {device}...") + + previous_level = logging.root.manager.disable + logging.disable(logging.CRITICAL) + + try: + from funasr import AutoModel + + model = AutoModel( + model=model_path, + device=str(device), + disable_update=True, + disable_pbar=True, + verbose=False, + ) + finally: + logging.disable(previous_level) + + return model + + +def _worker_setup(rank_queue): + """Common worker setup: get rank, configure device and threads.""" + global worker_device + + torch.set_num_threads(2) + + try: + rank = rank_queue.get(timeout=10) + except Exception: + raise RuntimeError("Failed to get GPU rank from queue.") + + assert torch.cuda.is_available(), "CUDA is required but not available." + worker_device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + + logging.info(f"Initializing worker on device: {worker_device}") + + +def process_init(rank_queue, model_dir): + """Initializer for Whisper worker processes.""" + global worker_pipe + + _worker_setup(rank_queue) + + try: + worker_pipe = load_whisper_model(model_dir, worker_device) + if worker_pipe is None: + raise RuntimeError("Whisper model loading failed.") + except Exception as e: + logging.critical(f"Failed to load Whisper model on {worker_device}: {e}") + raise e + + +def process_init_paraformer(rank_queue, model_dir): + """Initializer for Paraformer worker processes (Chinese evaluation).""" + global worker_paraformer + + _worker_setup(rank_queue) + + try: + worker_paraformer = load_paraformer_model(model_dir, worker_device) + if worker_paraformer is None: + raise RuntimeError("Paraformer model loading failed.") + except Exception as e: + logging.critical(f"Failed to load Paraformer model on {worker_device}: {e}") + raise e + + +def post_process(text: str, lang: str) -> str: + """ + Cleans and normalizes text for WER calculation. + Args: + text (str): The input text to be processed. + lang (str): The language of the input text. + + Returns: + str: The cleaned and normalized text. + """ + if lang != "unknown": + + iso_639_3_code = mixed_id_to_iso_639_3_id[lang] + text = text_normalize( + text, + iso_code=iso_639_3_code, + lower_case=True, + remove_numbers=False, + remove_brackets=False, + ) + + if lang in ["zh", "yue"]: + text = zhconv.convert(text, "zh-cn") + + # Processing spaces for languages using CER (consistent with the practice + # in paper Minimax-Speech), specifically: zh, yue, ja, ko, th, arb, vi, hi, el. + if lang in ("zh", "yue", "ja"): + # For languages where spaces are not semantically meaningful, remove spaces. + text = text.replace(" ", "") + text = " ".join([x for x in text]) + elif lang in ("ko", "th", "arb", "vi", "hi", "el"): + # For languages where spaces are semantically meaningful, replace spaces with |. + text = text.replace(" ", "|") + text = " ".join([x for x in text]) + text = text.lower() + return text.strip() + + +class SpeechEvalDataset(torch.utils.data.Dataset): + def __init__(self, data_list): + self.data_list = data_list + + def __len__(self): + return len(self.data_list) + + def __getitem__(self, index): + item = self.data_list[index] + waveform = load_eval_waveform(item["wav_path"], sample_rate=16000, return_numpy=True) + return { + "array": waveform, + "sampling_rate": 16000, + "truth_text": item["truth_text"], + } + + +def run_eval_worker(data_chunk, language, batch_size): + """ + Worker function to process a chunk of data. + Uses the global worker_pipe initialized by process_init. + """ + global worker_pipe + if worker_pipe is None: + logging.error("Worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + dataset = SpeechEvalDataset(data_chunk) + if language != "unknown": + generate_kwargs = {"language": language, "task": "transcribe"} + else: + generate_kwargs = {"task": "transcribe"} + + # Use the pipeline to infer batch + # Note: We iterate through the iterator returned by pipe + iterator = worker_pipe( + dataset, generate_kwargs=generate_kwargs, batch_size=batch_size + ) + + for i, out in enumerate(iterator): + hypothesis = out["text"].strip() + + ref_item = data_chunk[i] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + lang_id = ref_item.get("lang_id") + lang_name = ref_item.get("lang_name") + + m = process_one(hypothesis, truth, post_process, lang_id) + m["wav_path"] = wav_path + m["lang_name"] = lang_name + metrics_buffer.append(m) + + except Exception: + logging.error( + f"Worker failed on chunk (Lang: {language}):\n{traceback.format_exc()}" + ) + return [] + + return metrics_buffer + + +def run_eval_worker_paraformer(data_chunk, batch_size): + """ + Worker function for Chinese evaluation using Paraformer. + Uses the global worker_paraformer initialized by process_init_paraformer. + """ + global worker_paraformer + if worker_paraformer is None: + logging.error("Paraformer worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + wav_paths = [item["wav_path"] for item in data_chunk] + + for i in range(0, len(wav_paths), batch_size): + batch_paths = wav_paths[i : i + batch_size] + res_batch = worker_paraformer.generate( + input=batch_paths, batch_size=batch_size, disable_pbar=True + ) + + for j, res in enumerate(res_batch): + hypothesis = res["text"] + ref_item = data_chunk[i + j] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + lang_name = ref_item.get("lang_name") + + m = process_one(hypothesis, truth, post_process, "zh") + m["wav_path"] = wav_path + m["lang_name"] = lang_name + metrics_buffer.append(m) + + except Exception: + logging.error(f"Paraformer worker failed on chunk:\n{traceback.format_exc()}") + return [] + + return metrics_buffer + + +def main(): + parser = get_parser() + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + force=True, + ) + + # 1. Prepare Data + logging.info("Reading test list...") + data_by_lang = defaultdict(list) + total_files = 0 + wav_root = Path(args.wav_path) + + samples = read_test_list(args.test_list) + for s in samples: + wav_path = str(wav_root / f"{s['id']}.{args.extension}") + if not os.path.exists(wav_path): + logging.warning(f"File missing: {wav_path}") + continue + + lang_id = s.get("language_id") or "unknown" + lang_name = s.get("language_name") or "unknown" + + item = { + "wav_path": wav_path, + "truth_text": s["text"], + "lang_id": lang_id, + "lang_name": lang_name, + } + if args.lang and s.get("language_id") != args.lang: + continue + + data_by_lang[lang_name].append(item) + total_files += 1 + + logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.") + + # 2. Worker config + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_workers = num_gpus * args.nj_per_gpu + + mp.set_start_method("spawn", force=True) + manager = mp.Manager() + + # 3. Scheduling: Split data into Chinese (Paraformer) and non-Chinese (Whisper) + zh_items = [] + non_zh_items = [] + for lang_name, items in data_by_lang.items(): + lang_id = items[0].get("lang_id", "") if items else "" + if lang_name == "Chinese" or (lang_id and lang_id.startswith("zh")): + zh_items.extend(items) + else: + non_zh_items.extend(items) + + chunk_size = args.chunk_size + + whisper_tasks = [] + for i in range(0, len(non_zh_items), chunk_size): + chunk = non_zh_items[i : i + chunk_size] + lang_name = chunk[0].get("lang_name", "unknown") + whisper_tasks.append({"chunk": chunk, "lang": lang_name}) + + paraformer_tasks = [] + for i in range(0, len(zh_items), chunk_size): + paraformer_tasks.append(zh_items[i : i + chunk_size]) + + logging.info( + f"Whisper tasks: {len(whisper_tasks)} chunks ({len(non_zh_items)} files). " + f"Paraformer tasks: {len(paraformer_tasks)} chunks ({len(zh_items)} files). " + f"Spawning {total_workers} workers per pool." + ) + + # 4. Execution — run Whisper and Paraformer pools sequentially + results = [] + + # 4a. Whisper pool for non-Chinese languages + if whisper_tasks: + whisper_rank_queue = manager.Queue() + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + whisper_rank_queue.put(rank) + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init, + initargs=(whisper_rank_queue, args.model_dir), + ) as executor: + + futures = [] + for task in whisper_tasks: + futures.append( + executor.submit( + run_eval_worker, task["chunk"], task["lang"], args.batch_size + ) + ) + + with tqdm( + total=len(non_zh_items), + desc="Whisper Eval", + dynamic_ncols=True, + ) as pbar: + for future in as_completed(futures): + try: + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + except Exception as e: + logging.error(f"Whisper task failed: {e}") + + # 4b. Paraformer pool for Chinese + if paraformer_tasks: + para_rank_queue = manager.Queue() + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + para_rank_queue.put(rank) + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init_paraformer, + initargs=(para_rank_queue, args.model_dir), + ) as executor: + + futures = [] + for chunk in paraformer_tasks: + futures.append( + executor.submit(run_eval_worker_paraformer, chunk, args.batch_size) + ) + + with tqdm( + total=len(zh_items), + desc="Paraformer Eval", + dynamic_ncols=True, + ) as pbar: + for future in as_completed(futures): + try: + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + except Exception as e: + logging.error(f"Paraformer task failed: {e}") + + # 5. Metrics Aggregation + wers, inses, deles, subses = [], [], [], [] + word_nums = 0 + + # Store metrics per language + lang_stats = {} + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + logging.info(f"Saving detailed WER results to: {args.decode_path}") + fout = open(args.decode_path, "w", encoding="utf-8") + + for res in results: + wers.append(float(res["wer"])) + inses.append(float(res["insertions"])) + deles.append(float(res["deletions"])) + subses.append(float(res["substitutions"])) + word_nums += res["word_num"] + + if fout: + fout.write( + f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t" + f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t" + f"{res['substitutions']}\n" + ) + lang_name = res["lang_name"] + + # Per language stats + if lang_name not in lang_stats: + lang_stats[lang_name] = { + "inses": [], + "deles": [], + "subses": [], + "word_nums": 0, + } + lang_stats[lang_name]["inses"].append(float(res["insertions"])) + lang_stats[lang_name]["deles"].append(float(res["deletions"])) + lang_stats[lang_name]["subses"].append(float(res["substitutions"])) + lang_stats[lang_name]["word_nums"] += res["word_num"] + + print("-" * 50) + # Log per-language stats + per_lang_wers = [] + for lang in sorted(lang_stats.keys()): + stats = lang_stats[lang] + if stats["word_nums"] > 0: + lang_wer = log_metrics( + fout, + f"[{lang}]", + stats["inses"], + stats["deles"], + stats["subses"], + stats["word_nums"], + ndigits=3, + ) + per_lang_wers.append(lang_wer) + print("-" * 50) + + # Log Macro-average WER + if len(per_lang_wers) > 1: + macro_wer = np.mean(per_lang_wers) + logging.info( + f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%" + ) + if fout: + fout.write( + f"Macro-average WER over {len(per_lang_wers)} languages: {macro_wer:.2f}%\n" + ) + + # Log overall stats + if word_nums > 0: + log_metrics(fout, "Overall", inses, deles, subses, word_nums) + + if fout: + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/wer/norm_config_module.py b/omnivoice/eval/wer/norm_config_module.py new file mode 100644 index 0000000000000000000000000000000000000000..d2df2e5267bcc6d953c68e151550ff031d374530 --- /dev/null +++ b/omnivoice/eval/wer/norm_config_module.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +This module defines the normalization configuration for WER evaluation. +Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/norm_config_module.py +""" + +# type: ignore +import os +import re + +colon = ":" +comma = "," +exclamation_mark = "!" +period = re.escape(".") +question_mark = re.escape("?") +semicolon = ";" + +left_curly_bracket = "{" +right_curly_bracket = "}" +quotation_mark = '"' + +basic_punc = ( + period + + question_mark + + comma + + colon + + exclamation_mark + + left_curly_bracket + + right_curly_bracket +) + +# General punc unicode block (0x2000-0x206F) +zero_width_space = r"\u200B" +zero_width_nonjoiner = r"\u200C" +left_to_right_mark = r"\u200E" +right_to_left_mark = r"\u200F" +left_to_right_embedding = r"\u202A" +pop_directional_formatting = r"\u202C" + +# Here are some commonly ill-typed versions of apostrophe +right_single_quotation_mark = r"\u2019" +left_single_quotation_mark = r"\u2018" + +# Language specific definitions +# Spanish +inverted_exclamation_mark = r"\u00A1" +inverted_question_mark = r"\u00BF" + + +# Hindi +hindi_danda = "\u0964" + +# Egyptian Arabic +# arabic_percent = r"\u066A" +arabic_comma = r"\u060C" +arabic_question_mark = r"\u061F" +arabic_semicolon = r"\u061B" +arabic_diacritics = r"\u064B-\u0652" + + +arabic_subscript_alef_and_inverted_damma = r"\u0656-\u0657" + + +# Chinese +full_stop = r"\u3002" +full_comma = r"\uFF0C" +full_exclamation_mark = r"\uFF01" +full_question_mark = r"\uFF1F" +full_semicolon = r"\uFF1B" +full_colon = r"\uFF1A" +full_parentheses = r"\uFF08\uFF09" +quotation_mark_horizontal = r"\u300C-\u300F" +quotation_mark_vertical = r"\uFF41-\uFF44" +title_marks = r"\u3008-\u300B" +wavy_low_line = r"\uFE4F" +ellipsis = r"\u22EF" +enumeration_comma = r"\u3001" +hyphenation_point = r"\u2027" +forward_slash = r"\uFF0F" +wavy_dash = r"\uFF5E" +box_drawings_light_horizontal = r"\u2500" +fullwidth_low_line = r"\uFF3F" +chinese_punc = ( + full_stop + + full_comma + + full_exclamation_mark + + full_question_mark + + full_semicolon + + full_colon + + full_parentheses + + quotation_mark_horizontal + + quotation_mark_vertical + + title_marks + + wavy_low_line + + ellipsis + + enumeration_comma + + hyphenation_point + + forward_slash + + wavy_dash + + box_drawings_light_horizontal + + fullwidth_low_line +) + +# Armenian +armenian_apostrophe = r"\u055A" +emphasis_mark = r"\u055B" +exclamation_mark = r"\u055C" +armenian_comma = r"\u055D" +armenian_question_mark = r"\u055E" +abbreviation_mark = r"\u055F" +armenian_full_stop = r"\u0589" +armenian_punc = ( + armenian_apostrophe + + emphasis_mark + + exclamation_mark + + armenian_comma + + armenian_question_mark + + abbreviation_mark + + armenian_full_stop +) + +lesser_than_symbol = r"<" +greater_than_symbol = r">" + +lesser_than_sign = r"\u003c" +greater_than_sign = r"\u003e" + +nbsp_written_form = r" " + +# Quotation marks +left_double_quotes = r"\u201c" +right_double_quotes = r"\u201d" +left_double_angle = r"\u00ab" +right_double_angle = r"\u00bb" +left_single_angle = r"\u2039" +right_single_angle = r"\u203a" +low_double_quotes = r"\u201e" +low_single_quotes = r"\u201a" +high_double_quotes = r"\u201f" +high_single_quotes = r"\u201b" + +all_punct_quotes = ( + left_double_quotes + + right_double_quotes + + left_double_angle + + right_double_angle + + left_single_angle + + right_single_angle + + low_double_quotes + + low_single_quotes + + high_double_quotes + + high_single_quotes + + right_single_quotation_mark + + left_single_quotation_mark +) +mapping_quotes = ( + "[" + + high_single_quotes + + right_single_quotation_mark + + left_single_quotation_mark + + "]" +) + + +# Digits + +english_digits = r"\u0030-\u0039" +bengali_digits = r"\u09e6-\u09ef" +khmer_digits = r"\u17e0-\u17e9" +devanagari_digits = r"\u0966-\u096f" +oriya_digits = r"\u0b66-\u0b6f" +extended_arabic_indic_digits = r"\u06f0-\u06f9" +kayah_li_digits = r"\ua900-\ua909" +fullwidth_digits = r"\uff10-\uff19" +malayam_digits = r"\u0d66-\u0d6f" +myanmar_digits = r"\u1040-\u1049" +roman_numeral = r"\u2170-\u2179" +nominal_digit_shapes = r"\u206f" + +# Load punctuations +with open(f"{os.path.dirname(__file__)}/punctuations.lst", "r") as punc_f: + punc_list = [ + line + for line in punc_f.readlines() + if line.strip() and not line.strip().startswith("#") + ] + +punct_pattern = r"" +for punc in punc_list: + # the first character in the tab separated line is the punc to be removed + punct_pattern += re.escape(punc.split("\t")[0]) + +shared_digits = ( + english_digits + + bengali_digits + + khmer_digits + + devanagari_digits + + oriya_digits + + extended_arabic_indic_digits + + kayah_li_digits + + fullwidth_digits + + malayam_digits + + myanmar_digits + + roman_numeral + + nominal_digit_shapes +) + +shared_punc_list = ( + basic_punc + + all_punct_quotes + + greater_than_sign + + lesser_than_sign + + inverted_question_mark + + full_stop + + semicolon + + armenian_punc + + inverted_exclamation_mark + + arabic_comma + + enumeration_comma + + hindi_danda + + quotation_mark + + arabic_semicolon + + arabic_question_mark + + chinese_punc + + punct_pattern +) + +shared_mappping = { + lesser_than_symbol: "", + greater_than_symbol: "", + nbsp_written_form: "", + r"(\S+)" + mapping_quotes + r"(\S+)": r"\1'\2", +} + +shared_deletion_list = ( + left_to_right_mark + + zero_width_nonjoiner + + arabic_subscript_alef_and_inverted_damma + + zero_width_space + + arabic_diacritics + + pop_directional_formatting + + right_to_left_mark + + left_to_right_embedding +) + +norm_config = { + "*": { + "lower_case": True, + "punc_set": shared_punc_list, + "del_set": shared_deletion_list, + "mapping": shared_mappping, + "digit_set": shared_digits, + "unicode_norm": "NFKC", + "rm_diacritics": False, + } +} + +# =============== Mongolian ===============# + +norm_config["mon"] = norm_config["*"].copy() +# add soft hyphen to punc list to match with fleurs +norm_config["mon"]["del_set"] += r"\u00AD" + +norm_config["khk"] = norm_config["mon"].copy() + +# =============== Hebrew ===============# + +norm_config["heb"] = norm_config["*"].copy() +# add "HEBREW POINT" symbols to match with fleurs +norm_config["heb"]["del_set"] += r"\u05B0-\u05BF\u05C0-\u05CF" + +# =============== Thai ===============# + +norm_config["tha"] = norm_config["*"].copy() +# add "Zero width joiner" symbols to match with fleurs +norm_config["tha"]["punc_set"] += r"\u200D" + +# =============== Arabic ===============# +norm_config["ara"] = norm_config["*"].copy() +norm_config["ara"]["mapping"]["ٱ"] = "ا" +norm_config["arb"] = norm_config["ara"].copy() + +# =============== Javanese ===============# +norm_config["jav"] = norm_config["*"].copy() +norm_config["jav"]["rm_diacritics"] = True diff --git a/omnivoice/eval/wer/punctuations.lst b/omnivoice/eval/wer/punctuations.lst new file mode 100644 index 0000000000000000000000000000000000000000..f002b3553cb1344290950824d7d9f7c9a000357d --- /dev/null +++ b/omnivoice/eval/wer/punctuations.lst @@ -0,0 +1,188 @@ + 7355 INVALID UNICODE 0x81 + 5265 INVALID UNICODE 0x90 + 75 INVALID UNICODE 0x8 + 31 INVALID UNICODE 0x8d +” 3 INVALID UNICODE 0x94 + 2 INVALID UNICODE 0x8f + 2 INVALID UNICODE 0x1a + 1 INVALID UNICODE 0x9d +“ 1 INVALID UNICODE 0x93 +’ 1 INVALID UNICODE 0x92 + 8647 INVALID UNICODE 0xe295 + 6650 INVALID UNICODE 0xf21d + 6234 INVALID UNICODE 0xf62d + 4815 INVALID UNICODE 0xf173 + 4789 INVALID UNICODE 0xe514 + 4409 INVALID UNICODE 0xe293 + 3881 INVALID UNICODE 0xf523 + 3788 INVALID UNICODE 0xe233 + 2448 INVALID UNICODE 0xf50f + 2177 INVALID UNICODE 0xe232 + 1955 INVALID UNICODE 0xea7b + 1926 INVALID UNICODE 0xf172 + 973 INVALID UNICODE 0xe290 + 972 INVALID UNICODE 0xf519 + 661 INVALID UNICODE 0xe292 + 591 INVALID UNICODE 0xe328 + 509 INVALID UNICODE 0xe2fa + 458 INVALID UNICODE 0xe234 + 446 INVALID UNICODE 0xe043 + 419 INVALID UNICODE 0xe040 + 399 INVALID UNICODE 0xe2fb + 387 INVALID UNICODE 0xe32b + 381 INVALID UNICODE 0xe236 + 374 INVALID UNICODE 0xf511 + 314 INVALID UNICODE 0xe517 + 296 INVALID UNICODE 0xe2fe + 293 INVALID UNICODE 0xe492 + 291 INVALID UNICODE 0xf52d + 289 INVALID UNICODE 0xe2fc + 195 INVALID UNICODE 0xf521 + 190 INVALID UNICODE 0xe516 + 182 INVALID UNICODE 0xe041 + 178 INVALID UNICODE 0xf529 + 113 INVALID UNICODE 0xe2f9 + 87 INVALID UNICODE 0xe2d9 + 78 INVALID UNICODE 0xe32a + 76 INVALID UNICODE 0xe291 + 74 INVALID UNICODE 0xe296 + 66 INVALID UNICODE 0xe518 + 52 INVALID UNICODE 0xe32c + 46 INVALID UNICODE 0xe2db + 41 INVALID UNICODE 0xe231 + 34 INVALID UNICODE 0xf522 + 33 INVALID UNICODE 0xf518 + 32 INVALID UNICODE 0xf513 + 27 INVALID UNICODE 0xe32d + 25 INVALID UNICODE 0xe32e + 23 INVALID UNICODE 0xe06b + 15 INVALID UNICODE 0xea01 + 12 INVALID UNICODE 0xe294 + 11 INVALID UNICODE 0xe203 + 8 INVALID UNICODE 0xf218 + 7 INVALID UNICODE 0xe070 + 7 INVALID UNICODE 0xe013 + 5 INVALID UNICODE 0xe2de + 4 INVALID UNICODE 0xe493 + 3 INVALID UNICODE 0xf7e8 + 3 INVALID UNICODE 0xf7d0 + 3 INVALID UNICODE 0xe313 + 2 INVALID UNICODE 0xe329 + 2 INVALID UNICODE 0xe06d + 2 INVALID UNICODE 0xe003 + 1 INVALID UNICODE 0xf50e + 1 INVALID UNICODE 0xf171 + 1 INVALID UNICODE 0xe01d + 71 NOMINAL DIGIT SHAPES 0x206f +⁠ 3 WORD JOINER 0x2060 +― 126545 HORIZONTAL BAR 0x2015 +־ 1028 HEBREW PUNCTUATION MAQAF 0x5be +) 98429 RIGHT PARENTHESIS 0x29 +] 27108 RIGHT SQUARE BRACKET 0x5d +⌋ 1567 RIGHT FLOOR 0x230b +〕 97 RIGHT TORTOISE SHELL BRACKET 0x3015 +】 36 RIGHT BLACK LENTICULAR BRACKET 0x3011 +﴾ 14 ORNATE LEFT PARENTHESIS 0xfd3e +& 170517 AMPERSAND 0x26 +། 106330 TIBETAN MARK SHAD 0xf0d +። 90203 ETHIOPIC FULL STOP 0x1362 +፥ 60484 ETHIOPIC COLON 0x1365 +༌ 60464 TIBETAN MARK DELIMITER TSHEG BSTAR 0xf0c +။ 51567 MYANMAR SIGN SECTION 0x104b +/ 46929 SOLIDUS 0x2f +၊ 38042 MYANMAR SIGN LITTLE SECTION 0x104a +· 37985 MIDDLE DOT 0xb7 +‸ 36310 CARET 0x2038 +* 34793 ASTERISK 0x2a +۔ 32432 ARABIC FULL STOP 0x6d4 +፤ 31906 ETHIOPIC SEMICOLON 0x1364 +၏ 21519 MYANMAR SYMBOL GENITIVE 0x104f +។ 20834 KHMER SIGN KHAN 0x17d4 +꓾ 15773 LISU PUNCTUATION COMMA 0xa4fe +᙮ 13473 CANADIAN SYLLABICS FULL STOP 0x166e +꤯ 12892 KAYAH LI SIGN SHYA 0xa92f +⵰ 11478 TIFINAGH SEPARATOR MARK 0x2d70 +꓿ 11118 LISU PUNCTUATION FULL STOP 0xa4ff +॥ 10763 DEVANAGARI DOUBLE DANDA 0x965 +؞ 10403 ARABIC TRIPLE DOT PUNCTUATION MARK 0x61e +၍ 8936 MYANMAR SYMBOL COMPLETED 0x104d +· 8431 GREEK ANO TELEIA 0x387 +† 7477 DAGGER 0x2020 +၌ 6632 MYANMAR SYMBOL LOCATIVE 0x104c +፣ 5719 ETHIOPIC COMMA 0x1363 +៖ 5528 KHMER SIGN CAMNUC PII KUUH 0x17d6 +꤮ 4791 KAYAH LI SIGN CWI 0xa92e +※ 3439 REFERENCE MARK 0x203b +፦ 2727 ETHIOPIC PREFACE COLON 0x1366 +• 1749 BULLET 0x2022 +¶ 1507 PILCROW SIGN 0xb6 +၎ 1386 MYANMAR SYMBOL AFOREMENTIONED 0x104e +﹖ 1224 SMALL QUESTION MARK 0xfe56 +; 975 GREEK QUESTION MARK 0x37e +… 827 HORIZONTAL ELLIPSIS 0x2026 +% 617 PERCENT SIGN 0x25 +・ 468 KATAKANA MIDDLE DOT 0x30fb +༎ 306 TIBETAN MARK NYIS SHAD 0xf0e +‡ 140 DOUBLE DAGGER 0x2021 +# 137 NUMBER SIGN 0x23 +@ 125 COMMERCIAL AT 0x40 +፡ 121 ETHIOPIC WORDSPACE 0x1361 +៚ 55 KHMER SIGN KOOMUUT 0x17da +៕ 49 KHMER SIGN BARIYOOSAN 0x17d5 +﹐ 10 SMALL COMMA 0xfe50 +༅ 6 TIBETAN MARK CLOSING YIG MGO SGAB MA 0xf05 +༄ 6 TIBETAN MARK INITIAL YIG MGO MDUN MA 0xf04 +. 2 FULLWIDTH FULL STOP 0xff0e +﹗ 2 SMALL EXCLAMATION MARK 0xfe57 +﹕ 2 SMALL COLON 0xfe55 +‰ 2 PER MILLE SIGN 0x2030 +・ 1 HALFWIDTH KATAKANA MIDDLE DOT 0xff65 +( 98504 LEFT PARENTHESIS 0x28 +[ 27245 LEFT SQUARE BRACKET 0x5b +⌊ 1567 LEFT FLOOR 0x230a +〔 95 LEFT TORTOISE SHELL BRACKET 0x3014 +【 36 LEFT BLACK LENTICULAR BRACKET 0x3010 +﴿ 14 ORNATE RIGHT PARENTHESIS 0xfd3f +_ 4851 LOW LINE 0x5f +$ 72 DOLLAR SIGN 0x24 +€ 14 EURO SIGN 0x20ac +£ 2 POUND SIGN 0xa3 +~ 27462 TILDE 0x7e += 11450 EQUALS SIGN 0x3d +| 8430 VERTICAL LINE 0x7c +− 3971 MINUS SIGN 0x2212 +≫ 1904 MUCH GREATER-THAN 0x226b +≪ 1903 MUCH LESS-THAN 0x226a ++ 1450 PLUS SIGN 0x2b +< 345 FULLWIDTH LESS-THAN SIGN 0xff1c +> 344 FULLWIDTH GREATER-THAN SIGN 0xff1e +¬ 5 NOT SIGN 0xac +× 4 MULTIPLICATION SIGN 0xd7 +→ 2 RIGHTWARDS ARROW 0x2192 +᙭ 537 CANADIAN SYLLABICS CHI SIGN 0x166d +° 499 DEGREE SIGN 0xb0 +႟ 421 MYANMAR SYMBOL SHAN EXCLAMATION 0x109f +� 192 REPLACEMENT CHARACTER 0xfffd +⌟ 54 BOTTOM RIGHT CORNER 0x231f +⌞ 54 BOTTOM LEFT CORNER 0x231e +© 2 COPYRIGHT SIGN 0xa9 +  40 NARROW NO-BREAK SPACE 0x202f +  1 SIX-PER-EM SPACE 0x2006 +˜ 40261 SMALL TILDE 0x2dc +^ 6469 CIRCUMFLEX ACCENT 0x5e +¯ 20 MACRON 0xaf +ˇ 191442 CARON 0x2c7 +ⁿ 38144 SUPERSCRIPT LATIN SMALL LETTER N 0x207f +ـ 9440 ARABIC TATWEEL 0x640 +ๆ 6766 THAI CHARACTER MAIYAMOK 0xe46 +ៗ 3310 KHMER SIGN LEK TOO 0x17d7 +々 678 IDEOGRAPHIC ITERATION MARK 0x3005 +ໆ 430 LAO KO LA 0xec6 +ー 319 KATAKANA-HIRAGANA PROLONGED SOUND MARK 0x30fc +ⁱ 137 SUPERSCRIPT LATIN SMALL LETTER I 0x2071 +৷ 11056 BENGALI CURRENCY NUMERATOR FOUR 0x9f7 +⅓ 26 VULGAR FRACTION ONE THIRD 0x2153 +½ 26 VULGAR FRACTION ONE HALF 0xbd +¼ 4 VULGAR FRACTION ONE QUARTER 0xbc +⅟ 1 FRACTION NUMERATOR ONE 0x215f +⁄ 57 FRACTION SLASH 0x2044 diff --git a/omnivoice/eval/wer/seedtts.py b/omnivoice/eval/wer/seedtts.py new file mode 100644 index 0000000000000000000000000000000000000000..7d1ba98f186f8b36a4ab658e4814553c484e684f --- /dev/null +++ b/omnivoice/eval/wer/seedtts.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Computes word error rate (WER) with Whisper-large-v3 for English and +Paraformer for Chinese. Intended to evaluate WERs on Seed-TTS test sets. +""" +import argparse +import logging +import multiprocessing as mp +import os +import string +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +import numpy as np +import torch +import zhconv +from tqdm import tqdm +from zhon.hanzi import punctuation + +from omnivoice.eval.utils import load_eval_waveform +from omnivoice.eval.wer.common import process_one +from omnivoice.utils.data_utils import read_test_list + +# --- Global variables for worker processes --- +worker_pipe = None +worker_device = None + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Computes WER with Whisper/Paraformer.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing speech files.", + ) + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where WER information will be saved. " + "If not provided, results are only printed to console.", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of evaluation models repository. " + "Download from https://huggingface.co/k2-fsa/TTS_eval_models. " + "This script expects 'tts_eval_models/wer/whisper-large-v3/' for English " + "and 'tts_eval_models/wer/paraformer-zh/' for Chinese within this directory.", + ) + parser.add_argument( + "--test-list", + type=str, + default="test.jsonl", + help="path of the JSONL test list. Each line is a JSON object " + "with fields: id, text, ref_audio, ref_text, language_id, language_name.", + ) + parser.add_argument( + "--lang", + type=str, + choices=["zh", "en"], + required=True, + help="Language of the audio and transcripts for " + "decoding ('zh' for Chinese or 'en' for English).", + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size for decoding with the Hugging Face pipeline.", + ) + parser.add_argument( + "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU." + ) + return parser + + +def load_whisper_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/whisper-large-v3/") + if not os.path.exists(model_path): + logging.error(f"Whisper model not found at {model_path}.") + return None + + logging.debug(f"Loading Whisper model on {device}...") + + import transformers + + # Suppress transformers logging + transformers.logging.set_verbosity_error() + + pipe = transformers.pipeline( + "automatic-speech-recognition", + model=model_path, + dtype=torch.float16 if "cuda" in str(device) else torch.float32, + device=device, + ) + return pipe + + +def load_paraformer_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/paraformer-zh/") + if not os.path.exists(model_path): + logging.error(f"Paraformer model not found at {model_path}.") + return None + + logging.debug(f"Loading Paraformer model on {device}...") + + previous_level = logging.root.manager.disable + logging.disable(logging.CRITICAL) + + try: + from funasr import AutoModel + + # FunASR AutoModel accepts "cuda:0" string or torch.device + model = AutoModel( + model=model_path, + device=str(device), + disable_update=True, + disable_pbar=True, + verbose=False, + ) + finally: + logging.disable(previous_level) + + return model + + +def post_process(text: str, lang: str) -> str: + """ + Cleans and normalizes text for WER calculation. + Args: + text (str): The input text to be processed. + lang (str): The language of the input text. + + Returns: + str: The cleaned and normalized text. + """ + punctuation_all = punctuation + string.punctuation + for x in punctuation_all: + if x == "'": + continue + text = text.replace(x, "") + + text = text.replace(" ", " ") + + if lang == "zh": + text = " ".join([x for x in text]) + elif lang == "en": + text = text.lower() + else: + raise NotImplementedError + return text + + +def process_init(rank_queue, model_dir, lang): + """ + Initializer for each worker process. + Loads model onto a specific GPU, once per process. + """ + global worker_pipe, worker_device + + torch.set_num_threads(2) + + try: + rank = rank_queue.get(timeout=10) + except Exception: + raise RuntimeError("Failed to get GPU rank from queue.") + + assert torch.cuda.is_available(), "CUDA is required but not available." + worker_device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + + logging.info(f"Initializing worker on device: {worker_device}") + + try: + if lang == "en": + worker_pipe = load_whisper_model(model_dir, worker_device) + elif lang == "zh": + worker_pipe = load_paraformer_model(model_dir, worker_device) + if worker_pipe is None: + raise RuntimeError("Model loading failed.") + except Exception as e: + logging.critical(f"Failed to load model on {worker_device}: {e}") + raise e + + +def run_eval_worker(data_chunk, lang, batch_size): + """ + Worker function to process a chunk of data. + Uses the global worker_pipe initialized by process_init. + """ + global worker_pipe + if worker_pipe is None: + logging.error("Worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + if lang == "en": + # Load waveforms as arrays, truncating to 30s + dataset = [ + { + "array": load_eval_waveform( + item["wav_path"], sample_rate=16000, return_numpy=True + )[: 16000 * 30], + "sampling_rate": 16000, + } + for item in data_chunk + ] + generate_kwargs = {"language": "english", "task": "transcribe"} + + iterator = worker_pipe( + dataset, generate_kwargs=generate_kwargs, batch_size=batch_size + ) + + for i, out in enumerate(iterator): + hypothesis = out["text"].strip() + ref_item = data_chunk[i] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + + m = process_one(hypothesis, truth, post_process, lang) + m["wav_path"] = wav_path + metrics_buffer.append(m) + + elif lang == "zh": + wav_paths = [item["wav_path"] for item in data_chunk] + + for i in range(0, len(wav_paths), batch_size): + batch_paths = wav_paths[i : i + batch_size] + res_batch = worker_pipe.generate( + input=batch_paths, batch_size=batch_size, disable_pbar=True + ) + + for j, res in enumerate(res_batch): + hypothesis = zhconv.convert(res["text"], "zh-cn") + ref_item = data_chunk[i + j] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + + m = process_one(hypothesis, truth, post_process, lang) + m["wav_path"] = wav_path + metrics_buffer.append(m) + + except Exception: + logging.error( + f"Worker failed on chunk (Lang: {lang}):\n{traceback.format_exc()}" + ) + return [] + + return metrics_buffer + + +def main(): + parser = get_parser() + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + force=True, + ) + + logging.info(f"Calculating WER for {args.wav_path}") + + # 1. Prepare Data + logging.info("Reading test list...") + data_list = [] + samples = read_test_list(args.test_list) + for s in samples: + wav_path = str(Path(args.wav_path) / f"{s['id']}.{args.extension}") + if not os.path.exists(wav_path): + logging.warning(f"File missing: {wav_path}") + continue + data_list.append({"wav_path": wav_path, "truth_text": s["text"]}) + total_files = len(data_list) + logging.info(f"Total files: {total_files}.") + + # 2. Worker config + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_workers = num_gpus * args.nj_per_gpu + + mp.set_start_method("spawn", force=True) + manager = mp.Manager() + rank_queue = manager.Queue() + + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + rank_queue.put(rank) + + # 3. Scheduling: Split data into chunks for better load balancing + chunk_size = max(1, args.batch_size) + tasks = [] + for i in range(0, total_files, chunk_size): + tasks.append(data_list[i : i + chunk_size]) + + logging.info( + f"Split data into {len(tasks)} chunks (size ~{chunk_size}). " + f"Spawning {total_workers} workers." + ) + + # 4. Execution + results = [] + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init, + initargs=(rank_queue, args.model_dir, args.lang), + ) as executor: + + futures = [] + for chunk in tasks: + futures.append( + executor.submit(run_eval_worker, chunk, args.lang, args.batch_size) + ) + + # Unified progress bar + with tqdm(total=total_files, desc="Eval Progress", dynamic_ncols=True) as pbar: + for future in as_completed(futures): + try: + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + except Exception as e: + logging.error(f"Task failed: {e}") + + wers, inses, deles, subses = [], [], [], [] + word_nums = 0 + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + fout = open(args.decode_path, "w", encoding="utf8") + logging.info(f"Saving detailed WER results to: {args.decode_path}") + fout.write( + "Name\tWER\tTruth\tHypothesis\tInsertions\tDeletions\tSubstitutions\n" + ) + + for res in results: + wers.append(float(res["wer"])) + inses.append(float(res["insertions"])) + deles.append(float(res["deletions"])) + subses.append(float(res["substitutions"])) + word_nums += res["word_num"] + + if fout: + fout.write( + f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t" + f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t" + f"{res['substitutions']}\n" + ) + + wer_avg = round(np.mean(wers) * 100, 2) if wers else float("nan") + wer_weighted = ( + round( + (np.sum(subses) + np.sum(deles) + np.sum(inses)) / word_nums * 100, 2 + ) + if word_nums > 0 + else float("nan") + ) + + inse_sum = np.sum(inses) + dele_sum = np.sum(deles) + subs_sum = np.sum(subses) + + print("-" * 50) + logging.info(f"Processed {len(results)}/{total_files} files.") + seedtts_wer_info = f"Seed-TTS WER (Avg of WERs): {wer_avg}%" + wer_info = f"WER (Weighted): {wer_weighted}%" + detailed_info = ( + f"Errors: {inse_sum} ins, {dele_sum} del, {subs_sum} sub / {word_nums} words" + ) + logging.info(seedtts_wer_info) + logging.info(wer_info) + logging.info(detailed_info) + print("-" * 50) + + if fout: + fout.write(seedtts_wer_info + "\n" + wer_info + "\n" + detailed_info + "\n") + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/wer/sensevoice.py b/omnivoice/eval/wer/sensevoice.py new file mode 100644 index 0000000000000000000000000000000000000000..8490c22e6809e2a3e0d4723312d9481d0bbf545d --- /dev/null +++ b/omnivoice/eval/wer/sensevoice.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Computes Character Error Rate (CER) for Cantonese (yue) using SenseVoiceSmall. +""" + +import argparse +import logging +import multiprocessing as mp +import os +import re +import traceback +from concurrent.futures import ProcessPoolExecutor, as_completed +from pathlib import Path + +import cn2an +import torch +import zhconv +from tqdm import tqdm + +from omnivoice.eval.wer.common import log_metrics, process_one +from omnivoice.eval.wer.text_norm_omni import text_normalize +from omnivoice.utils.data_utils import read_test_list + +# --- Global variables for worker processes --- +worker_sensevoice = None +worker_device = None + + +def get_parser(): + parser = argparse.ArgumentParser( + description="Computes CER for Cantonese using SenseVoiceSmall.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--wav-path", + type=str, + required=True, + help="Path to the directory containing speech files.", + ) + + parser.add_argument( + "--extension", + type=str, + default="wav", + help="Extension of the speech files. Default: wav", + ) + + parser.add_argument( + "--decode-path", + type=str, + default=None, + help="Path to the output file where CER information will be saved. ", + ) + parser.add_argument( + "--model-dir", + type=str, + required=True, + help="Local path of evaluation models repository. ", + ) + parser.add_argument( + "--test-list", + type=str, + default="test.jsonl", + help="path of the JSONL test list.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size for decoding.", + ) + parser.add_argument( + "--nj-per-gpu", type=int, default=1, help="Number of workers per GPU." + ) + parser.add_argument( + "--chunk-size", + type=int, + default=10, + help="Number of samples per task chunk sent to workers.", + ) + return parser + + +def load_sensevoice_model(model_dir, device): + model_path = os.path.join(model_dir, "wer/SenseVoiceSmall") + if not os.path.exists(model_path): + # Fallback if specific sensevoice spelling isn't found + logging.warning( + f"SenseVoiceSmall not found at {model_path}. " + f"Please ensure it is present in eval models." + ) + + logging.info(f"Loading SenseVoice model on {device}...") + + previous_level = logging.root.manager.disable + logging.disable(logging.CRITICAL) + + try: + from funasr import AutoModel + + model = AutoModel( + model="iic/SenseVoiceSmall", + device=str(device), + disable_update=True, + disable_pbar=True, + verbose=False, + ) + finally: + logging.disable(previous_level) + + return model + + +def _worker_setup(rank_queue): + global worker_device + + torch.set_num_threads(2) + + try: + rank = rank_queue.get(timeout=10) + except Exception: + raise RuntimeError("Failed to get GPU rank from queue.") + + assert torch.cuda.is_available(), "CUDA is required but not available." + worker_device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(rank) + + logging.info(f"Initializing worker on device: {worker_device}") + + +def process_init_sensevoice(rank_queue, model_dir): + global worker_sensevoice + + _worker_setup(rank_queue) + + try: + worker_sensevoice = load_sensevoice_model(model_dir, worker_device) + if worker_sensevoice is None: + raise RuntimeError("SenseVoice model loading failed.") + except Exception as e: + logging.critical(f"Failed to load SenseVoice model on {worker_device}: {e}") + raise e + + +def post_process(text: str, lang: str) -> str: + """ + Cleans and normalizes text for calculation. + """ + assert lang == "yue", "this script is designed for Cantonese (yue) evaluation only." + text = text_normalize( + text, + iso_code="yue", + lower_case=True, + remove_numbers=False, + remove_brackets=False, + ) + + text = zhconv.convert(text, "zh-cn") + + text = cn2an.transform(text, "an2cn") + + text = text.replace(" ", "") + text = " ".join([x for x in text]) + text = text.lower() + return text.strip() + + +def run_eval_worker_sensevoice(data_chunk, batch_size): + global worker_sensevoice + if worker_sensevoice is None: + logging.error("SenseVoice worker pipeline is not initialized!") + return [] + + metrics_buffer = [] + try: + wav_paths = [item["wav_path"] for item in data_chunk] + + for i in range(0, len(wav_paths), batch_size): + batch_paths = wav_paths[i : i + batch_size] + + # SenseVoice generate call, target lang mapped to yue + res_batch = worker_sensevoice.generate( + input=batch_paths, + batch_size=batch_size, + language="yue", + use_itn=False, + disable_pbar=True, + ) + + for j, res in enumerate(res_batch): + hypothesis = res["text"] + # SenseVoice may format output with language tags, + # cleaning basic tags if any + hypothesis = re.sub(r"<\|[^|]*\|>", "", hypothesis).strip() + + ref_item = data_chunk[i + j] + truth = ref_item["truth_text"] + wav_path = ref_item["wav_path"] + lang_name = ref_item.get("lang_name") + + m = process_one(hypothesis, truth, post_process, "yue") + m["wav_path"] = wav_path + m["lang_name"] = lang_name + metrics_buffer.append(m) + + except Exception: + logging.error(f"SenseVoice worker failed on chunk:\n{traceback.format_exc()}") + return [] + + return metrics_buffer + + +def main(): + parser = get_parser() + args = parser.parse_args() + + logging.basicConfig( + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", + level=logging.INFO, + force=True, + ) + + logging.info("Reading test list and filtering for Cantonese (yue)...") + yue_items = [] + wav_root = Path(args.wav_path) + + samples = read_test_list(args.test_list) + for s in samples: + lang_id = s.get("language_id", "") + if lang_id != "yue": + continue + + wav_path = str(wav_root / f"{s['id']}.{args.extension}") + if not os.path.exists(wav_path): + logging.warning(f"File missing: {wav_path}") + continue + + yue_items.append( + { + "wav_path": wav_path, + "truth_text": s["text"], + "lang_id": "yue", + "lang_name": s.get("language_name", "Cantonese"), + } + ) + + logging.info(f"Total Cantonese files found: {len(yue_items)}.") + if len(yue_items) == 0: + logging.warning("No files to evaluate. Exiting.") + return + + num_gpus = torch.cuda.device_count() + assert num_gpus > 0, "No GPU found. GPU is required." + total_workers = num_gpus * args.nj_per_gpu + + mp.set_start_method("spawn", force=True) + manager = mp.Manager() + + chunk_size = args.chunk_size + tasks = [] + for i in range(0, len(yue_items), chunk_size): + tasks.append(yue_items[i : i + chunk_size]) + + results = [] + rank_queue = manager.Queue() + for _ in range(args.nj_per_gpu): + for rank in range(num_gpus): + rank_queue.put(rank) + + with ProcessPoolExecutor( + max_workers=total_workers, + initializer=process_init_sensevoice, + initargs=(rank_queue, args.model_dir), + ) as executor: + + futures = [] + for chunk in tasks: + futures.append( + executor.submit(run_eval_worker_sensevoice, chunk, args.batch_size) + ) + + with tqdm( + total=len(yue_items), + desc="SenseVoice Eval (Cantonese)", + dynamic_ncols=True, + ) as pbar: + for future in as_completed(futures): + try: + chunk_metrics = future.result() + results.extend(chunk_metrics) + pbar.update(len(chunk_metrics)) + except Exception as e: + logging.error(f"Task failed: {e}") + + # Metrics Aggregation + inses, deles, subses = [], [], [] + word_nums = 0 + + fout = None + if args.decode_path: + os.makedirs(os.path.dirname(args.decode_path), exist_ok=True) + logging.info(f"Saving detailed CER results to: {args.decode_path}") + fout = open(args.decode_path, "w", encoding="utf-8") + + for res in results: + inses.append(float(res["insertions"])) + deles.append(float(res["deletions"])) + subses.append(float(res["substitutions"])) + word_nums += res["word_num"] + + if fout: + fout.write( + f"{res['wav_path']}\t{res['wer']}\t{res['truth']}\t" + f"{res['hypo']}\t{res['insertions']}\t{res['deletions']}\t" + f"{res['substitutions']}\n" + ) + + print("-" * 50) + if word_nums > 0: + log_metrics(fout, "[yue] Cantonese", inses, deles, subses, word_nums) + + if fout: + fout.close() + + +if __name__ == "__main__": + main() diff --git a/omnivoice/eval/wer/text_norm_omni.py b/omnivoice/eval/wer/text_norm_omni.py new file mode 100644 index 0000000000000000000000000000000000000000..fc435e8e93812793e7d5d249438f8cf23114b801 --- /dev/null +++ b/omnivoice/eval/wer/text_norm_omni.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +This module contains the text normalization function for WER evaluation. +Copied from https://github.com/facebookresearch/omnilingual-asr/blob/81f51e224ce9e74b02cc2a3eaf21b2d91d743455/workflows/dataprep/text_tools.py +""" + +import re +import unicodedata + +from unidecode import unidecode + +import omnivoice.eval.wer.norm_config_module as norm_config_module + +norm_config = norm_config_module.norm_config # type: ignore + + +def text_normalize( + text, iso_code, lower_case=True, remove_numbers=True, remove_brackets=False +): + """Given a text, normalize it by changing to lower case, removing punctuations, removing words that only contain digits and removing extra spaces + + Args: + text : The string to be normalized + iso_code : + remove_numbers : Boolean flag to specify if words containing only digits should be removed + + Returns: + normalized_text : the string after all normalization + + """ + + config = norm_config.get(iso_code, norm_config["*"]) + + for field in [ + "lower_case", + "punc_set", + "del_set", + "mapping", + "digit_set", + "unicode_norm", + ]: + if field not in config: + config[field] = norm_config["*"][field] + + text = unicodedata.normalize(config["unicode_norm"], text) + + # Convert to lower case + + if config["lower_case"] and lower_case: + text = text.lower() + + # brackets + + # always text inside brackets with numbers in them. Usually corresponds to "(Sam 23:17)" + text = re.sub(r"\([^\)]*\d[^\)]*\)", " ", text) + if remove_brackets: + text = re.sub(r"\([^\)]*\)", " ", text) + + # Apply mappings + + for old, new in config["mapping"].items(): + text = re.sub(old, new, text) + + # Replace punctutations with space + + punct_pattern = r"[" + config["punc_set"] + + punct_pattern += "]" + + normalized_text = re.sub(punct_pattern, " ", text) + + # remove characters in delete list + + delete_patten = r"[" + config["del_set"] + "]" + + normalized_text = re.sub(delete_patten, "", normalized_text) + + # Remove words containing only digits + # We check for 3 cases a)text starts with a number b) a number is present somewhere in the middle of the text c) the text ends with a number + # For each case we use lookaround regex pattern to see if the digit pattern in preceded and followed by whitespaces, only then we replace the numbers with space + # The lookaround enables overlapping pattern matches to be replaced + + if remove_numbers: + + digits_pattern = "[" + config["digit_set"] + + digits_pattern += "]+" + + complete_digit_pattern = ( + r"^" + + digits_pattern + + r"(?=\s)|(?<=\s)" + + digits_pattern + + r"(?=\s)|(?<=\s)" + + digits_pattern + + "$" + ) + + normalized_text = re.sub(complete_digit_pattern, " ", normalized_text) + + if config["rm_diacritics"]: + normalized_text = unidecode(normalized_text) + + # Remove extra spaces + normalized_text = re.sub(r"\s+", " ", normalized_text).strip() + + return normalized_text diff --git a/omnivoice/models/__init__.py b/omnivoice/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/models/omnivoice.py b/omnivoice/models/omnivoice.py new file mode 100644 index 0000000000000000000000000000000000000000..96673eb84cbe084cbb8eb3127b34d721365c1584 --- /dev/null +++ b/omnivoice/models/omnivoice.py @@ -0,0 +1,1598 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Core OmniVoice model implementation. + +Defines the ``OmniVoice`` model class, generation config, and inference pipeline. +This is the main entry point for both inference and training: + +- **Inference**: ``OmniVoice.from_pretrained()`` loads the model, then + ``model.generate()`` supports voice cloning, voice design, and auto voice. +- **Training**: ``model.forward()`` computes the training loss; the model is + built and used by ``omnivoice.training.builder`` and ``omnivoice.training.trainer``. + +""" + +import difflib +import logging +import math +import os +import re +from dataclasses import dataclass, fields +from functools import partial +from typing import Any, List, Optional, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio + +try: + from torch.nn.attention.flex_attention import create_block_mask + + _flex_attention_available = True +except ImportError: + _flex_attention_available = False +from transformers import ( + AutoFeatureExtractor, + AutoModel, + AutoTokenizer, + HiggsAudioV2TokenizerModel, + PretrainedConfig, + PreTrainedModel, +) +from transformers.modeling_outputs import ModelOutput +from transformers.models.auto import CONFIG_MAPPING, AutoConfig + +from omnivoice.utils.audio import ( + cross_fade_chunks, + fade_and_pad_audio, + load_audio, + remove_silence, + trim_long_audio, +) +from omnivoice.utils.duration import RuleDurationEstimator +from omnivoice.utils.lang_map import LANG_IDS, LANG_NAMES +from omnivoice.utils.text import add_punctuation, chunk_text_punctuation +from omnivoice.utils.voice_design import ( + _INSTRUCT_ALL_VALID, + _INSTRUCT_EN_TO_ZH, + _INSTRUCT_MUTUALLY_EXCLUSIVE, + _INSTRUCT_VALID_EN, + _INSTRUCT_VALID_ZH, + _INSTRUCT_ZH_TO_EN, + _ZH_RE, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Dataclasses +# --------------------------------------------------------------------------- + + +@dataclass +class VoiceClonePrompt: + ref_audio_tokens: torch.Tensor # (C, T) + ref_text: str + ref_rms: float + + +@dataclass +class OmniVoiceGenerationConfig: + num_step: int = 32 + guidance_scale: float = 2.0 + t_shift: float = 0.1 + layer_penalty_factor: float = 5.0 + position_temperature: float = 5.0 + class_temperature: float = 0.0 + denoise: bool = True + preprocess_prompt: bool = True + postprocess_output: bool = True + audio_chunk_duration: float = 15.0 + audio_chunk_threshold: float = 30.0 + + @classmethod + def from_dict(cls, kwargs_dict): + valid_keys = {f.name for f in fields(cls)} + filtered = {k: v for k, v in kwargs_dict.items() if k in valid_keys} + return cls(**filtered) + + +@dataclass +class GenerationTask: + batch_size: int + texts: List[str] + target_lens: List[int] + langs: List[Optional[str]] + instructs: List[Optional[str]] + ref_texts: List[Optional[str]] + ref_audio_tokens: List[Optional[torch.Tensor]] + ref_rms: List[Optional[float]] + speed: Optional[List[float]] = None + + def get_indices(self, config: OmniVoiceGenerationConfig, frame_rate: int): + threshold = int(config.audio_chunk_threshold * frame_rate) + short_idx = [i for i, l in enumerate(self.target_lens) if l <= threshold] + long_idx = [i for i, l in enumerate(self.target_lens) if l > threshold] + return short_idx, long_idx + + def slice_task(self, indices: List[int]): + if not indices: + return None + return GenerationTask( + batch_size=len(indices), + texts=[self.texts[i] for i in indices], + target_lens=[self.target_lens[i] for i in indices], + langs=[self.langs[i] for i in indices], + instructs=[self.instructs[i] for i in indices], + ref_texts=[self.ref_texts[i] for i in indices], + ref_audio_tokens=[self.ref_audio_tokens[i] for i in indices], + ref_rms=[self.ref_rms[i] for i in indices], + speed=[self.speed[i] for i in indices] if self.speed else None, + ) + + +@dataclass +class OmniVoiceModelOutput(ModelOutput): + loss: Optional[torch.Tensor] = None + logits: Optional[torch.Tensor] = None + + +# --------------------------------------------------------------------------- +# Config & Model +# --------------------------------------------------------------------------- + + +class OmniVoiceConfig(PretrainedConfig): + model_type = "omnivoice" + sub_configs = {"llm_config": AutoConfig} + + def __init__( + self, + audio_vocab_size: int = 1025, + audio_mask_id: int = 1024, + num_audio_codebook: int = 8, + audio_codebook_weights: Optional[list[float]] = None, + llm_config: Optional[Union[dict, PretrainedConfig]] = None, + **kwargs, + ): + + if isinstance(llm_config, dict): + llm_config = CONFIG_MAPPING[llm_config["model_type"]](**llm_config) + + self.llm_config = llm_config + + super().__init__(**kwargs) + self.audio_vocab_size = audio_vocab_size + self.audio_mask_id = audio_mask_id + self.num_audio_codebook = num_audio_codebook + if audio_codebook_weights is None: + audio_codebook_weights = [8, 8, 6, 6, 4, 4, 2, 2] + self.audio_codebook_weights = audio_codebook_weights + + +def _resolve_model_path(name_or_path: str) -> str: + if os.path.isdir(name_or_path): + return name_or_path + from huggingface_hub import snapshot_download + + return snapshot_download(name_or_path) + + +class OmniVoice(PreTrainedModel): + _supports_flex_attn = True + _supports_flash_attn_2 = True + _supports_sdpa = True + config_class = OmniVoiceConfig + + def __init__(self, config: OmniVoiceConfig, llm: Optional[PreTrainedModel] = None): + super().__init__(config) + + if llm is not None: + # If an LLM instance is provided, use it directly + # (skipping config-based init). + self.llm = llm + else: + # Otherwise, initialize the LLM from the config. + self.llm = AutoModel.from_config(self.config.llm_config) + + self.audio_embeddings = nn.Embedding( + config.num_audio_codebook * config.audio_vocab_size, + self.config.llm_config.hidden_size, + ) + self.register_buffer( + "codebook_layer_offsets", + torch.arange(config.num_audio_codebook) * config.audio_vocab_size, + ) + + self.audio_heads = nn.Linear( + self.config.llm_config.hidden_size, + config.num_audio_codebook * config.audio_vocab_size, + bias=False, + ) + + self.normalized_audio_codebook_weights = [ + w / sum(config.audio_codebook_weights) + for w in config.audio_codebook_weights + ] + + self.post_init() + + # Inference-only attributes (set by from_pretrained when not in train mode) + self.text_tokenizer = None + self.audio_tokenizer = None + self.duration_estimator = None + self.sampling_rate = None + self._asr_pipe = None + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + train_mode = kwargs.pop("train", False) + load_asr = kwargs.pop("load_asr", False) + asr_model_name = kwargs.pop("asr_model_name", "openai/whisper-large-v3-turbo") + + # Suppress noisy INFO logs from transformers/huggingface_hub during loading + _prev_disable = logging.root.manager.disable + logging.disable(logging.INFO) + + try: + # Resolve to local path first; download only if not already cached + resolved_path = _resolve_model_path(pretrained_model_name_or_path) + + model = super().from_pretrained(resolved_path, *args, **kwargs) + + if not train_mode: + model.text_tokenizer = AutoTokenizer.from_pretrained(resolved_path) + + audio_tokenizer_path = os.path.join(resolved_path, "audio_tokenizer") + + if not os.path.isdir(audio_tokenizer_path): + audio_tokenizer_path = _resolve_model_path( + "eustlb/higgs-audio-v2-tokenizer" + ) + + # higgs-audio-v2-tokenizer does not support MPS + # (output channels > 65536) + tokenizer_device = ( + "cpu" if str(model.device).startswith("mps") else model.device + ) + model.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained( + audio_tokenizer_path, device_map=tokenizer_device + ) + model.feature_extractor = AutoFeatureExtractor.from_pretrained( + audio_tokenizer_path + ) + + model.sampling_rate = model.feature_extractor.sampling_rate + + model.duration_estimator = RuleDurationEstimator() + + if load_asr: + model.load_asr_model(model_name=asr_model_name) + finally: + logging.disable(_prev_disable) + + return model + + # ------------------------------------------------------------------- + # ASR support (optional, for auto-transcription) + # ------------------------------------------------------------------- + + def load_asr_model(self, model_name: str = "openai/whisper-large-v3-turbo"): + """Load a Whisper ASR model for reference audio transcription. + + Args: + model_name: HuggingFace model name or local path for the Whisper model. + """ + from transformers import pipeline as hf_pipeline + + logger.info("Loading ASR model %s ...", model_name) + asr_dtype = ( + torch.float16 if str(self.device).startswith("cuda") else torch.float32 + ) + + model_name = _resolve_model_path(model_name) + + self._asr_pipe = hf_pipeline( + "automatic-speech-recognition", + model=model_name, + dtype=asr_dtype, + device_map=self.device, + ) + logger.info("ASR model loaded on %s.", self.device) + + @torch.inference_mode() + def transcribe( + self, + audio: Union[str, tuple], + ) -> str: + """Transcribe audio using the loaded Whisper ASR model. + + Args: + audio: File path or ``(waveform, sample_rate)`` tuple. + Waveform can be a numpy array or torch.Tensor of shape + ``(1, T)`` or ``(T,)``. + + Returns: + Transcribed text. + """ + if self._asr_pipe is None: + raise RuntimeError( + "ASR model is not loaded. Call model.load_asr_model() first." + ) + + if isinstance(audio, str): + return self._asr_pipe(audio)["text"].strip() + else: + waveform, sr = audio + if isinstance(waveform, torch.Tensor): + waveform = waveform.cpu().numpy() + waveform = np.squeeze(waveform) # (1, T) or (T,) → (T,) + audio_input = { + "array": waveform, + "sampling_rate": sr, + } + return self._asr_pipe(audio_input)["text"].strip() + + def get_input_embeddings(self): + return self.llm.get_input_embeddings() + + def set_input_embeddings(self, value): + self.llm.set_input_embeddings(value) + + def _prepare_embed_inputs( + self, input_ids: torch.Tensor, audio_mask: torch.Tensor + ) -> torch.Tensor: + """ + Prepares embeddings from input_ids of shape (batch_size, layers, seq_length). + Embedding shape is (batch_size, seq_length, hidden_size). + """ + text_embeds = self.get_input_embeddings()(input_ids[:, 0, :]) + + # Apply shift to audio IDs based on codebook layer + # audio_ids: [Batch, 8, Seq] + # codebook_layer_offsets: [1, 8, 1] + # Result: Layer 0 ID Layer 1 ID + Layer 2 ID + 2050... + shifted_ids = ( + input_ids * audio_mask.unsqueeze(1) + ) + self.codebook_layer_offsets.view(1, -1, 1) + + # input: [Batch, 8, Seq] -> output: [Batch, Seq, Hidden] + audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1) + + return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds) + + def forward( + self, + input_ids: torch.LongTensor, + audio_mask: torch.Tensor, + labels: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + document_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + ): + + inputs_embeds = self._prepare_embed_inputs(input_ids, audio_mask) + + if attention_mask is None and document_ids is not None: + if not _flex_attention_available: + raise RuntimeError( + "flex_attention is not available in the current environment. " + "If you do not need flex_attention, set " + '"attn_implementation": "sdpa" in your training config.' + ) + attention_mask = create_block_mask( + _get_packed_mask( + document_ids[0].to(inputs_embeds.device), + ), + B=None, + H=None, + Q_LEN=input_ids.size(-1), + KV_LEN=input_ids.size(-1), + _compile=True, + device=inputs_embeds.device, + ) + + llm_outputs = self.llm( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + return_dict=True, + position_ids=position_ids, + ) + hidden_states = llm_outputs[0] + + loss = None + + # Shape: [B, S, C * Vocab] + batch_size, seq_len, _ = hidden_states.shape + logits_flat = self.audio_heads(hidden_states) + # Shape: [B, S, C, Vocab] -> [B, C, S, Vocab] + audio_logits = logits_flat.view( + batch_size, + seq_len, + self.config.num_audio_codebook, + self.config.audio_vocab_size, + ).permute(0, 2, 1, 3) + + if labels is not None: + + # audio_logits.permute(0, 3, 1, 2): + # [Batch, Layer, Seq, Vocab] -> [Batch, Vocab, Layer, Seq] + # per_token_loss shape: [Batch, Layer, Seq],ignore -100 + per_token_loss = torch.nn.functional.cross_entropy( + audio_logits.permute(0, 3, 1, 2), + labels, + reduction="none", + ignore_index=-100, + ) + # valid_mask shape: [Batch, Layer, Seq] + valid_mask = (labels != -100).float() + + # layer_means shape: [num_layers] + layer_means = (per_token_loss * valid_mask).sum( + dim=(0, 2) + ) / valid_mask.sum(dim=(0, 2)).clamp(min=1.0) + + weights = torch.tensor( + self.normalized_audio_codebook_weights, device=audio_logits.device + ) + loss = (layer_means * weights).sum() + + return OmniVoiceModelOutput( + loss=loss, + logits=audio_logits, + ) + + def supported_language_ids(self) -> set[str]: + """Return a list of supported language IDs.""" + return LANG_IDS + + def supported_language_names(self) -> set[str]: + """Return a list of supported language names.""" + return LANG_NAMES + + # ------------------------------------------------------------------- + # Inference API + # ------------------------------------------------------------------- + + @torch.inference_mode() + def generate( + self, + text: Union[str, list[str]], + language: Union[str, list[str], None] = None, + ref_text: Union[str, list[str], None] = None, + ref_audio: Union[ + str, + list[str], + tuple[torch.Tensor, int], + list[tuple[torch.Tensor, int]], + None, + ] = None, + voice_clone_prompt: Union[ + VoiceClonePrompt, list[VoiceClonePrompt], None + ] = None, + instruct: Union[str, list[str], None] = None, + duration: Union[float, list[Optional[float]], None] = None, + speed: Union[float, list[Optional[float]], None] = None, + generation_config: Optional[OmniVoiceGenerationConfig] = None, + **kwargs, + ) -> list[np.ndarray]: + """Generate speech audio given text in various modes. + + Supports three modes: + + 1. **Voice clone** — clone the voice style from the reference audio. + Should provide ``voice_clone_prompt`` (from + :meth:`create_voice_clone_prompt`) or ``ref_text`` + ``ref_audio``. + 2. **Voice design** — provide ``instruct`` text describing + the desired voice style; no reference audio needed. + 3. **Auto** — provide neither; the model picks a voice itself. + + Args: + text: Target text (single string or list for batch). + language: Language name (e.g. ``"English"``) or code + (e.g. ``"en"``). ``None`` for language-agnostic mode. + Performance is slightly better if you specify the language. + ref_text: Optional reference text for voice cloning mode. + ref_audio: Optional reference audio for voice cloning mode. + Can be a file path or a (waveform, sample_rate) tuple. + voice_clone_prompt: Reusable prompt from :meth:`create_voice_clone_prompt`. + If provided, it overrides ``ref_text`` and ``ref_audio``. + instruct: Style instruction for voice design mode. + duration: Fixed output duration in seconds. If a single float, + applies to all items; if a list, one value per item. + ``None`` (default) lets the model estimate duration from text. + Overrides ``speed`` when both are provided. + speed: Speaking speed factor. ``> 1.0`` for faster, ``< 1.0`` for + slower. If a list, one value per item. ``None`` (default) uses + the model's default estimation. + generation_config: Explicit config object. If provided, takes + precedence over ``**kwargs``. + **kwargs: Generation config or its fields: + denoise: Whether to prepend the ``<|denoise|>`` token. + num_step: Number of iterative decoding steps. + guidance_scale: Classifier-free guidance scale. + t_shift: Time-step shift (smaller → emphasise low-SNR). + postprocess_output: Post-process output (remove silence, fade-in/out, pad edges). + layer_penalty_factor: Penalty encouraging earlier codebook + layers to unmask first. + position_temperature: Temperature for position selection. + class_temperature: Temperature for token sampling (0 = greedy). + audio_chunk_duration: If > 0, split long text into chunks of + this duration (seconds) and generate chunk by chunk. + audio_chunk_threshold: Only apply chunking if estimated audio + duration exceeds this threshold (seconds). + Returns: + ``audios`` a list of 1-D ``np.ndarray`` with shape ``(T,)`` and + sampling rate consistent with the model's audio tokenizer + (usually 24 000 Hz). Can be saved directly with + ``soundfile.write("out.wav", audios[0], model.sampling_rate)``. + """ + + if self.audio_tokenizer is None or self.text_tokenizer is None: + raise RuntimeError( + "Model is not loaded with audio/text tokenizers. Make sure you " + "loaded the model with OmniVoice.from_pretrained()." + ) + gen_config = ( + generation_config + if generation_config is not None + else OmniVoiceGenerationConfig.from_dict(kwargs) + ) + + self.eval() + + full_task = self._preprocess_all( + text=text, + language=language, + ref_text=ref_text, + ref_audio=ref_audio, + voice_clone_prompt=voice_clone_prompt, + instruct=instruct, + preprocess_prompt=gen_config.preprocess_prompt, + speed=speed, + duration=duration, + ) + + short_idx, long_idx = full_task.get_indices( + gen_config, self.audio_tokenizer.config.frame_rate + ) + + results = [None] * full_task.batch_size + + if short_idx: + short_task = full_task.slice_task(short_idx) + short_results = self._generate_iterative(short_task, gen_config) + for idx, res in zip(short_idx, short_results): + results[idx] = res + + if long_idx: + long_task = full_task.slice_task(long_idx) + long_results = self._generate_chunked(long_task, gen_config) + for idx, res in zip(long_idx, long_results): + results[idx] = res + + generated_audios = [] + for i in range(full_task.batch_size): + assert results[i] is not None, f"Result {i} was not generated" + generated_audios.append( + self._decode_and_post_process( + results[i], full_task.ref_rms[i], gen_config # type: ignore[arg-type] + ) + ) + + return generated_audios + + def create_voice_clone_prompt( + self, + ref_audio: Union[str, tuple[torch.Tensor, int]], + ref_text: Optional[str] = None, + preprocess_prompt: bool = True, + ) -> VoiceClonePrompt: + """Create a reusable voice clone prompt from reference audio. + + Args: + ref_audio: File path (str) or ``(waveform, sample_rate)`` tuple. + waveform should be a 1-D or 2-D torch.Tensor (channels x samples). + ref_text: Transcript of the reference audio. If ``None``, the + ASR model will be used to auto-transcribe (must call + :meth:`load_asr_model` first). + preprocess_prompt: If ``True`` (default), apply silence removal and + trimming to the reference audio, add punctuation in the end + of reference text (if not already) + + Returns: + A :class:`VoiceClonePrompt` that can be passed to :meth:`generate`. + """ + if self.audio_tokenizer is None: + raise RuntimeError( + "Audio tokenizer is not loaded. Make sure you loaded the model " + "with OmniVoice.from_pretrained()." + ) + + if isinstance(ref_audio, str): + ref_wav = load_audio(ref_audio, self.sampling_rate) + else: + waveform, sr = ref_audio + if isinstance(waveform, torch.Tensor): + waveform = waveform.cpu().numpy() + if waveform.ndim == 1: + waveform = waveform[np.newaxis, :] + if waveform.shape[0] > 1: + waveform = np.mean(waveform, axis=0, keepdims=True) + if sr != self.sampling_rate: + waveform = torchaudio.functional.resample( + torch.from_numpy(waveform), + orig_freq=sr, + new_freq=self.sampling_rate, + ).numpy() + ref_wav = waveform + + ref_rms = float(np.sqrt(np.mean(ref_wav**2))) + if 0 < ref_rms < 0.1: + ref_wav = ref_wav * 0.1 / ref_rms + + if preprocess_prompt: + # Trim long reference audio (>20s) by splitting at the largest silence gap. + # Skip trimming when ref_text is user-provided, otherwise the + # trimmed audio will no longer match the full transcript. + if ref_text is None: + ref_wav = trim_long_audio( + ref_wav, self.sampling_rate, trim_threshold=20.0 + ) + ref_wav = remove_silence( + ref_wav, + self.sampling_rate, + mid_sil=200, + lead_sil=100, + trail_sil=200, + ) + if ref_wav.shape[-1] == 0: + raise ValueError( + "Reference audio is empty after silence removal. " + "Try setting preprocess_prompt=False." + ) + + ref_duration = ref_wav.shape[-1] / self.sampling_rate + if ref_duration > 20.0: + logger.warning( + "Reference audio is %.1fs long (>20s). This may cause slower " + "generation, higher memory usage, and degraded voice cloning " + "quality. We recommend trimming it to 3-10s.", + ref_duration, + ) + + # Auto-transcribe if ref_text not provided + if ref_text is None: + if self._asr_pipe is None: + logger.info("ASR model not loaded yet, loading on-the-fly ...") + self.load_asr_model() + ref_text = self.transcribe((ref_wav, self.sampling_rate)) + logger.debug("Auto-transcribed ref_text: %s", ref_text) + + chunk_size = self.audio_tokenizer.config.hop_length + clip_size = int(ref_wav.shape[-1] % chunk_size) + ref_wav = ref_wav[:, :-clip_size] if clip_size > 0 else ref_wav + # numpy → torch at tokenizer boundary + ref_wav_tensor = torch.from_numpy(ref_wav).to(self.audio_tokenizer.device) + ref_audio_tokens = self.audio_tokenizer.encode( + ref_wav_tensor.unsqueeze(0), + ).audio_codes.squeeze( + 0 + ) # (C, T) + + if preprocess_prompt: + ref_text = add_punctuation(ref_text) + + return VoiceClonePrompt( + ref_audio_tokens=ref_audio_tokens, + ref_text=ref_text, + ref_rms=ref_rms, + ) + + def _decode_and_post_process( + self, + tokens: Union[torch.Tensor, List[torch.Tensor]], + rms: Union[float, None], + gen_config: OmniVoiceGenerationConfig, + ) -> np.ndarray: + """ + Args: + tokens: Audio tokens — either a single tensor of shape + (num_codebooks, seq_len) or a list of chunk tensors. + rms: RMS of the reference audio for volume adjustment. + gen_config: Generation config for post-processing options. + Returns: + Decoded and post-processed audio array of shape (T,). + """ + tokenizer_device = self.audio_tokenizer.device + if isinstance(tokens, list): + chunk_audios = [ + self.audio_tokenizer.decode(t.to(tokenizer_device).unsqueeze(0)) + .audio_values[0] + .cpu() + .numpy() + for t in tokens + ] + audio_waveform = cross_fade_chunks(chunk_audios, self.sampling_rate) + else: + audio_waveform = ( + self.audio_tokenizer.decode(tokens.to(tokenizer_device).unsqueeze(0)) + .audio_values[0] + .cpu() + .numpy() + ) + + audio_waveform = self._post_process_audio( + audio_waveform, + postprocess_output=gen_config.postprocess_output, + ref_rms=rms, + ) + return audio_waveform.squeeze(0) + + def _post_process_audio( + self, + generated_audio: np.ndarray, + postprocess_output: bool, + ref_rms: Union[float, None], + ) -> np.ndarray: + """Optionally remove long silences, adjust volume, and add edge padding. + + Args: + generated_audio: Numpy array of shape (1, T). + postprocess_output: If True, remove long silences and apply fade/pad. + ref_rms: RMS of the reference audio for volume normalisation. + Returns: + Processed numpy array of shape (1, T). + """ + if postprocess_output: + generated_audio = remove_silence( + generated_audio, + self.sampling_rate, + mid_sil=500, + lead_sil=100, + trail_sil=100, + ) + + if ref_rms is not None and ref_rms < 0.1: + generated_audio = generated_audio * ref_rms / 0.1 + elif ref_rms is None: + peak = np.abs(generated_audio).max() + if peak > 1e-6: + generated_audio = generated_audio / peak * 0.5 + + generated_audio = fade_and_pad_audio( + generated_audio, + sample_rate=self.sampling_rate, + ) + return generated_audio + + def _generate_chunked( + self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig + ) -> List[List[torch.Tensor]]: + """Generate long audio by splitting text into chunks and batching. + + Each item in the returned list corresponds to one input and contains + a list of audio token tensors — one per text chunk. + + Args: + task: A :class:`GenerationTask` with one or more items whose + estimated audio exceeds ``audio_chunk_threshold``. + gen_config: Generation config (``audio_chunk_duration`` controls + chunk size). + Returns: + Per-item list of chunk token-tensor lists. + """ + # Chunk each item's text + all_chunks = [] + for i in range(task.batch_size): + avg_tokens_per_char = task.target_lens[i] / len(task.texts[i]) + text_chunk_len = int( + gen_config.audio_chunk_duration + * self.audio_tokenizer.config.frame_rate + / avg_tokens_per_char + ) + chunks = chunk_text_punctuation( + text=task.texts[i], + chunk_len=text_chunk_len, + min_chunk_len=3, + ) + logger.debug(f"Item {i} chunked into {len(chunks)} pieces: {chunks}") + all_chunks.append(chunks) + + has_ref = [t is not None for t in task.ref_audio_tokens] + assert all(has_ref) or not any(has_ref), ( + "Chunked inference requires all items to either have or not have " + "ref_audio. Mixed ref/non-ref is not supported." + ) + + max_num_chunks = max(len(c) for c in all_chunks) + + # chunk_results[item_idx] = list of generated token tensors per chunk + chunk_results = [[] for _ in range(task.batch_size)] + + def _run_batch(indices, texts, ref_audios, ref_texts): + speed_list = task.speed + target_lens = [ + self._estimate_target_tokens( + texts[j], + ref_texts[j], + ref_audios[j].size(-1) if ref_audios[j] is not None else None, + speed=speed_list[i] if speed_list else 1.0, + ) + for j, i in enumerate(indices) + ] + sub_task = GenerationTask( + batch_size=len(indices), + texts=texts, + target_lens=target_lens, + langs=[task.langs[i] for i in indices], + instructs=[task.instructs[i] for i in indices], + ref_texts=ref_texts, + ref_audio_tokens=ref_audios, + ref_rms=[task.ref_rms[i] for i in indices], + speed=[task.speed[i] for i in indices] if task.speed else None, + ) + gen_tokens = self._generate_iterative(sub_task, gen_config) + for j, idx in enumerate(indices): + chunk_results[idx].append(gen_tokens[j]) + + if all(has_ref): + # All items have reference audio. + # We still sequentially generate chunks within each item, but we + # batch across items for the same chunk index. This allows to keep + # the VRAM usage manageable while still benefiting from batching. + for ci in range(max_num_chunks): + indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])] + if not indices: + continue + _run_batch( + indices, + texts=[all_chunks[i][ci] for i in indices], + ref_audios=[task.ref_audio_tokens[i] for i in indices], + ref_texts=[task.ref_texts[i] for i in indices], + ) + else: + # No reference audio — generate chunk 0 for all items first, + # then use chunk 0 output as reference for all subsequent chunks. + indices_0 = [i for i in range(task.batch_size) if len(all_chunks[i]) > 0] + _run_batch( + indices_0, + texts=[all_chunks[i][0] for i in indices_0], + ref_audios=[None] * len(indices_0), + ref_texts=[None] * len(indices_0), + ) + first_chunk_map = {idx: chunk_results[idx][0] for idx in indices_0} + + # Batch all remaining chunks, using chunk 0 as fixed reference + for ci in range(1, max_num_chunks): + indices = [i for i in range(task.batch_size) if ci < len(all_chunks[i])] + if not indices: + continue + _run_batch( + indices, + texts=[all_chunks[i][ci] for i in indices], + ref_audios=[first_chunk_map[i] for i in indices], + ref_texts=[all_chunks[i][0] for i in indices], + ) + + return chunk_results + + def _preprocess_all( + self, + text: Union[str, list[str]], + language: Union[str, list[str], None] = None, + ref_text: Union[str, list[str], None] = None, + ref_audio: Union[ + str, + list[str], + tuple[torch.Tensor, int], + list[tuple[torch.Tensor, int]], + None, + ] = None, + voice_clone_prompt: Union[ + VoiceClonePrompt, list[VoiceClonePrompt], None + ] = None, + instruct: Union[str, list[str], None] = None, + preprocess_prompt: bool = True, + speed: Union[float, list[Optional[float]], None] = None, + duration: Union[float, list[Optional[float]], None] = None, + ) -> GenerationTask: + + if isinstance(text, str): + text_list = [text] + else: + assert isinstance( + text, list + ), "text should be a string or a list of strings" + text_list = text + batch_size = len(text_list) + + language_list = self._ensure_list(language, batch_size) + language_list = [_resolve_language(lang) for lang in language_list] + instruct_list = self._ensure_list(instruct, batch_size) + for i, s in enumerate(instruct_list): + if s is None: + continue + use_zh = bool(text_list[i] and _ZH_RE.search(text_list[i])) + instruct_list[i] = _resolve_instruct(s, use_zh=use_zh) + + if voice_clone_prompt is not None and ( + ref_text is not None or ref_audio is not None + ): + logger.warning( + "Both voice_clone_prompt and ref_text/ref_audio are provided. " + "ref_text/ref_audio will be ignored." + ) + if voice_clone_prompt is None and ref_audio is not None: + # If voice_clone_prompt is not provided, create it from + # ref_audio (ref_text will be auto-transcribed if not given). + ref_text_list = self._ensure_list(ref_text, batch_size, auto_repeat=False) + ref_audio_list = self._ensure_list(ref_audio, batch_size, auto_repeat=False) + + voice_clone_prompt = [] + for i in range(len(ref_text_list)): + voice_clone_prompt.append( + self.create_voice_clone_prompt( + ref_audio=ref_audio_list[i], + ref_text=ref_text_list[i], + preprocess_prompt=preprocess_prompt, + ) + ) + + voice_clone_prompt_list = self._ensure_list(voice_clone_prompt, batch_size) + if voice_clone_prompt_list[0] is not None: + ref_text_list = [vc.ref_text for vc in voice_clone_prompt_list] + ref_audio_tokens_list = [ + vc.ref_audio_tokens for vc in voice_clone_prompt_list + ] + ref_rms_list = [vc.ref_rms for vc in voice_clone_prompt_list] + else: + ref_text_list = [None] * batch_size + ref_audio_tokens_list = [None] * batch_size + ref_rms_list = [None] * batch_size + + # Normalize speed/duration to per-item lists (may contain None). + if speed is not None: + if isinstance(speed, (int, float)): + user_speed = [float(speed)] * batch_size + else: + user_speed = list(speed) + else: + user_speed = None + + if duration is not None: + if isinstance(duration, (int, float)): + durations = [float(duration)] * batch_size + else: + durations = list(duration) + else: + durations = None + + num_target_tokens_list = [] + for i in range(batch_size): + # duration[i] overrides speed for estimation: use speed=1.0 + # to get the raw estimate, then override target_lens below. + has_dur = durations is not None and durations[i] is not None + item_speed = 1.0 if has_dur else (user_speed[i] if user_speed else 1.0) + est = self._estimate_target_tokens( + text_list[i], + ref_text_list[i], + ref_audio_tokens_list[i].size(-1) + if ref_audio_tokens_list[i] is not None + else None, + speed=item_speed, + ) + num_target_tokens_list.append(est) + + # Per-item duration overrides: set target_lens to exact frame count + # and compute speed ratio so chunked generation scales proportionally. + speed_list: Optional[List[float]] = None + if durations is not None: + frame_rate = self.audio_tokenizer.config.frame_rate + speed_list = [] + for i in range(batch_size): + if durations[i] is not None: + target_tokens = max(1, int(durations[i] * frame_rate)) + est = num_target_tokens_list[i] + speed_list.append(est / target_tokens if target_tokens > 0 else 1.0) + num_target_tokens_list[i] = target_tokens + else: + s = user_speed[i] if user_speed else None + speed_list.append(s if s is not None else 1.0) + elif user_speed is not None: + speed_list = [s if s is not None else 1.0 for s in user_speed] + + return GenerationTask( + batch_size=batch_size, + texts=text_list, + target_lens=num_target_tokens_list, + langs=language_list, + instructs=instruct_list, + ref_texts=ref_text_list, + ref_audio_tokens=ref_audio_tokens_list, + ref_rms=ref_rms_list, + speed=speed_list, + ) + + def _estimate_target_tokens(self, text, ref_text, num_ref_audio_tokens, speed=1.0): + """Estimate number of target audio tokens.""" + if num_ref_audio_tokens is None or ref_text is None or len(ref_text) == 0: + # Fall back to a simple heuristic + ref_text = "Nice to meet you." + num_ref_audio_tokens = 25 + + est = self.duration_estimator.estimate_duration( + text, ref_text, num_ref_audio_tokens + ) + if speed > 0 and speed != 1.0: + est = est / speed + return max(1, int(est)) + + def _ensure_list( + self, x: Union[Any, List[Any]], batch_size: int, auto_repeat: bool = True + ) -> List[Any]: + x_list = x if isinstance(x, list) else [x] + if len(x_list) not in ( + 1, + batch_size, + ): + raise ValueError( + f"should be either the number of the text or 1, but got {len(x_list)}" + ) + if auto_repeat and len(x_list) == 1 and batch_size is not None: + x_list = x_list * batch_size + return x_list + + def _prepare_inference_inputs( + self, + text: str, + num_target_tokens: int, + ref_text: Optional[str] = None, + ref_audio_tokens: Optional[torch.Tensor] = None, + lang: Optional[str] = None, + instruct: Optional[str] = None, + denoise: bool = True, + ): + """Prepare input_ids and audio masks for inference. + Args: + text: Target text to generate. + num_target_tokens: Number of audio tokens to generate. + ref_text: Optional reference text for voice cloning. + ref_audio_tokens: Optional reference audio tokens for voice cloning. + with shape (C, T). + lang: Optional language ID. + instruct: Optional style instruction for voice design. + denoise: Whether to include the <|denoise|> token. + """ + + # Build style tokens: <|denoise|> + <|lang_start|>...<|lang_end|> + # + <|instruct_start|>...<|instruct_end|> + style_text = "" + if denoise and ref_audio_tokens is not None: + style_text += "<|denoise|>" + lang_str = lang if lang else "None" + instruct_str = instruct if instruct else "None" + style_text += f"<|lang_start|>{lang_str}<|lang_end|>" + style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>" + + style_tokens = ( + self.text_tokenizer(style_text, return_tensors="pt") + .input_ids.repeat(self.config.num_audio_codebook, 1) + .unsqueeze(0) + ).to( + self.device + ) # [1, C, N1] + + # Build text tokens + full_text = _combine_text(ref_text=ref_text, text=text) + wrapped_text = f"<|text_start|>{full_text}<|text_end|>" + text_tokens = ( + _tokenize_with_nonverbal_tags(wrapped_text, self.text_tokenizer) + .repeat(self.config.num_audio_codebook, 1) + .unsqueeze(0) + ).to( + self.device + ) # [1, C, N2] + + # Target: all MASK + target_audio_tokens = torch.full( + (1, self.config.num_audio_codebook, num_target_tokens), + self.config.audio_mask_id, + dtype=torch.long, + device=self.device, + ) + + # Conditional input + parts = [style_tokens, text_tokens] + if ref_audio_tokens is not None: + parts.append(ref_audio_tokens.unsqueeze(0).to(self.device)) + parts.append(target_audio_tokens) + cond_input_ids = torch.cat(parts, dim=2) + + cond_total_length = cond_input_ids.shape[2] + cond_audio_start_idx = cond_total_length - num_target_tokens + if ref_audio_tokens is not None: + cond_audio_start_idx -= ref_audio_tokens.size(-1) + + cond_audio_mask = torch.zeros( + 1, cond_total_length, dtype=torch.bool, device=self.device + ) + cond_audio_mask[0, cond_audio_start_idx:] = True + + return { + "input_ids": cond_input_ids, + "audio_mask": cond_audio_mask, + } + + def _generate_iterative( + self, task: GenerationTask, gen_config: OmniVoiceGenerationConfig + ) -> List[torch.Tensor]: + """N-step iterative unmasked decoding. + + Args: + task: A :class:`GenerationTask` containing batch texts, target + lengths, languages, instructions, and optional reference data. + gen_config: A :class:`OmniVoiceGenerationConfig` controlling + decoding steps, guidance, temperatures, etc. + Returns: + List of generated audio token tensors of shape (C, T) (one per + input text). + """ + + B = task.batch_size + + for i in range(B): + logger.debug( + "Item %d — text: %s | ref_text: %s | instruct: %s | lang: %s | target_tokens: %d", + i, + task.texts[i], + task.ref_texts[i], + task.instructs[i], + task.langs[i], + task.target_lens[i], + ) + + inputs_list = [ + self._prepare_inference_inputs( + task.texts[i], + task.target_lens[i], + task.ref_texts[i], + task.ref_audio_tokens[i], + task.langs[i], + task.instructs[i], + gen_config.denoise, + ) + for i in range(B) + ] + + c_lens = [inp["input_ids"].size(2) for inp in inputs_list] + max_c_len = max(c_lens) + pad_id = self.config.audio_mask_id # Or any other tokens + + batch_input_ids = torch.full( + (2 * B, self.config.num_audio_codebook, max_c_len), + pad_id, + dtype=torch.long, + device=self.device, + ) + batch_audio_mask = torch.zeros( + (2 * B, max_c_len), dtype=torch.bool, device=self.device + ) + batch_attention_mask = torch.zeros( + (2 * B, 1, max_c_len, max_c_len), dtype=torch.bool, device=self.device + ) + + for i, inp in enumerate(inputs_list): + c_len, u_len = c_lens[i], task.target_lens[i] + + # Cond (0 ~ B-1) + batch_input_ids[i, :, :c_len] = inp["input_ids"] + batch_audio_mask[i, :c_len] = inp["audio_mask"] + batch_attention_mask[i, :, :c_len, :c_len] = True + + # Uncond (B ~ 2B-1) + batch_input_ids[B + i, :, :u_len] = inp["input_ids"][..., -u_len:] + batch_audio_mask[B + i, :u_len] = inp["audio_mask"][..., -u_len:] + batch_attention_mask[B + i, :, :u_len, :u_len] = True + if max_c_len > u_len: + pad_diag = torch.arange(u_len, max_c_len, device=self.device) + batch_attention_mask[B + i, :, pad_diag, pad_diag] = True + + tokens = torch.full( + (B, self.config.num_audio_codebook, max(task.target_lens)), + self.config.audio_mask_id, + dtype=torch.long, + device=self.device, + ) + + timesteps = _get_time_steps( + t_start=0.0, + t_end=1.0, + num_step=gen_config.num_step, + t_shift=gen_config.t_shift, + ).tolist() + schedules = [] + for t_len in task.target_lens: + total_mask = t_len * self.config.num_audio_codebook + rem = total_mask + sched = [] + for step in range(gen_config.num_step): + num = ( + rem + if step == gen_config.num_step - 1 + else min( + math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])), + rem, + ) + ) + sched.append(int(num)) + rem -= int(num) + schedules.append(sched) + + layer_ids = torch.arange( + self.config.num_audio_codebook, device=self.device + ).view(1, -1, 1) + + for step in range(gen_config.num_step): + batch_logits = self( + input_ids=batch_input_ids, + audio_mask=batch_audio_mask, + attention_mask=batch_attention_mask, + ).logits.to(torch.float32) + + for i in range(B): + k = schedules[i][step] + if k <= 0: + continue + + c_len, t_len = c_lens[i], task.target_lens[i] + + # Extract real target Logits + # [1, C, T, V] + c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :] + u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :] + + pred_tokens, scores = self._predict_tokens_with_scoring( + c_logits, u_logits, gen_config + ) + + scores = scores - (layer_ids * gen_config.layer_penalty_factor) + + if gen_config.position_temperature > 0.0: + scores = _gumbel_sample(scores, gen_config.position_temperature) + + sample_tokens = tokens[i : i + 1, :, :t_len] + scores.masked_fill_( + sample_tokens != self.config.audio_mask_id, -float("inf") + ) + + _, topk_idx = torch.topk(scores.flatten(), k) + flat_tokens = sample_tokens.flatten() + flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx] + sample_tokens.copy_(flat_tokens.view_as(sample_tokens)) + + # Update individual slices into batched structure + tokens[i : i + 1, :, :t_len] = sample_tokens + batch_input_ids[i : i + 1, :, c_len - t_len : c_len] = sample_tokens + batch_input_ids[B + i : B + i + 1, :, :t_len] = sample_tokens + + return [tokens[i, :, : task.target_lens[i]] for i in range(B)] + + def _predict_tokens_with_scoring(self, c_logits, u_logits, gen_config): + if gen_config.guidance_scale != 0: + c_log_probs = F.log_softmax(c_logits, dim=-1) + u_log_probs = F.log_softmax(u_logits, dim=-1) + log_probs = torch.log_softmax( + c_log_probs + gen_config.guidance_scale * (c_log_probs - u_log_probs), + dim=-1, + ) + else: + log_probs = F.log_softmax(c_logits, dim=-1) + + log_probs[..., self.config.audio_mask_id] = -float("inf") + + if gen_config.class_temperature > 0.0: + filtered_probs = _filter_top_k(log_probs, ratio=0.1) + pred_tokens = _gumbel_sample( + filtered_probs, gen_config.class_temperature + ).argmax(dim=-1) + else: + pred_tokens = log_probs.argmax(dim=-1) + + confidence_scores = log_probs.max(dim=-1)[0] + + return pred_tokens, confidence_scores + + +# --------------------------------------------------------------------------- +# Standalone helpers +# --------------------------------------------------------------------------- + + +def _get_packed_mask(document_ids): + return partial(_mask_mod_packed, document_ids) + + +def _mask_mod_packed(document_ids, b, h, q_idx, kv_idx): + # 1. Sequence Packing Logic: Tokens must belong to the same document. + # Note: The doc_id for padding tokens is -1, which will automatically not match + # (if handled correctly) or be ignored. + same_doc = document_ids[q_idx] == document_ids[kv_idx] + return same_doc + + +def _resolve_language(language: Optional[str]) -> Union[str, None]: + from omnivoice.utils.lang_map import LANG_IDS, LANG_NAME_TO_ID + + if language is None or language.lower() == "none": + return None + if language in LANG_IDS: + return language + key = language.lower() + if key in LANG_NAME_TO_ID: + return LANG_NAME_TO_ID[key] + logger.warning( + f"Language '{language}' is not recognized. " + f"Please use a valid language ID (e.g., 'en', 'zh', 'ja', 'de') " + f"or a full language name (e.g., 'English', 'Chinese', 'Japanese'). " + f"See supported_language_ids() or supported_language_names() for details. " + f"Falling back to None (language-agnostic mode)." + ) + return None + + +def _resolve_instruct( + instruct: Optional[str], use_zh: bool = False +) -> Union[str, None]: + """Validate and normalise a voice-design instruct string. + + Supported instruct items (case-insensitive for English): + + English (comma + space separated): + gender: male, female + age: child, teenager, young adult, middle-aged, elderly + pitch: very low pitch, low pitch, moderate pitch, + high pitch, very high pitch + style: whisper + accent: american accent, british accent, australian accent, ... + + Chinese (full-width comma separated): + gender: 男, 女 + age: 儿童, 少年, 青年, 中年, 老年 + pitch: 极低音调, 低音调, 中音调, 高音调, 极高音调 + style: 耳语 + dialect: 河南话, 陕西话, 四川话, 贵州话, 云南话, + 桂林话, 济南话, 石家庄话, 甘肃话, 宁夏话, + 青岛话, 东北话 + + Minor issues (auto-fixed): + - Wrong separator (half-width comma in Chinese instruct or + full-width comma in English instruct) + - Leading / trailing commas + + Major issues (raise ``ValueError``): + - Unsupported or misspelled instruct items + - Suggestions are offered for close matches + + Args: + instruct: Raw instruct string, or ``None``. + use_zh: If True, normalise all items to Chinese (used when the + synthesis text contains Chinese and no accent is specified). + + Returns: + Normalised instruct string, or ``None``. + + Raises: + ValueError: if any instruct item is unsupported or misspelled. + """ + if instruct is None: + return None + + instruct_str = instruct.strip() + if not instruct_str: + return None + + # Split on both half-width and full-width commas + raw_items = re.split(r"\s*[,,]\s*", instruct_str) + raw_items = [x for x in raw_items if x] + + # Validate each item + unknown = [] + normalised = [] + for raw in raw_items: + n = raw.strip().lower() + if n in _INSTRUCT_ALL_VALID: + normalised.append(n) + else: + sug = difflib.get_close_matches(n, _INSTRUCT_ALL_VALID, n=1, cutoff=0.6) + unknown.append((raw, n, sug[0] if sug else None)) + + if unknown: + lines = [] + for raw, n, sug in unknown: + if sug: + lines.append(f" '{raw}' -> '{n}' (unsupported; did you mean '{sug}'?)") + else: + lines.append(f" '{raw}' -> '{n}' (unsupported)") + err = ( + f"Unsupported instruct items found in {instruct_str}:\n" + + "\n".join(lines) + + "\n\nValid English items: " + + ", ".join(sorted(_INSTRUCT_VALID_EN)) + + "\nValid Chinese items: " + + ",".join(sorted(_INSTRUCT_VALID_ZH)) + + "\n\nTip: Use only English or only Chinese instructs. " + "English instructs should use comma + space (e.g. " + "'male, indian accent'),\nChinese instructs should use full-width " + "comma (e.g. '男,河南话')." + ) + raise ValueError(err) + + # --- Language consistency: dialect forces Chinese, accent forces English --- + has_dialect = any(n.endswith("话") for n in normalised) + has_accent = any(" accent" in n for n in normalised) + + if has_dialect and has_accent: + raise ValueError( + "Cannot mix Chinese dialect and English accent in a single instruct. " + "Dialects are for Chinese speech, accents for English speech." + ) + + if has_dialect: + use_zh = True + elif has_accent: + use_zh = False + + # --- Unify to single language --- + if use_zh: + normalised = [_INSTRUCT_EN_TO_ZH.get(n, n) for n in normalised] + else: + normalised = [_INSTRUCT_ZH_TO_EN.get(n, n) for n in normalised] + + # --- Category conflict check --- + conflicts = [] + for cat in _INSTRUCT_MUTUALLY_EXCLUSIVE: + hits = [n for n in normalised if n in cat] + if len(hits) > 1: + conflicts.append(hits) + if conflicts: + parts = [] + for group in conflicts: + parts.append(" vs ".join(f"'{x}'" for x in group)) + raise ValueError( + "Conflicting instruct items within the same category: " + + "; ".join(parts) + + ". Each category (gender, age, pitch, style, accent, dialect) " + "allows at most one item." + ) + + # Determine separator based on language + has_zh = any(any("\u4e00" <= c <= "\u9fff" for c in n) for n in normalised) + separator = "," if has_zh else ", " + + return separator.join(normalised) + + +def _filter_top_k(logits: torch.Tensor, ratio: float = 0.1) -> torch.Tensor: + k = math.ceil(ratio * logits.shape[-1]) + val, ind = logits.topk(k, dim=-1) + probs = torch.full_like(logits, float("-inf")) + probs.scatter_(-1, ind, val) + return probs + + +def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor: + scaled_logits = logits / temperature + u = torch.rand_like(scaled_logits) + gumbel_noise = -torch.log(-torch.log(u + 1e-10) + 1e-10) + return scaled_logits + gumbel_noise + + +def _get_time_steps( + t_start: float = 0.0, + t_end: float = 1.0, + num_step: int = 10, + t_shift: float = 1.0, + device: torch.device = torch.device("cpu"), +) -> torch.Tensor: + timesteps = torch.linspace(t_start, t_end, num_step + 1).to(device) + timesteps = t_shift * timesteps / (1 + (t_shift - 1) * timesteps) + return timesteps + + +_NONVERBAL_PATTERN = re.compile( + r"\[(laughter|sigh|confirmation-en|question-en|question-ah|question-oh|" + r"question-ei|question-yi|surprise-ah|surprise-oh|surprise-wa|" + r"surprise-yo|dissatisfaction-hnn)\]" +) + + +def _tokenize_with_nonverbal_tags(text: str, tokenizer) -> torch.Tensor: + """Tokenize text containing non-verbal tags, handling each tag independently. + + Non-verbal tags are tokenized standalone to guarantee consistent token + IDs regardless of surrounding language context (Chinese, English, etc.). + + Args: + text: Full text string potentially containing non-verbal tags. + tokenizer: HuggingFace text tokenizer instance. + Returns: + Token IDs tensor of shape (1, seq_len). + """ + parts = [] + last_end = 0 + for m in _NONVERBAL_PATTERN.finditer(text): + if m.start() > last_end: + segment = text[last_end : m.start()] + ids = tokenizer(segment, add_special_tokens=False).input_ids + if ids: + parts.append(ids) + tag_ids = tokenizer(m.group(), add_special_tokens=False).input_ids + if tag_ids: + parts.append(tag_ids) + last_end = m.end() + if last_end < len(text): + segment = text[last_end:] + ids = tokenizer(segment, add_special_tokens=False).input_ids + if ids: + parts.append(ids) + + if not parts: + result = tokenizer(text, return_tensors="pt").input_ids + else: + combined = [] + for p in parts: + combined.extend(p) + result = torch.tensor([combined], dtype=torch.long) + return result + + +def _combine_text(text, ref_text: Optional[str] = None) -> str: + + # combine with reference text if not None + if ref_text: + full_text = ref_text.strip() + " " + text.strip() + else: + full_text = text.strip() + + # filter out newline / carriage-return characters + full_text = re.sub(r"[\r\n]+", "", full_text) + + # replace Chinese parentheses with English ones + full_text = full_text.replace("\uff08", "(").replace("\uff09", ")") + + # collapse consecutive spaces / tabs into a single space + full_text = re.sub(r"[ \t]+", " ", full_text) + + # remove spaces around chinese characters + chinese_range = r"[\u4e00-\u9fff]" + pattern = rf"(?<={chinese_range})\s+|\s+(?={chinese_range})" + full_text = re.sub(pattern, "", full_text) + + return full_text + + +# --------------------------------------------------------------------------- +# Register with HuggingFace Auto classes +# --------------------------------------------------------------------------- + +AutoConfig.register("omnivoice", OmniVoiceConfig) +AutoModel.register(OmniVoiceConfig, OmniVoice) diff --git a/omnivoice/scripts/__init__.py b/omnivoice/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/scripts/denoise_audio.py b/omnivoice/scripts/denoise_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..27fce910d6028ef3e0acebca8eeb1864ec4456fc --- /dev/null +++ b/omnivoice/scripts/denoise_audio.py @@ -0,0 +1,1049 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Denoise audio with Sidon and pack results into WebDataset shards. + +Supports two input modes: + +1. WebDataset manifest (data.lst): + python denoise_audio.py \ + --input_manifest data.lst \ + --tar_output_pattern output/audios/shard-%06d.tar \ + --jsonl_output_pattern output/txts/shard-%06d.jsonl \ + --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \ + --decoder_path sidon-v0.1/decoder_cuda.pt + +2. Raw JSONL (each line: {"id": "...", "audio_path": "...", ...}): + python denoise_audio.py \ + --input_jsonl data.jsonl \ + --tar_output_pattern output/audios/shard-%06d.tar \ + --jsonl_output_pattern output/txts/shard-%06d.jsonl \ + --feature_extractor_path sidon-v0.1/feature_extractor_cuda.pt \ + --decoder_path sidon-v0.1/decoder_cuda.pt + +Output structure: + output_dir/ + ├── audios/ # WebDataset tar shards (.flac audio + .json metadata) + │ ├── shard_000000.tar + │ └── ... + ├── txts/ # Per-shard JSONL metadata + │ ├── shard_000000.jsonl + │ └── ... + ├── data.lst # Manifest: + └── errors.jsonl # Failed samples with error details +""" + +from __future__ import annotations + +import argparse +import io +import json +import logging +import os +import pickle +import struct +import subprocess +import sys +import threading +from concurrent.futures import FIRST_COMPLETED, Future, wait +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Union + +import numpy as np +import torch +import torchaudio +import webdataset as wds +from torch.utils.data import DataLoader +from tqdm.auto import tqdm + +from omnivoice.data.batching import StreamLengthGroupDataset +from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader +import soundfile as sf +from omnivoice.utils.common import str2bool + +SIDON_INPUT_SAMPLE_RATE = 16_000 +SIDON_OUTPUT_SAMPLE_RATE = 48_000 + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + + # ── Input (mutually exclusive) ── + parser.add_argument( + "--input_manifest", + default=None, + help="WebDataset manifest (data.lst). Each line: " + " ", + ) + parser.add_argument( + "--input_jsonl", + default=None, + help="Raw JSONL file. Each line: " '{"id": "...", "audio_path": "...", ...}', + ) + + # ── Output ── + parser.add_argument( + "--tar_output_pattern", + default=None, + help="Tar shard pattern, e.g. output/audios/shard_%%06d.tar", + ) + parser.add_argument( + "--jsonl_output_pattern", + default=None, + help="JSONL shard pattern, e.g. output/txts/shard_%%06d.jsonl", + ) + parser.add_argument( + "--samples_per_shard", + type=int, + default=1_000, + help="Maximum records per output shard", + ) + + # ── Model ── + parser.add_argument( + "--feature_extractor_path", + default=None, + help="Path to feature_extractor_cuda.pt", + ) + parser.add_argument( + "--decoder_path", + default=None, + help="Path to decoder_cuda.pt", + ) + parser.add_argument( + "--target_sample_rate", + type=int, + default=24_000, + help="Sample rate of the denoised output audio", + ) + + # ── Filtering ── + parser.add_argument( + "--min_length", + type=float, + default=0.0, + help="Minimum audio duration in seconds", + ) + parser.add_argument( + "--max_length", + type=float, + default=80.0, + help="Maximum audio duration in seconds", + ) + + # ── Batching ── + parser.add_argument( + "--batch_duration", + type=float, + default=200.0, + help="Target batch duration in seconds for dynamic batching", + ) + parser.add_argument( + "--max_sample", + type=int, + default=32, + help="Maximum samples per batch for dynamic batching", + ) + + # ── Distributed ── + parser.add_argument( + "--num_machines", + type=int, + default=1, + help="Total number of machines for distributed runs", + ) + parser.add_argument( + "--machine_index", + type=int, + default=0, + help="Zero-based machine index when distributing across multiple " + "machines (e.g. 0, 1, ... num_machines-1)", + ) + + # ── Parallelism ── + parser.add_argument( + "--nj_per_gpu", + type=int, + default=1, + help="Worker processes per GPU (default 1)", + ) + parser.add_argument( + "--loader_workers", + type=int, + default=16, + help="PyTorch DataLoader worker threads", + ) + + # ── Data order (JSONL mode) ── + parser.add_argument( + "--shuffle", + type=str2bool, + default=True, + help="Shuffle JSONL entries", + ) + parser.add_argument( + "--shuffle_seed", + type=int, + default=42, + help="Seed for JSONL shuffle", + ) + + # ── Error handling ── + parser.add_argument( + "--skip_errors", + action="store_true", + help="Skip items that fail to denoise instead of aborting", + ) + parser.add_argument( + "--_subprocess_worker", + action="store_true", + help=argparse.SUPPRESS, + ) + return parser + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + + +def count_lines(path: str) -> int: + """Count newlines efficiently by reading binary chunks.""" + count = 0 + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1 << 20), b""): + count += chunk.count(b"\n") + return count + + +PaddingStrategy = Union[bool, str] +ReturnType = Union[torch.Tensor, np.ndarray] + + +def extract_seamless_m4t_features( + raw_speech: Union[torch.Tensor, List[float], List[torch.Tensor], List[List[float]]], + sampling_rate: int = 16000, + num_mel_bins: int = 80, + frame_length: int = 25, + frame_shift: int = 10, + preemphasis_coefficient: float = 0.97, + dither: float = 0.0, + window_type: str = "povey", + do_normalize_per_mel_bins: bool = True, + stride: int = 2, + padding: PaddingStrategy = "longest", + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = 2, + return_tensors: Optional[str] = "pt", + return_attention_mask: bool = True, + padding_value: float = 0.0, + device: torch.device = torch.device("cpu"), +) -> Dict[str, ReturnType]: + """Extract SeamlessM4T features using Torch-only operators.""" + if not isinstance(raw_speech, list): + raw_speech = [raw_speech] + + processed_speech = [ + torch.as_tensor(sample, dtype=torch.float32, device=device) + for sample in raw_speech + ] + + features: List[torch.Tensor] = [] + for waveform in processed_speech: + if waveform.ndim > 1: + waveform = waveform[0] + waveform_tensor = waveform.unsqueeze(0) + feature = torchaudio.compliance.kaldi.fbank( + waveform=waveform_tensor, + sample_frequency=sampling_rate, + num_mel_bins=num_mel_bins, + frame_length=frame_length, + frame_shift=frame_shift, + dither=dither, + preemphasis_coefficient=preemphasis_coefficient, + remove_dc_offset=True, + window_type=window_type, + use_energy=False, + energy_floor=1.192092955078125e-07, + ) + features.append(feature.squeeze(0)) + + if do_normalize_per_mel_bins: + normalised: List[torch.Tensor] = [] + for feature in features: + mean = feature.mean(0, keepdim=True) + var = feature.var(0, keepdim=True) + normalised.append((feature - mean) / torch.sqrt(var + 1e-5)) + features = normalised + + def _pad_batch( + features: List[torch.Tensor], + padding_strategy: PaddingStrategy = "longest", + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + padding_value: float = 0.0, + ) -> tuple[torch.Tensor, torch.Tensor]: + if padding_strategy == "longest": + target_length = max(f.shape[0] for f in features) + elif max_length is not None: + target_length = max_length + else: + raise ValueError( + "max_length must be provided when padding_strategy is not 'longest'" + ) + + if pad_to_multiple_of is not None: + target_length = ( + (target_length + pad_to_multiple_of - 1) + // pad_to_multiple_of + * pad_to_multiple_of + ) + + batch_size = len(features) + feature_dim = features[0].shape[1] + device = features[0].device + + padded_features = torch.full( + (batch_size, target_length, feature_dim), + padding_value, + dtype=torch.float32, + device=device, + ) + attention_mask = torch.zeros( + (batch_size, target_length), + dtype=torch.int64, + device=device, + ) + + for index, feature_tensor in enumerate(features): + seq_len = feature_tensor.shape[0] + padded_features[index, :seq_len] = feature_tensor + attention_mask[index, :seq_len] = 1 + + return padded_features, attention_mask + + input_features, attention_mask = _pad_batch( + features, + padding_strategy=padding, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + padding_value=padding_value, + ) + + batch_size, num_frames, num_channels = input_features.shape + new_num_frames = (num_frames // stride) * stride + input_features = input_features[:, :new_num_frames, :] + if return_attention_mask: + attention_mask = attention_mask[:, :new_num_frames] + + input_features = input_features.reshape( + batch_size, new_num_frames // stride, num_channels * stride + ) + + output: Dict[str, ReturnType] = {"input_features": input_features} + if return_attention_mask: + output["attention_mask"] = attention_mask[:, 1::stride] + + if return_tensors == "np": + for key, value in output.items(): + output[key] = value.cpu().numpy() # type: ignore[assignment] + + return output + + +def serialise_flac(key: str, waveform: torch.Tensor, sample_rate: int) -> dict: + buffer = io.BytesIO() + audio = waveform.to(dtype=torch.float32).cpu().numpy() + if audio.ndim == 2: + audio = audio.T # (C, T) → (T, C) for soundfile + sf.write(buffer, audio, sample_rate, format="FLAC") + return {"__key__": key, "flac": buffer.getvalue()} + + +def _normalise_value(value: Any) -> Any: + """Convert tensors and NumPy scalars to serialisable Python objects.""" + if isinstance(value, torch.Tensor): + if value.ndim == 0: + return value.item() + return value.cpu().tolist() + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + return value.tolist() + return value + + +def _encode_metadata(metadata: dict[str, Any]) -> bytes: + cleaned: dict[str, Any] = {} + for key, value in metadata.items(): + if value is None: + continue + cleaned[key] = _normalise_value(value) + return json.dumps(cleaned, ensure_ascii=False).encode("utf-8") + + +# --------------------------------------------------------------------------- +# Denoising model +# --------------------------------------------------------------------------- + + +class SpeechDenoisingProcessor: + """Run the TorchScripted feature extractor and decoder.""" + + def __init__( + self, + feature_extractor_path: str, + decoder_path: str, + device: str, + ) -> None: + self.device = torch.device(device) + self.feature_extractor = torch.jit.load( + feature_extractor_path, map_location=self.device + ) + self.decoder = torch.jit.load(decoder_path, map_location=self.device) + self.feature_extractor.eval() + self.decoder.eval() + + @torch.inference_mode() + def process(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor: + return self.process_batch([waveform], [sample_rate])[0] + + @torch.inference_mode() + def process_batch( + self, + waveforms: Sequence[torch.Tensor] | torch.Tensor, + sample_rates: Optional[Sequence[int]] = None, + expected_lengths: Optional[Sequence[int]] = None, + ) -> List[torch.Tensor]: + if expected_lengths is None: + expected_lengths: list[int] = [] + for waveform, sample_rate in zip(waveforms, sample_rates): + duration_seconds = waveform.shape[-1] / float(sample_rate) + expected_lengths.append( + int(round(duration_seconds * SIDON_OUTPUT_SAMPLE_RATE)) + ) + waveforms = torch.nn.functional.pad(waveforms, (0, 24000)) + + features = extract_seamless_m4t_features( + [x for x in waveforms], + return_tensors="pt", + padding_value=1.0, + device=self.device, + ) + feature_tensor = self.feature_extractor( + features["input_features"].to(self.device) + )["last_hidden_state"] + restored_waveforms = self.decoder(feature_tensor.transpose(1, 2)).cpu() + + results: List[torch.Tensor] = [] + for sample_idx, sample in enumerate(restored_waveforms): + restored_waveform = sample.view(-1) + target_length = expected_lengths[sample_idx] + current_length = restored_waveform.shape[-1] + if target_length > 0 and current_length != target_length: + diff = target_length - current_length + if diff > 0: + restored_waveform = torch.nn.functional.pad( + restored_waveform, (0, diff) + ) + elif diff < 0: + restored_waveform = restored_waveform[:target_length] + results.append(restored_waveform.contiguous()) + + return results + + +# --------------------------------------------------------------------------- +# Batch collation +# --------------------------------------------------------------------------- + + +class CollateFunction: + """Collate a list of samples into a padded batch.""" + + def __init__( + self, + sample_rate: int, + skip_errors: bool, + ) -> None: + self.sample_rate = sample_rate + self.skip_errors = skip_errors + + def __call__(self, samples: Sequence[dict[str, Any]]) -> CollatedBatch: + keys: list[str] = [] + waveforms: list[torch.Tensor] = [] + durations: list[float] = [] + metadata: list[dict[str, Any]] = [] + + for sample in samples: + keys.append(sample["label"]["id"]) + waveforms.append(sample["audio"].squeeze(0)) + durations.append(sample["audio"].size(-1) / self.sample_rate) + metadata.append(sample["label"]) + waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True) + + return CollatedBatch( + keys=keys, waveforms=waveforms, durations=durations, metadata=metadata + ) + + +@dataclass +class CollatedBatch: + """Batch payload returned by the DataLoader collate function.""" + + keys: list[str] + waveforms: list[torch.Tensor] + durations: list[float] + metadata: list[dict[str, Any]] + + @property + def size(self) -> int: + return len(self.keys) + + +# --------------------------------------------------------------------------- +# Subprocess-based GPU worker pool +# --------------------------------------------------------------------------- +# +# Problem: PyTorch ≥2.8 caches CUDA device state at import time. Neither +# forkserver nor spawn lets us change CUDA_VISIBLE_DEVICES *before* the CUDA +# runtime captures the device list. The only reliable approach is to launch +# each worker as a **subprocess** with CUDA_VISIBLE_DEVICES set in the +# subprocess environment, guaranteeing it takes effect before `import torch`. +# +# Protocol (parent ↔ child, length-prefixed pickle over stdin/stdout): +# Parent → child: 4-byte LE uint32 length + pickle(CollatedBatch) +# Child → parent: 4-byte LE uint32 length + pickle(result dict) +# Shutdown signal: 4 zero bytes (length == 0) + + +def _subprocess_recv(): + """Read a length-prefixed pickled object from stdin. Returns None on shutdown.""" + raw = sys.stdin.buffer.read(4) + if len(raw) < 4: + return None + (length,) = struct.unpack(" Future: + worker = self.workers[self._rr % len(self.workers)] + self._rr += 1 + with self._futures_lock: + req_id = self._next_id + self._next_id += 1 + fut = Future() + self._futures[req_id] = fut + batch_dict = { + "_req_id": req_id, + "_batch": batch, + } + worker.submit(batch_dict) + return fut + + def shutdown(self): + for worker in self.workers: + worker.shutdown() + for t in self._reader_threads: + t.join(timeout=5) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + parser = build_parser() + args = parser.parse_args() + + # ── Subprocess worker mode ── + if args._subprocess_worker: + subprocess_worker_main() + return + + # Validate input arguments + assert args.tar_output_pattern is not None, "--tar_output_pattern is required." + assert args.jsonl_output_pattern is not None, "--jsonl_output_pattern is required." + assert bool(args.input_manifest) != bool( + args.input_jsonl + ), "Exactly one of --input_manifest or --input_jsonl must be provided." + + if args.num_machines > 1: + assert ( + 0 <= args.machine_index < args.num_machines + ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})" + + # ── Build base dataset and count total samples ── + if args.input_jsonl: + logging.info(f"Input mode: raw JSONL ({args.input_jsonl})") + total_samples = count_lines(args.input_jsonl) + base_dataset = JsonlDatasetReader( + args.input_jsonl, + sample_rate=SIDON_INPUT_SAMPLE_RATE, + shuffle=args.shuffle, + shuffle_seed=args.shuffle_seed, + ) + loader_workers = args.loader_workers + else: + logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})") + manifest_num_lines = count_lines(args.input_manifest) + loader_workers = min(args.loader_workers, manifest_num_lines) + total_samples = 0 + manifests = [] + with open(args.input_manifest, "r", encoding="utf-8") as f: + for line_id, line in tqdm( + enumerate(f), + total=manifest_num_lines, + desc="Calculating dataset length", + ): + items = line.strip().split(" ") + tar_path, jsonl_path, num_items, duration = ( + items[0], + items[1], + int(items[2]), + float(items[3]), + ) + assert os.path.exists(tar_path), f"File {tar_path} does not exist." + assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist." + assert jsonl_path.endswith( + ".jsonl" + ), f"File {jsonl_path} is not a .jsonl file." + if ( + args.num_machines > 1 + and line_id % args.num_machines != args.machine_index + ): + continue + total_samples += num_items + manifests.append((tar_path, jsonl_path, num_items, duration)) + logging.info( + f"Total shards: {manifest_num_lines}, " + f"Shards for current index: {len(manifests)}" + ) + base_dataset = WebDatasetReader( + manifests=manifests, + sample_rate=SIDON_INPUT_SAMPLE_RATE, + evaluation=True, + ) + + # ── Dynamic batching + DataLoader ── + batched_dataset = StreamLengthGroupDataset( + dataset=base_dataset, + batch_duration=args.batch_duration, + max_sample=args.max_sample, + min_length=args.min_length, + max_length=args.max_length, + ) + + collate_fn = CollateFunction( + skip_errors=args.skip_errors, + sample_rate=SIDON_INPUT_SAMPLE_RATE, + ) + + dataloader = DataLoader( + dataset=batched_dataset, + batch_size=None, + collate_fn=collate_fn, + num_workers=loader_workers, + prefetch_factor=10 if loader_workers > 0 else None, + pin_memory=True, + persistent_workers=loader_workers > 0, + ) + + # ── Multi-GPU process pool ── + num_devices = torch.cuda.device_count() + if num_devices == 0: + logging.warning("No GPUs detected - using CPU for processing") + num_processes = args.nj_per_gpu + else: + num_processes = num_devices * args.nj_per_gpu + logging.info( + f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, " + f"Total processes: {num_processes}" + ) + + # Build a list of (physical_gpu_id, num_workers) for each pool. + # When num_devices == 0 we use a single CPU pool. + if num_devices == 0: + pool_specs = [(None, num_processes)] + else: + pool_specs = [(gpu_id, args.nj_per_gpu) for gpu_id in range(num_devices)] + + # ── Output paths ── + tar_output_pattern = str(Path(args.tar_output_pattern).expanduser()) + jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser()) + Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True) + Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True) + + output_dir = Path(tar_output_pattern).parent.parent + error_log_path = str(output_dir / "errors.jsonl") + manifest_path = str(output_dir / "data.lst") + + error_logger = logging.getLogger("error_log") + error_logger.setLevel(logging.ERROR) + error_logger.handlers.clear() + error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8") + error_fh.setFormatter(logging.Formatter("%(message)s")) + error_logger.addHandler(error_fh) + + # ── Progress and shard tracking ── + processed_count = 0 + error_count = 0 + write_error_count = 0 + failed_ids = [] + shard_idx = 0 + shard_sample_count = 0 + shard_duration = 0.0 + samples_per_shard = args.samples_per_shard + shard_manifest = {} + target_sample_rate = args.target_sample_rate + + tar_writer = None + jsonl_file = None + + def open_new_shard(): + nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + if shard_idx > 0 and shard_sample_count > 0: + prev_idx = shard_idx - 1 + shard_manifest[prev_idx] = ( + os.path.abspath(tar_output_pattern % prev_idx), + os.path.abspath(jsonl_output_pattern % prev_idx), + shard_sample_count, + shard_duration, + ) + tar_fname = tar_output_pattern % shard_idx + jsonl_fname = jsonl_output_pattern % shard_idx + tar_writer = wds.TarWriter(tar_fname) + jsonl_file = open(jsonl_fname, "w", encoding="utf-8") + shard_idx += 1 + shard_sample_count = 0 + shard_duration = 0.0 + + def write_sample(key, waveform, metadata): + nonlocal shard_sample_count, write_error_count, shard_duration + assert tar_writer is not None and jsonl_file is not None + try: + if target_sample_rate != SIDON_OUTPUT_SAMPLE_RATE: + waveform = torchaudio.functional.resample( + waveform, + orig_freq=SIDON_OUTPUT_SAMPLE_RATE, + new_freq=target_sample_rate, + ) + waveform = (waveform / (waveform.abs().max() + 1e-7)) * 0.6 + + record = serialise_flac(key, waveform, target_sample_rate) + jsonl_record = _encode_metadata(metadata) + tar_writer.write(record) + jsonl_file.write(jsonl_record.decode("utf-8") + "\n") + shard_sample_count += 1 + shard_duration += metadata.get("audio_duration", 0.0) + except Exception as exc: + write_error_count += 1 + failed_ids.append(key) + error_logger.error( + json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False) + ) + logging.error(f"Write failed for sample {key}: {exc}") + + def handle_result(result): + nonlocal processed_count, error_count + if result["status"] == "success": + for key, cleaned, metadata in zip( + result["keys"], result["results"], result["metadata"] + ): + if tar_writer is None or shard_sample_count >= samples_per_shard: + open_new_shard() + write_sample(key, cleaned, metadata) + processed_count += 1 + else: + error_count += result["size"] + failed_ids.extend(result["keys"]) + for key in result["keys"]: + error_logger.error( + json.dumps( + {"id": key, "reason": result["error"]}, + ensure_ascii=False, + ) + ) + if not args.skip_errors: + raise RuntimeError( + f"Batch starting with {result['keys'][0]} failed - terminating" + ) + logging.warning( + f"Skipping failed batch starting with {result['keys'][0]}: " + f"{result['error']}" + ) + + # ── Main processing loop ── + main_progress = tqdm(total=total_samples, desc="Denoising Audio") + + # Launch subprocess-based GPU workers. CUDA_VISIBLE_DEVICES is set in the + # subprocess Popen environment so it takes effect before import torch. + pool = GPUWorkerPool(pool_specs, args.feature_extractor_path, args.decoder_path) + logging.info(f"Submitting tasks... ({num_processes} subprocess workers)") + try: + futures = set() + max_pending = num_processes * 2 + + def drain_completed(): + nonlocal futures + done, _ = wait(futures, return_when=FIRST_COMPLETED) + for f in done: + futures.discard(f) + result = f.result() + main_progress.update(result["size"]) + handle_result(result) + main_progress.set_postfix( + OK=processed_count, + Err=error_count, + ) + + for batch in dataloader: + if batch.size == 0: + continue + if len(futures) >= max_pending: + drain_completed() + futures.add(pool.submit(batch)) + + logging.info("Processing remaining pending batches...") + while futures: + drain_completed() + + except Exception: + logging.error("Critical error during processing", exc_info=True) + raise + finally: + pool.shutdown() + main_progress.close() + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + if shard_idx > 0 and shard_sample_count > 0: + last_idx = shard_idx - 1 + shard_manifest[last_idx] = ( + os.path.abspath(tar_output_pattern % last_idx), + os.path.abspath(jsonl_output_pattern % last_idx), + shard_sample_count, + shard_duration, + ) + + # ── Write manifest (data.lst) ── + with open(manifest_path, "w", encoding="utf-8") as mf: + for idx in sorted(shard_manifest.keys()): + tar_path, jsonl_path, count, duration = shard_manifest[idx] + mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n") + + # ── Summary ── + total_failed = error_count + write_error_count + filtered_and_skipped = total_samples - processed_count - total_failed + logging.info( + f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, " + f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}" + ) + logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)") + if total_failed > 0: + logging.info(f"Error details: {error_log_path}") + if failed_ids and args.skip_errors: + logging.warning( + f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..." + ) + if write_error_count > 0 and not args.skip_errors: + raise RuntimeError( + f"{write_error_count} samples failed to write - check logs for details" + ) + + +if __name__ == "__main__": + main() diff --git a/omnivoice/scripts/extract_audio_tokens.py b/omnivoice/scripts/extract_audio_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..2f03a4fc712c789bd9d743a70461e0436497822b --- /dev/null +++ b/omnivoice/scripts/extract_audio_tokens.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Extract audio tokens from audio data and pack them into WebDataset shards. + +Supports two input modes: + +1. WebDataset manifest (data.lst): + python extract_audio_tokens.py \ + --input_manifest data.lst \ + --tar_output_pattern output/audios/shard-%06d.tar \ + --jsonl_output_pattern output/txts/shard-%06d.jsonl + +2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}): + python extract_audio_tokens.py \ + --input_jsonl data.jsonl \ + --tar_output_pattern output/audios/shard-%06d.tar \ + --jsonl_output_pattern output/txts/shard-%06d.jsonl + +Output structure: + output_dir/ + ├── audios/ # WebDataset tar shards (.npy audio tokens + .json metadata) + │ ├── shard_000000.tar + │ └── ... + ├── txts/ # Per-shard JSONL metadata + │ ├── shard_000000.jsonl + │ └── ... + ├── data.lst # Manifest: + └── errors.jsonl # Failed samples with error details +""" + +import argparse +import io +import json +import logging +import multiprocessing as mp +import os +import warnings +from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait +from pathlib import Path +from typing import Any + +import numpy as np +import torch +import webdataset as wds +from torch.utils.data import DataLoader, IterableDataset +from tqdm.auto import tqdm +from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel + +from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader +from omnivoice.utils.common import str2bool + +warnings.filterwarnings( + "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm" +) + +HIGGS_INPUT_SAMPLE_RATE = 24_000 + + +# Global variables: Store tokenizer and device for each worker process +worker_tokenizer = None +worker_feature_extractor = None + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--input_manifest", + default=None, + help="Path to input dataset manifest (data.lst).", + ) + parser.add_argument( + "--input_jsonl", + default=None, + help="Path to raw JSONL file (alternative to --input_manifest).", + ) + parser.add_argument( + "--tar_output_pattern", + required=True, + help="Tar shard pattern passed to WebDataset", + ) + parser.add_argument( + "--jsonl_output_pattern", + required=True, + help="Jsonl shard pattern passed to WebDataset", + ) + parser.add_argument( + "--samples_per_shard", + type=int, + default=1000, + help="Maximum records per shard", + ) + parser.add_argument( + "--min_num_shards", + type=int, + default=32, + help="Minimum number of output shards (use to ensure " + "shard count >= num_gpu * num_workers)", + ) + parser.add_argument( + "--tokenizer_path", + type=str, + default="eustlb/higgs-audio-v2-tokenizer", + help="Path to audio tokenizer.", + ) + parser.add_argument( + "--skip_errors", action="store_true", help="Skip items that fail to process" + ) + parser.add_argument( + "--min_length", + type=float, + default=0.0, + help="Minimum audio duration in seconds (e.g. 2.0)", + ) + parser.add_argument( + "--max_length", + type=float, + default=float("inf"), + help="Maximum audio duration in seconds (e.g. 15.0)", + ) + parser.add_argument( + "--num_machines", + type=int, + default=1, + help="Total number of machines for distributed runs", + ) + parser.add_argument( + "--machine_index", + type=int, + default=0, + help="Zero-based machine index when distributing across multiple " + "machines (e.g. 0, 1, ... num_machines-1)", + ) + parser.add_argument( + "--nj_per_gpu", + type=int, + default=3, + help="Number of worker processes to spawn per GPU.", + ) + parser.add_argument( + "--loader_workers", + type=int, + default=24, + help="Number of DataLoader workers for streaming IterableDataset.", + ) + parser.add_argument( + "--shuffle", + type=str2bool, + default=True, + help="Shuffle data by default.", + ) + parser.add_argument( + "--shuffle-seed", + type=int, + default=42, + help="Random seed for shuffle (default: 42).", + ) + return parser + + +def count_lines(path): + with open(path, "rb") as f: + return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b"")) + + +def serialise_numpy(key: str, tokens: np.ndarray) -> dict: + buffer = io.BytesIO() + np.save(buffer, tokens) + return {"__key__": key, "npy": buffer.getvalue()} + + +def process_init(rank_queue, tokenizer_path): + """ + Initialization function for each worker process. + Assigns a specific GPU to the process and loads the tokenizer. + """ + global worker_tokenizer, worker_feature_extractor + + # Configure worker process logging + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]" + " [Worker %(process)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + # Get assigned GPU rank + rank = rank_queue.get() + # Determine device + if rank != -1 and torch.cuda.is_available(): + worker_device = torch.device(f"cuda:{rank}") + else: + worker_device = torch.device("cpu") + + logging.debug(f"Worker process initialized with device: {worker_device}") + # Load tokenizer onto the specified device + worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path) + worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained( + tokenizer_path, device_map=worker_device + ) + logging.debug(f"Tokenizer loaded successfully on device {worker_device}") + + +def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]: + """ + Single-sample processing function executed in worker processes. + Skips invalid samples during streaming processing. + """ + try: + audio_tensor = sample.get("audio", None) # shape (1, T) + if audio_tensor is None: + raise ValueError("Sample missing 'audio' field") + + with torch.inference_mode(): + key = sample["label"]["id"] + inputs = worker_feature_extractor( + raw_audio=audio_tensor.squeeze(0).numpy(), + sampling_rate=HIGGS_INPUT_SAMPLE_RATE, + return_tensors="pt", + ).to(worker_tokenizer.device) + audio_tokens = worker_tokenizer.encode( + inputs["input_values"], + ).audio_codes.squeeze(0) + + assert len(audio_tokens.shape) == 2 + assert audio_tokens.size(0) == 8 + + num_tokens = audio_tokens.size(1) + metadata = sample["label"] + metadata["num_tokens"] = num_tokens + + # Convert to numpy format for subsequent serialization (int16 to save space) + audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy() + + return { + "status": "success", + "key": key, + "audio_tokens": audio_tokens_np, + "metadata": metadata, + "error_msg": None, + } + except Exception as e: + sample_id = sample.get("label", {}).get("id", "unknown") + logging.error(f"Failed to process sample {sample_id}: {e}") + return { + "status": "error", + "key": sample_id, + "audio_tokens": None, + "metadata": None, + "error_msg": str(e), + } + + +def _normalise_value(value: Any) -> Any: + """Convert tensors and NumPy scalars to serialisable Python objects.""" + if isinstance(value, torch.Tensor): + if value.ndim == 0: + return value.item() + return value.cpu().tolist() + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + return value.tolist() + return value + + +def _encode_metadata(metadata: dict[str, Any]) -> bytes: + cleaned: dict[str, Any] = {} + for key, value in metadata.items(): + if value is None: + continue + cleaned[key] = _normalise_value(value) + return json.dumps(cleaned, ensure_ascii=False).encode("utf-8") + + +class StreamingLengthFilteredDataset(IterableDataset): + def __init__( + self, + base_iterable, + min_len: float, + max_len: float, + sr: int, + ): + self.base_iterable = base_iterable + self.min_len = min_len + self.max_len = max_len + self.sr = sr + self.filtered_count = 0 + + def __iter__(self): + """Stream samples one by one and filter on the fly.""" + for sample in self.base_iterable: + try: + duration = sample["audio"].size(-1) / self.sr + if self.min_len <= duration <= self.max_len: + yield sample + else: + self.filtered_count += 1 + logging.warning( + f"Filtered sample (duration out of range): " + f"{sample['label']['id']} ({duration:.2f}s)" + ) + except Exception as e: + logging.warning(f"Skipped invalid sample during streaming: {e}") + continue + + +def main() -> None: + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + parser = build_parser() + args = parser.parse_args() + mp.set_start_method("spawn", force=True) + + # Validate input arguments + assert bool(args.input_manifest) != bool( + args.input_jsonl + ), "Exactly one of --input_manifest or --input_jsonl must be provided." + + if args.num_machines > 1: + assert ( + 0 <= args.machine_index < args.num_machines + ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})" + + # Build base dataset and count total samples based on input mode + if args.input_jsonl: + logging.info(f"Input mode: raw JSONL ({args.input_jsonl})") + total_samples = count_lines(args.input_jsonl) + base_dataset = JsonlDatasetReader( + args.input_jsonl, + sample_rate=HIGGS_INPUT_SAMPLE_RATE, + shuffle=args.shuffle, + shuffle_seed=args.shuffle_seed, + ) + loader_workers = args.loader_workers + else: + logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})") + manifest_num_lines = count_lines(args.input_manifest) + loader_workers = min(args.loader_workers, manifest_num_lines) + total_samples = 0 + manifests = [] + with open(args.input_manifest, "r", encoding="utf-8") as f: + for line_id, line in tqdm( + enumerate(f), + total=manifest_num_lines, + desc="Calculating dataset length", + ): + items = line.strip().split(" ") + tar_path, jsonl_path, num_items, duration = ( + items[0], + items[1], + int(items[2]), + float(items[3]), + ) + assert os.path.exists(tar_path), f"File {tar_path} does not exist." + assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist." + assert jsonl_path.endswith( + ".jsonl" + ), f"File {jsonl_path} is not a .jsonl file." + if ( + args.num_machines > 1 + and line_id % args.num_machines != args.machine_index + ): + continue + total_samples += num_items + manifests.append((tar_path, jsonl_path, num_items, duration)) + logging.info( + f"Total shards: {manifest_num_lines}, " + f"Shards for current index: {len(manifests)}" + ) + base_dataset = WebDatasetReader( + manifests=manifests, + sample_rate=HIGGS_INPUT_SAMPLE_RATE, + evaluation=True, + ) + + # Adjust samples_per_shard if min_num_shards would be violated + samples_per_shard = args.samples_per_shard + if total_samples > 0: + estimated_shards = max( + 1, (total_samples + samples_per_shard - 1) // samples_per_shard + ) + if estimated_shards < args.min_num_shards: + samples_per_shard = max(1, total_samples // args.min_num_shards) + logging.info( + f"Adjusted samples_per_shard from {args.samples_per_shard} to " + f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} " + f"(total_samples={total_samples})" + ) + + # Apply length filter and create DataLoader + filtered_dataset = StreamingLengthFilteredDataset( + base_iterable=base_dataset, + min_len=args.min_length, + max_len=args.max_length, + sr=HIGGS_INPUT_SAMPLE_RATE, + ) + dataloader = DataLoader( + dataset=filtered_dataset, + batch_size=None, + num_workers=loader_workers, + persistent_workers=loader_workers > 0, + pin_memory=False, + ) + + # Configure multi-GPU multi-process setup + num_devices = torch.cuda.device_count() + if num_devices == 0: + logging.warning("No GPUs detected - using CPU for processing") + num_processes = args.nj_per_gpu + else: + num_processes = num_devices * args.nj_per_gpu + logging.info( + f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, " + f"Total processes: {num_processes}" + ) + + # Shared GPU rank queue for process assignment + manager = mp.Manager() + rank_queue = manager.Queue() + for rank in list(range(num_devices)) * args.nj_per_gpu: + rank_queue.put(rank) + if num_devices == 0: + for _ in range(num_processes): + rank_queue.put(-1) + + # Prepare output paths + tar_output_pattern = str(Path(args.tar_output_pattern).expanduser()) + jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser()) + Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True) + Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True) + + # Determine output directory from tar_output_pattern + output_dir = Path(tar_output_pattern).parent.parent + error_log_path = str(output_dir / "errors.jsonl") + manifest_path = str(output_dir / "data.lst") + + # Setup error logger (writes to errors.jsonl) + error_logger = logging.getLogger("error_log") + error_logger.setLevel(logging.ERROR) + error_logger.handlers.clear() + error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8") + error_fh.setFormatter(logging.Formatter("%(message)s")) + error_logger.addHandler(error_fh) + + # Progress and error tracking + processed_count = 0 + error_count = 0 + write_error_count = 0 + failed_ids = [] + shard_idx = 0 + shard_sample_count = 0 + shard_duration = 0.0 + shard_manifest = {} # shard_idx -> (tar_path, jsonl_path, count, duration) + + tar_writer = None + jsonl_file = None + + def open_new_shard(): + nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + # Record manifest for the previous shard + if shard_idx > 0 and shard_sample_count > 0: + prev_idx = shard_idx - 1 + shard_manifest[prev_idx] = ( + os.path.abspath(tar_output_pattern % prev_idx), + os.path.abspath(jsonl_output_pattern % prev_idx), + shard_sample_count, + shard_duration, + ) + tar_fname = tar_output_pattern % shard_idx + jsonl_fname = jsonl_output_pattern % shard_idx + tar_writer = wds.TarWriter(tar_fname) + jsonl_file = open(jsonl_fname, "w", encoding="utf-8") + shard_idx += 1 + shard_sample_count = 0 + shard_duration = 0.0 + + def write_sample(key, audio_tokens_np, metadata): + nonlocal shard_sample_count, write_error_count, shard_duration + assert tar_writer is not None and jsonl_file is not None + try: + token_record = serialise_numpy(key, audio_tokens_np) + json_record = _encode_metadata(metadata) + tar_writer.write(token_record) + jsonl_file.write(json_record.decode("utf-8") + "\n") + shard_sample_count += 1 + shard_duration += metadata.get("audio_duration", 0.0) + except Exception as exc: + write_error_count += 1 + failed_ids.append(key) + error_logger.error( + json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False) + ) + logging.error(f"Write failed for sample {key}: {exc}") + + def handle_result(result): + nonlocal processed_count, error_count + if result["status"] == "success": + # Rotate shard if needed + if tar_writer is None or shard_sample_count >= samples_per_shard: + open_new_shard() + write_sample(result["key"], result["audio_tokens"], result["metadata"]) + processed_count += 1 + else: + error_count += 1 + failed_ids.append(result["key"]) + error_logger.error( + json.dumps( + {"id": result["key"], "reason": result["error_msg"]}, + ensure_ascii=False, + ) + ) + if not args.skip_errors: + raise RuntimeError( + f"Sample {result['key']} processing failed due " + f"to {result['error_msg']} - terminating" + ) + logging.warning( + f"Skipping failed sample {result['key']}: {result['error_msg']}" + ) + + main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens") + + try: + with ProcessPoolExecutor( + max_workers=num_processes, + initializer=process_init, + initargs=(rank_queue, args.tokenizer_path), + ) as executor: + logging.info(f"Submitting tasks... ({num_processes} workers)") + futures = set() + max_pending = num_processes * 10 + + def drain_completed(): + """Wait for at least one future to complete, process all done.""" + nonlocal futures + done, _ = wait(futures, return_when=FIRST_COMPLETED) + for f in done: + futures.discard(f) + result = f.result() + main_progress.update(1) + handle_result(result) + main_progress.set_postfix( + Samples=processed_count, + Errors=error_count, + ) + + # Stream samples from DataLoader + for sample in dataloader: + if len(futures) >= max_pending: + drain_completed() + + future = executor.submit(process_single_sample, sample) + futures.add(future) + + # Process remaining futures + logging.info("Processing remaining pending samples...") + while futures: + drain_completed() + + except Exception: + logging.error("Critical error during processing", exc_info=True) + raise + finally: + main_progress.close() + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + # Record the last shard in the manifest + if shard_idx > 0 and shard_sample_count > 0: + last_idx = shard_idx - 1 + shard_manifest[last_idx] = ( + os.path.abspath(tar_output_pattern % last_idx), + os.path.abspath(jsonl_output_pattern % last_idx), + shard_sample_count, + shard_duration, + ) + + # Write manifest file (data.lst) + with open(manifest_path, "w", encoding="utf-8") as mf: + for idx in sorted(shard_manifest.keys()): + tar_path, jsonl_path, count, duration = shard_manifest[idx] + mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n") + + # Output final statistics + total_failed = error_count + write_error_count + filtered_and_skipped = total_samples - processed_count - total_failed + logging.info( + f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, " + f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}" + ) + logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)") + if total_failed > 0: + logging.info(f"Error details: {error_log_path}") + if failed_ids and args.skip_errors: + logging.warning( + f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..." + ) + if write_error_count > 0 and not args.skip_errors: + raise RuntimeError( + f"{write_error_count} samples failed to write - check logs for details" + ) + + +if __name__ == "__main__": + main() diff --git a/omnivoice/scripts/extract_audio_tokens_add_noise.py b/omnivoice/scripts/extract_audio_tokens_add_noise.py new file mode 100644 index 0000000000000000000000000000000000000000..37b72ac0561c71cbd034f655f05b5159aa6c7955 --- /dev/null +++ b/omnivoice/scripts/extract_audio_tokens_add_noise.py @@ -0,0 +1,819 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Extract audio tokens from audio data and pack them into WebDataset shards. + +Extends ``extract_audio_tokens.py`` with optional noise and reverberation +augmentation on the prompt (reference) portion of the audio. Requires a +noise manifest and/or RIR manifest. + +Supports two input modes: + +1. WebDataset manifest (data.lst): + python extract_audio_tokens_add_noise.py \\ + --input_manifest data.lst \\ + --noise_manifest noise.lst \\ + --tar_output_pattern output/audios/shard-%06d.tar \\ + --jsonl_output_pattern output/txts/shard-%06d.jsonl + +2. Raw JSONL (each line: {"id": "...", "audio_path": "...", "text": "...", ...}): + python extract_audio_tokens_add_noise.py \\ + --input_jsonl data.jsonl \\ + --noise_manifest noise.lst \\ + --tar_output_pattern output/audios/shard-%06d.tar \\ + --jsonl_output_pattern output/txts/shard-%06d.jsonl + +Output structure: + output_dir/ + ├── audios/ # WebDataset tar shards (.npy audio tokens + .json metadata) + │ ├── shard_000000.tar + │ └── ... + ├── txts/ # Per-shard JSONL metadata + │ ├── shard_000000.jsonl + │ └── ... + ├── data.lst # Manifest: + └── errors.jsonl # Failed samples with error details +""" + +import argparse +import io +import json +import logging +import math +import multiprocessing as mp +import os +import random +import warnings +from concurrent.futures import FIRST_COMPLETED, ProcessPoolExecutor, wait +from pathlib import Path +from typing import Any + +import numpy as np +import torch +import torch.nn.functional as F +import webdataset as wds +from torch.utils.data import DataLoader, IterableDataset +from tqdm.auto import tqdm +from transformers import AutoFeatureExtractor, HiggsAudioV2TokenizerModel + +from omnivoice.data.dataset import JsonlDatasetReader, WebDatasetReader +from omnivoice.utils.audio import load_audio_bytes +from omnivoice.utils.common import str2bool + +warnings.filterwarnings( + "ignore", category=FutureWarning, module="torch.nn.utils.weight_norm" +) + +HIGGS_INPUT_SAMPLE_RATE = 24_000 + +# Global variables: Store tokenizer and device for each worker process +worker_tokenizer = None +worker_feature_extractor = None +worker_noise_sampler = None +worker_rir_sampler = None + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--input_manifest", + default=None, + help="Path to input dataset manifest (data.lst).", + ) + parser.add_argument( + "--input_jsonl", + default=None, + help="Path to raw JSONL file (alternative to --input_manifest).", + ) + parser.add_argument( + "--tar_output_pattern", + required=True, + help="Tar shard pattern passed to WebDataset", + ) + parser.add_argument( + "--jsonl_output_pattern", + required=True, + help="Jsonl shard pattern passed to WebDataset", + ) + parser.add_argument( + "--samples_per_shard", + type=int, + default=1000, + help="Maximum records per shard", + ) + parser.add_argument( + "--min_num_shards", + type=int, + default=32, + help="Minimum number of output shards (use to ensure " + "shard count >= num_gpu * num_workers)", + ) + parser.add_argument( + "--tokenizer_path", + type=str, + default="eustlb/higgs-audio-v2-tokenizer", + help="Path to audio tokenizer.", + ) + parser.add_argument( + "--skip_errors", action="store_true", help="Skip items that fail to process" + ) + parser.add_argument( + "--min_length", + type=float, + default=0.0, + help="Minimum audio duration in seconds (e.g. 2.0)", + ) + parser.add_argument( + "--max_length", + type=float, + default=float("inf"), + help="Maximum audio duration in seconds (e.g. 15.0)", + ) + parser.add_argument( + "--num_machines", + type=int, + default=1, + help="Total number of machines for distributed runs", + ) + parser.add_argument( + "--machine_index", + type=int, + default=0, + help="Zero-based machine index when distributing across multiple " + "machines (e.g. 0, 1, ... num_machines-1)", + ) + parser.add_argument( + "--nj_per_gpu", + type=int, + default=3, + help="Number of worker processes to spawn per GPU.", + ) + parser.add_argument( + "--loader_workers", + type=int, + default=24, + help="Number of DataLoader workers for streaming IterableDataset.", + ) + parser.add_argument( + "--shuffle", + type=str2bool, + default=True, + help="Shuffle data by default.", + ) + parser.add_argument( + "--shuffle-seed", + type=int, + default=42, + help="Random seed for shuffle (default: 42).", + ) + parser.add_argument( + "--noise_manifest", + default=None, + help="Path to noise manifest (list of tar files). Enables prompt noise augmentation.", + ) + parser.add_argument( + "--rir_manifest", + default=None, + help="Path to RIR manifest (list of tar files). Enables prompt reverb augmentation.", + ) + return parser + + +def count_lines(path): + with open(path, "rb") as f: + return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b"")) + + +def serialise_numpy(key: str, tokens: np.ndarray) -> dict: + buffer = io.BytesIO() + np.save(buffer, tokens) + return {"__key__": key, "npy": buffer.getvalue()} + + +def _load_aug_audio(data, sample_rate=24000): + """Simple audio loader for augmentation files.""" + return torch.from_numpy(load_audio_bytes(data, sample_rate)) + + +class SimpleWorkerSampler: + """A lightweight infinite sampler for noise/RIR within a worker process.""" + + def __init__(self, tar_paths, sample_rate=24000): + self.dataset = ( + wds.WebDataset( + tar_paths, shardshuffle=True, nodesplitter=None, workersplitter=None + ) + .decode() + .map(lambda s: self._decode(s, sample_rate)) + .select(lambda x: x is not None) + .shuffle(100) + .repeat() + ) + self.iterator = iter(self.dataset) + + def _decode(self, sample, sample_rate): + for ext in ["wav", "flac", "mp3"]: + if ext in sample: + return _load_aug_audio(sample[ext], sample_rate) + return None + + def sample_segment(self, target_len, allow_repeat=True): + """Get a random segment of noise matching the target length.""" + try: + audio = next(self.iterator) + except StopIteration: + self.iterator = iter(self.dataset) + audio = next(self.iterator) + + cur_len = audio.size(-1) + if cur_len < target_len and allow_repeat: + if cur_len > 0: + num_repeats = math.ceil(target_len / cur_len) + audio = audio.repeat(1, num_repeats) + else: + audio = F.pad(audio, (0, target_len), mode="constant") + cur_len = audio.size(-1) + + if cur_len > target_len: + start = random.randint(0, cur_len - target_len) + audio = audio[..., start : start + target_len] + + return audio + + +def _convolve1d(signal: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor: + m = signal.size(-1) + n = kernel.size(-1) + padded_size = m + n - 1 + f_signal = torch.fft.rfft(signal, n=padded_size) + f_kernel = torch.fft.rfft(kernel, n=padded_size) + f_result = f_signal * f_kernel + result = torch.fft.irfft(f_result, n=padded_size) + return result[:padded_size] + + +def _apply_rir(audio, rir, mix_ratio=0.5): + rir_scaling_factor = 0.5**15 + N_in = audio.shape[-1] + rir_d = rir[0, :] * rir_scaling_factor + aug_d = _convolve1d(audio[0], rir_d) + shift_index = torch.argmax(torch.abs(rir_d)) + end_index = shift_index + N_in + if end_index > aug_d.shape[0]: + augmented = F.pad(aug_d[shift_index:], (0, end_index - aug_d.shape[0])) + else: + augmented = aug_d[shift_index:end_index] + power_before = torch.sum(audio[0] ** 2) + power_after = torch.sum(augmented**2) + if power_after > 0: + augmented *= torch.sqrt(power_before / power_after) + mixed = (1 - mix_ratio) * audio[0] + mix_ratio * augmented + return mixed.unsqueeze(0) + + +def process_init(rank_queue, tokenizer_path, noise_manifest=None, rir_manifest=None): + """ + Initialization function for each worker process. + Assigns a specific GPU to the process and loads the tokenizer. + """ + global worker_tokenizer, worker_feature_extractor, worker_noise_sampler, worker_rir_sampler + + # Configure worker process logging + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d]" + " [Worker %(process)d] %(message)s" + ) + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + + # Get assigned GPU rank + rank = rank_queue.get() + # Determine device + if rank != -1 and torch.cuda.is_available(): + worker_device = torch.device(f"cuda:{rank}") + else: + worker_device = torch.device("cpu") + + logging.debug(f"Worker process initialized with device: {worker_device}") + # Load tokenizer onto the specified device + worker_feature_extractor = AutoFeatureExtractor.from_pretrained(tokenizer_path) + worker_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained( + tokenizer_path, device_map=worker_device + ) + logging.debug(f"Tokenizer loaded successfully on device {worker_device}") + + # Initialize augmentation samplers (optional) + if noise_manifest: + try: + with open(noise_manifest, "r") as f: + tars = [l.strip().split()[0] for l in f if l.strip()] + worker_noise_sampler = SimpleWorkerSampler( + tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE + ) + logging.debug("Noise sampler initialized.") + except Exception as e: + logging.warning(f"Failed to load noise manifest: {e}") + + if rir_manifest: + try: + with open(rir_manifest, "r") as f: + tars = [l.strip().split()[0] for l in f if l.strip()] + worker_rir_sampler = SimpleWorkerSampler( + tars, sample_rate=HIGGS_INPUT_SAMPLE_RATE + ) + logging.debug("RIR sampler initialized.") + except Exception as e: + logging.warning(f"Failed to load RIR manifest: {e}") + + +def _augment_prompt(audio_tensor: torch.Tensor) -> tuple[torch.Tensor, int]: + """Apply noise/reverb augmentation to the front portion of audio. + + Returns the augmented audio and the sample index where clean audio starts. + """ + # Pre-normalization + max_val = audio_tensor.abs().max() + 1e-7 + audio_tensor = (audio_tensor / max_val) * 0.6 + + total_len = audio_tensor.size(-1) + ratio = random.uniform(0.1, 0.3) + split_idx = int(total_len * ratio) + front_part = audio_tensor[:, :split_idx].clone() + + # Apply noise + if worker_noise_sampler is not None: + noise = worker_noise_sampler.sample_segment(split_idx) + snr_db = random.uniform(5, 15) + sig_rms = front_part.norm(p=2) / (split_idx**0.5) + noise_rms = noise.norm(p=2) / (split_idx**0.5) + if noise_rms > 1e-9: + snr = 10 ** (snr_db / 20) + scale = sig_rms / (snr * noise_rms + 1e-8) + front_part = front_part + noise * scale + + # Apply RIR (30% probability) + if worker_rir_sampler is not None and random.random() < 0.3: + rir = worker_rir_sampler.sample_segment(split_idx, allow_repeat=False) + reverb_amt = random.uniform(0.3, 1.0) + try: + front_part = _apply_rir(front_part, rir, reverb_amt) + except Exception as e: + logging.warning(f"RIR failed: {e}") + + # Merge back + if front_part.device != audio_tensor.device: + front_part = front_part.to(audio_tensor.device) + audio_tensor[:, :split_idx] = front_part + + # Post-normalization + max_val = audio_tensor.abs().max() + 1e-7 + audio_tensor = (audio_tensor / max_val) * 0.9 + + return audio_tensor, split_idx + + +def process_single_sample(sample: dict[str, Any]) -> dict[str, Any]: + """ + Single-sample processing function executed in worker processes. + Skips invalid samples during streaming processing. + """ + try: + audio_tensor = sample.get("audio", None) # shape (1, T) + if audio_tensor is None: + raise ValueError("Sample missing 'audio' field") + + # Apply prompt augmentation if noise/rir samplers are available + enable_aug = worker_noise_sampler is not None or worker_rir_sampler is not None + clean_sample_idx = 0 + if enable_aug: + audio_tensor, clean_sample_idx = _augment_prompt(audio_tensor) + + with torch.inference_mode(): + key = sample["label"]["id"] + + inputs = worker_feature_extractor( + raw_audio=audio_tensor.squeeze(0).numpy(), + sampling_rate=HIGGS_INPUT_SAMPLE_RATE, + return_tensors="pt", + ).to(worker_tokenizer.device) + audio_tokens = worker_tokenizer.encode( + inputs["input_values"], + ).audio_codes.squeeze(0) + + assert len(audio_tokens.shape) == 2 + assert audio_tokens.size(0) == 8 + + num_tokens = audio_tokens.size(1) + metadata = sample["label"] + metadata["num_tokens"] = num_tokens + + if enable_aug: + clean_token_idx = math.ceil( + clean_sample_idx / worker_tokenizer.config.hop_length + ) + metadata["clean_start_token_idx"] = clean_token_idx + + # Convert to numpy format for subsequent serialization (int16 to save space) + audio_tokens_np = audio_tokens.to(torch.int16).cpu().numpy() + + return { + "status": "success", + "key": key, + "audio_tokens": audio_tokens_np, + "metadata": metadata, + "error_msg": None, + } + except Exception as e: + sample_id = sample.get("label", {}).get("id", "unknown") + logging.error(f"Failed to process sample {sample_id}: {e}") + return { + "status": "error", + "key": sample_id, + "audio_tokens": None, + "metadata": None, + "error_msg": str(e), + } + + +def _normalise_value(value: Any) -> Any: + """Convert tensors and NumPy scalars to serialisable Python objects.""" + if isinstance(value, torch.Tensor): + if value.ndim == 0: + return value.item() + return value.cpu().tolist() + if isinstance(value, np.generic): + return value.item() + if isinstance(value, np.ndarray): + return value.tolist() + return value + + +def _encode_metadata(metadata: dict[str, Any]) -> bytes: + cleaned: dict[str, Any] = {} + for key, value in metadata.items(): + if value is None: + continue + cleaned[key] = _normalise_value(value) + return json.dumps(cleaned, ensure_ascii=False).encode("utf-8") + + +class StreamingLengthFilteredDataset(IterableDataset): + def __init__( + self, + base_iterable, + min_len: float, + max_len: float, + sr: int, + ): + self.base_iterable = base_iterable + self.min_len = min_len + self.max_len = max_len + self.sr = sr + self.filtered_count = 0 + + def __iter__(self): + """Stream samples one by one and filter on the fly.""" + for sample in self.base_iterable: + try: + duration = sample["audio"].size(-1) / self.sr + if self.min_len <= duration <= self.max_len: + yield sample + else: + self.filtered_count += 1 + logging.warning( + f"Filtered sample (duration out of range): " + f"{sample['label']['id']} ({duration:.2f}s)" + ) + except Exception as e: + logging.warning(f"Skipped invalid sample during streaming: {e}") + continue + + +def main() -> None: + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO, force=True) + parser = build_parser() + args = parser.parse_args() + mp.set_start_method("spawn", force=True) + + # Validate input arguments + assert bool(args.input_manifest) != bool( + args.input_jsonl + ), "Exactly one of --input_manifest or --input_jsonl must be provided." + + if args.num_machines > 1: + assert ( + 0 <= args.machine_index < args.num_machines + ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})" + + # Build base dataset and count total samples based on input mode + if args.input_jsonl: + logging.info(f"Input mode: raw JSONL ({args.input_jsonl})") + total_samples = count_lines(args.input_jsonl) + base_dataset = JsonlDatasetReader( + args.input_jsonl, + sample_rate=HIGGS_INPUT_SAMPLE_RATE, + shuffle=args.shuffle, + shuffle_seed=args.shuffle_seed, + ) + loader_workers = args.loader_workers + else: + logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})") + manifest_num_lines = count_lines(args.input_manifest) + loader_workers = min(args.loader_workers, manifest_num_lines) + total_samples = 0 + manifests = [] + with open(args.input_manifest, "r", encoding="utf-8") as f: + for line_id, line in tqdm( + enumerate(f), + total=manifest_num_lines, + desc="Calculating dataset length", + ): + items = line.strip().split(" ") + tar_path, jsonl_path, num_items, duration = ( + items[0], + items[1], + int(items[2]), + float(items[3]), + ) + assert os.path.exists(tar_path), f"File {tar_path} does not exist." + assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist." + assert jsonl_path.endswith( + ".jsonl" + ), f"File {jsonl_path} is not a .jsonl file." + if ( + args.num_machines > 1 + and line_id % args.num_machines != args.machine_index + ): + continue + total_samples += num_items + manifests.append((tar_path, jsonl_path, num_items, duration)) + logging.info( + f"Total shards: {manifest_num_lines}, " + f"Shards for current index: {len(manifests)}" + ) + base_dataset = WebDatasetReader( + manifests=manifests, + sample_rate=HIGGS_INPUT_SAMPLE_RATE, + evaluation=True, + ) + + # Apply length filter and create DataLoader + filtered_dataset = StreamingLengthFilteredDataset( + base_iterable=base_dataset, + min_len=args.min_length, + max_len=args.max_length, + sr=HIGGS_INPUT_SAMPLE_RATE, + ) + dataloader = DataLoader( + dataset=filtered_dataset, + batch_size=None, + num_workers=loader_workers, + persistent_workers=loader_workers > 0, + pin_memory=False, + ) + + # Adjust samples_per_shard if min_num_shards would be violated + samples_per_shard = args.samples_per_shard + if total_samples > 0: + estimated_shards = max( + 1, (total_samples + samples_per_shard - 1) // samples_per_shard + ) + if estimated_shards < args.min_num_shards: + samples_per_shard = max(1, total_samples // args.min_num_shards) + logging.info( + f"Adjusted samples_per_shard from {args.samples_per_shard} to " + f"{samples_per_shard} to meet min_num_shards={args.min_num_shards} " + f"(total_samples={total_samples})" + ) + + # Configure multi-GPU multi-process setup + num_devices = torch.cuda.device_count() + if num_devices == 0: + logging.warning("No GPUs detected - using CPU for processing") + num_processes = args.nj_per_gpu + else: + num_processes = num_devices * args.nj_per_gpu + logging.info( + f"GPU count: {num_devices}, Processes per GPU: {args.nj_per_gpu}, " + f"Total processes: {num_processes}" + ) + if args.noise_manifest or args.rir_manifest: + logging.info( + f"Prompt augmentation enabled - " + f"noise: {args.noise_manifest or 'off'}, rir: {args.rir_manifest or 'off'}" + ) + + # Shared GPU rank queue for process assignment + manager = mp.Manager() + rank_queue = manager.Queue() + for rank in list(range(num_devices)) * args.nj_per_gpu: + rank_queue.put(rank) + if num_devices == 0: + for _ in range(num_processes): + rank_queue.put(-1) + + # Prepare output paths + tar_output_pattern = str(Path(args.tar_output_pattern).expanduser()) + jsonl_output_pattern = str(Path(args.jsonl_output_pattern).expanduser()) + Path(tar_output_pattern).parent.mkdir(parents=True, exist_ok=True) + Path(jsonl_output_pattern).parent.mkdir(parents=True, exist_ok=True) + + # Determine output directory from tar_output_pattern + output_dir = Path(tar_output_pattern).parent.parent + error_log_path = str(output_dir / "errors.jsonl") + manifest_path = str(output_dir / "data.lst") + + # Setup error logger (writes to errors.jsonl) + error_logger = logging.getLogger("error_log") + error_logger.setLevel(logging.ERROR) + error_logger.handlers.clear() + error_fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8") + error_fh.setFormatter(logging.Formatter("%(message)s")) + error_logger.addHandler(error_fh) + + # Progress and error tracking + processed_count = 0 + error_count = 0 + write_error_count = 0 + failed_ids = [] + shard_idx = 0 + shard_sample_count = 0 + shard_duration = 0.0 + shard_manifest = {} # shard_idx -> (tar_path, jsonl_path, count, duration) + + tar_writer = None + jsonl_file = None + + def open_new_shard(): + nonlocal tar_writer, jsonl_file, shard_idx, shard_sample_count, shard_duration + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + # Record manifest for the previous shard + if shard_idx > 0 and shard_sample_count > 0: + prev_idx = shard_idx - 1 + shard_manifest[prev_idx] = ( + os.path.abspath(tar_output_pattern % prev_idx), + os.path.abspath(jsonl_output_pattern % prev_idx), + shard_sample_count, + shard_duration, + ) + tar_fname = tar_output_pattern % shard_idx + jsonl_fname = jsonl_output_pattern % shard_idx + tar_writer = wds.TarWriter(tar_fname) + jsonl_file = open(jsonl_fname, "w", encoding="utf-8") + shard_idx += 1 + shard_sample_count = 0 + shard_duration = 0.0 + + def write_sample(key, audio_tokens_np, metadata): + nonlocal shard_sample_count, write_error_count, shard_duration + assert tar_writer is not None and jsonl_file is not None + try: + token_record = serialise_numpy(key, audio_tokens_np) + json_record = _encode_metadata(metadata) + tar_writer.write(token_record) + jsonl_file.write(json_record.decode("utf-8") + "\n") + shard_sample_count += 1 + shard_duration += metadata.get("audio_duration", 0.0) + except Exception as exc: + write_error_count += 1 + failed_ids.append(key) + error_logger.error( + json.dumps({"id": key, "reason": str(exc)}, ensure_ascii=False) + ) + logging.error(f"Write failed for sample {key}: {exc}") + + def handle_result(result): + nonlocal processed_count, error_count + if result["status"] == "success": + # Rotate shard if needed + if tar_writer is None or shard_sample_count >= samples_per_shard: + open_new_shard() + write_sample(result["key"], result["audio_tokens"], result["metadata"]) + processed_count += 1 + else: + error_count += 1 + failed_ids.append(result["key"]) + error_logger.error( + json.dumps( + {"id": result["key"], "reason": result["error_msg"]}, + ensure_ascii=False, + ) + ) + if not args.skip_errors: + raise RuntimeError( + f"Sample {result['key']} processing failed due " + f"to {result['error_msg']} - terminating" + ) + logging.warning( + f"Skipping failed sample {result['key']}: {result['error_msg']}" + ) + + main_progress = tqdm(total=total_samples, desc="Extracting Audio Tokens") + + try: + with ProcessPoolExecutor( + max_workers=num_processes, + initializer=process_init, + initargs=( + rank_queue, + args.tokenizer_path, + args.noise_manifest, + args.rir_manifest, + ), + ) as executor: + logging.info(f"Submitting tasks... ({num_processes} workers)") + futures = set() + max_pending = num_processes * 10 + + def drain_completed(): + """Wait for at least one future to complete, process all done.""" + nonlocal futures + done, _ = wait(futures, return_when=FIRST_COMPLETED) + for f in done: + futures.discard(f) + result = f.result() + main_progress.update(1) + handle_result(result) + main_progress.set_postfix( + Samples=processed_count, + Errors=error_count, + ) + + # Stream samples from DataLoader + for sample in dataloader: + if len(futures) >= max_pending: + drain_completed() + + future = executor.submit(process_single_sample, sample) + futures.add(future) + + # Process remaining futures + logging.info("Processing remaining pending samples...") + while futures: + drain_completed() + + except Exception: + logging.error("Critical error during processing", exc_info=True) + raise + finally: + main_progress.close() + if tar_writer is not None: + tar_writer.close() + if jsonl_file is not None: + jsonl_file.close() + # Record the last shard in the manifest + if shard_idx > 0 and shard_sample_count > 0: + last_idx = shard_idx - 1 + shard_manifest[last_idx] = ( + os.path.abspath(tar_output_pattern % last_idx), + os.path.abspath(jsonl_output_pattern % last_idx), + shard_sample_count, + shard_duration, + ) + + # Write manifest file (data.lst) + with open(manifest_path, "w", encoding="utf-8") as mf: + for idx in sorted(shard_manifest.keys()): + tar_path, jsonl_path, count, duration = shard_manifest[idx] + mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n") + + # Output final statistics + total_failed = error_count + write_error_count + filtered_and_skipped = total_samples - processed_count - total_failed + logging.info( + f"Processing Complete - Successful: {processed_count}, Failed: {total_failed}, " + f"Filtered/Skipped: {filtered_and_skipped}, Shards written: {shard_idx}" + ) + logging.info(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)") + if total_failed > 0: + logging.info(f"Error details: {error_log_path}") + if failed_ids and args.skip_errors: + logging.warning( + f"Failed sample IDs (count: {len(failed_ids)}): {failed_ids[:100]}..." + ) + if write_error_count > 0 and not args.skip_errors: + raise RuntimeError( + f"{write_error_count} samples failed to write - check logs for details" + ) + + +if __name__ == "__main__": + main() diff --git a/omnivoice/scripts/jsonl_to_webdataset.py b/omnivoice/scripts/jsonl_to_webdataset.py new file mode 100644 index 0000000000000000000000000000000000000000..81442b4bc0f61b0bfebb0f2a757deb0051f242b3 --- /dev/null +++ b/omnivoice/scripts/jsonl_to_webdataset.py @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Pack a JSONL audio dataset into a customed WebDataset shards +(paired .tar and .jsonl files). + +Usage: + python jsonl_to_webdataset.py \ + --input data.jsonl \ + --output output_dir/ \ + --workers 16 \ + --threads 4 \ + --shard-size 1000 \ + --sr 24000 + +Input JSONL format (one JSON object per line): + {"id": "utt_001", "audio_path": "/data/wavs/001.wav", "text": "hello world", ...} + + Required fields: "id", "audio_path", "text" + All other fields are preserved in the output metadata. + +Output structure: + output_dir/ + ├── audios/ # WebDataset tar shards + │ ├── shard_000000.tar + │ ├── shard_000001.tar + │ └── ... + ├── txts/ # Per-shard JSONL metadata (with audio_duration added) + │ ├── shard_000000.jsonl + │ ├── shard_000001.jsonl + │ └── ... + ├── data.lst # Manifest: + └── errors.jsonl # Failed samples with error details +""" + +import argparse +import io +import json +import logging +import multiprocessing as mp +import os +import random +from concurrent.futures import ( + FIRST_COMPLETED, + ProcessPoolExecutor, + ThreadPoolExecutor, + as_completed, + wait, +) +from itertools import islice +from pathlib import Path + +import torch +import torchaudio +import webdataset as wds +from tqdm import tqdm + +import soundfile as sf + +from omnivoice.utils.audio import load_waveform +from omnivoice.utils.common import str2bool + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Pack JSONL audio dataset into WebDataset shards." + ) + parser.add_argument( + "--input", type=str, default="data.jsonl", help="Path to input JSONL file" + ) + parser.add_argument( + "--output", + type=str, + default="emilia", + help="Path to output directory", + ) + parser.add_argument( + "--workers", + type=int, + default=16, + help="Number of worker processes (default: 16)", + ) + parser.add_argument( + "--threads", + type=int, + default=4, + help="Number of threads per worker process.", + ) + parser.add_argument( + "--shard-size", + type=int, + default=1000, + help="Number of samples per shard (default: 1000)", + ) + parser.add_argument( + "--sr", type=int, default=24000, help="Target sample rate (default: 24000)" + ) + parser.add_argument( + "--shuffle", + type=str2bool, + default=True, + help="Shuffle data by default.", + ) + parser.add_argument( + "--shuffle-seed", + type=int, + default=42, + help="Random seed for shuffle (default: 42)", + ) + parser.add_argument( + "--min-duration", + type=float, + default=None, + help="Filter out samples shorter than this (seconds).", + ) + parser.add_argument( + "--max-duration", + type=float, + default=None, + help="Filter out samples >= this duration (seconds).", + ) + return parser + + +def read_jsonl(file_path): + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + +def chunked_reader(iterator, chunk_size): + it = iter(iterator) + while chunk := list(islice(it, chunk_size)): + yield chunk + + +def process_audio_item(meta, target_sr): + key = meta.get("id") + audio_path = meta.get("audio_path") + + if not key or not audio_path: + return { + "error": { + "id": key, + "audio_path": audio_path, + "reason": "missing id or audio_path", + } + } + + try: + if not os.path.exists(audio_path): + raise FileNotFoundError(f"{audio_path} not found") + + waveform, sr = load_waveform(audio_path) + audio_duration = waveform.shape[1] / sr + meta["audio_duration"] = audio_duration + + if target_sr and sr != target_sr: + waveform = torchaudio.functional.resample( + torch.from_numpy(waveform), orig_freq=sr, new_freq=target_sr + ).numpy() + sr = target_sr + + audio_buffer = io.BytesIO() + sf.write(audio_buffer, waveform.T, sr, format="FLAC") + audio_bytes = audio_buffer.getvalue() + + sample = { + "__key__": key, + "flac": audio_bytes, + } + + return {"ok": (sample, meta)} + + except Exception as e: + return {"error": {"id": key, "audio_path": audio_path, "reason": str(e)}} + + +def process_single_shard( + shard_idx, + records, + output_tar_pattern, + output_jsonl_pattern, + target_sr, + num_threads=4, + min_duration=None, + max_duration=None, +): + tar_fname = output_tar_pattern % shard_idx + jsonl_fname = output_jsonl_pattern % shard_idx + + processed_count = 0 + filtered_count = 0 + error_count = 0 + total_duration = 0.0 + errors = [] + + with wds.TarWriter(tar_fname) as sink, open( + jsonl_fname, "w", encoding="utf-8" + ) as jsonl_f: + + with ThreadPoolExecutor(max_workers=num_threads) as thread_pool: + futures = [] + + for meta in records: + f = thread_pool.submit(process_audio_item, meta, target_sr) + futures.append(f) + + for f in as_completed(futures): + result = f.result() + + if "error" in result: + error_count += 1 + errors.append(result["error"]) + continue + + sample, meta = result["ok"] + dur = meta.get("audio_duration", 0.0) + + # Duration filtering (based on actual audio_duration computed above) + if min_duration is not None and dur < min_duration: + filtered_count += 1 + continue + if max_duration is not None and dur >= max_duration: + filtered_count += 1 + continue + + sink.write(sample) + + jsonl_f.write(json.dumps(meta, ensure_ascii=False) + "\n") + + total_duration += dur + processed_count += 1 + + # Clean up empty shard files + if processed_count == 0: + for p in (tar_fname, jsonl_fname): + if os.path.exists(p): + os.remove(p) + + return ( + shard_idx, + processed_count, + error_count, + filtered_count, + total_duration, + errors, + ) + + +def count_lines(path): + with open(path, "rb") as f: + return sum(buf.count(b"\n") for buf in iter(lambda: f.read(1 << 20), b"")) + + +def pack_dataset( + input_jsonl, + output_dir, + samples_per_shard=5000, + num_workers=16, + target_sr=24000, + threads_per_worker=4, + shuffle=False, + shuffle_seed=None, + min_duration=None, + max_duration=None, +): + input_path = Path(input_jsonl) + output_dir = Path(output_dir) + output_tar_dir = output_dir / "audios" + output_tar_dir.mkdir(parents=True, exist_ok=True) + output_jsonl_dir = output_dir / "txts" + output_jsonl_dir.mkdir(parents=True, exist_ok=True) + + output_tar_pattern = str(output_tar_dir / "shard-%06d.tar") + output_jsonl_pattern = str(output_jsonl_dir / "shard-%06d.jsonl") + + error_log_path = str(output_dir / "errors.jsonl") + + # Setup error logger + error_logger = logging.getLogger("error_log") + error_logger.setLevel(logging.ERROR) + error_logger.handlers.clear() + fh = logging.FileHandler(error_log_path, mode="w", encoding="utf-8") + fh.setFormatter(logging.Formatter("%(message)s")) + error_logger.addHandler(fh) + + shard_manifest = {} + + print(f"Reading input: {input_path}") + print(f"Output dir: {output_dir}") + print(f"Strategy: {num_workers} Processes x {threads_per_worker} Threads") + + if shuffle: + print("Load input dataset...") + entries = list(read_jsonl(input_path)) + random.seed(shuffle_seed) + random.shuffle(entries) + print(f"Shuffled {len(entries)} entries (seed={shuffle_seed})") + total_lines = len(entries) + chunk_gen = chunked_reader(iter(entries), samples_per_shard) + else: + print("Calculating total lines...") + total_lines = count_lines(input_path) + chunk_gen = chunked_reader(read_jsonl(input_path), samples_per_shard) + + if min_duration is not None or max_duration is not None: + print( + f"Duration filter: [{min_duration or 0:.2f}s" + f", {max_duration or float('inf'):.1f}s) (applied after audio decoding)" + ) + + total_shards_est = (total_lines + samples_per_shard - 1) // samples_per_shard + print(f"Total samples: {total_lines}, Estimated shards: {total_shards_est}") + + with ProcessPoolExecutor(max_workers=num_workers) as executor: + + futures = set() + + shard_idx = 0 + total_processed = 0 + total_errors = 0 + total_filtered = 0 + + pbar = tqdm( + total=total_shards_est, + desc="Shards Processed", + unit="shard", + ) + + def submit_next_chunks(limit): + """Pull up to `limit` chunks from generator, submit them.""" + nonlocal shard_idx + submitted = 0 + for chunk in chunk_gen: + f = executor.submit( + process_single_shard, + shard_idx, + chunk, + output_tar_pattern, + output_jsonl_pattern, + target_sr, + threads_per_worker, + min_duration, + max_duration, + ) + futures.add(f) + shard_idx += 1 + submitted += 1 + if submitted >= limit: + break + + submit_next_chunks(num_workers * 2) + + while futures: + done, _ = wait(futures, return_when=FIRST_COMPLETED) + + for f in done: + futures.remove(f) + + try: + s_idx, p_count, e_count, f_count, s_duration, errors = f.result() + total_processed += p_count + total_errors += e_count + total_filtered += f_count + + # Write error log + for err in errors: + err["shard_idx"] = s_idx + error_logger.error(json.dumps(err, ensure_ascii=False)) + + if p_count > 0: + tar_abs = os.path.abspath(output_tar_pattern % s_idx) + jsonl_abs = os.path.abspath(output_jsonl_pattern % s_idx) + shard_manifest[s_idx] = ( + tar_abs, + jsonl_abs, + p_count, + s_duration, + ) + + pbar.set_postfix( + { + "Samples": total_processed, + "Filtered": total_filtered, + "Errors": total_errors, + } + ) + pbar.update(1) + except Exception as e: + print(f"Shard task failed: {e}") + + submit_next_chunks(1) + + pbar.close() + + # Write final manifest file (data.lst) + manifest_path = str(output_dir / "data.lst") + with open(manifest_path, "w", encoding="utf-8") as mf: + for idx in sorted(shard_manifest.keys()): + tar_path, jsonl_path, count, duration = shard_manifest[idx] + mf.write(f"{tar_path} {jsonl_path} {count} {duration:.3f}\n") + + print(f"\nDone! Output saved to {output_dir}") + print(f"Successfully packed: {total_processed}") + print(f"Filtered by duration: {total_filtered}") + print(f"Failed: {total_errors}") + print(f"Manifest written to: {manifest_path} ({len(shard_manifest)} shards)") + if total_errors > 0: + print(f"Error details: {error_log_path}") + + +if __name__ == "__main__": + mp.set_start_method("spawn", force=True) + + args = build_parser().parse_args() + pack_dataset( + input_jsonl=args.input, + output_dir=args.output, + samples_per_shard=args.shard_size, + num_workers=args.workers, + target_sr=args.sr, + threads_per_worker=args.threads, + shuffle=args.shuffle, + shuffle_seed=args.shuffle_seed, + min_duration=args.min_duration, + max_duration=args.max_duration, + ) diff --git a/omnivoice/training/__init__.py b/omnivoice/training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/training/builder.py b/omnivoice/training/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..4af2fc72b34dbdbd5de7c0d8f3386e8188386d41 --- /dev/null +++ b/omnivoice/training/builder.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Builders for constructing training components. + +Provides factory functions to assemble the model, tokenizer, and data loaders +from a ``TrainingConfig``. Called by ``omnivoice.cli.train`` to set up training. + +Key functions: +- ``build_model_and_tokenizer()``: Loads the model and text tokenizer. +- ``build_dataloaders()``: Builds train/eval data loaders from a data config JSON. + The batching strategy is chosen based on ``TrainingConfig.attn_implementation``: + + - ``"flex_attention"``: sequence packing via ``PackingIterableDataset`` + + ``PackingDataCollator``. Batch shape is ``[1, C, batch_tokens]``. + - other (e.g. ``"sdpa"``): length-grouped padding via + ``StreamLengthGroupDataset`` + ``PaddingDataCollator``. Batch shape + is ``[B, C, max_len]`` where B ≥ 1 and max_len ≤ batch_tokens. +""" + +import logging +from functools import partial +from typing import Tuple + +import torch +from torch.utils.data import DataLoader +from transformers import AutoConfig, AutoModel, AutoTokenizer +from transformers import logging as hf_logging +from transformers.trainer_utils import seed_worker + +from omnivoice.data.batching import PackingIterableDataset, StreamLengthGroupDataset +from omnivoice.data.collator import PackingDataCollator, PaddingDataCollator +from omnivoice.data.dataset import WebDatasetReader, prepare_data_manifests_from_json +from omnivoice.data.processor import OmniVoiceSampleProcessor +from omnivoice.models.omnivoice import OmniVoice, OmniVoiceConfig, _resolve_model_path +from omnivoice.training.config import TrainingConfig + +logger = logging.getLogger(__name__) + + +def build_model_and_tokenizer( + config: TrainingConfig, +) -> Tuple[OmniVoice, AutoTokenizer]: + """Load Tokenizer and Model, handle resizing and special tokens.""" + logger.info("Initializing Model & Tokenizer...") + + # 1. Tokenizer + tokenizer_path = ( + config.init_from_checkpoint + if config.init_from_checkpoint + else config.llm_name_or_path + ) + tokenizer_path = _resolve_model_path(tokenizer_path) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + new_tokens = [ + "<|denoise|>", + "<|lang_start|>", + "<|lang_end|>", + "<|instruct_start|>", + "<|instruct_end|>", + "<|text_start|>", + "<|text_end|>", + ] + + tokens_to_add = [t for t in new_tokens if t not in tokenizer.get_vocab()] + if tokens_to_add: + tokenizer.add_special_tokens({"additional_special_tokens": tokens_to_add}) + + if config.init_from_checkpoint: + logger.info(f"Loading weights from {config.init_from_checkpoint}") + model = OmniVoice.from_pretrained( + config.init_from_checkpoint, + attn_implementation=config.attn_implementation, + dtype=torch.float32, + train=True, + ) + else: + resolved_llm = _resolve_model_path(config.llm_name_or_path) + llm_config = AutoConfig.from_pretrained(resolved_llm) + + ov_config = OmniVoiceConfig( + audio_vocab_size=config.audio_vocab_size, + audio_mask_id=config.audio_mask_id, + num_audio_codebook=config.num_audio_codebook, + audio_codebook_weights=config.audio_codebook_weights, + llm_config=llm_config, + ) + + original_level = hf_logging.get_verbosity() + hf_logging.set_verbosity_error() # suppress expected lm_head.weight warnings + + llm = AutoModel.from_pretrained( + resolved_llm, + attn_implementation=config.attn_implementation, + dtype=torch.float32, + ) + + hf_logging.set_verbosity(original_level) + model = OmniVoice(config=ov_config, llm=llm) + + # 3. Resize Embeddings + if len(tokenizer) != model.config.llm_config.vocab_size: + model.llm.resize_token_embeddings(len(tokenizer)) + model.config.llm_config.vocab_size = len(tokenizer) + + # 4. Config IDs + model.config.pad_token_id = tokenizer.pad_token_id + model.config.bos_token_id = tokenizer.bos_token_id + model.config.eos_token_id = tokenizer.eos_token_id + + return model, tokenizer + + +def build_dataloaders( + config: TrainingConfig, tokenizer: AutoTokenizer +) -> Tuple[DataLoader, DataLoader]: + """Setup Data Pipeline: Manifests -> WDS -> Batching -> Loaders. + + Batching strategy depends on ``config.attn_implementation``: + - ``"flex_attention"``: sequence packing (PackingIterableDataset + + PackingDataCollator). All samples are concatenated into one long sequence. + - other (e.g. ``"sdpa"``): length-grouped padding + (LengthGroupedIterableDataset + PaddingDataCollator). Samples with + similar token lengths are batched together and padded to the same length. + """ + logger.info("Initializing Data Readers...") + + processor = OmniVoiceSampleProcessor( + text_tokenizer=tokenizer, + num_channels=config.num_audio_codebook, + audio_mask_id=config.audio_mask_id, + prompt_ratio_range=config.prompt_ratio_range, + mask_ratio_range=config.mask_ratio_range, + drop_cond_ratio=config.drop_cond_ratio, + language_ratio=config.language_ratio, + use_pinyin_ratio=config.use_pinyin_ratio, + instruct_ratio=config.instruct_ratio, + only_instruct_ratio=config.only_instruct_ratio, + ) + + train_manifests, dev_manifests = prepare_data_manifests_from_json( + config.data_config + ) + raw_train_ds = WebDatasetReader(manifests=train_manifests, evaluation=False) + + use_packing = config.attn_implementation == "flex_attention" + + if use_packing: + train_dataset = PackingIterableDataset( + raw_train_ds, processor, config.batch_tokens + ) + collate_fn = PackingDataCollator(processor, config.batch_tokens) + else: + train_dataset = StreamLengthGroupDataset( + raw_train_ds, + batch_duration=config.batch_tokens, + min_length=config.min_sample_tokens, + max_length=config.max_sample_tokens, + max_sample=config.max_batch_size, + processor=processor, + length_fn=lambda s: s["length"], + ) + collate_fn = PaddingDataCollator(processor, config.batch_tokens) + + logger.info( + "Using %s (attn_implementation=%s)", + "sequence packing" if use_packing else "length-grouped padding", + config.attn_implementation, + ) + + init_fn = partial( + seed_worker, + num_workers=config.num_workers, + rank=( + torch.distributed.get_rank() + if torch.distributed.is_initialized() + else 0 + ), + ) + + train_loader = DataLoader( + train_dataset, + batch_size=None, + num_workers=config.num_workers, + collate_fn=collate_fn, + worker_init_fn=init_fn, + pin_memory=True, + prefetch_factor=4, + ) + + eval_loader = None + if dev_manifests: + raw_dev_ds = WebDatasetReader( + manifests=dev_manifests, evaluation=True + ) + if use_packing: + dev_dataset = PackingIterableDataset( + raw_dev_ds, processor, config.batch_tokens + ) + else: + dev_dataset = StreamLengthGroupDataset( + raw_dev_ds, + batch_duration=config.batch_tokens, + min_length=config.min_sample_tokens, + max_length=config.max_sample_tokens, + max_sample=config.max_batch_size, + processor=processor, + length_fn=lambda s: s["length"], + ) + eval_loader = DataLoader( + dev_dataset, + batch_size=None, # Each item is already a collated batch + num_workers=1, + collate_fn=collate_fn, + pin_memory=True, + prefetch_factor=2, + ) + + return train_loader, eval_loader diff --git a/omnivoice/training/checkpoint.py b/omnivoice/training/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..4bbf969d727708225a8edbd0d05611302db418b9 --- /dev/null +++ b/omnivoice/training/checkpoint.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Checkpoint saving, resuming, and training logging. + +Provides utilities for saving/loading training checkpoints and logging metrics +to console and trackers (TensorBoard/WandB). Used by ``OmniTrainer``. + +Key components: +- ``TrainLogger``: Logs training metrics to console and Accelerate trackers. +- ``save_checkpoint()``: Saves model, optimizer, and scheduler state. +- ``load_checkpoint()``: Restores training state from a checkpoint directory. +""" + +import logging +import os +import shutil +import time +from typing import Any, Dict, Optional + +import torch +from accelerate import Accelerator +from tqdm.auto import tqdm + +logger = logging.getLogger(__name__) + + +class TrainLogger: + """ + Handles logging to console and trackers (TensorBoard/WandB) + """ + + def __init__(self, accelerator: Accelerator, total_steps: int, logging_steps: int): + self.accelerator = accelerator + self.total_steps = total_steps + self.logging_steps = logging_steps + self.start_time = None + self.progress_bar = None + + def start(self, start_step: int = 0): + self.start_time = time.time() + + if self.accelerator.is_main_process: + self.progress_bar = tqdm( + total=self.total_steps, + initial=start_step, + desc="Training", + dynamic_ncols=True, + disable=not self.accelerator.is_local_main_process, + ) + + def update( + self, step: int, loss: Optional[float] = None, lr: Optional[float] = None + ): + """ + Called every step to update the progress bar UI. + """ + if self.progress_bar: + self.progress_bar.update(1) + + # Update real-time metrics on the progress bar itself + postfix = {} + if loss is not None: + postfix["loss"] = f"{loss:.4f}" + if lr is not None: + postfix["lr"] = f"{lr:.2e}" + + if postfix: + self.progress_bar.set_postfix(postfix) + + def log_metrics(self, step: int, metrics: Dict[str, Any]): + """ + Called periodically to log to TensorBoard/WandB and console. + """ + # Log to trackers (TensorBoard, etc.) + self.accelerator.log(metrics, step=step) + + if self.accelerator.is_main_process: + # Format for console log (separate from tqdm) + # Remove keys that are redundant or too verbose for one line + formatted_metrics = [] + for k, v in metrics.items(): + if isinstance(v, float): + val_str = f"{v:.4f}" + if val_str == "0.0000" and v != 0: + formatted_metrics.append(f"{k}: {v:.2e}") + else: + formatted_metrics.append(f"{k}: {val_str}") + else: + formatted_metrics.append(f"{k}: {v}") + + # Use external logger to write to file, tqdm.write to avoid breaking bar + msg = f"Step {step} | " + " | ".join(formatted_metrics) + if self.progress_bar: + self.progress_bar.write(msg) + else: + logger.info(msg) + + def close(self): + if self.progress_bar: + self.progress_bar.close() + + +def save_checkpoint( + accelerator: Accelerator, + model: torch.nn.Module, + tokenizer: Any, + output_dir: str, + step: int, + keep_last_n: int = 3, +): + """ + Saves model, tokenizer, and accelerator states (optimizer/scheduler). + Manages rotation of checkpoints. + """ + checkpoint_dir = os.path.join(output_dir, f"checkpoint-{step}") + + # 1. Save Accelerator State (Optimizer, Scheduler, RNG, Scaler) + accelerator.save_state(checkpoint_dir) + + # 2. Save Model in HF format (config.json + pytorch_model.bin/safetensors) + unwrap_model = accelerator.unwrap_model(model) + unwrap_model.save_pretrained( + checkpoint_dir, + is_main_process=accelerator.is_main_process, + save_function=accelerator.save, + ) + + # 3. Save Tokenizer + if accelerator.is_main_process: + tokenizer.save_pretrained(checkpoint_dir) + + logger.info(f"Saved checkpoint to {checkpoint_dir}") + + # 4. Rotate checkpoints (Keep last N) + if accelerator.is_main_process and keep_last_n > 0: + checkpoints = [ + d + for d in os.listdir(output_dir) + if d.startswith("checkpoint-") + and os.path.isdir(os.path.join(output_dir, d)) + ] + # Sort by step number + checkpoints.sort(key=lambda x: int(x.split("-")[-1])) + + if len(checkpoints) > keep_last_n: + to_remove = checkpoints[:-keep_last_n] + for d in to_remove: + shutil.rmtree(os.path.join(output_dir, d)) + logger.info(f"Removed old checkpoint {d}") + + +def load_checkpoint(accelerator: Accelerator, checkpoint_path: str): + """ + Resumes training state. + """ + logger.info(f"Resuming from {checkpoint_path}") + accelerator.load_state(checkpoint_path) + + # Try to infer step + try: + clean_path = os.path.normpath(checkpoint_path) + step = int(os.path.basename(clean_path).split("-")[-1]) + return step + except ValueError: + return 0 diff --git a/omnivoice/training/config.py b/omnivoice/training/config.py new file mode 100644 index 0000000000000000000000000000000000000000..49cd8809a84b109aeccbac5d12884092d8d95f38 --- /dev/null +++ b/omnivoice/training/config.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training configuration dataclass. + +Defines ``TrainingConfig``, a dataclass that holds all hyperparameters and paths +for training. Loaded from a JSON config file via ``TrainingConfig.from_json()`` +in ``omnivoice.cli.train``. +""" + +import json +from dataclasses import asdict, dataclass, field +from typing import List, Optional, Tuple + + +@dataclass +class TrainingConfig: + # Key Paths + output_dir: Optional[str] = None + data_config: Optional[str] = None + + # Model Specific + llm_name_or_path: str = "Qwen/Qwen3-0.6B" + audio_vocab_size: int = 1025 # valid vocab size + 1 (mask token) + audio_mask_id: int = 1024 # 1024 is the 1025-th token + num_audio_codebook: int = 8 + + # Model Training Specific + audio_codebook_weights: List[float | int] = field( + default_factory=lambda: [8, 8, 6, 6, 4, 4, 2, 2] + ) + drop_cond_ratio: float = 0.1 + prompt_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 0.3)) + mask_ratio_range: Tuple[float, float] = field(default_factory=lambda: (0.0, 1.0)) + language_ratio: float = 0.8 + use_pinyin_ratio: float = 0.3 + instruct_ratio: float = 1.0 + only_instruct_ratio: float = 0.5 + + # Init settings + resume_from_checkpoint: Optional[str] = None + init_from_checkpoint: Optional[str] = None + + # Training Hyperparams + learning_rate: float = 1e-4 + weight_decay: float = 0.01 + max_grad_norm: float = 1.0 + steps: int = 300000 + seed: int = 42 + lr_scheduler_type: str = "cosine" + warmup_type: str = "ratio" + warmup_ratio: float = 0.03 + warmup_steps: int = 2000 + + # Data + batch_tokens: int = 8192 + gradient_accumulation_steps: int = 1 + num_workers: int = 8 + + # System + mixed_precision: str = "bf16" + allow_tf32: bool = True + use_deepspeed: bool = False + deepspeed_config: Optional[str] = None + attn_implementation: str = "flex_attention" + + # Length-grouped batching (only used when attn_implementation != "flex_attention") + max_sample_tokens: int = 2000 + min_sample_tokens: int = 50 + max_batch_size: int = 64 + + # Logging + logging_steps: int = 100 + eval_steps: int = 1000 + save_steps: int = 10000 + keep_last_n_checkpoints: int = -1 + + @classmethod + def from_json(cls, json_path: str): + with open(json_path, "r") as f: + cfg_dict = json.load(f) + valid_keys = cls.__annotations__.keys() + filtered_dict = {k: v for k, v in cfg_dict.items() if k in valid_keys} + instance = cls(**filtered_dict) + return instance + + def save_to_json(self, json_path: str): + data = asdict(self) + with open(json_path, "w") as f: + json.dump(data, f, indent=4) diff --git a/omnivoice/training/trainer.py b/omnivoice/training/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..081955ea8ec5a0f7a23bdcffeedc6bdde751330a --- /dev/null +++ b/omnivoice/training/trainer.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Training loop for OmniVoice. + +Wraps the HuggingFace Accelerate training loop with checkpoint saving/resuming, +evaluation, gradient accumulation, and learning rate scheduling. +Launched via ``omnivoice.cli.train``. +""" + +import logging +import math +import os +import sys +import time +from datetime import timedelta +from typing import Any, Optional + +import torch +from accelerate import Accelerator, DistributedDataParallelKwargs +from accelerate.utils import DeepSpeedPlugin, InitProcessGroupKwargs, set_seed +from torch.utils.data import DataLoader +from transformers import ( + get_cosine_schedule_with_warmup, + get_constant_schedule_with_warmup, +) + +from omnivoice.training.checkpoint import TrainLogger, load_checkpoint +from omnivoice.training.checkpoint import save_checkpoint as engine_save_checkpoint + +logger = logging.getLogger(__name__) + + +def _to_device(batch, device): + """Move all tensors in a batch dict to the target device.""" + return { + k: v.to(device, non_blocking=True) if isinstance(v, torch.Tensor) else v + for k, v in batch.items() + } + + +class OmniTrainer: + def __init__( + self, + model: torch.nn.Module, + config: Any, # TrainingConfig + train_dataloader: DataLoader, + eval_dataloader: Optional[DataLoader] = None, + tokenizer: Optional[Any] = None, + optimizer: Optional[torch.optim.Optimizer] = None, + lr_scheduler: Optional[Any] = None, + ): + self.config = config + self.model = model + self.tokenizer = tokenizer + self.train_dataloader = train_dataloader + self.eval_dataloader = eval_dataloader + + # 1. Initialize Accelerator + self.accelerator = self._init_accelerator() + + # 2. Setup Optimizer & Scheduler if not provided + if optimizer is None: + self.optimizer, self.lr_scheduler = self.create_optimizer_and_scheduler() + else: + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + + # 3. DeepSpeed Hack (Batch Size fix) + if self.accelerator.distributed_type == "DEEPSPEED": + self.accelerator.state.deepspeed_plugin.deepspeed_config[ + "train_micro_batch_size_per_gpu" + ] = 1 + + # 4. Prepare with Accelerator + (self.model, self.optimizer, self.lr_scheduler,) = self.accelerator.prepare( + self.model, + self.optimizer, + self.lr_scheduler, + ) + + self.global_step = 0 + self.epoch = 0 + + def _init_accelerator(self) -> Accelerator: + """Initialize Accelerator, DeepSpeed, and Logging.""" + # TF32 setup + if getattr(self.config, "allow_tf32", False): + torch.set_float32_matmul_precision("high") + + # Init handlers + ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False) + init_kwargs = InitProcessGroupKwargs(timeout=timedelta(minutes=60)) + + # DeepSpeed setup + deepspeed_plugin = None + if self.config.use_deepspeed and self.config.deepspeed_config: + if not os.path.exists(self.config.deepspeed_config): + raise FileNotFoundError( + f"DeepSpeed config not found: {self.config.deepspeed_config}" + ) + deepspeed_plugin = DeepSpeedPlugin( + hf_ds_config=self.config.deepspeed_config, + gradient_accumulation_steps=self.config.gradient_accumulation_steps, + gradient_clipping=self.config.max_grad_norm, + ) + + accelerator = Accelerator( + gradient_accumulation_steps=self.config.gradient_accumulation_steps, + mixed_precision=self.config.mixed_precision, + log_with="tensorboard", + project_dir=self.config.output_dir, + step_scheduler_with_optimizer=False, + kwargs_handlers=[ddp_kwargs, init_kwargs], + deepspeed_plugin=deepspeed_plugin, + split_batches=False, + ) + + # Logging setup + if accelerator.is_main_process: + os.makedirs(self.config.output_dir, exist_ok=True) + # Try to save config if it has the method + if hasattr(self.config, "save_to_json"): + self.config.save_to_json( + os.path.join(self.config.output_dir, "initial_config.json") + ) + + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + handlers=[ + logging.StreamHandler(sys.stdout), + logging.FileHandler( + os.path.join(self.config.output_dir, "train.log") + ), + ], + ) + else: + logging.basicConfig(level=logging.ERROR) + + logger.info(f"Loaded Config: {self.config}") + set_seed(self.config.seed) + accelerator.init_trackers("tensorboard") + return accelerator + + def create_optimizer_and_scheduler(self): + """Default AdamW + configurable LR Scheduler.""" + optimizer = torch.optim.AdamW( + self.model.parameters(), + lr=self.config.learning_rate, + weight_decay=self.config.weight_decay, + ) + + if self.config.warmup_type == "ratio": + final_warmup_steps = math.ceil(self.config.steps * self.config.warmup_ratio) + else: + final_warmup_steps = self.config.warmup_steps + + if self.config.lr_scheduler_type == "constant": + lr_scheduler = get_constant_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=final_warmup_steps, + ) + else: + lr_scheduler = get_cosine_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=final_warmup_steps, + num_training_steps=self.config.steps, + ) + return optimizer, lr_scheduler + + def save_checkpoint(self, step): + """Wrapper for engine save_checkpoint.""" + engine_save_checkpoint( + self.accelerator, + self.model, + self.tokenizer, + self.config.output_dir, + step, + self.config.keep_last_n_checkpoints, + ) + # Save config copy for convenience + if self.accelerator.is_main_process and hasattr(self.config, "save_to_json"): + checkpoint_dir = os.path.join(self.config.output_dir, f"checkpoint-{step}") + self.config.save_to_json(os.path.join(checkpoint_dir, "train_config.json")) + + def load_checkpoint(self, checkpoint_path): + """Wrapper for loading.""" + step = load_checkpoint(self.accelerator, checkpoint_path) + self.global_step = step + logger.info(f"Resumed from step {self.global_step}") + return step + + def evaluate(self): + """Evaluation loop.""" + if self.eval_dataloader is None: + return {} + + self.model.eval() + logger.info(f"Running evaluation at step {self.global_step}...") + + local_loss_sum = torch.tensor(0.0, device=self.accelerator.device) + eval_count = 0 + + with torch.no_grad(): + for eval_batch in self.eval_dataloader: + eval_batch = _to_device(eval_batch, self.accelerator.device) + outputs = self.model(**eval_batch) + local_loss_sum += outputs.loss.detach() + eval_count += 1 + + if eval_count > 0: + local_mean = local_loss_sum / eval_count + else: + local_mean = torch.tensor(0.0, device=self.accelerator.device) + + all_means = self.accelerator.gather(local_mean) + final_eval_loss = all_means.mean().item() + + eval_metrics = {"eval/loss": final_eval_loss} + self.accelerator.log(eval_metrics, step=self.global_step) + logger.info(f"Eval Loss: {final_eval_loss:.4f}") + + self.accelerator.wait_for_everyone() + self.model.train() + return eval_metrics + + def train(self): + """Main training loop.""" + logger.info("Starting Training Loop...") + + # Resume if configured + if self.config.resume_from_checkpoint: + self.load_checkpoint(self.config.resume_from_checkpoint) + + # Handle IterableDataset Epochs + if hasattr(self.train_dataloader.dataset, "set_epoch"): + self.train_dataloader.dataset.set_epoch(self.epoch) + + # Logger + train_logger = TrainLogger( + self.accelerator, self.config.steps, self.config.logging_steps + ) + train_logger.start(self.global_step) + + self.model.train() + train_iterator = iter(self.train_dataloader) + + logging_start_time = time.time() + logging_start_step = self.global_step + tr_loss = torch.tensor(0.0).to(self.accelerator.device) + logging_loss_scalar = 0.0 + + while self.global_step < self.config.steps: + try: + batch = next(train_iterator) + except StopIteration: + self.epoch += 1 + logger.info(f"Epoch {self.epoch} starting. Resetting dataloader...") + if hasattr(self.train_dataloader.dataset, "set_epoch"): + self.train_dataloader.dataset.set_epoch(self.epoch) + + train_iterator = iter(self.train_dataloader) + batch = next(train_iterator) + + batch = _to_device(batch, self.accelerator.device) + + with self.accelerator.accumulate(self.model): + outputs = self.model(**batch) + loss = outputs.loss + tr_loss += loss.detach() + self.accelerator.backward(loss) + + if self.accelerator.sync_gradients: + # Clipping + grad_norm = 0.0 + if self.config.max_grad_norm > 0: + grad_norm = self.accelerator.clip_grad_norm_( + self.model.parameters(), self.config.max_grad_norm + ) + grad_norm = ( + grad_norm.item() if grad_norm is not None else 0.0 + ) + + self.optimizer.step() + self.lr_scheduler.step() + self.optimizer.zero_grad() + self.global_step += 1 + + # Logging + current_lr = self.lr_scheduler.get_last_lr()[0] + train_logger.update( + step=self.global_step, loss=loss.item(), lr=current_lr + ) + + if self.global_step % self.config.logging_steps == 0: + elapsed = time.time() - logging_start_time + steps_per_sec = ( + (self.global_step - logging_start_step) / elapsed + if elapsed > 0 + else 0 + ) + + tr_loss_scalar = self.accelerator.gather(tr_loss).mean().item() + current_interval_loss = tr_loss_scalar - logging_loss_scalar + avg_loss = current_interval_loss / ( + self.config.logging_steps + * self.config.gradient_accumulation_steps + ) + logging_loss_scalar = tr_loss_scalar + + logs = { + "train/loss": avg_loss, + "train/learning_rate": current_lr, + "train/grad_norm": grad_norm, + "train/epoch": self.epoch, + "train/steps_per_sec": steps_per_sec, + } + train_logger.log_metrics(step=self.global_step, metrics=logs) + + logging_start_time = time.time() + logging_start_step = self.global_step + + # Evaluate + if ( + self.eval_dataloader is not None + and self.global_step % self.config.eval_steps == 0 + ): + self.evaluate() + + # Save + if self.global_step % self.config.save_steps == 0: + self.save_checkpoint(self.global_step) + + # Final Save + self.save_checkpoint(self.global_step) + train_logger.close() + self.accelerator.end_training() diff --git a/omnivoice/utils/__init__.py b/omnivoice/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/omnivoice/utils/audio.py b/omnivoice/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..02c40100727df676e5545d748c3f0fb4c1bf37f1 --- /dev/null +++ b/omnivoice/utils/audio.py @@ -0,0 +1,343 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Audio I/O and processing utilities. + +Provides functions for loading, resampling, silence removal, +chunking, cross-fading, and format conversion. + +All public functions in this module operate on **numpy float32 arrays** +with shape ``(C, T)`` (channels-first). +""" + +import io +import logging + +import numpy as np +import soundfile as sf +import torch +import torchaudio +from pydub import AudioSegment +from pydub.silence import detect_leading_silence, detect_nonsilent, split_on_silence + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Loading +# --------------------------------------------------------------------------- + + +def load_waveform(audio_path: str): + """Load audio from a file path, returning (data, sample_rate). + + Tries two backends in order: + 1. soundfile — covers WAV/FLAC/OGG etc., no ffmpeg needed. + 2. librosa — covers MP3/M4A etc. via audioread + ffmpeg. + + Returns: + (data, sample_rate) where data is a numpy float32 array of + shape (C, T). + """ + try: + data, sr = sf.read(audio_path, dtype="float32", always_2d=True) + return data.T, sr # (T, C) → (C, T) + except Exception: + # soundfile cannot handle MP3/M4A etc., fall back to librosa. + import librosa + + data, sr = librosa.load(audio_path, sr=None, mono=False) + if data.ndim == 1: + data = data[np.newaxis, :] + return data, sr + + +def load_audio(audio_path: str, sampling_rate: int) -> np.ndarray: + """Load a waveform from file and resample to the target rate. + + Parameters: + audio_path: path of the audio. + sampling_rate: target sampling rate. + + Returns: + Numpy float32 array of shape (1, T). + """ + data, sr = load_waveform(audio_path) + + if data.shape[0] > 1: + data = np.mean(data, axis=0, keepdims=True) + if sr != sampling_rate: + data = torchaudio.functional.resample( + torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate + ).numpy() + + return data + + +def load_audio_bytes(raw: bytes, sampling_rate: int) -> np.ndarray: + """Load audio from in-memory bytes and resample. + + Parameters: + raw: raw audio file bytes (e.g. from WebDataset). + sampling_rate: target sampling rate. + + Returns: + Numpy float32 array of shape (1, T). + """ + buf = io.BytesIO(raw) + + try: + data, sr = sf.read(buf, dtype="float32", always_2d=True) + data = data.T # (T, C) → (C, T) + except Exception: + import librosa + + buf.seek(0) + data, sr = librosa.load(buf, sr=None, mono=False) + if data.ndim == 1: + data = data[np.newaxis, :] + + if data.shape[0] > 1: + data = np.mean(data, axis=0, keepdims=True) + if sr != sampling_rate: + data = torchaudio.functional.resample( + torch.from_numpy(data), orig_freq=sr, new_freq=sampling_rate + ).numpy() + + return data + + +# --------------------------------------------------------------------------- +# Audio processing (all numpy in / numpy out) +# --------------------------------------------------------------------------- + + +def numpy_to_audiosegment(audio: np.ndarray, sample_rate: int) -> AudioSegment: + """Convert a numpy float32 array of shape (C, T) to a pydub AudioSegment.""" + audio_int = (audio * 32768.0).clip(-32768, 32767).astype(np.int16) + if audio_int.shape[0] > 1: + audio_int = audio_int.T.flatten() # interleave channels + return AudioSegment( + data=audio_int.tobytes(), + sample_width=2, + frame_rate=sample_rate, + channels=audio.shape[0], + ) + + +def audiosegment_to_numpy(aseg: AudioSegment) -> np.ndarray: + """Convert a pydub AudioSegment to a numpy float32 array of shape (C, T).""" + data = np.array(aseg.get_array_of_samples()).astype(np.float32) / 32768.0 + if aseg.channels == 1: + return data[np.newaxis, :] + return data.reshape(-1, aseg.channels).T + + +def remove_silence( + audio: np.ndarray, + sampling_rate: int, + mid_sil: int = 300, + lead_sil: int = 100, + trail_sil: int = 300, +) -> np.ndarray: + """Remove middle silences longer than *mid_sil* ms and trim edge silences. + + Parameters: + audio: numpy array with shape (C, T). + sampling_rate: sampling rate of the audio. + mid_sil: middle-silence threshold in ms (0 to skip). + lead_sil: kept leading silence in ms. + trail_sil: kept trailing silence in ms. + + Returns: + Numpy array with shape (C, T'). + """ + wave = numpy_to_audiosegment(audio, sampling_rate) + + if mid_sil > 0: + non_silent_segs = split_on_silence( + wave, + min_silence_len=mid_sil, + silence_thresh=-50, + keep_silence=mid_sil, + seek_step=10, + ) + wave = AudioSegment.silent(duration=0) + for seg in non_silent_segs: + wave += seg + + wave = remove_silence_edges(wave, lead_sil, trail_sil, -50) + + return audiosegment_to_numpy(wave) + + +def remove_silence_edges( + audio: AudioSegment, + lead_sil: int = 100, + trail_sil: int = 300, + silence_threshold: float = -50, +) -> AudioSegment: + """Remove edge silences, keeping *lead_sil* / *trail_sil* ms.""" + start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold) + start_idx = max(0, start_idx - lead_sil) + audio = audio[start_idx:] + + audio = audio.reverse() + start_idx = detect_leading_silence(audio, silence_threshold=silence_threshold) + start_idx = max(0, start_idx - trail_sil) + audio = audio[start_idx:] + audio = audio.reverse() + + return audio + + +def fade_and_pad_audio( + audio: np.ndarray, + pad_duration: float = 0.1, + fade_duration: float = 0.1, + sample_rate: int = 24000, +) -> np.ndarray: + """Apply fade-in/out and pad with silence to prevent clicks. + + Args: + audio: numpy array of shape (C, T). + pad_duration: silence padding duration per side (seconds). + fade_duration: fade curve duration (seconds). + sample_rate: audio sampling rate. + + Returns: + Processed numpy array of shape (C, T_new). + """ + if audio.shape[-1] == 0: + return audio + + fade_samples = int(fade_duration * sample_rate) + pad_samples = int(pad_duration * sample_rate) + + processed = audio.copy() + + if fade_samples > 0: + k = min(fade_samples, processed.shape[-1] // 2) + if k > 0: + fade_in = np.linspace(0, 1, k, dtype=np.float32)[np.newaxis, :] + processed[..., :k] *= fade_in + + fade_out = np.linspace(1, 0, k, dtype=np.float32)[np.newaxis, :] + processed[..., -k:] *= fade_out + + if pad_samples > 0: + silence = np.zeros( + (processed.shape[0], pad_samples), + dtype=processed.dtype, + ) + processed = np.concatenate([silence, processed, silence], axis=-1) + + return processed + + +def trim_long_audio( + audio: np.ndarray, + sampling_rate: int, + max_duration: float = 15.0, + min_duration: float = 3.0, + trim_threshold: float = 20.0, +) -> np.ndarray: + """Trim audio to <= *max_duration* by splitting at the largest silence gap. + + Only trims when the audio exceeds *trim_threshold* seconds. + + Args: + audio: numpy array of shape (C, T). + sampling_rate: audio sampling rate. + max_duration: maximum duration in seconds. + min_duration: minimum duration in seconds. + trim_threshold: only trim if audio is longer than this (seconds). + + Returns: + Trimmed numpy array. + """ + duration = audio.shape[-1] / sampling_rate + if duration <= trim_threshold: + return audio + + seg = numpy_to_audiosegment(audio, sampling_rate) + nonsilent = detect_nonsilent( + seg, min_silence_len=100, silence_thresh=-40, seek_step=10 + ) + if not nonsilent: + return audio + + max_ms = int(max_duration * 1000) + min_ms = int(min_duration * 1000) + + best_split = 0 + for start, end in nonsilent: + if start > best_split and start <= max_ms: + best_split = start + if end > max_ms: + break + + if best_split < min_ms: + best_split = min(max_ms, len(seg)) + + trimmed = seg[:best_split] + return audiosegment_to_numpy(trimmed) + + +def cross_fade_chunks( + chunks: list[np.ndarray], + sample_rate: int, + silence_duration: float = 0.3, +) -> np.ndarray: + """Concatenate audio chunks with silence gaps and cross-fade at boundaries. + + Args: + chunks: list of numpy arrays, each (C, T). + sample_rate: audio sample rate. + silence_duration: total silence gap duration in seconds. + + Returns: + Merged numpy array (C, T_total). + """ + if len(chunks) == 1: + return chunks[0] + + total_n = int(silence_duration * sample_rate) + fade_n = total_n // 3 + silence_n = fade_n + merged = chunks[0].copy() + + for chunk in chunks[1:]: + parts = [merged] + + fout_n = min(fade_n, merged.shape[-1]) + if fout_n > 0: + w_out = np.linspace(1, 0, fout_n, dtype=np.float32)[np.newaxis, :] + parts[-1][..., -fout_n:] *= w_out + + parts.append(np.zeros((chunks[0].shape[0], silence_n), dtype=np.float32)) + + fade_in = chunk.copy() + fin_n = min(fade_n, fade_in.shape[-1]) + if fin_n > 0: + w_in = np.linspace(0, 1, fin_n, dtype=np.float32)[np.newaxis, :] + fade_in[..., :fin_n] *= w_in + + parts.append(fade_in) + merged = np.concatenate(parts, axis=-1) + + return merged diff --git a/omnivoice/utils/common.py b/omnivoice/utils/common.py new file mode 100644 index 0000000000000000000000000000000000000000..6bdd342673ff6b8dd2fe658ec7dceb66e03839f0 --- /dev/null +++ b/omnivoice/utils/common.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared utility functions.""" + +import argparse +import random + +import numpy as np +import torch + + +def str2bool(v): + """Used in argparse.ArgumentParser.add_argument to indicate + that a type is a bool type and user can enter + + - yes, true, t, y, 1, to represent True + - no, false, f, n, 0, to represent False + + See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa + """ + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def fix_random_seed(random_seed: int): + """ + Set the same random seed for the libraries and modules. + Includes the ``random`` module, numpy, and torch. + """ + random.seed(random_seed) + np.random.seed(random_seed) + torch.random.manual_seed(random_seed) + # Ensure deterministic ID creation + rd = random.Random() + rd.seed(random_seed) diff --git a/omnivoice/utils/data_utils.py b/omnivoice/utils/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b59fee4b0d7d98c18146c5577c192a263cabf1a2 --- /dev/null +++ b/omnivoice/utils/data_utils.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data utilities for batch inference and evaluation. + +Provides ``read_test_list()`` to parse JSONL test list files used by +``omnivoice.cli.infer_batch`` and evaluation scripts. +""" + +import json +import logging +from pathlib import Path + + +def read_test_list(path): + """Read a JSONL test list file. + + Each line should be a JSON object. Only ``id`` and ``text`` are required; + all other fields are optional (default to ``None``): + id, text, ref_audio, ref_text, instruct, + language_id, language_name, duration, speed + + Note: ``language_name`` is only used by evaluation scripts (under + ``omnivoice/eval/``) for grouping and reporting results. The model + itself only consumes ``language_id``. + + Returns a list of dicts. + """ + path = Path(path) + samples = [] + with path.open("r", encoding="utf-8") as f: + for line_no, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except json.JSONDecodeError: + logging.warning(f"Skipping malformed JSON at line {line_no}: {line}") + continue + + sample = { + "id": obj.get("id"), + "text": obj.get("text"), + "ref_audio": obj.get("ref_audio"), + "ref_text": obj.get("ref_text"), + "language_id": obj.get("language_id"), + "language_name": obj.get("language_name"), + "duration": obj.get("duration"), + "speed": obj.get("speed"), + "instruct": obj.get("instruct"), + } + samples.append(sample) + return samples diff --git a/omnivoice/utils/duration.py b/omnivoice/utils/duration.py new file mode 100644 index 0000000000000000000000000000000000000000..f3bb9d7641e5facdedb9ac8ea1ed59f704d4487c --- /dev/null +++ b/omnivoice/utils/duration.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text duration estimation for TTS generation. + +Provides ``RuleDurationEstimator``, which estimates audio duration from text +using character phonetic weights across 600+ languages. Used by +``OmniVoice.generate()`` to determine output length when no duration is specified. +""" + +import bisect +import unicodedata +from functools import lru_cache +from typing import Optional + + +class RuleDurationEstimator: + def __init__(self): + # ========================================== + # 1. Phonetic Weights Table + # ========================================== + # The weight represents the relative speaking time compared to + # a standard Latin letter. + # Benchmark: 1.0 = One Latin Character (~40-50ms) + self.weights = { + # --- Logographic (1 char = full syllable/word) --- + "cjk": 3.0, # Chinese, Japanese Kanji, etc. + # --- Syllabic / Blocks + "hangul": 2.5, # Korean Hangul + "kana": 2.2, # Japanese Hiragana/Katakana + "ethiopic": 3.0, # Amharic/Ge'ez + "yi": 3.0, # Yi script + # --- Abugida (Consonant-Vowel complexes) --- + "indic": 1.8, # Hindi, Bengali, Tamil, etc. + "thai_lao": 1.5, # Thai, Lao + "khmer_myanmar": 1.8, # Khmer, Myanmar + # --- Abjad (Consonant-heavy) --- + "arabic": 1.5, # Arabic, Persian, Urdu + "hebrew": 1.5, # Hebrew + # --- Alphabet (Segmental) --- + "latin": 1.0, # English, Spanish, French, Vietnamese, etc. (Baseline) + "cyrillic": 1.0, # Russian, Ukrainian + "greek": 1.0, # Greek + "armenian": 1.0, # Armenian + "georgian": 1.0, # Georgian + # --- Symbols & Misc --- + "punctuation": 0.5, # Pause capability + "space": 0.2, # Word boundary/Breath (0.05 / 0.22) + "digit": 3.5, # Numbers + "mark": 0.0, # Diacritics/Accents (Silent modifiers) + "default": 1.0, # Fallback for unknown scripts + } + + # ========================================== + # 2. Unicode Range Mapping + # ========================================== + # Format: (End_Codepoint, Type_Key) + # Used for fast binary search (bisect). + self.ranges = [ + (0x02AF, "latin"), # Latin (Basic, Supplement, Ext, IPA) + (0x03FF, "greek"), # Greek & Coptic + (0x052F, "cyrillic"), # Cyrillic + (0x058F, "armenian"), # Armenian + (0x05FF, "hebrew"), # Hebrew + (0x077F, "arabic"), # Arabic, Syriac, Arabic Supplement + (0x089F, "arabic"), # Arabic Extended-B (+ Syriac Supp) + (0x08FF, "arabic"), # Arabic Extended-A + (0x097F, "indic"), # Devanagari + (0x09FF, "indic"), # Bengali + (0x0A7F, "indic"), # Gurmukhi + (0x0AFF, "indic"), # Gujarati + (0x0B7F, "indic"), # Oriya + (0x0BFF, "indic"), # Tamil + (0x0C7F, "indic"), # Telugu + (0x0CFF, "indic"), # Kannada + (0x0D7F, "indic"), # Malayalam + (0x0DFF, "indic"), # Sinhala + (0x0EFF, "thai_lao"), # Thai & Lao + (0x0FFF, "indic"), # Tibetan (Abugida) + (0x109F, "khmer_myanmar"), # Myanmar + (0x10FF, "georgian"), # Georgian + (0x11FF, "hangul"), # Hangul Jamo + (0x137F, "ethiopic"), # Ethiopic + (0x139F, "ethiopic"), # Ethiopic Supplement + (0x13FF, "default"), # Cherokee + (0x167F, "default"), # Canadian Aboriginal Syllabics + (0x169F, "default"), # Ogham + (0x16FF, "default"), # Runic + (0x171F, "default"), # Tagalog (Baybayin) + (0x173F, "default"), # Hanunoo + (0x175F, "default"), # Buhid + (0x177F, "default"), # Tagbanwa + (0x17FF, "khmer_myanmar"), # Khmer + (0x18AF, "default"), # Mongolian + (0x18FF, "default"), # Canadian Aboriginal Syllabics Ext + (0x194F, "indic"), # Limbu + (0x19DF, "indic"), # Tai Le & New Tai Lue + (0x19FF, "khmer_myanmar"), # Khmer Symbols + (0x1A1F, "indic"), # Buginese + (0x1AAF, "indic"), # Tai Tham + (0x1B7F, "indic"), # Balinese + (0x1BBF, "indic"), # Sundanese + (0x1BFF, "indic"), # Batak + (0x1C4F, "indic"), # Lepcha + (0x1C7F, "indic"), # Ol Chiki (Santali) + (0x1C8F, "cyrillic"), # Cyrillic Extended-C + (0x1CBF, "georgian"), # Georgian Extended + (0x1CCF, "indic"), # Sundanese Supplement + (0x1CFF, "indic"), # Vedic Extensions + (0x1D7F, "latin"), # Phonetic Extensions + (0x1DBF, "latin"), # Phonetic Extensions Supplement + (0x1DFF, "default"), # Combining Diacritical Marks Supplement + (0x1EFF, "latin"), # Latin Extended Additional (Vietnamese) + (0x309F, "kana"), # Hiragana + (0x30FF, "kana"), # Katakana + (0x312F, "cjk"), # Bopomofo (Pinyin) + (0x318F, "hangul"), # Hangul Compatibility Jamo + (0x9FFF, "cjk"), # CJK Unified Ideographs (Main) + (0xA4CF, "yi"), # Yi Syllables + (0xA4FF, "default"), # Lisu + (0xA63F, "default"), # Vai + (0xA69F, "cyrillic"), # Cyrillic Extended-B + (0xA6FF, "default"), # Bamum + (0xA7FF, "latin"), # Latin Extended-D + (0xA82F, "indic"), # Syloti Nagri + (0xA87F, "default"), # Phags-pa + (0xA8DF, "indic"), # Saurashtra + (0xA8FF, "indic"), # Devanagari Extended + (0xA92F, "indic"), # Kayah Li + (0xA95F, "indic"), # Rejang + (0xA97F, "hangul"), # Hangul Jamo Extended-A + (0xA9DF, "indic"), # Javanese + (0xA9FF, "khmer_myanmar"), # Myanmar Extended-B + (0xAA5F, "indic"), # Cham + (0xAA7F, "khmer_myanmar"), # Myanmar Extended-A + (0xAADF, "indic"), # Tai Viet + (0xAAFF, "indic"), # Meetei Mayek Extensions + (0xAB2F, "ethiopic"), # Ethiopic Extended-A + (0xAB6F, "latin"), # Latin Extended-E + (0xABBF, "default"), # Cherokee Supplement + (0xABFF, "indic"), # Meetei Mayek + (0xD7AF, "hangul"), # Hangul Syllables + (0xFAFF, "cjk"), # CJK Compatibility + (0xFDFF, "arabic"), # Arabic Presentation Forms-A + (0xFE6F, "default"), # Variation Selectors + (0xFEFF, "arabic"), # Arabic Presentation Forms-B + (0xFFEF, "latin"), # Fullwidth Latin + ] + self.breakpoints = [r[0] for r in self.ranges] + + @lru_cache(maxsize=4096) + def _get_char_weight(self, char): + """Determines the weight of a single character.""" + code = ord(char) + if (65 <= code <= 90) or (97 <= code <= 122): + return self.weights["latin"] + if code == 32: + return self.weights["space"] + + # Ignore arabic Tatweel + if code == 0x0640: + return self.weights["mark"] + + category = unicodedata.category(char) + + if category.startswith("M"): + return self.weights["mark"] + + if category.startswith("P") or category.startswith("S"): + return self.weights["punctuation"] + + if category.startswith("Z"): + return self.weights["space"] + + if category.startswith("N"): + return self.weights["digit"] + + # 3. Binary search for Unicode Block (此时区间里绝不会再混进标点符号) + idx = bisect.bisect_left(self.breakpoints, code) + if idx < len(self.ranges): + script_type = self.ranges[idx][1] + return self.weights.get(script_type, self.weights["default"]) + + # 4. Handle upper planes (CJK Ext B/C/D, Historic scripts) + if code > 0x20000: + return self.weights["cjk"] + + return self.weights["default"] + + def calculate_total_weight(self, text): + """Sums up the normalized weights for a string.""" + return sum(self._get_char_weight(c) for c in text) + + def estimate_duration( + self, + target_text: str, + ref_text: str, + ref_duration: float, + low_threshold: Optional[float] = 50, + boost_strength: float = 3, + ) -> float: + """ + + Args: + target_text (str): The text for which we want to estimate the duration. + ref_text (str): The reference text that was used to measure + the ref_duration. + ref_duration (float): The actual duration it took + to speak the ref_text. + low_threshold (float): The minimum duration threshold below which the + estimation will be considered unreliable. + boost_strength (float): Controls the power-curve boost for short durations. + Higher values boost small durations more aggressively. + 1 = no boost (linear), 2 = sqrt-like + + Returns: + float: The estimated duration for the target_text based + on the ref_text and ref_duration. + """ + if ref_duration <= 0 or not ref_text: + return 0.0 + + ref_weight = self.calculate_total_weight(ref_text) + if ref_weight == 0: + return 0.0 + + speed_factor = ref_weight / ref_duration + target_weight = self.calculate_total_weight(target_text) + + estimated_duration = target_weight / speed_factor + if low_threshold is not None and estimated_duration < low_threshold: + alpha = 1.0 / boost_strength + return low_threshold * (estimated_duration / low_threshold) ** alpha + else: + return estimated_duration + + +# ========================================== +# Example Usage +# ========================================== +if __name__ == "__main__": + estimator = RuleDurationEstimator() + + ref_txt = "Hello, world." + ref_dur = 1.5 + + test_cases = [ + ("Hindi (With complex marks)", "नमस्ते दुनिया"), + ("Arabic (With vowels)", "مَرْحَبًا بِالْعَالَم"), + ("Vietnamese (Lots of diacritics)", "Chào thế giới"), + ("Chinese", "你好,世界!"), + ("Mixed Emoji", "Hello 🌍! This is fun 🎉"), + ] + + print("--- Reference ---") + print(f"Reference Text: '{ref_txt}'") + print(f"Reference Duration: {ref_dur}s") + print("-" * 30) + + for lang, txt in test_cases: + est_time = estimator.estimate_duration(txt, ref_txt, ref_dur) + weight = estimator.calculate_total_weight(txt) + + print(f"[{lang}]") + print(f"Text: {txt}") + print(f"Total Weight: {weight:.2f}") + print(f"Estimated Duration: {est_time:.2f} s") + print("-" * 30) diff --git a/omnivoice/utils/lang_map.py b/omnivoice/utils/lang_map.py new file mode 100644 index 0000000000000000000000000000000000000000..ffcda10ae5ab93ac45ca12de1147a752585a5ed1 --- /dev/null +++ b/omnivoice/utils/lang_map.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Language name to ISO 639-3 code mapping. + +Auto-generated from ``docs/lang_id_name_map.tsv``. Provides ``LANG_NAME_TO_ID`` +(for resolving language names to codes) and ``LANG_IDS`` (the set of supported +ISO 639-3 codes). Used by ``OmniVoice.generate()`` to resolve user-provided +language names. +""" + +# Auto-generated from docs/lang_id_name_map.tsv +# Maps lowercase language name -> language ID code + +LANG_NAME_TO_ID = { + "abadi": "kbt", + "abkhazian": "ab", + "abron": "abr", + "abua": "abn", + "adamawa fulfulde": "fub", + "adyghe": "ady", + "afade": "aal", + "afrikaans": "af", + "agwagwune": "yay", + "aja (benin)": "ajg", + "akebu": "keu", + "alago": "ala", + "albanian": "sq", + "algerian arabic": "arq", + "algerian saharan arabic": "aao", + "ambo-pasco quechua": "qva", + "ambonese malay": "abs", + "amdo tibetan": "adx", + "amharic": "am", + "anaang": "anw", + "angika": "anp", + "antankarana malagasy": "xmv", + "aragonese": "an", + "arbëreshë albanian": "aae", + "arequipa-la unión quechua": "qxu", + "armenian": "hy", + "ashe": "ahs", + "ashéninka perené": "prq", + "askopan": "eiv", + "assamese": "as", + "asturian": "ast", + "atayal": "tay", + "awak": "awo", + "ayacucho quechua": "quy", + "azerbaijani": "az", + "baatonum": "bba", + "bacama": "bcy", + "bade": "bde", + "bafia": "ksf", + "bafut": "bfd", + "bagirmi fulfulde": "fui", + "bago-kusuntu": "bqg", + "baharna arabic": "abv", + "bakoko": "bkh", + "balanta-ganja": "bjt", + "balti": "bft", + "bamenyam": "bce", + "bamun": "bax", + "bangwinji": "bsj", + "banjar": "bjn", + "bankon": "abb", + "baoulé": "bci", + "bara malagasy": "bhr", + "barok": "bjk", + "basa (cameroon)": "bas", + "basa (nigeria)": "bzw", + "bashkir": "ba", + "basque": "eu", + "batak mandailing": "btm", + "batanga": "bnm", + "bateri": "btv", + "bats": "bbl", + "bayot": "bda", + "bebele": "beb", + "belarusian": "be", + "bengali": "bn", + "betawi": "bew", + "bhili": "bhb", + "bhojpuri": "bho", + "bilur": "bxf", + "bima": "bhp", + "bodo": "brx", + "boghom": "bux", + "bokyi": "bky", + "bomu": "bmq", + "bondei": "bou", + "borgu fulfulde": "fue", + "bosnian": "bs", + "brahui": "brh", + "braj": "bra", + "breton": "br", + "buduma": "bdm", + "buginese": "bug", + "bukharic": "bhh", + "bulgarian": "bg", + "bulu (cameroon)": "bum", + "bundeli": "bns", + "bunun": "bnn", + "bura-pabir": "bwr", + "burak": "bys", + "burmese": "my", + "burushaski": "bsk", + "cacaloxtepec mixtec": "miu", + "cajatambo north lima quechua": "qvl", + "cakfem-mushere": "cky", + "cameroon pidgin": "wes", + "campidanese sardinian": "sro", + "cantonese": "yue", + "catalan": "ca", + "cebuano": "ceb", + "cen": "cen", + "central kurdish": "ckb", + "central nahuatl": "nhn", + "central pame": "pbs", + "central pashto": "pst", + "central puebla nahuatl": "ncx", + "central tarahumara": "tar", + "central yupik": "esu", + "central-eastern niger fulfulde": "fuq", + "chadian arabic": "shu", + "chichewa": "ny", + "chichicapan zapotec": "zpv", + "chiga": "cgg", + "chimalapa zoque": "zoh", + "chimborazo highland quichua": "qug", + "chinese": "zh", + "chiquián ancash quechua": "qxa", + "chitwania tharu": "the", + "chokwe": "cjk", + "chuvash": "cv", + "cibak": "ckl", + "coastal konjo": "kjc", + "copainalá zoque": "zoc", + "cornish": "kw", + "corongo ancash quechua": "qwa", + "croatian": "hr", + "cross river mbembe": "mfn", + "cuyamecalco mixtec": "xtu", + "czech": "cs", + "dadiya": "dbd", + "dagbani": "dag", + "dameli": "dml", + "danish": "da", + "dargwa": "dar", + "dazaga": "dzg", + "deccan": "dcc", + "degema": "deg", + "dera (nigeria)": "kna", + "dghwede": "dgh", + "dhatki": "mki", + "dhivehi": "dv", + "dhofari arabic": "adf", + "dijim-bwilim": "cfa", + "dogri": "dgo", + "domaaki": "dmk", + "dotyali": "dty", + "duala": "dua", + "dutch": "nl", + "dũya": "ldb", + "dyula": "dyu", + "eastern balochi": "bgp", + "eastern bolivian guaraní": "gui", + "eastern egyptian bedawi arabic": "avl", + "eastern krahn": "kqo", + "eastern mari": "mhr", + "eastern yiddish": "ydd", + "ebrié": "ebr", + "eggon": "ego", + "egyptian arabic": "arz", + "ejagham": "etu", + "eleme": "elm", + "eloyi": "afo", + "embu": "ebu", + "english": "en", + "erzya": "myv", + "esan": "ish", + "esperanto": "eo", + "estonian": "et", + "eton (cameroon)": "eto", + "ewondo": "ewo", + "extremaduran": "ext", + "fang (equatorial guinea)": "fan", + "fanti": "fat", + "farefare": "gur", + "fe'fe'": "fmp", + "filipino": "fil", + "filomena mata-coahuitlán totonac": "tlp", + "finnish": "fi", + "fipa": "fip", + "french": "fr", + "fulah": "ff", + "galician": "gl", + "gambian wolof": "wof", + "ganda": "lg", + "garhwali": "gbm", + "gawar-bati": "gwt", + "gawri": "gwc", + "gbagyi": "gbr", + "gbari": "gby", + "geji": "gyz", + "gen": "gej", + "georgian": "ka", + "german": "de", + "geser-gorom": "ges", + "gheg albanian": "aln", + "ghomálá'": "bbj", + "gidar": "gid", + "glavda": "glw", + "goan konkani": "gom", + "goaria": "gig", + "goemai": "ank", + "gola": "gol", + "greek": "el", + "guarani": "gn", + "guduf-gava": "gdf", + "guerrero amuzgo": "amu", + "gujarati": "gu", + "gujari": "gju", + "gulf arabic": "afb", + "gurgula": "ggg", + "gusii": "guz", + "gusilay": "gsl", + "gweno": "gwe", + "güilá zapotec": "ztu", + "hadothi": "hoj", + "hahon": "hah", + "haitian": "ht", + "hakha chin": "cnh", + "hakö": "hao", + "halia": "hla", + "hausa": "ha", + "hawaiian": "haw", + "hazaragi": "haz", + "hebrew": "he", + "hemba": "hem", + "herero": "hz", + "highland konjo": "kjk", + "hijazi arabic": "acw", + "hindi": "hi", + "huarijio": "var", + "huautla mazatec": "mau", + "huaxcaleca nahuatl": "nhq", + "huba": "hbb", + "huitepec mixtec": "mxs", + "hula": "hul", + "hungarian": "hu", + "hunjara-kaina ke": "hkk", + "hwana": "hwo", + "ibibio": "ibb", + "icelandic": "is", + "idakho-isukha-tiriki": "ida", + "idoma": "idu", + "igbo": "ig", + "igo": "ahl", + "ikposo": "kpo", + "ikwere": "ikw", + "imbabura highland quichua": "qvi", + "indonesian": "id", + "indus kohistani": "mvy", + "interlingua (international auxiliary language association)": "ia", + "inupiaq": "ik", + "irish": "ga", + "iron ossetic": "os", + "isekiri": "its", + "isoko": "iso", + "italian": "it", + "ito": "itw", + "itzá": "itz", + "ixtayutla mixtec": "vmj", + "izon": "ijc", + "jambi malay": "jax", + "japanese": "ja", + "jaqaru": "jqr", + "jauja wanca quechua": "qxw", + "jaunsari": "jns", + "javanese": "jv", + "jiba": "juo", + "jju": "kaj", + "judeo-moroccan arabic": "aju", + "juxtlahuaca mixtec": "vmc", + "kabardian": "kbd", + "kabras": "lkb", + "kabuverdianu": "kea", + "kabyle": "kab", + "kachi koli": "gjk", + "kairak": "ckr", + "kalabari": "ijn", + "kalasha": "kls", + "kalenjin": "kln", + "kalkoti": "xka", + "kamba": "kam", + "kamo": "kcq", + "kanauji": "bjj", + "kanembu": "kbl", + "kannada": "kn", + "karekare": "kai", + "kashmiri": "ks", + "kathoriya tharu": "tkt", + "kati": "bsh", + "kazakh": "kk", + "keiyo": "eyo", + "khams tibetan": "khg", + "khana": "ogo", + "khetrani": "xhe", + "khmer": "km", + "khowar": "khw", + "kinga": "zga", + "kinnauri": "kfk", + "kinyarwanda": "rw", + "kirghiz": "ky", + "kirya-konzəl": "fkk", + "kochila tharu": "thq", + "kohistani shina": "plk", + "kohumono": "bcs", + "kok borok": "trp", + "kol (papua new guinea)": "kol", + "kom (cameroon)": "bkm", + "koma": "kmy", + "konkani": "knn", + "konzo": "koo", + "korean": "ko", + "korwa": "kfp", + "kota (india)": "kfe", + "koti": "eko", + "kuanua": "ksd", + "kuanyama": "kj", + "kui (india)": "uki", + "kulung (nigeria)": "bbu", + "kuot": "kto", + "kushi": "kuh", + "kwambi": "kwm", + "kwasio": "nmg", + "lala-roba": "lla", + "lamang": "hia", + "lao": "lo", + "larike-wakasihu": "alo", + "lasi": "lss", + "latgalian": "ltg", + "latvian": "lv", + "levantine arabic": "apc", + "liana-seti": "ste", + "liberia kpelle": "xpe", + "liberian english": "lir", + "libyan arabic": "ayl", + "ligurian": "lij", + "lijili": "mgi", + "lingala": "ln", + "lithuanian": "lt", + "loarki": "lrk", + "logooli": "rag", + "logudorese sardinian": "src", + "loja highland quichua": "qvj", + "loloda": "loa", + "longuda": "lnu", + "loxicha zapotec": "ztp", + "luba-lulua": "lua", + "luo": "luo", + "lushai": "lus", + "luxembourgish": "lb", + "maasina fulfulde": "ffm", + "maba (chad)": "mde", + "macedo-romanian": "rup", + "macedonian": "mk", + "mada (cameroon)": "mxu", + "mafa": "maf", + "maithili": "mai", + "malay": "ms", + "malayalam": "ml", + "mali": "gcc", + "malinaltepec me'phaa": "tcf", + "maltese": "mt", + "mandara": "tbf", + "mandjak": "mfv", + "manggarai": "mqy", + "manipuri": "mni", + "mansoanka": "msw", + "manx": "gv", + "maori": "mi", + "marathi": "mr", + "marghi central": "mrt", + "marghi south": "mfm", + "maria (india)": "mrr", + "marwari (pakistan)": "mve", + "masana": "mcn", + "masikoro malagasy": "msh", + "matsés": "mcf", + "mazaltepec zapotec": "zpy", + "mazatlán mazatec": "vmz", + "mazatlán mixe": "mzl", + "mbe": "mfo", + "mbo (cameroon)": "mbo", + "mbum": "mdd", + "medumba": "byv", + "mekeo": "mek", + "meru": "mer", + "mesopotamian arabic": "acm", + "mewari": "mtr", + "min nan chinese": "nan", + "mingrelian": "xmf", + "mitlatongo mixtec": "vmm", + "miya": "mkf", + "mokpwe": "bri", + "moksha": "mdf", + "mom jango": "ver", + "mongolian": "mn", + "moroccan arabic": "ary", + "motu": "meu", + "mpiemo": "mcx", + "mpumpong": "mgg", + "mundang": "mua", + "mungaka": "mhk", + "musey": "mse", + "musgu": "mug", + "musi": "mui", + "naba": "mne", + "najdi arabic": "ars", + "nalik": "nal", + "nawdm": "nmz", + "ndonga": "ng", + "neapolitan": "nap", + "nepali": "npi", + "ngamo": "nbh", + "ngas": "anc", + "ngiemboon": "nnh", + "ngizim": "ngi", + "ngomba": "jgo", + "ngombale": "nla", + "nigerian fulfulde": "fuv", + "nigerian pidgin": "pcm", + "nimadi": "noe", + "nobiin": "fia", + "north mesopotamian arabic": "ayp", + "north moluccan malay": "max", + "northern betsimisaraka malagasy": "bmm", + "northern hindko": "hno", + "northern kurdish": "kmr", + "northern pame": "pmq", + "northern pashto": "pbu", + "northern uzbek": "uzn", + "northwest gbaya": "gya", + "norwegian": "no", + "norwegian bokmål": "nb", + "norwegian nynorsk": "nn", + "notsi": "ncf", + "nyankpa": "yes", + "nyungwe": "nyu", + "nzanyi": "nja", + "nüpode huitoto": "hux", + "occitan": "oc", + "od": "odk", + "odia": "ory", + "odual": "odu", + "omani arabic": "acx", + "orizaba nahuatl": "nlv", + "orma": "orc", + "ormuri": "oru", + "oromo": "om", + "pahari-potwari": "phr", + "paiwan": "pwn", + "panjabi": "pa", + "papuan malay": "pmy", + "parkari koli": "kvx", + "pedi": "nso", + "pero": "pip", + "persian": "fa", + "petats": "pex", + "phalura": "phl", + "piemontese": "pms", + "piya-kwonci": "piy", + "plateau malagasy": "plt", + "polish": "pl", + "poqomam": "poc", + "portuguese": "pt", + "pulaar": "fuc", + "pular": "fuf", + "puno quechua": "qxp", + "pushto": "ps", + "pökoot": "pko", + "qaqet": "byx", + "quiotepec chinantec": "chq", + "rana tharu": "thr", + "rangi": "lag", + "rapoisi": "kyx", + "ratahan": "rth", + "rayón zoque": "zor", + "romanian": "ro", + "romansh": "rm", + "rombo": "rof", + "rotokas": "roo", + "rukai": "dru", + "russian": "ru", + "sacapulteco": "quv", + "saidi arabic": "aec", + "sakalava malagasy": "skg", + "sakizaya": "szy", + "saleman": "sau", + "samba daka": "ccg", + "samba leko": "ndi", + "san felipe otlaltepec popoloca": "pow", + "san francisco del mar huave": "hue", + "san juan atzingo popoloca": "poe", + "san martín itunyoso triqui": "trq", + "san miguel el grande mixtec": "mig", + "sansi": "ssi", + "sanskrit": "sa", + "santa ana de tusi pasco quechua": "qxt", + "santa catarina albarradas zapotec": "ztn", + "santali": "sat", + "santiago del estero quichua": "qus", + "saposa": "sps", + "saraiki": "skr", + "sardinian": "sc", + "saya": "say", + "sediq": "trv", + "serbian": "sr", + "seri": "sei", + "shina": "scl", + "shona": "sn", + "siar-lak": "sjr", + "sibe": "nco", + "sicilian": "scn", + "sihuas ancash quechua": "qws", + "sikkimese": "sip", + "sinaugoro": "snc", + "sindhi": "sd", + "sindhi bhil": "sbn", + "sinhala": "si", + "sinicahua mixtec": "xti", + "sipacapense": "qum", + "siwai": "siw", + "slovak": "sk", + "slovenian": "sl", + "solos": "sol", + "somali": "so", + "soninke": "snk", + "south giziga": "giz", + "south ucayali ashéninka": "cpy", + "southeastern nochixtlán mixtec": "mxy", + "southern betsimisaraka malagasy": "bzc", + "southern pashto": "pbt", + "southern pastaza quechua": "qup", + "soyaltepec mazatec": "vmp", + "spanish": "es", + "standard arabic": "arb", + "standard moroccan tamazight": "zgh", + "sudanese arabic": "apd", + "sulka": "sua", + "svan": "sva", + "swahili": "sw", + "swedish": "sv", + "tae'": "rob", + "tahaggart tamahaq": "thv", + "taita": "dav", + "tajik": "tg", + "tamil": "ta", + "tandroy-mahafaly malagasy": "tdx", + "tangale": "tan", + "tanosy malagasy": "txy", + "tarok": "yer", + "tatar": "tt", + "tedaga": "tuq", + "telugu": "te", + "tem": "kdh", + "teop": "tio", + "tepeuxila cuicatec": "cux", + "tepinapa chinantec": "cte", + "tera": "ttr", + "terei": "buo", + "termanu": "twu", + "tesaka malagasy": "tkg", + "tetelcingo nahuatl": "nhg", + "teutila cuicatec": "cut", + "thai": "th", + "tibetan": "bo", + "tidaá mixtec": "mtx", + "tidore": "tvo", + "tigak": "tgc", + "tigre": "tig", + "tigrinya": "ti", + "tilquiapan zapotec": "zts", + "tinputz": "tpz", + "tlacoapa me'phaa": "tpl", + "tlacoatzintepec chinantec": "ctl", + "tlingit": "tli", + "toki pona": "tok", + "tomoip": "tqp", + "tondano": "tdn", + "tonsea": "txs", + "tooro": "ttj", + "torau": "ttu", + "torwali": "trw", + "tsimihety malagasy": "xmw", + "tsotso": "lto", + "tswana": "tn", + "tugen": "tuy", + "tuki": "bag", + "tula": "tul", + "tulu": "tcy", + "tunen": "tvu", + "tungag": "lcm", + "tunisian arabic": "aeb", + "tupuri": "tui", + "turkana": "tuv", + "turkish": "tr", + "turkmen": "tk", + "tututepec mixtec": "mtu", + "twi": "tw", + "ubaghara": "byc", + "uighur": "ug", + "ukrainian": "uk", + "umbundu": "umb", + "upper sorbian": "hsb", + "urdu": "ur", + "ushojo": "ush", + "uzbek": "uz", + "vai": "vai", + "vietnamese": "vi", + "votic": "vot", + "võro": "vro", + "waci gbe": "wci", + "wadiyara koli": "kxp", + "waja": "wja", + "wakhi": "wbl", + "wanga": "lwg", + "wapan": "juk", + "warji": "wji", + "welsh": "cy", + "wemale": "weo", + "western frisian": "fy", + "western highland purepecha": "pua", + "western juxtlahuaca mixtec": "jmx", + "western maninkakan": "mlq", + "western mari": "mrj", + "western niger fulfulde": "fuh", + "western panjabi": "pnb", + "wolof": "wo", + "wuzlam": "udl", + "xanaguía zapotec": "ztg", + "xhosa": "xh", + "yace": "ekr", + "yakut": "sah", + "yalahatan": "jal", + "yanahuanca pasco quechua": "qur", + "yangben": "yav", + "yaqui": "yaq", + "yauyos quechua": "qux", + "yekhee": "ets", + "yiddish": "yi", + "yidgha": "ydg", + "yoruba": "yo", + "yutanduchi mixtec": "mab", + "zacatlán-ahuacatlán-tepetzintla nahuatl": "nhi", + "zarma": "dje", + "zaza": "zza", + "zulu": "zu", + "ömie": "aom", +} + +LANG_NAMES = set(LANG_NAME_TO_ID.keys()) +LANG_IDS = set(LANG_NAME_TO_ID.values()) + +# Exceptions where .title() doesn't match the canonical casing from the TSV. +_TITLE_EXCEPTIONS = { + "fe'fe'": "Fe'fe'", + "dũya": "Dũya", + "santiago del estero quichua": "Santiago del Estero Quichua", + "santa ana de tusi pasco quechua": "Santa Ana de Tusi Pasco Quechua", + "malinaltepec me'phaa": "Malinaltepec Me'phaa", + "tlacoapa me'phaa": "Tlacoapa Me'phaa", +} + + +def lang_display_name(name: str) -> str: + """Return a display-friendly version of a lowercase language name. + + Uses .title() for most names, with manual exceptions for cases like + apostrophes and small words (de, del) that should stay lowercase. + """ + return _TITLE_EXCEPTIONS.get(name, name.title()) diff --git a/omnivoice/utils/text.py b/omnivoice/utils/text.py new file mode 100644 index 0000000000000000000000000000000000000000..3fc9adb04202eab422f7988cccb494edf17cdfad --- /dev/null +++ b/omnivoice/utils/text.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text processing utilities for TTS inference. + +Provides: +- ``chunk_text_punctuation()``: Splits long text into model-friendly chunks at + sentence boundaries, with abbreviation-aware punctuation splitting. +- ``add_punctuation()``: Appends missing end punctuation (Chinese or English). +""" + +from typing import List, Optional + + +SPLIT_PUNCTUATION = set(".,;:!?。,;:!?") +CLOSING_MARKS = set("\"'""')]》》>」】") + +END_PUNCTUATION = { + ";", + ":", + ",", + ".", + "!", + "?", + "…", + ")", + "]", + "}", + '"', + "'", + """, + "'", + ";", + ":", + ",", + "。", + "!", + "?", + "、", + "……", + ")", + "】", + """, + "'", +} + + +ABBREVIATIONS = { + "Mr.", + "Mrs.", + "Ms.", + "Dr.", + "Prof.", + "Sr.", + "Jr.", + "Rev.", + "Fr.", + "Hon.", + "Pres.", + "Gov.", + "Capt.", + "Gen.", + "Sen.", + "Rep.", + "Col.", + "Maj.", + "Lt.", + "Cmdr.", + "Sgt.", + "Cpl.", + "Co.", + "Corp.", + "Inc.", + "Ltd.", + "Est.", + "Dept.", + "St.", + "Ave.", + "Blvd.", + "Rd.", + "Mt.", + "Ft.", + "No.", + "Jan.", + "Feb.", + "Mar.", + "Apr.", + "Aug.", + "Sep.", + "Sept.", + "Oct.", + "Nov.", + "Dec.", + "i.e.", + "e.g.", + "vs.", + "Vs.", + "Etc.", + "approx.", + "fig.", + "def.", +} + + +def chunk_text_punctuation( + text: str, + chunk_len: int, + min_chunk_len: Optional[int] = None, +) -> List[str]: + """ + Splits the input tokens list into chunks according to punctuations, + avoiding splits on common abbreviations (e.g., Mr., No.). + """ + + # 1. Split the tokens according to punctuations. + sentences = [] + current_sentence = [] + + tokens_list = list(text) + + for token in tokens_list: + # If the first token of current sentence is punctuation, + # append it to the end of the previous sentence. + if ( + len(current_sentence) == 0 + and len(sentences) != 0 + and (token in SPLIT_PUNCTUATION or token in CLOSING_MARKS) + ): + sentences[-1].append(token) + # Otherwise, append the current token to the current sentence. + else: + current_sentence.append(token) + + # Split the sentence in positions of punctuations. + if token in SPLIT_PUNCTUATION: + is_abbreviation = False + + if token == ".": + temp_str = "".join(current_sentence).strip() + if temp_str: + last_word = temp_str.split()[-1] + if last_word in ABBREVIATIONS: + is_abbreviation = True + + if not is_abbreviation: + sentences.append(current_sentence) + current_sentence = [] + # Assume the last few tokens are also a sentence + if len(current_sentence) != 0: + sentences.append(current_sentence) + + # 2. Merge short sentences. + merged_chunks = [] + current_chunk = [] + for sentence in sentences: + if len(current_chunk) + len(sentence) <= chunk_len: + current_chunk.extend(sentence) + else: + if len(current_chunk) > 0: + merged_chunks.append(current_chunk) + current_chunk = sentence + + if len(current_chunk) > 0: + merged_chunks.append(current_chunk) + + # 4. Post-process: Check for undersized chunks and merge them + # with the previous chunk or next chunk (if it's the first chunk). + if min_chunk_len is not None: + first_chunk_short_flag = ( + len(merged_chunks) > 0 and len(merged_chunks[0]) < min_chunk_len + ) + final_chunks = [] + for i, chunk in enumerate(merged_chunks): + if i == 1 and first_chunk_short_flag: + final_chunks[-1].extend(chunk) + else: + if len(chunk) >= min_chunk_len: + final_chunks.append(chunk) + else: + if len(final_chunks) == 0: + final_chunks.append(chunk) + else: + final_chunks[-1].extend(chunk) + else: + final_chunks = merged_chunks + + chunk_strings = [ + "".join(chunk).strip() for chunk in final_chunks if "".join(chunk).strip() + ] + return chunk_strings + + +def add_punctuation(text: str): + """Add punctuation if there is not in the end of text""" + text = text.strip() + + if not text: + return text + + if text[-1] not in END_PUNCTUATION: + is_chinese = any("\u4e00" <= char <= "\u9fff" for char in text) + + text += "。" if is_chinese else "." + + return text diff --git a/omnivoice/utils/voice_design.py b/omnivoice/utils/voice_design.py new file mode 100644 index 0000000000000000000000000000000000000000..802321d5060b5a93db40ffef2f42a1dc947dea19 --- /dev/null +++ b/omnivoice/utils/voice_design.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Voice-design instruct constants for TTS inference. + +Defines speaker attribute tags (gender, age, pitch, accent, dialect) and +translation/validation utilities between English and Chinese. Used by +``OmniVoice.generate()`` for voice design mode. +""" + +import re + +_ZH_RE = re.compile(r'[\u4e00-\u9fff]') + +# Category = set of {english: chinese, ...} items that are mutually exclusive. +# Accent (EN-only) and dialect (ZH-only) are stored as flat sets below. +_INSTRUCT_CATEGORIES = [ + {"male": "男", "female": "女"}, + {"child": "儿童", "teenager": "少年", "young adult": "青年", + "middle-aged": "中年", "elderly": "老年"}, + {"very low pitch": "极低音调", "low pitch": "低音调", + "moderate pitch": "中音调", "high pitch": "高音调", + "very high pitch": "极高音调"}, + {"whisper": "耳语"}, + # Accent (English-only, no Chinese counterpart) + {"american accent", "british accent", "australian accent", + "chinese accent", "canadian accent", "indian accent", + "korean accent", "portuguese accent", "russian accent", "japanese accent"}, + # Dialect (Chinese-only, no English counterpart) + {"河南话", "陕西话", "四川话", "贵州话", "云南话", "桂林话", + "济南话", "石家庄话", "甘肃话", "宁夏话", "青岛话", "东北话"}, +] + +_INSTRUCT_EN_TO_ZH = {} +_INSTRUCT_ZH_TO_EN = {} +_INSTRUCT_MUTUALLY_EXCLUSIVE = [] +for _cat in _INSTRUCT_CATEGORIES: + if isinstance(_cat, dict): + _INSTRUCT_EN_TO_ZH.update(_cat) + _INSTRUCT_ZH_TO_EN.update({v: k for k, v in _cat.items()}) + _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat) | set(_cat.values())) + else: + _INSTRUCT_MUTUALLY_EXCLUSIVE.append(set(_cat)) + +_INSTRUCT_ALL_VALID = ( + set(_INSTRUCT_EN_TO_ZH) | set(_INSTRUCT_ZH_TO_EN) + | _INSTRUCT_MUTUALLY_EXCLUSIVE[-2] # accents + | _INSTRUCT_MUTUALLY_EXCLUSIVE[-1] # dialects +) + +_INSTRUCT_VALID_EN = frozenset(i for i in _INSTRUCT_ALL_VALID if not _ZH_RE.search(i)) +_INSTRUCT_VALID_ZH = frozenset(i for i in _INSTRUCT_ALL_VALID if _ZH_RE.search(i)) diff --git a/sync_data/configs/config.json b/sync_data/configs/config.json new file mode 100644 index 0000000000000000000000000000000000000000..dfefd5e0bdf24a9e91a72f064fb455c78fbed547 --- /dev/null +++ b/sync_data/configs/config.json @@ -0,0 +1,40 @@ +{ + "llm_name_or_path": "Qwen/Qwen3-0.6B", + "audio_vocab_size": 1025, + "audio_mask_id": 1024, + "num_audio_codebook": 8, + + "audio_codebook_weights": [8, 8, 6, 6, 4, 4, 2, 2], + "drop_cond_ratio": 0.1, + "prompt_ratio_range": [0.0, 0.3], + "mask_ratio_range": [0.0, 1.0], + "language_ratio": 0.8, + "use_pinyin_ratio": 0.0, + "instruct_ratio": 0.0, + "only_instruct_ratio": 0.0, + + "resume_from_checkpoint": null, + "init_from_checkpoint": "oddadmix/lahgtna-omnivoice-v2", + + "learning_rate": 1e-5, + "weight_decay": 0.01, + "max_grad_norm": 1.0, + "steps": 5000, + "seed": 42, + "warmup_type": "ratio", + "warmup_ratio": 0.01, + "warmup_steps": 0, + + "batch_tokens": 4096, + "gradient_accumulation_steps": 2, + "num_workers": 3, + + "mixed_precision": "bf16", + "allow_tf32": true, + "attn_implementation": "sdpa", + + "logging_steps": 50, + "eval_steps": 500, + "save_steps": 500, + "keep_last_n_checkpoints": -1 +} diff --git a/sync_data/configs/data_saudi.json b/sync_data/configs/data_saudi.json new file mode 100644 index 0000000000000000000000000000000000000000..62db32ff7ede8e2787aac0c3b91157127ca1df26 --- /dev/null +++ b/sync_data/configs/data_saudi.json @@ -0,0 +1,16 @@ +{ + "train": [ + { + "language_id": "ar", + "manifest_path": ["/home/riftuser/OmniVoice/sync_data/tokens/train/data.lst"], + "repeat": 1 + } + ], + "dev": [ + { + "language_id": "ar", + "manifest_path": ["/home/riftuser/OmniVoice/sync_data/tokens/dev/data.lst"], + "repeat": 1 + } + ] +} diff --git a/sync_data/data/dev_raw.jsonl b/sync_data/data/dev_raw.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..44c9ae666d7bb98f654b4f1ebac3dea0228b0a92 --- /dev/null +++ b/sync_data/data/dev_raw.jsonl @@ -0,0 +1,15 @@ +{"id": "sample_000194", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000194.wav", "text": "تحت نور القمر، الشوارع لنا.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000122", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000122.wav", "text": "الحراس جايين! بسرعة، اختبي في الظلام!", "language_id": "ar", "instruct": "saudi, conversational, tense"} +{"id": "sample_000089", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000089.wav", "text": "شوف! لقيت عملة من زمان!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000090", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000090.wav", "text": "نجهز للحج! وش لازم نجيب معنا؟", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000307", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000307.wav", "text": "يلا نبدا الدرس! تأكد من HP مالك وتحضّر للمستوى الجديد على صفحة ستة وخمسين.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000119", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000119.wav", "text": "يا هيوستن، طرنا! مهمتنا للمريخ تبدأ الحين!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000452", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000452.wav", "text": "شغل عدل! ادزلبتس استراتيجية الدفاع عن الواحة. خلنا نحتفل بكوب من القهوة مع GameMaster. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000127", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000127.wav", "text": "الحراس يغيرون النوبة. هذي فرصتنا نتسلل!", "language_id": "ar", "instruct": "saudi, conversational, tense"} +{"id": "sample_000146", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000146.wav", "text": "الواحة تطلع مثل السراب، بس صدق.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000189", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000189.wav", "text": "غير الكفر بسرعة، ولا ما نطلع من هنا!", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000200", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000200.wav", "text": "غلي المويه، بعدين حط الهيل عشان الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000215", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000215.wav", "text": "افرك الورق على الجرح، العشب هذا مستخدم من زمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000016", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000016.wav", "text": "لازم نعبي موية زيادة للسفرة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000132", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000132.wav", "text": "النجوم ترشدنا بليل البر.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} diff --git a/sync_data/data/train_raw.jsonl b/sync_data/data/train_raw.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..33c3b92aaec619d50667738a58d4ae43548126b9 --- /dev/null +++ b/sync_data/data/train_raw.jsonl @@ -0,0 +1,280 @@ +{"id": "sample_000364", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000364.wav", "text": "مبروك! انت حليت آخر لغز وربحت جائزة مميزة. تْشوف التفاصيل في صفحة مئة واثنين وأربعين!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000257", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000257.wav", "text": "شوف! هذا نجم الشمال. المسافرين أول كانوا يستخدمونه عشان يلقون طريقهم في الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, awe-inspired"} +{"id": "sample_000099", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000099.wav", "text": "يالله بسرعة! عدل حلاوة العيد قبل لا يجون الضيوف!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000093", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000093.wav", "text": "ياسلام! هالفلس العتيق يضبط بالضبط في يد التمثال!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000427", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000427.wav", "text": "يلا نخلي اللغز ذا سوا! شوفي صفحة مئة واثنين وأربعين للhint.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000294", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000294.wav", "text": "استعدوا يا رجيل، العدو جاي من الشرق عقب خمس عشرة دقيقة. صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000426", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000426.wav", "text": "شفتي الجمل يرقص جنب الواحَهْ البارحة؟ كان العرض عند ستة وخمسين شارع النجدي. It was super!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000130", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000130.wav", "text": "الاستعداد لهالرحلة شرف ومسؤولية.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000229", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000229.wav", "text": "القنبلة جاية! خبّ نفسك!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000166", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000166.wav", "text": "يتقدمون علينا! اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000113", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000113.wav", "text": "من البيوت الطينية للأبراج الزجاجية، سماء الرياض تحكي قصة تطورنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000475", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000475.wav", "text": "حياك الله في مجلس الصحارى. مهمتك تبتدي عند الغروب بْتمامَه في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000330", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000330.wav", "text": "أبغى منك تنظم تجمع العائلة في ستة وخمسين شارع النجدي. لا تنسى ترسل الدعوة بواتساب.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000318", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000318.wav", "text": "شفت شي غريب حول الواحة القديمه ليلة أمس؟ هذا مكتوب في سجل رقم خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000396", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000396.wav", "text": "حياكم الله في المجلس الاستراتيجي. فريقنا مِستعد يواجه تحديات جديدة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000161", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000161.wav", "text": "كل خطوة قدام تقربنا من النصر.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000005", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000005.wav", "text": "الرمل يشبه كثبان الفضا من فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000447", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000447.wav", "text": "لو سمحت اشرح لي كيف تعاملت مع موقف صعب في شغلك الأخير في Office خمسة وستين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000180", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000180.wav", "text": "يالله نلعب! أراهن إني أفوز!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000213", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000213.wav", "text": "تدرب على الدعوات، بتساعدك في الرحلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000148", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000148.wav", "text": "أسمع شي بالظلام... يمكن الهوى.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000375", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000375.wav", "text": "يا شريك، صار الوقت نقول وداع! لا تنسى puzzle party يوم الجمعة في خمسة ستة شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000214", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000214.wav", "text": "رم الشبكة لما تكون الموية راكدة، الصبر هو المفتاح.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000163", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000163.wav", "text": "لا تطلقون النار لين يقربون!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000009", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000009.wav", "text": "طور شريحة السرعة في جملك الروبوت للسباق!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000438", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000438.wav", "text": "قم اتبع الصقر إلى الواحَهْ باستخدام الmap في صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000256", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000256.wav", "text": "ياخي، اللعبة ذي بالنظارات قوية! حسيت إني أطير فوق الرياض صدق!", "language_id": "ar", "instruct": "saudi, conversational, thrilled"} +{"id": "sample_000188", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000188.wav", "text": "دوس بنزين! قاعدين نفقدهم!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000343", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000343.wav", "text": "شوف ذا النقش القديم! كنه لقى كنز مخفي في صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000123", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000123.wav", "text": "الأكسجين عندنا في خطر. لازم نوسع البيت الأخضر على طول.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000136", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000136.wav", "text": "كل خطوة تقربك من الراحة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000370", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000370.wav", "text": "قال الملك، 'وشلون الجمل صار في ستة وخمسين شارع Royal؟' يمكن يتفرج على Netflix!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000403", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000403.wav", "text": "مرحبًا بك في المقابلة! اجلس واستمتع بـ qahwa المشهورة في صفحة ستة وخمسين من دليلنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000287", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000287.wav", "text": "يا ولد العم! شفت الجمل الجديد للجد؟ اسمه 'Speedster ثلاثة آلاف'!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000160", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000160.wav", "text": "استعدوا، المعركة هذي بتكون صعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000112", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000112.wav", "text": "الهبوب جاي! بسرعة، اضرب الخيمة وربط البعارين!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000460", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000460.wav", "text": "العدو قْرب من الكْثبان. جهز حرس القَصر للمعركة على صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000440", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000440.wav", "text": "يا هلا بالمسافر، مرحب فيك عند Checkpoint خمسة ستة! استانس بسباقات الجمال اللي عندنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000260", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000260.wav", "text": "الرموز القديمة على صخرة الفيل تطلع بس تحت ضوء القمر. يلا، فك رموزها قبل يطلع الفجر!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000157", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000157.wav", "text": "إذا سجلنا الحين، المشروبات علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000350", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000350.wav", "text": "جهّز نفسَك للحج! تأكد من Google Maps وقابلنا عند الوّاحات في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000404", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000404.wav", "text": "مرحبتس في Strategy واحد صفر واحد: كيف تدزلب الجمال وتغلب الكثبان! صفحة مئة وثلاثة وعشرون", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000305", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000305.wav", "text": "تذكر كيف كان الهوا الصحراوي على الكْثُبان عند الأوسيَس صفحة مئة واثنان وأربعون؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000419", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000419.wav", "text": "تعلمتوا الاستراتيجيات. الحين طبقوها علشان تغزون أراضي الصحرة. شوفوا صفحة مئة واثنين وأربعين في كتاب Game Manual.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000209", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000209.wav", "text": "قل القصيدة الحربية، تذكرنا بشجاعة أجدادنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000010", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000010.wav", "text": "يلّا نلحق القطار قبل لا يفوتنا!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000236", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000236.wav", "text": "لازم نسيطر على الأهرامات، اندفع الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000101", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000101.wav", "text": "تجهيز شنطة الحج مثل حل بازل مقدس!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000367", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000367.wav", "text": "لا تْقَلِق، الجمل عنده GPS. بس اتبع الإحداثيات على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000342", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000342.wav", "text": "عرضك ظليل. ما أقدر أبيع بأقل من مئتي coins.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000477", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000477.wav", "text": "حياكم في المهرجان! تعالوا عند بوث خمسة ستة لتجربة ما تنساها.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000423", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000423.wav", "text": "هلا بك يا قايد! حضّر عساكرك للمهمة الأولى في واحة خمسة وستين! شوف الخريطة في جهازك الـTablet.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000253", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000253.wav", "text": "الشاورما خلصت! دق على المورد واطلب دجاج زيادة، بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, busy"} +{"id": "sample_000446", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000446.wav", "text": "تْذكّر أيام الوَاحَه، وين وُلِدَت Legends في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000140", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000140.wav", "text": "الدلة جاهزة! اسكبها للمعازيم.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000109", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000109.wav", "text": "بالعيد، نبدا بزيارة الكبار، وبعدين الصغار. كذا نكرم الحكمة والبراءة مع بعض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000450", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000450.wav", "text": "ليه الجمل ما رضى يتفاهم؟ خايف يقسم له الoasis برقم ستة وخمسين!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000164", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000164.wav", "text": "نكسر خط دفاعهم هنا، نفوز بالمعركة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000273", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000273.wav", "text": "يلا، لازم نهرب قبل ما الجمل يزهق! ليفل خمسة يستنانا.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000366", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000366.wav", "text": "في الصحرَهْ، لقيت رسالة قديمة من جدّي على صفحة اثنين وأربعين في كتاب Desert Wisdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000153", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000153.wav", "text": "إذا ما سرعت، بنخسر قدام سكوتر!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000107", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000107.wav", "text": "في العيد، طلبت بساط طاير، بس جاني مكنسة كهرب بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000359", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000359.wav", "text": "حذر! ذا الجمل يحب ياكل guidebooks. تلقاه عند ستة وخمسين شارع النجدي بعد العصر.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000117", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000117.wav", "text": "امش في الجسر اللي بين أبراج مركز المملكة. شوف الرياض من فوق، يا سلام!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000008", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000008.wav", "text": "كسّر الجدار عشان توقف هجمة الفيروس!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000223", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000223.wav", "text": "اللي ينسى أصله، ما له مستقبل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000339", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000339.wav", "text": "هلا والله! أنا دليلك اللي بيخذك في مغامرة وسط النْجود. يلا نبدأ من ستة وخمسين شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000182", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000182.wav", "text": "الجمل ذا سلالته صافية، يستاهل كل ريال.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000013", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000013.wav", "text": "اقبض الطارات على السيف وقت العرضة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000196", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000196.wav", "text": "الأساطير تنولد هنا، في شوارعنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000226", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000226.wav", "text": "عبي الشوزن، قاعدين يهجمون علينا!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000221", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000221.wav", "text": "الصحراء تعلم الصبر، مثل ما يقولون البدو.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000243", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000243.wav", "text": "خلك منتبه! إذا أطلقت الصقر بدري، بنخسر السباق!", "language_id": "ar", "instruct": "saudi, conversational, competitive"} +{"id": "sample_000474", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000474.wav", "text": "خلنا نخطط الstrategy لوحتنا! اتصل على صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان لعقد مجلس العائلة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000293", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000293.wav", "text": "تذكر سوالف جدودنا واحنا واقفين عند بوابة المدينة رقم خمسة ستة جنب برج Kingdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000175", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000175.wav", "text": "طلقة مضبوطه تغير كل شي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000275", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000275.wav", "text": "ليش الجمل دخل قبيلة Puzzle؟ عشان يحل ألغاز القفر في level ثلاثة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000106", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000106.wav", "text": "من جسر السما في برج المملكة بالرياض، تقدر تلمس الغيوم!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000456", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000456.wav", "text": "جهز الإمدادات لرحلتنا. لازم نوصل ل Oasis أربعة اثنان قبل غروب الشمس.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000480", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000480.wav", "text": "علمتني وش شفت في ستة وخمسين شارع النجدي. كان فيه أحد مشبوه؟", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000141", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000141.wav", "text": "كل خطوة بهالرحلة تقربك للإيمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000241", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000241.wav", "text": "الهوى قوي الليلة، ثبت الخيمة زين في الأرض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000421", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000421.wav", "text": "اقوموا يا محاربين! حتى الـcamel تخاف من خطتنا! الصفحة مئة واثنان وأربعون في الدليل يشرح الخطة.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000126", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000126.wav", "text": "عشان نوصل للمرحلة الجاية، لازم نعيد ترتيب هالرموز العتيقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000245", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000245.wav", "text": "احمِ المها من الصيادين، هي مهددة بالانقراض وتحتاج حمايتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000444", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000444.wav", "text": "اسرع، حل اللغز! المفتاح مخفي في صفحة مئة واثنين وأربعين من الmanual.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000199", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000199.wav", "text": "افرد العجينة وزيد الصلصة بترتيب.", "language_id": "ar", "instruct": "saudi, conversational, creative"} +{"id": "sample_000207", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000207.wav", "text": "جدل السعف زين، السلة بتشيل كثير تمر.", "language_id": "ar", "instruct": "saudi, conversational, creative"} +{"id": "sample_000259", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000259.wav", "text": "الشركة حقتنا في البلوكتشين محتاجة استثمار أكثر. يلا نعرض على شركات التمويل الجريء السعودية!", "language_id": "ar", "instruct": "saudi, conversational, ambitious"} +{"id": "sample_000152", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000152.wav", "text": "الموضوع مو بس عن الفوز، هو عن الدقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000195", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000195.wav", "text": "آخر جولة، ما بقى إلا المحترفين!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000108", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000108.wav", "text": "اللفافة العتيقة تقول: 'الكنز في المكان اللي ظل أطول منارة يبوس أقدم بير وقت العصر.'", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000095", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000095.wav", "text": "ياهوه! جا وقت نعلق فوانيس رمضان!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000145", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000145.wav", "text": "الخريطة تقول الكنز مدفون تحت الرمال اللي تتحرك.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000135", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000135.wav", "text": "إذا ما قتلنا الزحمة، الحر بيقتلنا!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000159", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000159.wav", "text": "نتقدم، لا توقفون الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000198", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000198.wav", "text": "حرك الجريش لين يثقل، وبعدين زيد البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000387", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000387.wav", "text": "مبروك! الحين وصلت لآخر level، يلا نحتفل في ستة وخمسين شارع.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000219", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000219.wav", "text": "امش بالليل عشان تتفادى حرارة الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000277", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000277.wav", "text": "ليه الجمل قدّم للوظيفَه؟ يبي stable position!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000268", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000268.wav", "text": "الفخ النبطي القديم يشتغل! بسرعة، حل لغز خريطة النجوم عشان توقفه!", "language_id": "ar", "instruct": "saudi, conversational, thrilling"} +{"id": "sample_000190", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000190.wav", "text": "حط القذائف مضبوط، لو غلطنا نضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000328", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000328.wav", "text": "وش عذرتس يا بنت عند ستة وخمسين شارع الواحة الساعه سبعة؟ شفتي سباق الجمل؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000179", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000179.wav", "text": "إذا تثق بصقرك، بيجيب الفريسة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000014", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000014.wav", "text": "الغرفة السرية تكشف الأسرار القديمة وقت الغروب.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000147", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000147.wav", "text": "القافلة تريح، بس البر ما ينام.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000232", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000232.wav", "text": "حط الحاجز هنا، بيكون وقفتنا الأخيرة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000011", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000011.wav", "text": "ساعدني ألقى حرامي الضايع في خربطة الشنط!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000105", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000105.wav", "text": "تجهيز شنط الحج مثل حل لعبة المكعبات بحبات السبحة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000225", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000225.wav", "text": "عبي السلاح بسرعة! إحنا تحت ضرب النار!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000178", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000178.wav", "text": "عجل بالحصان! قربنا نوصل للنهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000181", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000181.wav", "text": "لازم نلقى لنا ملجأ قبل ما تجي العاصفة.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000121", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000121.wav", "text": "البطولة لنا! تعب فريقنا ما راح على الفاضي!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000270", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000270.wav", "text": "العملاق النبطي قاعد يصحى! بسرعة، استخدم العود السحري حقك عشان تصعقه، وبعدين اضربه بسيفك المعقوف!", "language_id": "ar", "instruct": "saudi, conversational, epic"} +{"id": "sample_000373", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000373.wav", "text": "دور على المفتاح المخفي عشان تفتح الباب، وإلا بتظل هنا للأبد! ترى أقرب phone في الدور ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000285", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000285.wav", "text": "دور على الأثر القديم في manual صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000283", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000283.wav", "text": "بسرعة! لازم نلقى الscroll المخفي في المكتبه قبل يرجعون الحراس! المكتبه في دور ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000206", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000206.wav", "text": "حكم الخيوط زين، هالسجادة بتحمل قصص أهلنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000262", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000262.wav", "text": "لازم نوازن بين الاستدامة والتقنية. زيد المزارع العمودية في المنطقة خمسة.", "language_id": "ar", "instruct": "saudi, conversational, innovative"} +{"id": "sample_000435", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000435.wav", "text": "بسرعة، علمني، وش هي عاصمة أستراليا؟ تلميحة: شوفي صفحة مئة واثنين وأربعين في الڤايد!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000291", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000291.wav", "text": "اتفقت القبيلة! بنلتقي في واحة ستة وخمسون. جبو أحسن strategies عندكم!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000280", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000280.wav", "text": "عدّينا كثبان واجد سوا؛ وداعًا يا رفيقي لين نرجع نلتقي في ليفل عشرة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000174", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000174.wav", "text": "محاصرين! لازم نطلعهم بقوة!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000197", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000197.wav", "text": "سر الكبسة يجي مع ظبط البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000278", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000278.wav", "text": "دير بالك! الجمل سرق قْهَوَتِس في المهرجان! الظاهر إنه عنده مئة HP مثل Boss في اللعبة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000346", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000346.wav", "text": "هلا بك في الفريق، يا القايد. قاعدتك على ستة وخمسين شارع النجدي. جهّز استراتيجيتك.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000228", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000228.wav", "text": "عجل! عبي المدفع قبل ما يضربونا!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000170", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000170.wav", "text": "ما عندنا مؤونة كفاية، لازم ننسحب!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000234", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000234.wav", "text": "تراجع الحين! تجمع عند نقطة التفتيش الجاية!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000125", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000125.wav", "text": "ثلاثة، اثنان، واحد، انطلق! دعس البنزين وكون الأول!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000208", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000208.wav", "text": "طبخ الرز واللحم على نار هادية، الضيوف بيجون قريب.", "language_id": "ar", "instruct": "saudi, conversational, creative"} +{"id": "sample_000151", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000151.wav", "text": "العدو قريب، لازم نهرب الحين!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000102", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000102.wav", "text": "من فوق برج المملكة، تقدر تشوف باكر من اليوم!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000204", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000204.wav", "text": "الصبار فيه موية تكفينا أيام.", "language_id": "ar", "instruct": "saudi, conversational, creative"} +{"id": "sample_000120", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000120.wav", "text": "يا ربعي! علامة الضرب تبين المكان على خريطة الكنز ذي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000203", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000203.wav", "text": "خيط الشماغ للعريس، خلي الأطراف زينة.", "language_id": "ar", "instruct": "saudi, conversational, creative"} +{"id": "sample_000424", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000424.wav", "text": "يا بطل! وش اسمك الملحمي قبل نخلّص العالم؟ حط اسمك هنا وشوف صفحة مئة وواحد.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000003", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000003.wav", "text": "حطينا في الدرعية القديمة! يالله بسرعة، عدّل طريقة كلامك لا يحسبونا جنّ الناس هنا!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000379", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000379.wav", "text": "ذي الأحجار القديمة في Ruins خمسة ستة تحكي سَالْفة ضايعة مع الزمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000381", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000381.wav", "text": "العدو جاي من ورى الرِّمال. جهّز الدفاعات في سيكتور خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000454", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000454.wav", "text": "أذكر الأيام الزينات في الصحرا تحت ضو الگمر. تشبه Level خمسة باللعبة في صفحة مئة وثلاثة وعشرون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000400", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000400.wav", "text": "النصر لنا! خلونا نركب الجمال ونحتفل، بس لا تنسى تسجل رقم خمسة ستة شارع النجدي في GPS.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000384", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000384.wav", "text": "لازم تسوي سعر أحسن لهالبضاعة، ولا بتخاطر تخسر الصفقة. شوف الشروط بصفحة ستة وخمسين من كتيب Steam.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000162", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000162.wav", "text": "اليد اللي ثابتة تفوز، مو اليد السريعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000265", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000265.wav", "text": "الآلة القديمة تشتغل! بسرعة، حل لغز الكتابة القديمة عشان تدخل الغرفة المخبية!", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000248", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000248.wav", "text": "يبه، ليه صخرة الفيل وكل الصخور ذي أشكالها غريبة كذا؟", "language_id": "ar", "instruct": "saudi, conversational, curious"} +{"id": "sample_000104", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000104.wav", "text": "شد حيلك! هالبقي على الكثبان أخشن من بعير فيه الزغطة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000006", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000006.wav", "text": "البس نظارة الواقع عشان تشوف خريطة الكنز المخبية!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000393", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000393.wav", "text": "مرحبا بكم في بداية مغامرتنا! خلونا نشوف وش فيه بصفحة ستة وخمسون سوى!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000139", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000139.wav", "text": "الرموز باهتة، بس معناها قوي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000463", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000463.wav", "text": "بسرعة، قولي لي وش شفتي ب ستة وخمسين شارع النجدي! كان مجلس Falcon؟", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000453", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000453.wav", "text": "شوف ذا النقوش القديمة! يمكن تكشف secret strategy عن خطة سرية. العنوان: ستة وخمسون شارع النجدي", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000231", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000231.wav", "text": "تراجع للغطاء، الوضع هنا يضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000279", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000279.wav", "text": "جَمِّع السرعة عشان تفوز بكأس الصحراء. Level ثلاثة ينتظرك.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000173", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000173.wav", "text": "طلقة مضبوطة يعني فوز مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000264", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000264.wav", "text": "المنطقة الآمنة تضيق! روح برج المملكة عشان تاخذ ميزة المكان العالي!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000227", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000227.wav", "text": "سوق الدبابة للمرتفع، نحتاج الأفضلية!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000183", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000183.wav", "text": "الحصان باسمه القوي يشيل فخر القبيلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000115", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000115.wav", "text": "النقوش في الحجر تتكلم عن طرق التجارة القديمة. تقدر تفك شفرة كلامها؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000114", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000114.wav", "text": "جا وقت نعلق فانوس رمضان! يلا ننور بيتنا للشهر الفضيل.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000103", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000103.wav", "text": "في العيد، تمنيت بعير... بس جاني لعبة محشية بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000478", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000478.wav", "text": "مرحبا بكم يا رْجال الصغار! خلونا نبدا الدرس بقوة وشجاعة. شوفوا صفحة مئة واثنان وأربعون في كتاب game.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000097", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000097.wav", "text": "الجليب العتيق يوسوس، 'حط ريال وتمن أمنية!'", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000201", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000201.wav", "text": "هشّك الرز بالشوكة عشان يطلع خفيف.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000205", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000205.wav", "text": "دربه يرجع بالصيد، الثقة تبني مع الوقت.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000276", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000276.wav", "text": "سويتِها! الحين الصحراء صارت بأمان. تعالوا نلتقي عند الواحة لاحتفال كبير مع الشلة في ليفل واحد صفر!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000274", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000274.wav", "text": "ذا الجمل اللي في حوشتس عميل سري ولا بس يعشق قهوة وPlayStation؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000374", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000374.wav", "text": "حيّاكم الله في المهرجان! انبسطوا بسباق الجمال وخذوا تمرات تس مجانًا من booth خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000266", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000266.wav", "text": "انتبه! درونات الشركات تفحص المكان. استخدم جهاز الإخفاء السايبر حقك عشان ما ينكشف وجودك.", "language_id": "ar", "instruct": "saudi, conversational, futuristic"} +{"id": "sample_000344", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000344.wav", "text": "حياك الله في هالمستوى من Puzzle! لا تضيع جملَك في متاهة النفود عند سبعة وستين شارع النبطي.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000184", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000184.wav", "text": "كلّم حصانك بهدوء، وبيثق فيك وقت المعركة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000224", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000224.wav", "text": "النار تدفي قلوبنا، والقصص تدفي الأرواح.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000252", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000252.wav", "text": "يالله، وزع الرماة على طرف طويق! ما نبي يخترقون القلعة!", "language_id": "ar", "instruct": "saudi, conversational, competitive"} +{"id": "sample_000212", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000212.wav", "text": "الغزلان سريعة، امش على آثارها بحذر.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000218", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000218.wav", "text": "النقوش هذي تحكي قصة سقوط المدينة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000401", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000401.wav", "text": "السلام عليكم يا زائرٍ نبيل. أنا رفيقتس اللي بيدزلك في هذي الألغاز الغامضة في Level خمسة من اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000110", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000110.wav", "text": "هالنقوش على الحجر تبين ساعة الماء القديمة في مدائن صالح. تقدر تشغلها من جديد؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000142", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000142.wav", "text": "الرمل يخبّي أكثر من العظام.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000111", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000111.wav", "text": "ترتيب النقل الجماعي لحجاج حملتنا مثل حل لغز صعب. كل واحد له طلبات مختلفة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000192", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000192.wav", "text": "دوّر السيارة! خلك ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000239", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000239.wav", "text": "خل القهوة تغلي ببطء، الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000448", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000448.wav", "text": "هلا، لقيت الجمل الضايع؟ دور في صفحة مئة واثنين وأربعين من الmanual!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000202", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000202.wav", "text": "خيط الثوب زين، كل شي مهم للعريس.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000242", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000242.wav", "text": "ابني الطاحونة في وجه الهوى، لازم تدور بدون مشاكل.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000263", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000263.wav", "text": "دبابات العدو جاية من الشرق! حمّل الطلقات اللي تخترق الدروع وصوّب!", "language_id": "ar", "instruct": "saudi, conversational, intense"} +{"id": "sample_000267", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000267.wav", "text": "السباق يبدأ بعد خمسة دقايق! خذ لك سيارة سريعة وتعال لموقف استاد الملك فهد.", "language_id": "ar", "instruct": "saudi, conversational, satirical"} +{"id": "sample_000414", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000414.wav", "text": "بسرعة! حل اللغز عشان تفتح الجمل قبل توصلنا القبيلة المنافسة. لا تنسى تسجل الرقم في صفحة ستة وخمسون بالدليل!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000220", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000220.wav", "text": "خلنا ندور أفضل الصفقات في السوق قبل ما نكمل.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000129", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000129.wav", "text": "لازم نلقى المفتاح المخفي علشان ندخل القبر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000457", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000457.wav", "text": "شوف ذا الباب القديم! يحسسك إنه Portal مخفي لعالم ثاني، في خريطة ليفل ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000308", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000308.wav", "text": "تسذكر يوم اجتمعنا تحت النجوم بالصحراء؟ كنا نسولف عن PlayStation وشارع خمسة ستة بالرياض.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000177", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000177.wav", "text": "النجوم بتدلنا بالليل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000155", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000155.wav", "text": "ما نخليهم ياخذون التل. اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000012", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000012.wav", "text": "بسرعة، طلّق الصقور على الطريدة اللي تفر!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000399", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000399.wav", "text": "تأكد من إرسال تفاصيل المعاهدة إلى المجلس عند الساعة العاشرة صباحًا عبر email.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000018", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000018.wav", "text": "النجوم تدلنا في الرمل اللي ما له نهاية.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000271", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000271.wav", "text": "انتبِه لخطواتك. الطريق هنا في ليفل خمسة خطير جدًا وsteep.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000158", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000158.wav", "text": "ثبت في مكانك، لا تفقد تركيزك!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000250", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000250.wav", "text": "المترو زحمة! لازم نلقى طريق أسرع للمول.", "language_id": "ar", "instruct": "saudi, conversational, stressed"} +{"id": "sample_000131", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000131.wav", "text": "لازم نزين الخيمة للاحتفال الكبير!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000430", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000430.wav", "text": "هلا، خلونا نحفر هنا ونلقي treasure قديم! يمكن نلقى fossil الجمل؟ صفحة ثلاثمئة واثنا عشر في الدليل.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000412", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000412.wav", "text": "شفت سباق الجمال عند واحة ستة وخمسين؟ كان رهيب!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000238", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000238.wav", "text": "الهواء مثالي، خلنا نسبقهم لخط النهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000144", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000144.wav", "text": "شلون كل محل هنا يبيع نفس الشي؟", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000340", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000340.wav", "text": "مبروك يا بطل! قِمتَ بقود قبيْلتك للمجد. لين نلتقي مرة ثانية في الويْحَه على صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000193", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000193.wav", "text": "دوس بنزين! نتسابق على ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000326", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000326.wav", "text": "تذكر يوم اللي جالك تحدي وغلبته في مقابلة مع شركة RiyadhTech. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000137", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000137.wav", "text": "الرمل يتحرك مع كل خطوة، يخفي الطريق قدامنا.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000261", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000261.wav", "text": "الخط الأزرق زحمة! حوّل الركاب للخط الأخضر عشان يوصلون المول بالوقت!", "language_id": "ar", "instruct": "saudi, conversational, urgent"} +{"id": "sample_000211", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000211.wav", "text": "شوف رجوله، قوية. بيكون ممتاز للقافلة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000118", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000118.wav", "text": "من بيوت الدرعية الطينية لأبراج الرياض، عمارتنا تحكي قصتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000337", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000337.wav", "text": "سمعتْ؟ خُطتْ المَلِگ الجديدة سرية مثل رقم الهاتف صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000230", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000230.wav", "text": "حط المتفجرات وارجع، بنفجر الجسر!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000092", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000092.wav", "text": "قلت 'قزازة موية'، مو 'بعير موية'!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000240", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000240.wav", "text": "ارفع السيف فوق، العرضة بتبدأ قريب!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000433", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000433.wav", "text": "تتذكر حكايات Al-Majlis القديمة؟ دايمًا تلهم الشجاعة. تذكر صفحة مئة واثنان وأربعون بالكتاب.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000254", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000254.wav", "text": "عشان نفتح الدور الجاي، لازم نحل اللغز الاقتصادي ذا. فكر مثل المستثمر!", "language_id": "ar", "instruct": "saudi, conversational, challenging"} +{"id": "sample_000143", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000143.wav", "text": "الصقر حلق، ودانا وجهتنا.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000246", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000246.wav", "text": "يا سلام، قطعنا البلاد بساعة وحدة! الهايبرلوب ذا شي ثاني!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000128", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000128.wav", "text": "طيارات العدو وراك! سو حركات المراوغة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000165", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000165.wav", "text": "قريبين نفوز! لا تخففون الضغط!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000169", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000169.wav", "text": "تحركوا! ما نقدر نجلس هنا أكثر!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000116", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000116.wav", "text": "المشوار لمكة مو بس للجسم. أنت جاهز روحياً للحج؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000168", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000168.wav", "text": "لو تسجل، العشا علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000297", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000297.wav", "text": "يا زين! جملنا جاب VIP pass للْحَجّ. دق على صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة للتفاصيل.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000237", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000237.wav", "text": "نحتاج تعزيزات نفك الحصار عن بغداد!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000191", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000191.wav", "text": "انتبه من الحفر! ما نبي نطيح!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000468", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000468.wav", "text": "قوّموا المكينة! نْسابق صوب الويحات بسرعة صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000172", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000172.wav", "text": "بهالسرعة بنفوت العشا!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000100", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000100.wav", "text": "رمول الزمان خشت المفتاح؛ ما يشوفه إلا عين الصقر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious"} +{"id": "sample_000323", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000323.wav", "text": "مرحبا بك في عالم الصحراء. استعد للمعركة، يا بطل. رحلتك تبدأ الآن في صفحة ستة وخمسون من Game Guide.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000134", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000134.wav", "text": "لقينا الخريطة! يالله نبدأ المغامرة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000314", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000314.wav", "text": "حياك الله في مجلس التخطيط. من فضلك اذكر اسمك ودورك. حنا في غرفة رقم اثني عشر.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000394", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000394.wav", "text": "مرحبًا بك يا محارب. قول لنا عن رحلتك قُدّام المجلس على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000149", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000149.wav", "text": "أحسن لك تكون أسرع من الهوى، وإلا راحت عليك!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000272", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000272.wav", "text": "صدى الحكمة القديمة باقي يتردد في هالأطلال المْنسية، كنك تلقى شي زي ليفل خمسة في لعبة strategy.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000133", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000133.wav", "text": "ضرب عازف العود على الأوتار، وبدأوا الناس يصفقون.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000244", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000244.wav", "text": "تأكد أن كل التوصيلات على الوقت، زحمة المدينة ممكن تسبب تأخير.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000015", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000015.wav", "text": "شوف! لقيت كتابة قديمة على الجدار ذا!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000091", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000091.wav", "text": "يبه، ليه نلبس ثياب يديدة للعيد؟", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000255", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000255.wav", "text": "وناسة عالم الشتاء! نجرب اللعبة الدوارة ولا صالة الجليد أول؟", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000418", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000418.wav", "text": "مرحبًا يا محاربين! خلونا نغزو الديرة ونسجل مئة نقطة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000222", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000222.wav", "text": "القبيلة واقفة مع بعض مثل الجدار المتين.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000167", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000167.wav", "text": "يدك ثابتة وعقلك صافٍ، كذا تفوز.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000320", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000320.wav", "text": "مبروك! دزيت هالمستوى. لا تنسى تشيك على وقود الجمل بصفحة مئة واثنين وأربعين من الـ User Manual.", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000338", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000338.wav", "text": "تتذگر طعم القهوة مع التمر بالمجلس؟ أظن إنه كان في صالة رقم اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000233", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000233.wav", "text": "حط الفخ هنا، بننصب لهم كمين مع الفجر.", "language_id": "ar", "instruct": "saudi, conversational, critical"} +{"id": "sample_000235", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000235.wav", "text": "دور القناص قبل ما يضرب مرة ثانية!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000295", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000295.wav", "text": "بسرعة، دوّر على المفتاح المخفي في القاعة majestic قبل ما يرجعون الحراس! شوف صفحة مئة واثنين وأربعين للخطوات.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000150", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000150.wav", "text": "لازم نرجع نرتب الصفوف قبل ما يهجمون مرة ثانية.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000216", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000216.wav", "text": "اطلع بحذر، أفضل التمر فوق.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000094", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000094.wav", "text": "شوف! الكعبة قدامنا على طول!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000171", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000171.wav", "text": "شكله الكفر انفجر، لازم نصلحه بسرعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000356", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000356.wav", "text": "ارسل الكشافة لجهة الكثبان الشرقية يدورون عن إشارات كمين. لا تنسى تحدث الخريطة في جهاز GPS اللي معك للإصدار اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000392", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000392.wav", "text": "تشوف الصقور يطرن فوق الجبل؟ خلنا ننضم معهم في الصفحة مئة واثنان وأربعون من Mountain Quest!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000138", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000138.wav", "text": "الغنم جاهزة! يالله نبدأ العزيمة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000351", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000351.wav", "text": "حياكم الله في سوق التجارة! شوفوا السرج الخاص بالجمل عندنا بسعر مئة وتسعة وتسعين ريال!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000096", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000096.wav", "text": "التجهيز للحج مثل لعبة الطناخة بس حقيقية!", "language_id": "ar", "instruct": "saudi, conversational, humorous"} +{"id": "sample_000210", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000210.wav", "text": "السعر غالي، خلنا نتفق على شي ينفعنا اثنيننا.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000251", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000251.wav", "text": "متأكد إن البالون ذا يشيلنا كلنا؟ أحس إنه يترنح شوي فوق!", "language_id": "ar", "instruct": "saudi, conversational, nervous"} +{"id": "sample_000258", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000258.wav", "text": "ياخي، هذا مرشد سياحي تصويري؟ كأنك تمشي بالماضي والمستقبل مع بعض!", "language_id": "ar", "instruct": "saudi, conversational, curious"} +{"id": "sample_000124", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000124.wav", "text": "العاصفة تقرب! لازم نوصل المنطقة الآمنة بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000269", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000269.wav", "text": "فوت الكورة بين رجلين المدافع وسو تمريرة حايط مع ربيعك عشان تسجل في زقاق السوق!", "language_id": "ar", "instruct": "saudi, conversational, energetic"} +{"id": "sample_000455", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000455.wav", "text": "اذكر يوم تعلمنا عن قبائل Bedouin وتقاليدهم في صفحة أربعة وثلاثين من كتاب History.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000467", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000467.wav", "text": "لقَ ذا الواحَهْ المخفّية قبل غروب الشمس وقَم المخيم قريب من الأنقاض القديمة في شارع الملك عبد الله رقم سبعة تسعة.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000247", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000247.wav", "text": "شوف الحفلة التصويرية ذي! التقنية غيرت جو الترفيه عندنا بشكل!", "language_id": "ar", "instruct": "saudi, conversational, amazed"} +{"id": "sample_000185", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000185.wav", "text": "حصان الفارس أعز صديق له وقت القتال.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000154", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000154.wav", "text": "باقي لك هدف واحد. خلها تضبط.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000442", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000442.wav", "text": "الاجتماع العائلي في الواحة مهم جداً. لازم نسوي الخطة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000249", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000249.wav", "text": "السيارات الكهربا ساكتة مرة! كأننا نسابق في المستقبل!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000459", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000459.wav", "text": "تذكر يوم تسابقنا فوق الكْثبان الرملية في Desert Racer، ندزلب غروب الشمس؟ قابلني عند ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000156", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000156.wav", "text": "قريب توصل، باقي لك لفة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000007", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000007.wav", "text": "برمج الدرونات تسقي المزارع اللي فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000217", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000217.wav", "text": "الحجارة هذي من قرون واقفة، اسمع قصصها.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000341", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000341.wav", "text": "تْفَكَّر فِي الرِّحْلَة عبر الصّحْرَا لِـ Mecca. تذَكَّر سَوالِف أَجْدَادْنَا اللي قالوها في صفحة مئة واثنين وأربعين.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000187", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000187.wav", "text": "الدبابة جايه من الشرق! جهز المدفع!", "language_id": "ar", "instruct": "saudi, conversational, serious"} +{"id": "sample_000302", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000302.wav", "text": "شفت أحد مشبوه قريب من الوَحَه حول الساعه صفر تسعة: صفر صفر؟ يمكن كان لابس قميص مكتوب عليه Desert Eagle.", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000176", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000176.wav", "text": "فاضي! مرر الكورة!", "language_id": "ar", "instruct": "saudi, conversational, excited"} +{"id": "sample_000017", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000017.wav", "text": "نقوش الحنا حق جدتي مثل القصيد اللي يسيل.", "language_id": "ar", "instruct": "saudi, conversational, reflective"} +{"id": "sample_000407", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000407.wav", "text": "شوف! الكثبان تلمع مثل fireflies! يلا نحل هال puzzle بسرعه!", "language_id": "ar", "instruct": "saudi, conversational, excited"} diff --git a/sync_data/tokens/dev/data.lst b/sync_data/tokens/dev/data.lst new file mode 100644 index 0000000000000000000000000000000000000000..ec6c1f750231f415e86915481bba0bbd683b8889 --- /dev/null +++ b/sync_data/tokens/dev/data.lst @@ -0,0 +1,15 @@ +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000000.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000000.jsonl 1 4.200 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000001.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000001.jsonl 1 2.840 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000002.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000002.jsonl 1 4.840 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000003.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000003.jsonl 1 4.840 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000004.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000004.jsonl 1 2.720 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000005.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000005.jsonl 1 7.280 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000006.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000006.jsonl 1 3.480 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000007.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000007.jsonl 1 2.840 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000008.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000008.jsonl 1 9.720 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000009.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000009.jsonl 1 3.800 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000010.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000010.jsonl 1 3.360 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000011.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000011.jsonl 1 4.880 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000012.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000012.jsonl 1 3.840 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000013.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000013.jsonl 1 3.160 +/home/riftuser/OmniVoice/sync_data/tokens/dev/audios/shard-000014.tar /home/riftuser/OmniVoice/sync_data/tokens/dev/txts/shard-000014.jsonl 1 3.360 diff --git a/sync_data/tokens/dev/errors.jsonl b/sync_data/tokens/dev/errors.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sync_data/tokens/dev/txts/shard-000000.jsonl b/sync_data/tokens/dev/txts/shard-000000.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..77facbe9da1adbeea0c2e54840db31915a59ff99 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000000.jsonl @@ -0,0 +1 @@ +{"id": "sample_000127", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000127.wav", "text": "الحراس يغيرون النوبة. هذي فرصتنا نتسلل!", "language_id": "ar", "instruct": "saudi, conversational, tense", "audio_duration": 4.2, "num_tokens": 105} diff --git a/sync_data/tokens/dev/txts/shard-000001.jsonl b/sync_data/tokens/dev/txts/shard-000001.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ebc3da34ac4249d958409d971a5dc73ffdbb8c67 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000001.jsonl @@ -0,0 +1 @@ +{"id": "sample_000016", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000016.wav", "text": "لازم نعبي موية زيادة للسفرة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71} diff --git a/sync_data/tokens/dev/txts/shard-000002.jsonl b/sync_data/tokens/dev/txts/shard-000002.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c39504e0df05dc4fb38434fba140f64a44f34a19 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000002.jsonl @@ -0,0 +1 @@ +{"id": "sample_000119", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000119.wav", "text": "يا هيوستن، طرنا! مهمتنا للمريخ تبدأ الحين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121} diff --git a/sync_data/tokens/dev/txts/shard-000003.jsonl b/sync_data/tokens/dev/txts/shard-000003.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5b91fdd6d124f2d4e5880c9172a4c16f2c03802 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000003.jsonl @@ -0,0 +1 @@ +{"id": "sample_000215", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000215.wav", "text": "افرك الورق على الجرح، العشب هذا مستخدم من زمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.84, "num_tokens": 121} diff --git a/sync_data/tokens/dev/txts/shard-000004.jsonl b/sync_data/tokens/dev/txts/shard-000004.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f4449cfbf53bbbde7546c823403cdb8ca1946577 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000004.jsonl @@ -0,0 +1 @@ +{"id": "sample_000132", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000132.wav", "text": "النجوم ترشدنا بليل البر.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.72, "num_tokens": 68} diff --git a/sync_data/tokens/dev/txts/shard-000007.jsonl b/sync_data/tokens/dev/txts/shard-000007.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0a7b73c9d3ac6b7d65c8448c22161b273c381d83 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000007.jsonl @@ -0,0 +1 @@ +{"id": "sample_000089", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000089.wav", "text": "شوف! لقيت عملة من زمان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71} diff --git a/sync_data/tokens/dev/txts/shard-000008.jsonl b/sync_data/tokens/dev/txts/shard-000008.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bbf8a7d77e89f6365826b23d827a91d62ced464e --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000008.jsonl @@ -0,0 +1 @@ +{"id": "sample_000452", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000452.wav", "text": "شغل عدل! ادزلبتس استراتيجية الدفاع عن الواحة. خلنا نحتفل بكوب من القهوة مع GameMaster. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.72, "num_tokens": 243} diff --git a/sync_data/tokens/dev/txts/shard-000010.jsonl b/sync_data/tokens/dev/txts/shard-000010.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..74ecadc871e8ef90121f0acfce58334c58a57146 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000010.jsonl @@ -0,0 +1 @@ +{"id": "sample_000090", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000090.wav", "text": "نجهز للحج! وش لازم نجيب معنا؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.36, "num_tokens": 84} diff --git a/sync_data/tokens/dev/txts/shard-000011.jsonl b/sync_data/tokens/dev/txts/shard-000011.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..96e7fe923f5b1c4c389143fe9d6be0d5e07e49e1 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000011.jsonl @@ -0,0 +1 @@ +{"id": "sample_000200", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000200.wav", "text": "غلي المويه، بعدين حط الهيل عشان الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.88, "num_tokens": 122} diff --git a/sync_data/tokens/dev/txts/shard-000013.jsonl b/sync_data/tokens/dev/txts/shard-000013.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e186e53bb82e7f180a3f7dff9e2efebb4b0acbd7 --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000013.jsonl @@ -0,0 +1 @@ +{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79} diff --git a/sync_data/tokens/dev/txts/shard-000014.jsonl b/sync_data/tokens/dev/txts/shard-000014.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5a2fc147d3014e4b9faf44d87cda601068b1263d --- /dev/null +++ b/sync_data/tokens/dev/txts/shard-000014.jsonl @@ -0,0 +1 @@ +{"id": "sample_000189", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000189.wav", "text": "غير الكفر بسرعة، ولا ما نطلع من هنا!", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84} diff --git a/sync_data/tokens/train/data.lst b/sync_data/tokens/train/data.lst new file mode 100644 index 0000000000000000000000000000000000000000..610df5b33763993ec28fc04b5e0c2e062b28baa7 --- /dev/null +++ b/sync_data/tokens/train/data.lst @@ -0,0 +1,35 @@ +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000000.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000000.jsonl 8 42.280 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000001.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000001.jsonl 8 44.000 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000002.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000002.jsonl 8 48.000 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000003.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000003.jsonl 8 35.560 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000004.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000004.jsonl 8 45.360 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000005.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000005.jsonl 8 41.680 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000006.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000006.jsonl 8 49.120 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000007.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000007.jsonl 8 47.360 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000008.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000008.jsonl 8 41.840 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000009.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000009.jsonl 8 35.160 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000010.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000010.jsonl 8 41.360 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000011.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000011.jsonl 8 33.080 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000012.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000012.jsonl 8 41.360 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000013.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000013.jsonl 8 46.160 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000014.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000014.jsonl 8 43.760 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000015.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000015.jsonl 8 47.520 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000016.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000016.jsonl 8 42.800 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000017.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000017.jsonl 8 30.440 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000018.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000018.jsonl 8 42.040 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000019.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000019.jsonl 8 44.720 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000020.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000020.jsonl 8 37.600 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000021.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000021.jsonl 8 43.320 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000022.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000022.jsonl 8 44.960 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000023.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000023.jsonl 8 43.880 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000024.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000024.jsonl 8 51.960 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000025.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000025.jsonl 8 38.240 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000026.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000026.jsonl 8 48.400 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000027.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000027.jsonl 8 40.320 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000028.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000028.jsonl 8 33.960 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000029.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000029.jsonl 8 56.080 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000030.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000030.jsonl 8 44.080 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000031.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000031.jsonl 8 37.160 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000032.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000032.jsonl 8 44.600 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000033.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000033.jsonl 8 35.480 +/home/riftuser/OmniVoice/sync_data/tokens/train/audios/shard-000034.tar /home/riftuser/OmniVoice/sync_data/tokens/train/txts/shard-000034.jsonl 8 50.640 diff --git a/sync_data/tokens/train/errors.jsonl b/sync_data/tokens/train/errors.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/sync_data/tokens/train/txts/shard-000000.jsonl b/sync_data/tokens/train/txts/shard-000000.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..824a974d463b8266fdef90892b107674e770a844 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000000.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000144", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000144.wav", "text": "شلون كل محل هنا يبيع نفس الشي؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.96, "num_tokens": 74} +{"id": "sample_000171", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000171.wav", "text": "شكله الكفر انفجر، لازم نصلحه بسرعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000294", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000294.wav", "text": "استعدوا يا رجيل، العدو جاي من الشرق عقب خمس عشرة دقيقة. صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.44, "num_tokens": 111} +{"id": "sample_000167", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000167.wav", "text": "يدك ثابتة وعقلك صافٍ، كذا تفوز.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.52, "num_tokens": 113} +{"id": "sample_000399", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000399.wav", "text": "تأكد من إرسال تفاصيل المعاهدة إلى المجلس عند الساعة العاشرة صباحًا عبر email.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.48, "num_tokens": 162} +{"id": "sample_000421", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000421.wav", "text": "اقوموا يا محاربين! حتى الـcamel تخاف من خطتنا! الصفحة مئة واثنان وأربعون في الدليل يشرح الخطة.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.6, "num_tokens": 240} +{"id": "sample_000166", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000166.wav", "text": "يتقدمون علينا! اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67} +{"id": "sample_000426", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000426.wav", "text": "شفتي الجمل يرقص جنب الواحَهْ البارحة؟ كان العرض عند ستة وخمسين شارع النجدي. It was super!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.96, "num_tokens": 199} diff --git a/sync_data/tokens/train/txts/shard-000001.jsonl b/sync_data/tokens/train/txts/shard-000001.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..ca8bbf77cb397c134773fcfa1c0fc8bbb863d6af --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000001.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000367", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000367.wav", "text": "لا تْقَلِق، الجمل عنده GPS. بس اتبع الإحداثيات على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.32, "num_tokens": 158} +{"id": "sample_000193", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000193.wav", "text": "دوس بنزين! نتسابق على ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000320", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000320.wav", "text": "مبروك! دزيت هالمستوى. لا تنسى تشيك على وقود الجمل بصفحة مئة واثنين وأربعين من الـ User Manual.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.68, "num_tokens": 217} +{"id": "sample_000270", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000270.wav", "text": "العملاق النبطي قاعد يصحى! بسرعة، استخدم العود السحري حقك عشان تصعقه، وبعدين اضربه بسيفك المعقوف!", "language_id": "ar", "instruct": "saudi, conversational, epic", "audio_duration": 8.56, "num_tokens": 214} +{"id": "sample_000393", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000393.wav", "text": "مرحبا بكم في بداية مغامرتنا! خلونا نشوف وش فيه بصفحة ستة وخمسون سوى!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.72, "num_tokens": 168} +{"id": "sample_000094", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000094.wav", "text": "شوف! الكعبة قدامنا على طول!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.0, "num_tokens": 75} +{"id": "sample_000152", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000152.wav", "text": "الموضوع مو بس عن الفوز، هو عن الدقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.84, "num_tokens": 96} +{"id": "sample_000204", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000204.wav", "text": "الصبار فيه موية تكفينا أيام.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.16, "num_tokens": 79} diff --git a/sync_data/tokens/train/txts/shard-000002.jsonl b/sync_data/tokens/train/txts/shard-000002.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..bcc07be20a53c26f3e1d10c199e0bbc8df790869 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000002.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000134", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000134.wav", "text": "لقينا الخريطة! يالله نبدأ المغامرة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.6, "num_tokens": 90} +{"id": "sample_000381", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000381.wav", "text": "العدو جاي من ورى الرِّمال. جهّز الدفاعات في سيكتور خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.0, "num_tokens": 125} +{"id": "sample_000379", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000379.wav", "text": "ذي الأحجار القديمة في Ruins خمسة ستة تحكي سَالْفة ضايعة مع الزمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.92, "num_tokens": 198} +{"id": "sample_000014", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000014.wav", "text": "الغرفة السرية تكشف الأسرار القديمة وقت الغروب.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.84, "num_tokens": 121} +{"id": "sample_000278", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000278.wav", "text": "دير بالك! الجمل سرق قْهَوَتِس في المهرجان! الظاهر إنه عنده مئة HP مثل Boss في اللعبة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.2, "num_tokens": 205} +{"id": "sample_000107", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000107.wav", "text": "في العيد، طلبت بساط طاير، بس جاني مكنسة كهرب بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.28, "num_tokens": 132} +{"id": "sample_000178", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000178.wav", "text": "عجل بالحصان! قربنا نوصل للنهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.56, "num_tokens": 89} +{"id": "sample_000474", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000474.wav", "text": "خلنا نخطط الstrategy لوحتنا! اتصل على صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان لعقد مجلس العائلة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.6, "num_tokens": 240} diff --git a/sync_data/tokens/train/txts/shard-000003.jsonl b/sync_data/tokens/train/txts/shard-000003.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1261997f398a8637e39d7ac3a5e36201dd543b6d --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000003.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000477", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000477.wav", "text": "حياكم في المهرجان! تعالوا عند بوث خمسة ستة لتجربة ما تنساها.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.72, "num_tokens": 143} +{"id": "sample_000128", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000128.wav", "text": "طيارات العدو وراك! سو حركات المراوغة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000217", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000217.wav", "text": "الحجارة هذي من قرون واقفة، اسمع قصصها.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.12, "num_tokens": 103} +{"id": "sample_000187", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000187.wav", "text": "الدبابة جايه من الشرق! جهز المدفع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000151", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000151.wav", "text": "العدو قريب، لازم نهرب الحين!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 2.84, "num_tokens": 71} +{"id": "sample_000456", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000456.wav", "text": "جهز الإمدادات لرحلتنا. لازم نوصل ل Oasis أربعة اثنان قبل غروب الشمس.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.56, "num_tokens": 164} +{"id": "sample_000168", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000168.wav", "text": "لو تسجل، العشا علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.72, "num_tokens": 68} +{"id": "sample_000118", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000118.wav", "text": "من بيوت الدرعية الطينية لأبراج الرياض، عمارتنا تحكي قصتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.4, "num_tokens": 160} diff --git a/sync_data/tokens/train/txts/shard-000004.jsonl b/sync_data/tokens/train/txts/shard-000004.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d18a5416fdd9fc52b691dee3abf628b90340f5d7 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000004.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000295", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000295.wav", "text": "بسرعة، دوّر على المفتاح المخفي في القاعة majestic قبل ما يرجعون الحراس! شوف صفحة مئة واثنين وأربعين للخطوات.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.8, "num_tokens": 170} +{"id": "sample_000092", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000092.wav", "text": "قلت 'قزازة موية'، مو 'بعير موية'!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000007", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000007.wav", "text": "برمج الدرونات تسقي المزارع اللي فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.2, "num_tokens": 80} +{"id": "sample_000340", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000340.wav", "text": "مبروك يا بطل! قِمتَ بقود قبيْلتك للمجد. لين نلتقي مرة ثانية في الويْحَه على صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.92, "num_tokens": 248} +{"id": "sample_000227", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000227.wav", "text": "سوق الدبابة للمرتفع، نحتاج الأفضلية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000274", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000274.wav", "text": "ذا الجمل اللي في حوشتس عميل سري ولا بس يعشق قهوة وPlayStation؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.92, "num_tokens": 148} +{"id": "sample_000478", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000478.wav", "text": "مرحبا بكم يا رْجال الصغار! خلونا نبدا الدرس بقوة وشجاعة. شوفوا صفحة مئة واثنان وأربعون في كتاب game.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.92, "num_tokens": 223} +{"id": "sample_000160", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000160.wav", "text": "استعدوا، المعركة هذي بتكون صعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.24, "num_tokens": 81} diff --git a/sync_data/tokens/train/txts/shard-000005.jsonl b/sync_data/tokens/train/txts/shard-000005.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..952c7b62277bc4d917902b33dea843029746d819 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000005.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000305", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000305.wav", "text": "تذكر كيف كان الهوا الصحراوي على الكْثُبان عند الأوسيَس صفحة مئة واثنان وأربعون؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.08, "num_tokens": 177} +{"id": "sample_000250", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000250.wav", "text": "المترو زحمة! لازم نلقى طريق أسرع للمول.", "language_id": "ar", "instruct": "saudi, conversational, stressed", "audio_duration": 4.0, "num_tokens": 100} +{"id": "sample_000223", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000223.wav", "text": "اللي ينسى أصله، ما له مستقبل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.2, "num_tokens": 80} +{"id": "sample_000237", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000237.wav", "text": "نحتاج تعزيزات نفك الحصار عن بغداد!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000245", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000245.wav", "text": "احمِ المها من الصيادين، هي مهددة بالانقراض وتحتاج حمايتنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.2, "num_tokens": 130} +{"id": "sample_000403", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000403.wav", "text": "مرحبًا بك في المقابلة! اجلس واستمتع بـ qahwa المشهورة في صفحة ستة وخمسين من دليلنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.04, "num_tokens": 176} +{"id": "sample_000116", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000116.wav", "text": "المشوار لمكة مو بس للجسم. أنت جاهز روحياً للحج؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.36, "num_tokens": 134} +{"id": "sample_000394", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000394.wav", "text": "مرحبًا بك يا محارب. قول لنا عن رحلتك قُدّام المجلس على صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.32, "num_tokens": 158} diff --git a/sync_data/tokens/train/txts/shard-000006.jsonl b/sync_data/tokens/train/txts/shard-000006.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d353c62bba197b9472cb6975cae984e8b079a5c9 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000006.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000150", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000150.wav", "text": "لازم نرجع نرتب الصفوف قبل ما يهجمون مرة ثانية.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.04, "num_tokens": 101} +{"id": "sample_000433", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000433.wav", "text": "تتذكر حكايات Al-Majlis القديمة؟ دايمًا تلهم الشجاعة. تذكر صفحة مئة واثنان وأربعون بالكتاب.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 9.28, "num_tokens": 232} +{"id": "sample_000418", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000418.wav", "text": "مرحبًا يا محاربين! خلونا نغزو الديرة ونسجل مئة نقطة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.88, "num_tokens": 147} +{"id": "sample_000125", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000125.wav", "text": "ثلاثة، اثنان، واحد، انطلق! دعس البنزين وكون الأول!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.72, "num_tokens": 193} +{"id": "sample_000247", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000247.wav", "text": "شوف الحفلة التصويرية ذي! التقنية غيرت جو الترفيه عندنا بشكل!", "language_id": "ar", "instruct": "saudi, conversational, amazed", "audio_duration": 6.08, "num_tokens": 152} +{"id": "sample_000010", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000010.wav", "text": "يلّا نلحق القطار قبل لا يفوتنا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.52, "num_tokens": 63} +{"id": "sample_000427", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000427.wav", "text": "يلا نخلي اللغز ذا سوا! شوفي صفحة مئة واثنين وأربعين للhint.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.04, "num_tokens": 126} +{"id": "sample_000430", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000430.wav", "text": "هلا، خلونا نحفر هنا ونلقي treasure قديم! يمكن نلقى fossil الجمل؟ صفحة ثلاثمئة واثنا عشر في الدليل.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.56, "num_tokens": 214} diff --git a/sync_data/tokens/train/txts/shard-000007.jsonl b/sync_data/tokens/train/txts/shard-000007.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..636b66b6321ff31ead332d6a0fcd3b1984acb1e1 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000007.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000285", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000285.wav", "text": "دور على الأثر القديم في manual صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.52, "num_tokens": 113} +{"id": "sample_000268", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000268.wav", "text": "الفخ النبطي القديم يشتغل! بسرعة، حل لغز خريطة النجوم عشان توقفه!", "language_id": "ar", "instruct": "saudi, conversational, thrilling", "audio_duration": 6.16, "num_tokens": 154} +{"id": "sample_000159", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000159.wav", "text": "نتقدم، لا توقفون الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.64, "num_tokens": 66} +{"id": "sample_000453", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000453.wav", "text": "شوف ذا النقوش القديمة! يمكن تكشف secret strategy عن خطة سرية. العنوان: ستة وخمسون شارع النجدي", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.04, "num_tokens": 226} +{"id": "sample_000257", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000257.wav", "text": "شوف! هذا نجم الشمال. المسافرين أول كانوا يستخدمونه عشان يلقون طريقهم في الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, awe-inspired", "audio_duration": 6.92, "num_tokens": 173} +{"id": "sample_000480", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000480.wav", "text": "علمتني وش شفت في ستة وخمسين شارع النجدي. كان فيه أحد مشبوه؟", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.28, "num_tokens": 132} +{"id": "sample_000440", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000440.wav", "text": "يا هلا بالمسافر، مرحب فيك عند Checkpoint خمسة ستة! استانس بسباقات الجمال اللي عندنا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.48, "num_tokens": 187} +{"id": "sample_000444", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000444.wav", "text": "اسرع، حل اللغز! المفتاح مخفي في صفحة مئة واثنين وأربعين من الmanual.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.32, "num_tokens": 133} diff --git a/sync_data/tokens/train/txts/shard-000008.jsonl b/sync_data/tokens/train/txts/shard-000008.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1af0dc657a8d0f4566ae90b7a4a1dcb3968f31ed --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000008.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000102", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000102.wav", "text": "من فوق برج المملكة، تقدر تشوف باكر من اليوم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.4, "num_tokens": 110} +{"id": "sample_000149", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000149.wav", "text": "أحسن لك تكون أسرع من الهوى، وإلا راحت عليك!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.36, "num_tokens": 109} +{"id": "sample_000407", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000407.wav", "text": "شوف! الكثبان تلمع مثل fireflies! يلا نحل هال puzzle بسرعه!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.12, "num_tokens": 153} +{"id": "sample_000249", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000249.wav", "text": "السيارات الكهربا ساكتة مرة! كأننا نسابق في المستقبل!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121} +{"id": "sample_000231", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000231.wav", "text": "تراجع للغطاء، الوضع هنا يضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000232", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000232.wav", "text": "حط الحاجز هنا، بيكون وقفتنا الأخيرة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.16, "num_tokens": 79} +{"id": "sample_000339", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000339.wav", "text": "هلا والله! أنا دليلك اللي بيخذك في مغامرة وسط النْجود. يلا نبدأ من ستة وخمسين شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 8.36, "num_tokens": 209} +{"id": "sample_000423", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000423.wav", "text": "هلا بك يا قايد! حضّر عساكرك للمهمة الأولى في واحة خمسة وستين! شوف الخريطة في جهازك الـTablet.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.2, "num_tokens": 180} diff --git a/sync_data/tokens/train/txts/shard-000009.jsonl b/sync_data/tokens/train/txts/shard-000009.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b6f57f02e2a3472dea2e6887bb7d30ae1761a412 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000009.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000179", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000179.wav", "text": "إذا تثق بصقرك، بيجيب الفريسة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.56, "num_tokens": 89} +{"id": "sample_000172", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000172.wav", "text": "بهالسرعة بنفوت العشا!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 2.12, "num_tokens": 53} +{"id": "sample_000188", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000188.wav", "text": "دوس بنزين! قاعدين نفقدهم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000137", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000137.wav", "text": "الرمل يتحرك مع كل خطوة، يخفي الطريق قدامنا.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.52, "num_tokens": 113} +{"id": "sample_000184", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000184.wav", "text": "كلّم حصانك بهدوء، وبيثق فيك وقت المعركة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.52, "num_tokens": 113} +{"id": "sample_000246", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000246.wav", "text": "يا سلام، قطعنا البلاد بساعة وحدة! الهايبرلوب ذا شي ثاني!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.96, "num_tokens": 149} +{"id": "sample_000375", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000375.wav", "text": "يا شريك، صار الوقت نقول وداع! لا تنسى puzzle party يوم الجمعة في خمسة ستة شارع النجدي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.88, "num_tokens": 197} +{"id": "sample_000153", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000153.wav", "text": "إذا ما سرعت، بنخسر قدام سكوتر!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.2, "num_tokens": 80} diff --git a/sync_data/tokens/train/txts/shard-000010.jsonl b/sync_data/tokens/train/txts/shard-000010.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6bb76bb51479d5182bea7a33dc6b3924aa6037f6 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000010.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000226", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000226.wav", "text": "عبي الشوزن، قاعدين يهجمون علينا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.96, "num_tokens": 74} +{"id": "sample_000448", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000448.wav", "text": "هلا، لقيت الجمل الضايع؟ دور في صفحة مئة واثنين وأربعين من الmanual!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.28, "num_tokens": 157} +{"id": "sample_000271", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000271.wav", "text": "انتبِه لخطواتك. الطريق هنا في ليفل خمسة خطير جدًا وsteep.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.56, "num_tokens": 139} +{"id": "sample_000263", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000263.wav", "text": "دبابات العدو جاية من الشرق! حمّل الطلقات اللي تخترق الدروع وصوّب!", "language_id": "ar", "instruct": "saudi, conversational, intense", "audio_duration": 6.2, "num_tokens": 155} +{"id": "sample_000120", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000120.wav", "text": "يا ربعي! علامة الضرب تبين المكان على خريطة الكنز ذي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.76, "num_tokens": 119} +{"id": "sample_000147", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000147.wav", "text": "القافلة تريح، بس البر ما ينام.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000222", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000222.wav", "text": "القبيلة واقفة مع بعض مثل الجدار المتين.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.64, "num_tokens": 116} +{"id": "sample_000111", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000111.wav", "text": "ترتيب النقل الجماعي لحجاج حملتنا مثل حل لغز صعب. كل واحد له طلبات مختلفة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.32, "num_tokens": 183} diff --git a/sync_data/tokens/train/txts/shard-000011.jsonl b/sync_data/tokens/train/txts/shard-000011.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..eda9e738ea11ed679f5a661d08d7fe82635f79d7 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000011.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000113", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000113.wav", "text": "من البيوت الطينية للأبراج الزجاجية، سماء الرياض تحكي قصة تطورنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.92, "num_tokens": 173} +{"id": "sample_000197", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000197.wav", "text": "سر الكبسة يجي مع ظبط البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000243", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000243.wav", "text": "خلك منتبه! إذا أطلقت الصقر بدري، بنخسر السباق!", "language_id": "ar", "instruct": "saudi, conversational, competitive", "audio_duration": 4.48, "num_tokens": 112} +{"id": "sample_000136", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000136.wav", "text": "كل خطوة تقربك من الراحة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.84, "num_tokens": 71} +{"id": "sample_000130", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000130.wav", "text": "الاستعداد لهالرحلة شرف ومسؤولية.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84} +{"id": "sample_000252", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000252.wav", "text": "يالله، وزع الرماة على طرف طويق! ما نبي يخترقون القلعة!", "language_id": "ar", "instruct": "saudi, conversational, competitive", "audio_duration": 5.32, "num_tokens": 133} +{"id": "sample_000169", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000169.wav", "text": "تحركوا! ما نقدر نجلس هنا أكثر!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.04, "num_tokens": 76} +{"id": "sample_000236", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000236.wav", "text": "لازم نسيطر على الأهرامات، اندفع الحين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85} diff --git a/sync_data/tokens/train/txts/shard-000012.jsonl b/sync_data/tokens/train/txts/shard-000012.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..53b339edecb158d5b7704540ad1ff837c8bd6e3e --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000012.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000012", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000012.wav", "text": "بسرعة، طلّق الصقور على الطريدة اللي تفر!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.68, "num_tokens": 92} +{"id": "sample_000275", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000275.wav", "text": "ليش الجمل دخل قبيلة Puzzle؟ عشان يحل ألغاز القفر في level ثلاثة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.08, "num_tokens": 152} +{"id": "sample_000177", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000177.wav", "text": "النجوم بتدلنا بالليل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 2.44, "num_tokens": 61} +{"id": "sample_000266", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000266.wav", "text": "انتبه! درونات الشركات تفحص المكان. استخدم جهاز الإخفاء السايبر حقك عشان ما ينكشف وجودك.", "language_id": "ar", "instruct": "saudi, conversational, futuristic", "audio_duration": 7.8, "num_tokens": 195} +{"id": "sample_000468", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000468.wav", "text": "قوّموا المكينة! نْسابق صوب الويحات بسرعة صفر واحد تسعة صفر اثنان اثنان واحد اثنان ثلاثة ثلاثة اثنان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.84, "num_tokens": 96} +{"id": "sample_000308", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000308.wav", "text": "تسذكر يوم اجتمعنا تحت النجوم بالصحراء؟ كنا نسولف عن PlayStation وشارع خمسة ستة بالرياض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.44, "num_tokens": 211} +{"id": "sample_000277", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000277.wav", "text": "ليه الجمل قدّم للوظيفَه؟ يبي stable position!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.28, "num_tokens": 132} +{"id": "sample_000220", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000220.wav", "text": "خلنا ندور أفضل الصفقات في السوق قبل ما نكمل.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.8, "num_tokens": 95} diff --git a/sync_data/tokens/train/txts/shard-000013.jsonl b/sync_data/tokens/train/txts/shard-000013.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..66284a14be7ad236137b58d8542b475829762965 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000013.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000291", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000291.wav", "text": "اتفقت القبيلة! بنلتقي في واحة ستة وخمسون. جبو أحسن strategies عندكم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.56, "num_tokens": 164} +{"id": "sample_000269", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000269.wav", "text": "فوت الكورة بين رجلين المدافع وسو تمريرة حايط مع ربيعك عشان تسجل في زقاق السوق!", "language_id": "ar", "instruct": "saudi, conversational, energetic", "audio_duration": 7.4, "num_tokens": 185} +{"id": "sample_000442", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000442.wav", "text": "الاجتماع العائلي في الواحة مهم جداً. لازم نسوي الخطة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.64, "num_tokens": 166} +{"id": "sample_000213", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000213.wav", "text": "تدرب على الدعوات، بتساعدك في الرحلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000106", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000106.wav", "text": "من جسر السما في برج المملكة بالرياض، تقدر تلمس الغيوم!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.12, "num_tokens": 128} +{"id": "sample_000258", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000258.wav", "text": "ياخي، هذا مرشد سياحي تصويري؟ كأنك تمشي بالماضي والمستقبل مع بعض!", "language_id": "ar", "instruct": "saudi, conversational, curious", "audio_duration": 6.64, "num_tokens": 166} +{"id": "sample_000457", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000457.wav", "text": "شوف ذا الباب القديم! يحسسك إنه Portal مخفي لعالم ثاني، في خريطة ليفل ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.92, "num_tokens": 173} +{"id": "sample_000011", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000011.wav", "text": "ساعدني ألقى حرامي الضايع في خربطة الشنط!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.48, "num_tokens": 87} diff --git a/sync_data/tokens/train/txts/shard-000014.jsonl b/sync_data/tokens/train/txts/shard-000014.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..02c83cfd4487b274760750042b634ee204e0575d --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000014.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000096", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000096.wav", "text": "التجهيز للحج مثل لعبة الطناخة بس حقيقية!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.24, "num_tokens": 131} +{"id": "sample_000260", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000260.wav", "text": "الرموز القديمة على صخرة الفيل تطلع بس تحت ضوء القمر. يلا، فك رموزها قبل يطلع الفجر!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 7.64, "num_tokens": 191} +{"id": "sample_000463", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000463.wav", "text": "بسرعة، قولي لي وش شفتي ب ستة وخمسين شارع النجدي! كان مجلس Falcon؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.36, "num_tokens": 134} +{"id": "sample_000238", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000238.wav", "text": "الهواء مثالي، خلنا نسبقهم لخط النهاية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.76, "num_tokens": 94} +{"id": "sample_000234", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000234.wav", "text": "تراجع الحين! تجمع عند نقطة التفتيش الجاية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.12, "num_tokens": 103} +{"id": "sample_000467", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000467.wav", "text": "لقَ ذا الواحَهْ المخفّية قبل غروب الشمس وقَم المخيم قريب من الأنقاض القديمة في شارع الملك عبد الله رقم سبعة تسعة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.56, "num_tokens": 239} +{"id": "sample_000214", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000214.wav", "text": "رم الشبكة لما تكون الموية راكدة، الصبر هو المفتاح.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.24, "num_tokens": 131} +{"id": "sample_000176", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000176.wav", "text": "فاضي! مرر الكورة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71} diff --git a/sync_data/tokens/train/txts/shard-000015.jsonl b/sync_data/tokens/train/txts/shard-000015.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b71a12b848bfe7b19abe21389c8dfcd23aa94877 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000015.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000280", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000280.wav", "text": "عدّينا كثبان واجد سوا؛ وداعًا يا رفيقي لين نرجع نلتقي في ليفل عشرة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.8, "num_tokens": 220} +{"id": "sample_000287", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000287.wav", "text": "يا ولد العم! شفت الجمل الجديد للجد؟ اسمه 'Speedster ثلاثة آلاف'!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.16, "num_tokens": 154} +{"id": "sample_000114", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000114.wav", "text": "جا وقت نعلق فانوس رمضان! يلا ننور بيتنا للشهر الفضيل.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.0, "num_tokens": 125} +{"id": "sample_000279", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000279.wav", "text": "جَمِّع السرعة عشان تفوز بكأس الصحراء. Level ثلاثة ينتظرك.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.36, "num_tokens": 134} +{"id": "sample_000124", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000124.wav", "text": "العاصفة تقرب! لازم نوصل المنطقة الآمنة بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.76, "num_tokens": 94} +{"id": "sample_000302", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000302.wav", "text": "شفت أحد مشبوه قريب من الوَحَه حول الساعه صفر تسعة: صفر صفر؟ يمكن كان لابس قميص مكتوب عليه Desert Eagle.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.12, "num_tokens": 178} +{"id": "sample_000174", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000174.wav", "text": "محاصرين! لازم نطلعهم بقوة!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.28, "num_tokens": 82} +{"id": "sample_000364", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000364.wav", "text": "مبروك! انت حليت آخر لغز وربحت جائزة مميزة. تْشوف التفاصيل في صفحة مئة واثنين وأربعين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 8.04, "num_tokens": 201} diff --git a/sync_data/tokens/train/txts/shard-000016.jsonl b/sync_data/tokens/train/txts/shard-000016.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c44dd9c96b26f03c7d5a93aee6044cea1b32d42a --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000016.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000154", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000154.wav", "text": "باقي لك هدف واحد. خلها تضبط.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000265", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000265.wav", "text": "الآلة القديمة تشتغل! بسرعة، حل لغز الكتابة القديمة عشان تدخل الغرفة المخبية!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 6.84, "num_tokens": 171} +{"id": "sample_000211", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000211.wav", "text": "شوف رجوله، قوية. بيكون ممتاز للقافلة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.64, "num_tokens": 116} +{"id": "sample_000104", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000104.wav", "text": "شد حيلك! هالبقي على الكثبان أخشن من بعير فيه الزغطة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.28, "num_tokens": 132} +{"id": "sample_000323", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000323.wav", "text": "مرحبا بك في عالم الصحراء. استعد للمعركة، يا بطل. رحلتك تبدأ الآن في صفحة ستة وخمسون من Game Guide.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.92, "num_tokens": 248} +{"id": "sample_000202", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000202.wav", "text": "خيط الثوب زين، كل شي مهم للعريس.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.76, "num_tokens": 94} +{"id": "sample_000123", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000123.wav", "text": "الأكسجين عندنا في خطر. لازم نوسع البيت الأخضر على طول.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.8, "num_tokens": 120} +{"id": "sample_000207", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000207.wav", "text": "جدل السعف زين، السلة بتشيل كثير تمر.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 4.16, "num_tokens": 104} diff --git a/sync_data/tokens/train/txts/shard-000017.jsonl b/sync_data/tokens/train/txts/shard-000017.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..675a3f5b300cb07c140c75e49a7f6a2a932c3330 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000017.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000099", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000099.wav", "text": "يالله بسرعة! عدل حلاوة العيد قبل لا يجون الضيوف!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.68, "num_tokens": 92} +{"id": "sample_000008", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000008.wav", "text": "كسّر الجدار عشان توقف هجمة الفيروس!", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.56, "num_tokens": 89} +{"id": "sample_000185", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000185.wav", "text": "حصان الفارس أعز صديق له وقت القتال.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.8, "num_tokens": 95} +{"id": "sample_000129", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000129.wav", "text": "لازم نلقى المفتاح المخفي علشان ندخل القبر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000203", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000203.wav", "text": "خيط الشماغ للعريس، خلي الأطراف زينة.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.8, "num_tokens": 95} +{"id": "sample_000219", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000219.wav", "text": "امش بالليل عشان تتفادى حرارة الصحرا.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.16, "num_tokens": 79} +{"id": "sample_000105", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000105.wav", "text": "تجهيز شنط الحج مثل حل لعبة المكعبات بحبات السبحة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.56, "num_tokens": 139} +{"id": "sample_000131", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000131.wav", "text": "لازم نزين الخيمة للاحتفال الكبير!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.24, "num_tokens": 81} diff --git a/sync_data/tokens/train/txts/shard-000018.jsonl b/sync_data/tokens/train/txts/shard-000018.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1fb755b18e787e6a98dc4c22478699328f5f5330 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000018.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000350", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000350.wav", "text": "جهّز نفسَك للحج! تأكد من Google Maps وقابلنا عند الوّاحات في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.56, "num_tokens": 164} +{"id": "sample_000343", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000343.wav", "text": "شوف ذا النقش القديم! كنه لقى كنز مخفي في صفحة مئة واثنين وأربعين من دليل اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.16, "num_tokens": 154} +{"id": "sample_000017", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000017.wav", "text": "نقوش الحنا حق جدتي مثل القصيد اللي يسيل.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.4, "num_tokens": 110} +{"id": "sample_000356", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000356.wav", "text": "ارسل الكشافة لجهة الكثبان الشرقية يدورون عن إشارات كمين. لا تنسى تحدث الخريطة في جهاز GPS اللي معك للإصدار اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 9.88, "num_tokens": 247} +{"id": "sample_000235", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000235.wav", "text": "دور القناص قبل ما يضرب مرة ثانية!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.88, "num_tokens": 72} +{"id": "sample_000228", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000228.wav", "text": "عجل! عبي المدفع قبل ما يضربونا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.32, "num_tokens": 83} +{"id": "sample_000216", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000216.wav", "text": "اطلع بحذر، أفضل التمر فوق.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000242", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000242.wav", "text": "ابني الطاحونة في وجه الهوى، لازم تدور بدون مشاكل.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.36, "num_tokens": 134} diff --git a/sync_data/tokens/train/txts/shard-000019.jsonl b/sync_data/tokens/train/txts/shard-000019.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2efe0a6331df6640e503d372e12297f539c2eec9 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000019.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000208", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000208.wav", "text": "طبخ الرز واللحم على نار هادية، الضيوف بيجون قريب.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 4.36, "num_tokens": 109} +{"id": "sample_000297", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000297.wav", "text": "يا زين! جملنا جاب VIP pass للْحَجّ. دق على صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة للتفاصيل.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.56, "num_tokens": 239} +{"id": "sample_000267", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000267.wav", "text": "السباق يبدأ بعد خمسة دقايق! خذ لك سيارة سريعة وتعال لموقف استاد الملك فهد.", "language_id": "ar", "instruct": "saudi, conversational, satirical", "audio_duration": 6.56, "num_tokens": 164} +{"id": "sample_000183", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000183.wav", "text": "الحصان باسمه القوي يشيل فخر القبيلة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.16, "num_tokens": 104} +{"id": "sample_000225", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000225.wav", "text": "عبي السلاح بسرعة! إحنا تحت ضرب النار!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000139", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000139.wav", "text": "الرموز باهتة، بس معناها قوي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.36, "num_tokens": 84} +{"id": "sample_000162", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000162.wav", "text": "اليد اللي ثابتة تفوز، مو اليد السريعة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.0, "num_tokens": 100} +{"id": "sample_000272", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000272.wav", "text": "صدى الحكمة القديمة باقي يتردد في هالأطلال المْنسية، كنك تلقى شي زي ليفل خمسة في لعبة strategy.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 9.32, "num_tokens": 233} diff --git a/sync_data/tokens/train/txts/shard-000020.jsonl b/sync_data/tokens/train/txts/shard-000020.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b9e79b8af285bd6e95aeffa52f2edef51df7bc3c --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000020.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000112", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000112.wav", "text": "الهبوب جاي! بسرعة، اضرب الخيمة وربط البعارين!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.8, "num_tokens": 120} +{"id": "sample_000155", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000155.wav", "text": "ما نخليهم ياخذون التل. اثبتوا!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.2, "num_tokens": 80} +{"id": "sample_000117", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000117.wav", "text": "امش في الجسر اللي بين أبراج مركز المملكة. شوف الرياض من فوق، يا سلام!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.68, "num_tokens": 142} +{"id": "sample_000255", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000255.wav", "text": "وناسة عالم الشتاء! نجرب اللعبة الدوارة ولا صالة الجليد أول؟", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.68, "num_tokens": 142} +{"id": "sample_000205", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000205.wav", "text": "دربه يرجع بالصيد، الثقة تبني مع الوقت.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.0, "num_tokens": 100} +{"id": "sample_000133", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000133.wav", "text": "ضرب عازف العود على الأوتار، وبدأوا الناس يصفقون.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.04, "num_tokens": 101} +{"id": "sample_000455", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000455.wav", "text": "اذكر يوم تعلمنا عن قبائل Bedouin وتقاليدهم في صفحة أربعة وثلاثين من كتاب History.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.64, "num_tokens": 166} +{"id": "sample_000170", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000170.wav", "text": "ما عندنا مؤونة كفاية، لازم ننسحب!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.56, "num_tokens": 89} diff --git a/sync_data/tokens/train/txts/shard-000021.jsonl b/sync_data/tokens/train/txts/shard-000021.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..39e0734309b8205fbd4f3ee9414be6788c7635e4 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000021.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000460", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000460.wav", "text": "العدو قْرب من الكْثبان. جهز حرس القَصر للمعركة على صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.52, "num_tokens": 163} +{"id": "sample_000142", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000142.wav", "text": "الرمل يخبّي أكثر من العظام.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.24, "num_tokens": 81} +{"id": "sample_000212", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000212.wav", "text": "الغزلان سريعة، امش على آثارها بحذر.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000374", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000374.wav", "text": "حيّاكم الله في المهرجان! انبسطوا بسباق الجمال وخذوا تمرات تس مجانًا من booth خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.6, "num_tokens": 190} +{"id": "sample_000459", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000459.wav", "text": "تذكر يوم تسابقنا فوق الكْثبان الرملية في Desert Racer، ندزلب غروب الشمس؟ قابلني عند ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.88, "num_tokens": 222} +{"id": "sample_000140", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000140.wav", "text": "الدلة جاهزة! اسكبها للمعازيم.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000248", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000248.wav", "text": "يبه، ليه صخرة الفيل وكل الصخور ذي أشكالها غريبة كذا؟", "language_id": "ar", "instruct": "saudi, conversational, curious", "audio_duration": 5.24, "num_tokens": 131} +{"id": "sample_000412", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000412.wav", "text": "شفت سباق الجمال عند واحة ستة وخمسين؟ كان رهيب!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.64, "num_tokens": 116} diff --git a/sync_data/tokens/train/txts/shard-000022.jsonl b/sync_data/tokens/train/txts/shard-000022.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5b790d74a2ad426f99d0db48f180d2a329ffcbca --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000022.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000141", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000141.wav", "text": "كل خطوة بهالرحلة تقربك للإيمان.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000143", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000143.wav", "text": "الصقر حلق، ودانا وجهتنا.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.28, "num_tokens": 82} +{"id": "sample_000328", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000328.wav", "text": "وش عذرتس يا بنت عند ستة وخمسين شارع الواحة الساعه سبعة؟ شفتي سباق الجمل؟", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.8, "num_tokens": 145} +{"id": "sample_000229", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000229.wav", "text": "القنبلة جاية! خبّ نفسك!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67} +{"id": "sample_000396", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000396.wav", "text": "حياكم الله في المجلس الاستراتيجي. فريقنا مِستعد يواجه تحديات جديدة في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.64, "num_tokens": 216} +{"id": "sample_000401", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000401.wav", "text": "السلام عليكم يا زائرٍ نبيل. أنا رفيقتس اللي بيدزلك في هذي الألغاز الغامضة في Level خمسة من اللعبة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.68, "num_tokens": 217} +{"id": "sample_000126", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000126.wav", "text": "عشان نوصل للمرحلة الجاية، لازم نعيد ترتيب هالرموز العتيقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 5.64, "num_tokens": 141} +{"id": "sample_000450", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000450.wav", "text": "ليه الجمل ما رضى يتفاهم؟ خايف يقسم له الoasis برقم ستة وخمسين!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.6, "num_tokens": 165} diff --git a/sync_data/tokens/train/txts/shard-000023.jsonl b/sync_data/tokens/train/txts/shard-000023.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f8f9b1dc5e60071654d36535fe7fe4b8c69ad8b5 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000023.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000251", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000251.wav", "text": "متأكد إن البالون ذا يشيلنا كلنا؟ أحس إنه يترنح شوي فوق!", "language_id": "ar", "instruct": "saudi, conversational, nervous", "audio_duration": 5.72, "num_tokens": 143} +{"id": "sample_000201", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000201.wav", "text": "هشّك الرز بالشوكة عشان يطلع خفيف.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79} +{"id": "sample_000110", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000110.wav", "text": "هالنقوش على الحجر تبين ساعة الماء القديمة في مدائن صالح. تقدر تشغلها من جديد؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 6.88, "num_tokens": 172} +{"id": "sample_000196", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000196.wav", "text": "الأساطير تنولد هنا، في شوارعنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.12, "num_tokens": 78} +{"id": "sample_000091", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000091.wav", "text": "يبه، ليه نلبس ثياب يديدة للعيد؟", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.2, "num_tokens": 80} +{"id": "sample_000337", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000337.wav", "text": "سمعتْ؟ خُطتْ المَلِگ الجديدة سرية مثل رقم الهاتف صفر خمسة صفر واحد اثنان ثلاثة أربعة خمسة ستة سبعة!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.68, "num_tokens": 242} +{"id": "sample_000262", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000262.wav", "text": "لازم نوازن بين الاستدامة والتقنية. زيد المزارع العمودية في المنطقة خمسة.", "language_id": "ar", "instruct": "saudi, conversational, innovative", "audio_duration": 6.44, "num_tokens": 161} +{"id": "sample_000244", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000244.wav", "text": "تأكد أن كل التوصيلات على الوقت، زحمة المدينة ممكن تسبب تأخير.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 5.68, "num_tokens": 142} diff --git a/sync_data/tokens/train/txts/shard-000024.jsonl b/sync_data/tokens/train/txts/shard-000024.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1a0f7b3e1f2ac82c99bd01f9f13ab4069a4914de --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000024.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000359", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000359.wav", "text": "حذر! ذا الجمل يحب ياكل guidebooks. تلقاه عند ستة وخمسين شارع النجدي بعد العصر.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.0, "num_tokens": 200} +{"id": "sample_000261", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000261.wav", "text": "الخط الأزرق زحمة! حوّل الركاب للخط الأخضر عشان يوصلون المول بالوقت!", "language_id": "ar", "instruct": "saudi, conversational, urgent", "audio_duration": 6.32, "num_tokens": 158} +{"id": "sample_000424", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000424.wav", "text": "يا بطل! وش اسمك الملحمي قبل نخلّص العالم؟ حط اسمك هنا وشوف صفحة مئة وواحد.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.6, "num_tokens": 190} +{"id": "sample_000264", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000264.wav", "text": "المنطقة الآمنة تضيق! روح برج المملكة عشان تاخذ ميزة المكان العالي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 5.36, "num_tokens": 134} +{"id": "sample_000419", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000419.wav", "text": "تعلمتوا الاستراتيجيات. الحين طبقوها علشان تغزون أراضي الصحرة. شوفوا صفحة مئة واثنين وأربعين في كتاب Game Manual.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.88, "num_tokens": 222} +{"id": "sample_000180", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000180.wav", "text": "يالله نلعب! أراهن إني أفوز!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.88, "num_tokens": 72} +{"id": "sample_000276", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000276.wav", "text": "سويتِها! الحين الصحراء صارت بأمان. تعالوا نلتقي عند الواحة لاحتفال كبير مع الشلة في ليفل واحد صفر!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 9.08, "num_tokens": 227} +{"id": "sample_000192", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000192.wav", "text": "دوّر السيارة! خلك ملك الشوارع!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.84, "num_tokens": 96} diff --git a/sync_data/tokens/train/txts/shard-000025.jsonl b/sync_data/tokens/train/txts/shard-000025.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a7fbf26e9c0a44945b224eceaeaffbdc146b3b53 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000025.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000138", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000138.wav", "text": "الغنم جاهزة! يالله نبدأ العزيمة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.12, "num_tokens": 78} +{"id": "sample_000013", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000013.wav", "text": "اقبض الطارات على السيف وقت العرضة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.08, "num_tokens": 77} +{"id": "sample_000195", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000195.wav", "text": "آخر جولة، ما بقى إلا المحترفين!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000224", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000224.wav", "text": "النار تدفي قلوبنا، والقصص تدفي الأرواح.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.36, "num_tokens": 109} +{"id": "sample_000135", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000135.wav", "text": "إذا ما قتلنا الزحمة، الحر بيقتلنا!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.52, "num_tokens": 88} +{"id": "sample_000446", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000446.wav", "text": "تْذكّر أيام الوَاحَه، وين وُلِدَت Legends في صفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.08, "num_tokens": 177} +{"id": "sample_000404", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000404.wav", "text": "مرحبتس في Strategy واحد صفر واحد: كيف تدزلب الجمال وتغلب الكثبان! صفحة مئة وثلاثة وعشرون", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.88, "num_tokens": 222} +{"id": "sample_000145", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000145.wav", "text": "الخريطة تقول الكنز مدفون تحت الرمال اللي تتحرك.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.72, "num_tokens": 118} diff --git a/sync_data/tokens/train/txts/shard-000026.jsonl b/sync_data/tokens/train/txts/shard-000026.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..da051815cbaf7b8045c803177df7a367532860eb --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000026.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000283", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000283.wav", "text": "بسرعة! لازم نلقى الscroll المخفي في المكتبه قبل يرجعون الحراس! المكتبه في دور ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.48, "num_tokens": 162} +{"id": "sample_000097", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000097.wav", "text": "الجليب العتيق يوسوس، 'حط ريال وتمن أمنية!'", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.6, "num_tokens": 115} +{"id": "sample_000392", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000392.wav", "text": "تشوف الصقور يطرن فوق الجبل؟ خلنا ننضم معهم في الصفحة مئة واثنان وأربعون من Mountain Quest!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.64, "num_tokens": 166} +{"id": "sample_000233", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000233.wav", "text": "حط الفخ هنا، بننصب لهم كمين مع الفجر.", "language_id": "ar", "instruct": "saudi, conversational, critical", "audio_duration": 3.76, "num_tokens": 94} +{"id": "sample_000366", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000366.wav", "text": "في الصحرَهْ، لقيت رسالة قديمة من جدّي على صفحة اثنين وأربعين في كتاب Desert Wisdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.8, "num_tokens": 220} +{"id": "sample_000438", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000438.wav", "text": "قم اتبع الصقر إلى الواحَهْ باستخدام الmap في صفحة ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.72, "num_tokens": 118} +{"id": "sample_000314", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000314.wav", "text": "حياك الله في مجلس التخطيط. من فضلك اذكر اسمك ودورك. حنا في غرفة رقم اثني عشر.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 6.8, "num_tokens": 170} +{"id": "sample_000293", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000293.wav", "text": "تذكر سوالف جدودنا واحنا واقفين عند بوابة المدينة رقم خمسة ستة جنب برج Kingdom.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 6.6, "num_tokens": 165} diff --git a/sync_data/tokens/train/txts/shard-000027.jsonl b/sync_data/tokens/train/txts/shard-000027.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..037eaf50532e928945c8e42b77bb857606dc31fb --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000027.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000344", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000344.wav", "text": "حياك الله في هالمستوى من Puzzle! لا تضيع جملَك في متاهة النفود عند سبعة وستين شارع النبطي.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 7.56, "num_tokens": 189} +{"id": "sample_000158", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000158.wav", "text": "ثبت في مكانك، لا تفقد تركيزك!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.88, "num_tokens": 97} +{"id": "sample_000273", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000273.wav", "text": "يلا، لازم نهرب قبل ما الجمل يزهق! ليفل خمسة يستنانا.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.44, "num_tokens": 136} +{"id": "sample_000239", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000239.wav", "text": "خل القهوة تغلي ببطء، الطعم يصير مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.8, "num_tokens": 95} +{"id": "sample_000015", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000015.wav", "text": "شوف! لقيت كتابة قديمة على الجدار ذا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.32, "num_tokens": 83} +{"id": "sample_000447", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000447.wav", "text": "لو سمحت اشرح لي كيف تعاملت مع موقف صعب في شغلك الأخير في Office خمسة وستين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.24, "num_tokens": 181} +{"id": "sample_000006", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000006.wav", "text": "البس نظارة الواقع عشان تشوف خريطة الكنز المخبية!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.6, "num_tokens": 115} +{"id": "sample_000342", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000342.wav", "text": "عرضك ظليل. ما أقدر أبيع بأقل من مئتي coins.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.48, "num_tokens": 112} diff --git a/sync_data/tokens/train/txts/shard-000028.jsonl b/sync_data/tokens/train/txts/shard-000028.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d89eae66f751df8acfb0a5fbbb36834b2984a67a --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000028.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000165", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000165.wav", "text": "قريبين نفوز! لا تخففون الضغط!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.44, "num_tokens": 86} +{"id": "sample_000095", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000095.wav", "text": "ياهوه! جا وقت نعلق فوانيس رمضان!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.32, "num_tokens": 108} +{"id": "sample_000191", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000191.wav", "text": "انتبه من الحفر! ما نبي نطيح!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.36, "num_tokens": 84} +{"id": "sample_000115", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000115.wav", "text": "النقوش في الحجر تتكلم عن طرق التجارة القديمة. تقدر تفك شفرة كلامها؟", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 5.88, "num_tokens": 147} +{"id": "sample_000103", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000103.wav", "text": "في العيد، تمنيت بعير... بس جاني لعبة محشية بداله!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 5.2, "num_tokens": 130} +{"id": "sample_000175", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000175.wav", "text": "طلقة مضبوطه تغير كل شي.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000230", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000230.wav", "text": "حط المتفجرات وارجع، بنفجر الجسر!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.52, "num_tokens": 88} +{"id": "sample_000198", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000198.wav", "text": "حرك الجريش لين يثقل، وبعدين زيد البهارات.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.84, "num_tokens": 121} diff --git a/sync_data/tokens/train/txts/shard-000029.jsonl b/sync_data/tokens/train/txts/shard-000029.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7cf277b6c9f4e5f51e4af1e745e77d27273c4596 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000029.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000148", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000148.wav", "text": "أسمع شي بالظلام... يمكن الهوى.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.64, "num_tokens": 91} +{"id": "sample_000400", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000400.wav", "text": "النصر لنا! خلونا نركب الجمال ونحتفل، بس لا تنسى تسجل رقم خمسة ستة شارع النجدي في GPS.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.24, "num_tokens": 231} +{"id": "sample_000370", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000370.wav", "text": "قال الملك، 'وشلون الجمل صار في ستة وخمسين شارع Royal؟' يمكن يتفرج على Netflix!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 6.96, "num_tokens": 174} +{"id": "sample_000003", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000003.wav", "text": "حطينا في الدرعية القديمة! يالله بسرعة، عدّل طريقة كلامك لا يحسبونا جنّ الناس هنا!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.48, "num_tokens": 162} +{"id": "sample_000206", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000206.wav", "text": "حكم الخيوط زين، هالسجادة بتحمل قصص أهلنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.64, "num_tokens": 116} +{"id": "sample_000414", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000414.wav", "text": "بسرعة! حل اللغز عشان تفتح الجمل قبل توصلنا القبيلة المنافسة. لا تنسى تسجل الرقم في صفحة ستة وخمسون بالدليل!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 8.24, "num_tokens": 206} +{"id": "sample_000475", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000475.wav", "text": "حياك الله في مجلس الصحارى. مهمتك تبتدي عند الغروب بْتمامَه في ستة وخمسين شارع النجدي.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.56, "num_tokens": 214} +{"id": "sample_000326", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000326.wav", "text": "تذكر يوم اللي جالك تحدي وغلبته في مقابلة مع شركة RiyadhTech. رقم الصفحة مئة واثنان وأربعون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.32, "num_tokens": 208} diff --git a/sync_data/tokens/train/txts/shard-000030.jsonl b/sync_data/tokens/train/txts/shard-000030.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..07e30946093766b89e88a0914614d2150dcfc6fd --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000030.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000173", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000173.wav", "text": "طلقة مضبوطة يعني فوز مضبوط.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.52, "num_tokens": 88} +{"id": "sample_000351", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000351.wav", "text": "حياكم الله في سوق التجارة! شوفوا السرج الخاص بالجمل عندنا بسعر مئة وتسعة وتسعين ريال!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.56, "num_tokens": 189} +{"id": "sample_000182", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000182.wav", "text": "الجمل ذا سلالته صافية، يستاهل كل ريال.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.12, "num_tokens": 103} +{"id": "sample_000259", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000259.wav", "text": "الشركة حقتنا في البلوكتشين محتاجة استثمار أكثر. يلا نعرض على شركات التمويل الجريء السعودية!", "language_id": "ar", "instruct": "saudi, conversational, ambitious", "audio_duration": 7.12, "num_tokens": 178} +{"id": "sample_000108", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000108.wav", "text": "اللفافة العتيقة تقول: 'الكنز في المكان اللي ظل أطول منارة يبوس أقدم بير وقت العصر.'", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 8.6, "num_tokens": 215} +{"id": "sample_000009", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000009.wav", "text": "طور شريحة السرعة في جملك الروبوت للسباق!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 4.24, "num_tokens": 106} +{"id": "sample_000163", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000163.wav", "text": "لا تطلقون النار لين يقربون!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 2.68, "num_tokens": 67} +{"id": "sample_000387", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000387.wav", "text": "مبروك! الحين وصلت لآخر level، يلا نحتفل في ستة وخمسين شارع.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.24, "num_tokens": 156} diff --git a/sync_data/tokens/train/txts/shard-000031.jsonl b/sync_data/tokens/train/txts/shard-000031.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..423dcc1ce7a4d3433167f96968955b6911593e11 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000031.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000256", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000256.wav", "text": "ياخي، اللعبة ذي بالنظارات قوية! حسيت إني أطير فوق الرياض صدق!", "language_id": "ar", "instruct": "saudi, conversational, thrilled", "audio_duration": 6.52, "num_tokens": 163} +{"id": "sample_000199", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000199.wav", "text": "افرد العجينة وزيد الصلصة بترتيب.", "language_id": "ar", "instruct": "saudi, conversational, creative", "audio_duration": 3.48, "num_tokens": 87} +{"id": "sample_000018", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000018.wav", "text": "النجوم تدلنا في الرمل اللي ما له نهاية.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.84, "num_tokens": 96} +{"id": "sample_000157", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000157.wav", "text": "إذا سجلنا الحين، المشروبات علي!", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 3.0, "num_tokens": 75} +{"id": "sample_000338", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000338.wav", "text": "تتذگر طعم القهوة مع التمر بالمجلس؟ أظن إنه كان في صالة رقم اثنان ثلاثة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.04, "num_tokens": 176} +{"id": "sample_000109", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000109.wav", "text": "بالعيد، نبدا بزيارة الكبار، وبعدين الصغار. كذا نكرم الحكمة والبراءة مع بعض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 7.04, "num_tokens": 176} +{"id": "sample_000218", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000218.wav", "text": "النقوش هذي تحكي قصة سقوط المدينة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.4, "num_tokens": 85} +{"id": "sample_000156", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000156.wav", "text": "قريب توصل، باقي لك لفة!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 2.84, "num_tokens": 71} diff --git a/sync_data/tokens/train/txts/shard-000032.jsonl b/sync_data/tokens/train/txts/shard-000032.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..46abe43b5831596dd75a36870c751c5a52449c12 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000032.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000241", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000241.wav", "text": "الهوى قوي الليلة، ثبت الخيمة زين في الأرض.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.24, "num_tokens": 106} +{"id": "sample_000341", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000341.wav", "text": "تْفَكَّر فِي الرِّحْلَة عبر الصّحْرَا لِـ Mecca. تذَكَّر سَوالِف أَجْدَادْنَا اللي قالوها في صفحة مئة واثنين وأربعين.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 8.28, "num_tokens": 207} +{"id": "sample_000093", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000093.wav", "text": "ياسلام! هالفلس العتيق يضبط بالضبط في يد التمثال!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.84, "num_tokens": 121} +{"id": "sample_000384", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000384.wav", "text": "لازم تسوي سعر أحسن لهالبضاعة، ولا بتخاطر تخسر الصفقة. شوف الشروط بصفحة ستة وخمسين من كتيب Steam.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 8.48, "num_tokens": 212} +{"id": "sample_000100", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000100.wav", "text": "رمول الزمان خشت المفتاح؛ ما يشوفه إلا عين الصقر.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 4.84, "num_tokens": 121} +{"id": "sample_000121", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000121.wav", "text": "البطولة لنا! تعب فريقنا ما راح على الفاضي!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 4.12, "num_tokens": 103} +{"id": "sample_000190", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000190.wav", "text": "حط القذائف مضبوط، لو غلطنا نضيع!", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000318", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000318.wav", "text": "شفت شي غريب حول الواحة القديمه ليلة أمس؟ هذا مكتوب في سجل رقم خمسة ستة.", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 6.08, "num_tokens": 152} diff --git a/sync_data/tokens/train/txts/shard-000033.jsonl b/sync_data/tokens/train/txts/shard-000033.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3150ae595ccf1439a33fadbc81cc51110a37a000 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000033.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000240", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000240.wav", "text": "ارفع السيف فوق، العرضة بتبدأ قريب!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.6, "num_tokens": 90} +{"id": "sample_000181", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000181.wav", "text": "لازم نلقى لنا ملجأ قبل ما تجي العاصفة.", "language_id": "ar", "instruct": "saudi, conversational, mysterious", "audio_duration": 3.28, "num_tokens": 82} +{"id": "sample_000210", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000210.wav", "text": "السعر غالي، خلنا نتفق على شي ينفعنا اثنيننا.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 4.2, "num_tokens": 105} +{"id": "sample_000435", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000435.wav", "text": "بسرعة، علمني، وش هي عاصمة أستراليا؟ تلميحة: شوفي صفحة مئة واثنين وأربعين في الڤايد!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 7.32, "num_tokens": 183} +{"id": "sample_000005", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000005.wav", "text": "الرمل يشبه كثبان الفضا من فوق!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.12, "num_tokens": 78} +{"id": "sample_000101", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000101.wav", "text": "تجهيز شنطة الحج مثل حل بازل مقدس!", "language_id": "ar", "instruct": "saudi, conversational, excited", "audio_duration": 3.96, "num_tokens": 99} +{"id": "sample_000161", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000161.wav", "text": "كل خطوة قدام تقربنا من النصر.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.32, "num_tokens": 83} +{"id": "sample_000254", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000254.wav", "text": "عشان نفتح الدور الجاي، لازم نحل اللغز الاقتصادي ذا. فكر مثل المستثمر!", "language_id": "ar", "instruct": "saudi, conversational, challenging", "audio_duration": 6.68, "num_tokens": 167} diff --git a/sync_data/tokens/train/txts/shard-000034.jsonl b/sync_data/tokens/train/txts/shard-000034.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7dab3246cfc9edd10b0f895d49a0d37e1fcfd5c5 --- /dev/null +++ b/sync_data/tokens/train/txts/shard-000034.jsonl @@ -0,0 +1,8 @@ +{"id": "sample_000209", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000209.wav", "text": "قل القصيدة الحربية، تذكرنا بشجاعة أجدادنا.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 4.44, "num_tokens": 111} +{"id": "sample_000253", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000253.wav", "text": "الشاورما خلصت! دق على المورد واطلب دجاج زيادة، بسرعة!", "language_id": "ar", "instruct": "saudi, conversational, busy", "audio_duration": 4.68, "num_tokens": 117} +{"id": "sample_000373", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000373.wav", "text": "دور على المفتاح المخفي عشان تفتح الباب، وإلا بتظل هنا للأبد! ترى أقرب phone في الدور ستة وخمسون.", "language_id": "ar", "instruct": "saudi, conversational, humorous", "audio_duration": 9.04, "num_tokens": 226} +{"id": "sample_000221", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000221.wav", "text": "الصحراء تعلم الصبر، مثل ما يقولون البدو.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.8, "num_tokens": 95} +{"id": "sample_000346", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000346.wav", "text": "هلا بك في الفريق، يا القايد. قاعدتك على ستة وخمسين شارع النجدي. جهّز استراتيجيتك.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.52, "num_tokens": 188} +{"id": "sample_000164", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000164.wav", "text": "نكسر خط دفاعهم هنا، نفوز بالمعركة.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 3.72, "num_tokens": 93} +{"id": "sample_000454", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000454.wav", "text": "أذكر الأيام الزينات في الصحرا تحت ضو الگمر. تشبه Level خمسة باللعبة في صفحة مئة وثلاثة وعشرون.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 10.0, "num_tokens": 250} +{"id": "sample_000330", "audio_path": "/home/riftuser/OmniVoice/sync_data/data/wavs/sample_000330.wav", "text": "أبغى منك تنظم تجمع العائلة في ستة وخمسين شارع النجدي. لا تنسى ترسل الدعوة بواتساب.", "language_id": "ar", "instruct": "saudi, conversational, serious", "audio_duration": 7.44, "num_tokens": 186} diff --git a/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl b/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b26ca14951bd5ea2c214b1fee206edcbaec24d28 --- /dev/null +++ b/training_data/sync_data/tokens/dev/txts/shard-000012.jsonl @@ -0,0 +1 @@ +{"id": "sample_000186", "audio_path": "/home/riftuser/OmniVoice/sync_data/wavs/sample_000186.wav", "text": "قبل السباق، نعطيهم تمر للطاقة.", "language_id": "ar", "instruct": "saudi, conversational, reflective", "audio_duration": 3.16, "num_tokens": 79}