File size: 9,427 Bytes

a4d9876

#!/bin/bash

# Evaluate OmniVoice models on TTS benchmarks.

# Stage 1: Download the test sets and evaluation models.
# Stage 2: LibriSpeech-PC
# Stage 3: seedtts_en
# Stage 4: seedtts_zh
# Stage 5: fleurs
# Stage 6: minimax

set -euo pipefail

# Specify the stages to run by setting the `stage` and `stop_stage` variables. 
stage=1
stop_stage=6

# Available GPUs for evaluation. Adjust this according to your setup.
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

# Specify the checkpoint to evaluate.
CHECKPOINT=k2-fsa/OmniVoice
emilia_checkpoint=false

# CHECKPOINT=k2-fsa/OmniVoice
# emilia_checkpoint=true

# For the OmniVoice-Emilia checkpoint, we set denoise to False and lang_id to None
#, as the model is trained without prompt denoising or language id.

if [ "${emilia_checkpoint}" = true ]; then
    infer_options="--preprocess_prompt False \
        --postprocess_output False \
        --batch_duration 600 \
        --denoise False \
        --lang_id None \
        --audio_chunk_threshold 1000"
else
    infer_options="--preprocess_prompt False \
        --postprocess_output False \
        --batch_duration 600 \
        --audio_chunk_threshold 1000"
fi

export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"

download_dir="download"
TTS_EVAL_MODEL_DIR="${download_dir}/tts_eval_models/"
TTS_EVAL_DATA_DIR="${download_dir}/tts_eval_datasets/"

# Map test_name to its test.jsonl path.
get_test_list() {
    case "$1" in
        librispeech_pc) echo "${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean.jsonl" ;;
        seedtts_en)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_en.jsonl" ;;
        seedtts_zh)     echo "${TTS_EVAL_DATA_DIR}/seedtts_test_zh.jsonl" ;;
        minimax)        echo "${TTS_EVAL_DATA_DIR}/minimax_multilingual_24.jsonl" ;;
        fleurs)         echo "${TTS_EVAL_DATA_DIR}/fleurs_multilingual_102.jsonl" ;;
        *)              echo ""; return 1 ;;
    esac
}

# ============================================================
# Stage 1: Prepare the test sets and evaluation models
# ============================================================

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "Stage 1: Download test sets and evaluation models"

    hf_repo=k2-fsa/TTS_eval_datasets
    mkdir -p ${TTS_EVAL_DATA_DIR}/
    for file in \
        librispeech_pc_test_clean.jsonl \
        librispeech_pc_test_clean_transcript.jsonl \
        seedtts_test_en.jsonl \
        seedtts_test_zh.jsonl \
        minimax_multilingual_24.jsonl \
        fleurs_multilingual_102.jsonl; do
        echo "Downloading ${file}..."
        huggingface-cli download \
                --repo-type dataset \
                --local-dir ${TTS_EVAL_DATA_DIR}/ \
                ${hf_repo} \
                ${file}
    done

    for file in \
        librispeech_pc_testset.tar.gz \
        seedtts_testset.tar.gz \
        minimax_multilingual_24.tar.gz \
        fleurs_multilingual_102.tar.gz; do
        echo "Downloading ${file}..."
        huggingface-cli download \
                --repo-type dataset \
                --local-dir ${TTS_EVAL_DATA_DIR}/ \
                ${hf_repo} \
                ${file}

        echo "Extracting ${file}..."
        tar -xzf ${TTS_EVAL_DATA_DIR}/${file} -C ${TTS_EVAL_DATA_DIR}/
    done

    echo "Download all evaluation models"
    hf_repo=k2-fsa/TTS_eval_models
    mkdir -p ${TTS_EVAL_MODEL_DIR}
    huggingface-cli download \
        --local-dir ${TTS_EVAL_MODEL_DIR} \
        ${hf_repo}
fi

# ============================================================
# Stage 2: Evaluation on LibriSpeech-PC
# ============================================================


if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    echo "Stage 2: Evaluation on LibriSpeech-PC"
    wav_path="results/librispeech_pc"
    test_jsonl="$(get_test_list librispeech_pc)"
    transcript_jsonl="${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean_transcript.jsonl"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}" ${infer_options}

    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.hubert \
        --wav-path "${wav_path}" \
        --test-list "${transcript_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 3: Evaluation on Seed-TTS en
# ============================================================

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    echo "Stage 3: Evaluation on Seed-TTS en"
    wav_path="results/seedtts_en"
    test_jsonl="$(get_test_list seedtts_en)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.seedtts \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}" \
        --lang en

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 4: Evaluation on Seed-TTS zh
# ============================================================

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    echo "Stage 4: Evaluation on Seed-TTS zh"
    wav_path="results/seedtts_zh"
    test_jsonl="$(get_test_list seedtts_zh)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.seedtts \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}" \
        --lang zh

    python -m omnivoice.eval.mos.utmos \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.mos.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi



# ============================================================
# Stage 5: Evaluation on MiniMax multilingual
# ============================================================

if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    echo "Stage 5: Evaluation on MiniMax multilingual"
    wav_path="results/minimax"
    test_jsonl="$(get_test_list minimax)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}

    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    python -m omnivoice.eval.wer.minimax \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"
fi


# ============================================================
# Stage 6: Evaluation on FLEURS multilingual
# ============================================================

if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    echo "Stage 6: Evaluation on FLEURS multilingual"
    wav_path="results/fleurs"
    test_jsonl="$(get_test_list fleurs)"

    python -m omnivoice.cli.infer_batch \
        --model "${CHECKPOINT}" \
        --test_list "${test_jsonl}" \
        --res_dir "${wav_path}"  ${infer_options}


    python -m omnivoice.eval.speaker_similarity.sim \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.sim.log" \
        --model-dir "${TTS_EVAL_MODEL_DIR}"

    # Evaluation on FLEURS requires omnilingual-asr, which has dependencies that
    # conflict with other packages (at least the transformers package) in our project.

    # To evaluate on FLEURS, we suggest users to set up a separate virtual
    # environment to install omnilingual-asr. Install instructions can be found in
    # https://github.com/facebookresearch/omnilingual-asr

    python ${PWD}/../omnivoice/eval/wer/fleurs.py \
        --wav-path "${wav_path}" \
        --test-list "${test_jsonl}" \
        --decode-path "${wav_path}.wer.log" \
        --model-card omniASR_LLM_Unlimited_7B_v2 \
        --chunk-size 100 \
        --batch-size 50
fi