Add files using upload-large-folder tool

a4d9876 verified 26 days ago

9.43 kB

	#!/bin/bash

	# Evaluate OmniVoice models on TTS benchmarks.

	# Stage 1: Download the test sets and evaluation models.
	# Stage 2: LibriSpeech-PC
	# Stage 3: seedtts_en
	# Stage 4: seedtts_zh
	# Stage 5: fleurs
	# Stage 6: minimax

	set -euo pipefail

	# Specify the stages to run by setting the `stage` and `stop_stage` variables.
	stage=1
	stop_stage=6

	# Available GPUs for evaluation. Adjust this according to your setup.
	export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"

	# Specify the checkpoint to evaluate.
	CHECKPOINT=k2-fsa/OmniVoice
	emilia_checkpoint=false

	# CHECKPOINT=k2-fsa/OmniVoice
	# emilia_checkpoint=true

	# For the OmniVoice-Emilia checkpoint, we set denoise to False and lang_id to None
	#, as the model is trained without prompt denoising or language id.

	if [ "${emilia_checkpoint}" = true ]; then
	infer_options="--preprocess_prompt False \
	--postprocess_output False \
	--batch_duration 600 \
	--denoise False \
	--lang_id None \
	--audio_chunk_threshold 1000"
	else
	infer_options="--preprocess_prompt False \
	--postprocess_output False \
	--batch_duration 600 \
	--audio_chunk_threshold 1000"
	fi

	export PYTHONPATH="$(cd "$(dirname "$0")/.." && pwd):${PYTHONPATH:-}"

	download_dir="download"
	TTS_EVAL_MODEL_DIR="${download_dir}/tts_eval_models/"
	TTS_EVAL_DATA_DIR="${download_dir}/tts_eval_datasets/"

	# Map test_name to its test.jsonl path.
	get_test_list() {
	case "$1" in
	librispeech_pc) echo "${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean.jsonl" ;;
	seedtts_en) echo "${TTS_EVAL_DATA_DIR}/seedtts_test_en.jsonl" ;;
	seedtts_zh) echo "${TTS_EVAL_DATA_DIR}/seedtts_test_zh.jsonl" ;;
	minimax) echo "${TTS_EVAL_DATA_DIR}/minimax_multilingual_24.jsonl" ;;
	fleurs) echo "${TTS_EVAL_DATA_DIR}/fleurs_multilingual_102.jsonl" ;;
	*) echo ""; return 1 ;;
	esac
	}

	# ============================================================
	# Stage 1: Prepare the test sets and evaluation models
	# ============================================================

	if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
	echo "Stage 1: Download test sets and evaluation models"

	hf_repo=k2-fsa/TTS_eval_datasets
	mkdir -p ${TTS_EVAL_DATA_DIR}/
	for file in \
	librispeech_pc_test_clean.jsonl \
	librispeech_pc_test_clean_transcript.jsonl \
	seedtts_test_en.jsonl \
	seedtts_test_zh.jsonl \
	minimax_multilingual_24.jsonl \
	fleurs_multilingual_102.jsonl; do
	echo "Downloading ${file}..."
	huggingface-cli download \
	--repo-type dataset \
	--local-dir ${TTS_EVAL_DATA_DIR}/ \
	${hf_repo} \
	${file}
	done

	for file in \
	librispeech_pc_testset.tar.gz \
	seedtts_testset.tar.gz \
	minimax_multilingual_24.tar.gz \
	fleurs_multilingual_102.tar.gz; do
	echo "Downloading ${file}..."
	huggingface-cli download \
	--repo-type dataset \
	--local-dir ${TTS_EVAL_DATA_DIR}/ \
	${hf_repo} \
	${file}

	echo "Extracting ${file}..."
	tar -xzf ${TTS_EVAL_DATA_DIR}/${file} -C ${TTS_EVAL_DATA_DIR}/
	done

	echo "Download all evaluation models"
	hf_repo=k2-fsa/TTS_eval_models
	mkdir -p ${TTS_EVAL_MODEL_DIR}
	huggingface-cli download \
	--local-dir ${TTS_EVAL_MODEL_DIR} \
	${hf_repo}
	fi

	# ============================================================
	# Stage 2: Evaluation on LibriSpeech-PC
	# ============================================================


	if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
	echo "Stage 2: Evaluation on LibriSpeech-PC"
	wav_path="results/librispeech_pc"
	test_jsonl="$(get_test_list librispeech_pc)"
	transcript_jsonl="${TTS_EVAL_DATA_DIR}/librispeech_pc_test_clean_transcript.jsonl"

	python -m omnivoice.cli.infer_batch \
	--model "${CHECKPOINT}" \
	--test_list "${test_jsonl}" \
	--res_dir "${wav_path}" ${infer_options}

	python -m omnivoice.eval.speaker_similarity.sim \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.sim.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	python -m omnivoice.eval.wer.hubert \
	--wav-path "${wav_path}" \
	--test-list "${transcript_jsonl}" \
	--decode-path "${wav_path}.wer.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	python -m omnivoice.eval.mos.utmos \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.mos.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"
	fi


	# ============================================================
	# Stage 3: Evaluation on Seed-TTS en
	# ============================================================

	if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
	echo "Stage 3: Evaluation on Seed-TTS en"
	wav_path="results/seedtts_en"
	test_jsonl="$(get_test_list seedtts_en)"

	python -m omnivoice.cli.infer_batch \
	--model "${CHECKPOINT}" \
	--test_list "${test_jsonl}" \
	--res_dir "${wav_path}" ${infer_options}


	python -m omnivoice.eval.speaker_similarity.sim \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.sim.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	python -m omnivoice.eval.wer.seedtts \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.wer.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}" \
	--lang en

	python -m omnivoice.eval.mos.utmos \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.mos.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"
	fi


	# ============================================================
	# Stage 4: Evaluation on Seed-TTS zh
	# ============================================================

	if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
	echo "Stage 4: Evaluation on Seed-TTS zh"
	wav_path="results/seedtts_zh"
	test_jsonl="$(get_test_list seedtts_zh)"

	python -m omnivoice.cli.infer_batch \
	--model "${CHECKPOINT}" \
	--test_list "${test_jsonl}" \
	--res_dir "${wav_path}" ${infer_options}


	python -m omnivoice.eval.speaker_similarity.sim \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.sim.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	python -m omnivoice.eval.wer.seedtts \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.wer.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}" \
	--lang zh

	python -m omnivoice.eval.mos.utmos \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.mos.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"
	fi



	# ============================================================
	# Stage 5: Evaluation on MiniMax multilingual
	# ============================================================

	if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
	echo "Stage 5: Evaluation on MiniMax multilingual"
	wav_path="results/minimax"
	test_jsonl="$(get_test_list minimax)"

	python -m omnivoice.cli.infer_batch \
	--model "${CHECKPOINT}" \
	--test_list "${test_jsonl}" \
	--res_dir "${wav_path}" ${infer_options}

	python -m omnivoice.eval.speaker_similarity.sim \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.sim.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	python -m omnivoice.eval.wer.minimax \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.wer.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"
	fi


	# ============================================================
	# Stage 6: Evaluation on FLEURS multilingual
	# ============================================================

	if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
	echo "Stage 6: Evaluation on FLEURS multilingual"
	wav_path="results/fleurs"
	test_jsonl="$(get_test_list fleurs)"

	python -m omnivoice.cli.infer_batch \
	--model "${CHECKPOINT}" \
	--test_list "${test_jsonl}" \
	--res_dir "${wav_path}" ${infer_options}


	python -m omnivoice.eval.speaker_similarity.sim \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.sim.log" \
	--model-dir "${TTS_EVAL_MODEL_DIR}"

	# Evaluation on FLEURS requires omnilingual-asr, which has dependencies that
	# conflict with other packages (at least the transformers package) in our project.

	# To evaluate on FLEURS, we suggest users to set up a separate virtual
	# environment to install omnilingual-asr. Install instructions can be found in
	# https://github.com/facebookresearch/omnilingual-asr

	python ${PWD}/../omnivoice/eval/wer/fleurs.py \
	--wav-path "${wav_path}" \
	--test-list "${test_jsonl}" \
	--decode-path "${wav_path}.wer.log" \
	--model-card omniASR_LLM_Unlimited_7B_v2 \
	--chunk-size 100 \
	--batch-size 50
	fi