Upload 70 files

5f858bc verified 10 months ago

6.83 kB

	import re
	from typing import Iterable, List, Tuple
	import cn2an
	from english_utils.abbreviations import expand_abbreviations
	from english_utils.time_norm import expand_time_english
	from english_utils.number_norm import normalize_numbers as replace_numbers_en


	def merge_short_sentences_zh(sens):
	# return sens
	"""Avoid short sentences by merging them with the following sentence.

	Args:
	List[str]: list of input sentences.

	Returns:
	List[str]: list of output sentences.
	"""
	sens_out = []
	for s in sens:
	# If the previous sentense is too short, merge them with
	# the current sentence.
	if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
	sens_out[-1] = sens_out[-1] + " " + s
	else:
	sens_out.append(s)
	try:
	if len(sens_out[-1]) <= 2:
	sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
	sens_out.pop(-1)
	except:
	pass
	return sens_out


	def split_sentences_zh(text, min_len=10):
	text = re.sub('[。！？；]', '.', text)
	text = re.sub('[，]', ',', text)
	# 将文本中的换行符、空格和制表符替换为空格
	text = re.sub('[\n\t ]+', ' ', text)
	# 在标点符号后添加一个空格
	text = re.sub('([,.!?;])', r'\1 $#!', text)
	# 分隔句子并去除前后空格
	# sentences = [s.strip() for s in re.split('(。\|！\|？\|；)', text)]
	sentences = [s.strip() for s in text.split('$#!')]
	if len(sentences[-1]) == 0: del sentences[-1]

	new_sentences = []
	new_sent = []
	count_len = 0
	for ind, sent in enumerate(sentences):
	new_sent.append(sent)
	count_len += len(sent)
	if count_len > min_len or ind == len(sentences) - 1:
	count_len = 0
	new_sentences.append(' '.join(new_sent))
	new_sent = []
	return merge_short_sentences_zh(new_sentences)


	def intersperse(lst, item):
	result = [item] * (len(lst) * 2 + 1)
	result[1::2] = lst
	return result


	def replace_numbers_zh(text):
	numbers = re.findall(r"\d+(?:\.?\d+)?", text)
	for number in numbers:
	text = text.replace(number, cn2an.an2cn(number), 1)
	return text


	def replace_punctuation(text):
	rep_map = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"·": ",",
	"、": ",",
	"...": "…",
	"$": ".",
	"“": "'",
	"”": "'",
	"‘": "'",
	"’": "'",
	"（": "'",
	"）": "'",
	"(": "'",
	")": "'",
	"《": "'",
	"》": "'",
	"【": "'",
	"】": "'",
	"[": "'",
	"]": "'",
	"—": "-",
	"～": "-",
	"~": "-",
	"「": "'",
	"」": "'",
	}

	for k, v in rep_map.items():
	text = text.replace(k, v)
	return text


	class Lexicon:
	def __init__(self, lexion_filename: str, tokens_filename: str):
	tokens = dict()
	with open(tokens_filename, encoding="utf-8") as f:
	for line in f:
	s, i = line.split()
	tokens[s] = int(i)

	lexicon = dict()
	with open(lexion_filename, encoding="utf-8") as f:
	for line in f:
	splits = line.split()
	word_or_phrase = splits[0]
	phone_tone_list = splits[1:]
	assert len(phone_tone_list) & 1 == 0, len(phone_tone_list)
	phone_str = phone_tone_list[: len(phone_tone_list) // 2]
	phones = [tokens[p] for p in phone_str]

	tones = phone_tone_list[len(phone_tone_list) // 2 :]
	tones = [int(t) for t in tones]

	lexicon[word_or_phrase] = (phone_str, phones, tones)
	lexicon["呣"] = lexicon["母"]
	lexicon["嗯"] = lexicon["恩"]
	self.lexicon = lexicon

	punctuation = ["!", "?", "…", ",", ".", "'", "-"]
	for p in punctuation:
	i = tokens[p]
	tone = 0
	self.lexicon[p] = ([p], [i], [tone])
	self.lexicon[" "] = ([" "], [tokens["_"]], [0])

	def g2p_zh_mix_en(self, text: str) -> Tuple[List[int], List[int]]:
	phone_str = []
	phones = []
	tones = []

	if text not in self.lexicon:
	# print("t", text)
	if len(text) > 1:
	for w in text:
	# print("w: ", w)
	s, _, p, t = self.convert(w)
	if p:
	phone_str += s
	phones += p
	tones += t
	return phone_str, phones, tones

	phone_str, phones, tones = self.lexicon[text]
	return phone_str, phones, tones


	def split_zh_en(self, text):
	if re.search(r'[a-zA-Z]+', text):
	spliter = '#$&^!@'
	# replace all english words
	text = re.sub(r'[a-zA-Z]+', lambda x: f'{spliter}{x.group()}{spliter}', text)
	texts = text.split(spliter)
	texts = [t for t in texts if len(t) > 0]
	return texts
	else:
	return [text]


	def normalize_english(self, text):
	text = text.lower()
	text = expand_time_english(text)
	text = replace_numbers_en(text)
	text = expand_abbreviations(text)
	return text

	def normalize_chinese(self, text):
	text = replace_numbers_zh(text)
	return text


	def is_english(self, text):
	return 1 if re.match(r'[a-zA-Z\s]+', text) else 0

	def convert(self, text: Iterable[str]) -> Tuple[List[int], List[int]]:
	phone_str = []
	yinjie_num = []
	phones = []
	tones = []

	text = replace_punctuation(text)
	texts_zh_en = self.split_zh_en(text)
	en_num = sum([self.is_english(i) for i in texts_zh_en])
	if en_num * 2 >= len(texts_zh_en):
	texts_zh_en = self.split_zh_en(self.normalize_english(text))
	else:
	texts_zh_en = self.split_zh_en(self.normalize_chinese(text))
	for text_one_lang in texts_zh_en:
	if self.is_english(text_one_lang):
	# English
	s, p, t = self.g2p_zh_mix_en(text_one_lang)

	phone_str += s
	yinjie_num.append(len(s))
	phones += p
	tones += t
	else:
	# print(f"text_one_lang = {text_one_lang}")
	for tl in text_one_lang:
	s, p, t = self.g2p_zh_mix_en(tl)

	phone_str += s
	yinjie_num.append(len(s))
	phones += p
	tones += t

	return phone_str, yinjie_num, phones, tones