| | import re |
| | from typing import Iterable, List, Tuple |
| | import cn2an |
| | from english_utils.abbreviations import expand_abbreviations |
| | from english_utils.time_norm import expand_time_english |
| | from english_utils.number_norm import normalize_numbers as replace_numbers_en |
| |
|
| |
|
| | def merge_short_sentences_zh(sens): |
| | |
| | """Avoid short sentences by merging them with the following sentence. |
| | |
| | Args: |
| | List[str]: list of input sentences. |
| | |
| | Returns: |
| | List[str]: list of output sentences. |
| | """ |
| | sens_out = [] |
| | for s in sens: |
| | |
| | |
| | if len(sens_out) > 0 and len(sens_out[-1]) <= 2: |
| | sens_out[-1] = sens_out[-1] + " " + s |
| | else: |
| | sens_out.append(s) |
| | try: |
| | if len(sens_out[-1]) <= 2: |
| | sens_out[-2] = sens_out[-2] + " " + sens_out[-1] |
| | sens_out.pop(-1) |
| | except: |
| | pass |
| | return sens_out |
| |
|
| |
|
| | def split_sentences_zh(text, min_len=10): |
| | text = re.sub('[。!?;]', '.', text) |
| | text = re.sub('[,]', ',', text) |
| | |
| | text = re.sub('[\n\t ]+', ' ', text) |
| | |
| | text = re.sub('([,.!?;])', r'\1 $#!', text) |
| | |
| | |
| | sentences = [s.strip() for s in text.split('$#!')] |
| | if len(sentences[-1]) == 0: del sentences[-1] |
| |
|
| | new_sentences = [] |
| | new_sent = [] |
| | count_len = 0 |
| | for ind, sent in enumerate(sentences): |
| | new_sent.append(sent) |
| | count_len += len(sent) |
| | if count_len > min_len or ind == len(sentences) - 1: |
| | count_len = 0 |
| | new_sentences.append(' '.join(new_sent)) |
| | new_sent = [] |
| | return merge_short_sentences_zh(new_sentences) |
| |
|
| |
|
| | def intersperse(lst, item): |
| | result = [item] * (len(lst) * 2 + 1) |
| | result[1::2] = lst |
| | return result |
| |
|
| |
|
| | def replace_numbers_zh(text): |
| | numbers = re.findall(r"\d+(?:\.?\d+)?", text) |
| | for number in numbers: |
| | text = text.replace(number, cn2an.an2cn(number), 1) |
| | return text |
| |
|
| |
|
| | def replace_punctuation(text): |
| | rep_map = { |
| | ":": ",", |
| | ";": ",", |
| | ",": ",", |
| | "。": ".", |
| | "!": "!", |
| | "?": "?", |
| | "\n": ".", |
| | "·": ",", |
| | "、": ",", |
| | "...": "…", |
| | "$": ".", |
| | "“": "'", |
| | "”": "'", |
| | "‘": "'", |
| | "’": "'", |
| | "(": "'", |
| | ")": "'", |
| | "(": "'", |
| | ")": "'", |
| | "《": "'", |
| | "》": "'", |
| | "【": "'", |
| | "】": "'", |
| | "[": "'", |
| | "]": "'", |
| | "—": "-", |
| | "~": "-", |
| | "~": "-", |
| | "「": "'", |
| | "」": "'", |
| | } |
| |
|
| | for k, v in rep_map.items(): |
| | text = text.replace(k, v) |
| | return text |
| |
|
| |
|
| | class Lexicon: |
| | def __init__(self, lexion_filename: str, tokens_filename: str): |
| | tokens = dict() |
| | with open(tokens_filename, encoding="utf-8") as f: |
| | for line in f: |
| | s, i = line.split() |
| | tokens[s] = int(i) |
| |
|
| | lexicon = dict() |
| | with open(lexion_filename, encoding="utf-8") as f: |
| | for line in f: |
| | splits = line.split() |
| | word_or_phrase = splits[0] |
| | phone_tone_list = splits[1:] |
| | assert len(phone_tone_list) & 1 == 0, len(phone_tone_list) |
| | phone_str = phone_tone_list[: len(phone_tone_list) // 2] |
| | phones = [tokens[p] for p in phone_str] |
| |
|
| | tones = phone_tone_list[len(phone_tone_list) // 2 :] |
| | tones = [int(t) for t in tones] |
| |
|
| | lexicon[word_or_phrase] = (phone_str, phones, tones) |
| | lexicon["呣"] = lexicon["母"] |
| | lexicon["嗯"] = lexicon["恩"] |
| | self.lexicon = lexicon |
| |
|
| | punctuation = ["!", "?", "…", ",", ".", "'", "-"] |
| | for p in punctuation: |
| | i = tokens[p] |
| | tone = 0 |
| | self.lexicon[p] = ([p], [i], [tone]) |
| | self.lexicon[" "] = ([" "], [tokens["_"]], [0]) |
| |
|
| | def g2p_zh_mix_en(self, text: str) -> Tuple[List[int], List[int]]: |
| | phone_str = [] |
| | phones = [] |
| | tones = [] |
| |
|
| | if text not in self.lexicon: |
| | |
| | if len(text) > 1: |
| | for w in text: |
| | |
| | s, _, p, t = self.convert(w) |
| | if p: |
| | phone_str += s |
| | phones += p |
| | tones += t |
| | return phone_str, phones, tones |
| |
|
| | phone_str, phones, tones = self.lexicon[text] |
| | return phone_str, phones, tones |
| | |
| | |
| | def split_zh_en(self, text): |
| | if re.search(r'[a-zA-Z]+', text): |
| | spliter = '#$&^!@' |
| | |
| | text = re.sub(r'[a-zA-Z]+', lambda x: f'{spliter}{x.group()}{spliter}', text) |
| | texts = text.split(spliter) |
| | texts = [t for t in texts if len(t) > 0] |
| | return texts |
| | else: |
| | return [text] |
| | |
| | |
| | def normalize_english(self, text): |
| | text = text.lower() |
| | text = expand_time_english(text) |
| | text = replace_numbers_en(text) |
| | text = expand_abbreviations(text) |
| | return text |
| |
|
| | def normalize_chinese(self, text): |
| | text = replace_numbers_zh(text) |
| | return text |
| | |
| |
|
| | def is_english(self, text): |
| | return 1 if re.match(r'[a-zA-Z\s]+', text) else 0 |
| |
|
| | def convert(self, text: Iterable[str]) -> Tuple[List[int], List[int]]: |
| | phone_str = [] |
| | yinjie_num = [] |
| | phones = [] |
| | tones = [] |
| |
|
| | text = replace_punctuation(text) |
| | texts_zh_en = self.split_zh_en(text) |
| | en_num = sum([self.is_english(i) for i in texts_zh_en]) |
| | if en_num * 2 >= len(texts_zh_en): |
| | texts_zh_en = self.split_zh_en(self.normalize_english(text)) |
| | else: |
| | texts_zh_en = self.split_zh_en(self.normalize_chinese(text)) |
| | for text_one_lang in texts_zh_en: |
| | if self.is_english(text_one_lang): |
| | |
| | s, p, t = self.g2p_zh_mix_en(text_one_lang) |
| |
|
| | phone_str += s |
| | yinjie_num.append(len(s)) |
| | phones += p |
| | tones += t |
| | else: |
| | |
| | for tl in text_one_lang: |
| | s, p, t = self.g2p_zh_mix_en(tl) |
| |
|
| | phone_str += s |
| | yinjie_num.append(len(s)) |
| | phones += p |
| | tones += t |
| | |
| | return phone_str, yinjie_num, phones, tones |