| | import json |
| |
|
| |
|
| | |
| | class HomophonesReplacer: |
| | """ |
| | Homophones Replacer |
| | |
| | Replace the mispronounced characters with correctly pronounced ones. |
| | |
| | Creation process of homophones_map.json: |
| | |
| | 1. Establish a word corpus using the [Tencent AI Lab Embedding Corpora v0.2.0 large] with 12 million entries. After cleaning, approximately 1.8 million entries remain. Use ChatTTS to infer the text. |
| | 2. Record discrepancies between the inferred and input text, identifying about 180,000 misread words. |
| | 3. Create a pinyin to common characters mapping using correctly read characters by ChatTTS. |
| | 4. For each discrepancy, extract the correct pinyin using [python-pinyin] and find homophones with the correct pronunciation from the mapping. |
| | |
| | Thanks to: |
| | [Tencent AI Lab Embedding Corpora for Chinese and English Words and Phrases](https://ai.tencent.com/ailab/nlp/en/embedding.html) |
| | [python-pinyin](https://github.com/mozillazg/python-pinyin) |
| | |
| | """ |
| |
|
| | def __init__(self, map_file_path): |
| | self.homophones_map = self.load_homophones_map(map_file_path) |
| |
|
| | def load_homophones_map(self, map_file_path): |
| | with open(map_file_path, "r", encoding="utf-8") as f: |
| | homophones_map = json.load(f) |
| | return homophones_map |
| |
|
| | def replace(self, text): |
| | result = [] |
| | for char in text: |
| | if char in self.homophones_map: |
| | result.append(self.homophones_map[char]) |
| | else: |
| | result.append(char) |
| | return "".join(result) |
| |
|