| """Set of default text cleaners"""
|
|
|
|
|
| import re
|
|
|
|
|
| _whitespace_re = re.compile(r"\s+")
|
|
|
| rep_map = {
|
| "οΌ": ",",
|
| "οΌ": ",",
|
| "οΌ": ",",
|
| "γ": ".",
|
| "οΌ": "!",
|
| "οΌ": "?",
|
| "\n": ".",
|
| "Β·": ",",
|
| "γ": ",",
|
| "...": ".",
|
| "β¦": ".",
|
| "$": ".",
|
| "β": "'",
|
| "β": "'",
|
| "β": "'",
|
| "β": "'",
|
| "οΌ": "'",
|
| "οΌ": "'",
|
| "(": "'",
|
| ")": "'",
|
| "γ": "'",
|
| "γ": "'",
|
| "γ": "'",
|
| "γ": "'",
|
| "[": "'",
|
| "]": "'",
|
| "β": "",
|
| "ο½": "-",
|
| "~": "-",
|
| "γ": "'",
|
| "γ": "'",
|
| }
|
|
|
| def replace_punctuation(text):
|
| pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
| replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
| return replaced_text
|
|
|
| def lowercase(text):
|
| return text.lower()
|
|
|
|
|
| def collapse_whitespace(text):
|
| return re.sub(_whitespace_re, " ", text).strip()
|
|
|
| def remove_punctuation_at_begin(text):
|
| return re.sub(r'^[,.!?]+', '', text)
|
|
|
| def remove_aux_symbols(text):
|
| text = re.sub(r"[\<\>\(\)\[\]\"\Β«\Β»\']+", "", text)
|
| return text
|
|
|
|
|
| def replace_symbols(text, lang="en"):
|
| """Replace symbols based on the lenguage tag.
|
|
|
| Args:
|
| text:
|
| Input text.
|
| lang:
|
| Lenguage identifier. ex: "en", "fr", "pt", "ca".
|
|
|
| Returns:
|
| The modified text
|
| example:
|
| input args:
|
| text: "si l'avi cau, diguem-ho"
|
| lang: "ca"
|
| Output:
|
| text: "si lavi cau, diguemho"
|
| """
|
| text = text.replace(";", ",")
|
| text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
|
| text = text.replace(":", ",")
|
| if lang == "en":
|
| text = text.replace("&", " and ")
|
| elif lang == "fr":
|
| text = text.replace("&", " et ")
|
| elif lang == "pt":
|
| text = text.replace("&", " e ")
|
| elif lang == "ca":
|
| text = text.replace("&", " i ")
|
| text = text.replace("'", "")
|
| elif lang== "es":
|
| text=text.replace("&","y")
|
| text = text.replace("'", "")
|
| return text
|
|
|
| def unicleaners(text, cased=False, lang='en'):
|
| """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
| numbers, phonemizer already does that"""
|
| if not cased:
|
| text = lowercase(text)
|
| text = replace_punctuation(text)
|
| text = replace_symbols(text, lang=lang)
|
| text = remove_aux_symbols(text)
|
| text = remove_punctuation_at_begin(text)
|
| text = collapse_whitespace(text)
|
| text = re.sub(r'([^\.,!\?\-β¦])$', r'\1.', text)
|
| return text
|
|
|
|
|