| import re |
| import string |
| from statistics import mode |
|
|
| import emoji |
| from langdetect import detect |
| from spellchecker import SpellChecker |
|
|
|
|
| def clean_text(text: str) -> str: |
|
|
| |
|
|
| for fun in [ |
| remove_URL, |
| remove_html, |
| remove_hashtags, |
| |
| |
| |
| ]: |
| text = fun(text) |
| return text |
|
|
|
|
| def remove_URL(text: str) -> str: |
| url = re.compile(r"https?://\S+|www\.\S+") |
| return url.sub(r"", text) |
|
|
|
|
| def remove_hashtags(text: str) -> str: |
| hashtag = re.compile(r"#\S+") |
| return hashtag.sub(r"", text) |
|
|
|
|
| def remove_html(text: str) -> str: |
| html = re.compile(r"<.*?>") |
| return html.sub(r"", text) |
|
|
|
|
| def remove_emojis(text: str) -> str: |
| delimiter = "#4=" |
| for i in range(5): |
| text = emoji.demojize(string=text, delimiters=(delimiter, delimiter)) |
| text = re.sub(f"{delimiter}\S+{delimiter}", "", text) |
| return text |
|
|
|
|
| def remove_punct(text): |
| table = str.maketrans("", "", string.punctuation) |
| return text.translate(table) |
|
|
|
|
| def correct_spellings(text): |
| spell = SpellChecker() |
| corrected_text = [] |
| misspelled_words = spell.unknown(text.split()) |
| for word in text.split(): |
| corrected_word = spell.correction(word) |
| if word in misspelled_words and corrected_word is not None: |
| corrected_text.append(corrected_word) |
| else: |
| corrected_text.append(word) |
| return " ".join(corrected_text) |
|
|
|
|
| def remove_backslashes(text: str) -> str: |
| backslash = re.compile(r"\\\S+") |
| return backslash.sub(r"", text) |
|
|
|
|
| def detect_language(list_of_texts: list[str]) -> str | None: |
|
|
| if len(list_of_texts) == 0: |
| return None |
|
|
| languages = [] |
|
|
| for text in list_of_texts: |
| try: |
| lan = detect(text) |
| languages.append(lan) |
| except Exception: |
| continue |
|
|
| return mode(languages) if len(languages) else None |
|
|