| | from typing import List |
| | import regex as re |
| | import sys |
| | from tqdm import tqdm |
| | from joblib import Parallel, delayed |
| | from indic_num_map import INDIC_NUM_MAP |
| |
|
| |
|
| | URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b' |
| | EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}' |
| | |
| | NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)" |
| | |
| | OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+' |
| |
|
| |
|
| | def normalize_indic_numerals(line: str) -> str: |
| | """ |
| | Normalize the numerals in Indic languages from native script to Roman script (if present). |
| | |
| | Args: |
| | line (str): an input string with Indic numerals to be normalized. |
| | |
| | Returns: |
| | str: an input string with the all Indic numerals normalized to Roman script. |
| | """ |
| | return "".join([INDIC_NUM_MAP.get(c, c) for c in line]) |
| |
|
| |
|
| | def wrap_with_dnt_tag(text: str, pattern: str) -> str: |
| | """ |
| | Wraps all occurences of a given pattern match in the input string with a do not translate |
| | tags (`<dnt>` {input string} `</dnt>`). This will be particularly useful when some span of |
| | input string needs to be forwarded as it and not translated. |
| | |
| | Args: |
| | text (str): input string. |
| | pattern (str): pattern to search for in the input string. |
| | |
| | Returns: |
| | str: input string with spans wrapped in `<dnt>` and `</dnt>` tags in case of pattern matches. |
| | """ |
| | |
| | matches = set(re.findall(pattern, text)) |
| | |
| | |
| | for match in matches: |
| | text = text.replace(match, f' <dnt> {match} </dnt> ') |
| | |
| | text = re.sub("\s+", " ", text) |
| | |
| | return text |
| |
|
| |
|
| | def normalize(text: str, patterns: List[str]) -> str: |
| | """ |
| | Normalizes and wraps the spans of input string with `<dnt>` and `</dnt>` tags. It first normalizes |
| | the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized |
| | Indic numerals to wrap the spans of text matching the pattern with `<dnt>` and `</dnt>` tags. |
| | |
| | Args: |
| | text (str): input string. |
| | pattern (List[str]): list of patterns to search for in the input string. |
| | |
| | Returns: |
| | str: normalized input string wrapped with `<dnt>` and `</dnt>` tags. |
| | """ |
| | text = normalize_indic_numerals(text.strip("\n")) |
| | for pattern in patterns: |
| | text = wrap_with_dnt_tag(text, pattern) |
| | return text |
| |
|
| |
|
| | if __name__ == "__main__": |
| |
|
| | src_infname = sys.argv[1] |
| | src_outfname = sys.argv[2] |
| | |
| | num_lines = sum(1 for line in open(src_infname, "r")) |
| | patterns = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN] |
| |
|
| | with open(src_infname, "r", encoding="utf-8") as src_infile, \ |
| | open(src_outfname, "w", encoding="utf-8") as src_outfile: |
| | |
| | for src_line in tqdm(src_infile): |
| | src_line = normalize(src_line, patterns) |
| | src_outfile.write(src_line.strip() + "\n") |
| |
|