| | import argparse |
| | import unicodedata |
| | import re |
| | from tqdm import tqdm |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | import re |
| | import unicodedata |
| |
|
| | PUNCTS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~«»' |
| | UNICODE_PUNCT = { |
| | ",": ",", |
| | "。": ".", |
| | "、": ",", |
| | "„": '"', |
| | "”": '"', |
| | "“": '"', |
| | "«": '"', |
| | "»": '"', |
| | "1": '"', |
| | "」": '"', |
| | "「": '"', |
| | "《": '"', |
| | "》": '"', |
| | "´": "'", |
| | "∶": ":", |
| | ":": ":", |
| | "?": "?", |
| | "!": "!", |
| | "(": "(", |
| | ")": ")", |
| | ";": ";", |
| | "–": "-", |
| | "—": " - ", |
| | ".": ". ", |
| | "~": "~", |
| | "’": "'", |
| | "…": "...", |
| | "━": "-", |
| | "〈": "<", |
| | "〉": ">", |
| | "【": "[", |
| | "】": "]", |
| | "%": "%", |
| | "►": "-", |
| | "■": " ", |
| | } |
| |
|
| | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") |
| |
|
| |
|
| | def replace_unicode_punct(text: str) -> str: |
| | return "".join(UNICODE_PUNCT.get(c, c) for c in text) |
| |
|
| |
|
| | def remove_unicode_punct(text: str) -> str: |
| | """More aggressive version of replace_unicode_punct but also faster.""" |
| | return UNICODE_PUNCT_RE.sub("", text) |
| |
|
| |
|
| | def strip_accents(line: str) -> str: |
| | """Strips accents from a piece of text.""" |
| | nfd = unicodedata.normalize("NFD", line) |
| | output = [c for c in nfd if unicodedata.category(c) != "Mn"] |
| | if len(output) == line: |
| | return line |
| | return "".join(output) |
| |
|
| |
|
| | |
| | NON_PRINTING_CHARS_RE = re.compile( |
| | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" |
| | ) |
| | DIGIT_RE = re.compile(r"\d") |
| | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( |
| | (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "") |
| | ) |
| |
|
| |
|
| | def remove_non_printing_char(text: str) -> str: |
| | return NON_PRINTING_CHARS_RE.sub("", text) |
| |
|
| |
|
| | def normalize(line: str, accent=True, case=True, numbers=True, punct=1) -> str: |
| | line = line.strip() |
| | if not line: |
| | return line |
| | if case: |
| | line = line.lower() |
| | if accent: |
| | line = strip_accents(line) |
| | if numbers: |
| | line = DIGIT_RE.sub("0", line) |
| | if punct == 1: |
| | line = replace_unicode_punct(line) |
| | elif punct == 2: |
| | line = remove_unicode_punct(line) |
| | line = remove_non_printing_char(line) |
| | return line |
| |
|
| |
|
| | def slow_normalize_for_dedup(line: str) -> str: |
| | return normalize(line, accent=False, case=True, numbers=True, punct=2) |
| |
|
| |
|
| | def normalize_for_dedup(line: str) -> str: |
| | line = line.strip() |
| | if not line: |
| | return line |
| | |
| | line = line.lower() |
| | |
| | line = DIGIT_RE.sub("0", line) |
| | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) |
| | return line |
| |
|
| | |
| | def normalize_text(line): |
| | normalized_line = unicodedata.normalize('NFKC', line).lower() |
| |
|
| | |
| | normalized_line = normalized_line.rstrip() |
| | if normalized_line and normalized_line[-1] not in PUNCTS: |
| | normalized_line += '.' |
| |
|
| | |
| | |
| | normalized_line = normalize(normalized_line, accent=False, case=True, numbers=True, punct=1) |
| | return normalized_line |
| |
|
| |
|
| | def normalize_file(input_file, output_file, cutoff=None): |
| | with (open(output_file, 'w', encoding='utf-8') as f, |
| | open(input_file, 'r', encoding='utf-8') as lines): |
| | for line_count, line in tqdm(enumerate(lines), desc="Processing"): |
| | f.write(normalize_text(line) + "\n") |
| | if cutoff and line_count >= cutoff: |
| | break |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description='Normalize text file line by line, ensure trailing punctuation, replace newlines with spaces, and show progress.') |
| | parser.add_argument('input_file', type=str, help='Input file path') |
| | parser.add_argument('output_file', type=str, help='Output file path') |
| | parser.add_argument('--cutoff', required=False, type=int, help='Max number of lines to process') |
| |
|
| | args = parser.parse_args() |
| |
|
| | normalize_file(args.input_file, args.output_file, args.cutoff) |
| |
|