code_SAS_VLM2Vec / src /text_utils /normalize_text.py
MgGladys's picture
Add files using upload-large-folder tool
0a937d7 verified
#: Control characters.
CONTROLS = {
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
}
# There are further control characters, but they are instead replaced with a space by unicode normalization
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
#: Hyphen and dash characters.
HYPHENS = {
'-', # \u002d Hyphen-minus
'‐', # \u2010 Hyphen
'‑', # \u2011 Non-breaking hyphen
'⁃', # \u2043 Hyphen bullet
'β€’', # \u2012 figure dash
'–', # \u2013 en dash
'β€”', # \u2014 em dash
'―', # \u2015 horizontal bar
}
#: Minus characters.
MINUSES = {
'-', # \u002d Hyphen-minus
'βˆ’', # \u2212 Minus
'-', # \uff0d Full-width Hyphen-minus
'⁻', # \u207b Superscript minus
}
#: Plus characters.
PLUSES = {
'+', # \u002b Plus
'οΌ‹', # \uff0b Full-width Plus
'⁺', # \u207a Superscript plus
}
#: Slash characters.
SLASHES = {
'/', # \u002f Solidus
'⁄', # \u2044 Fraction slash
'βˆ•', # \u2215 Division slash
}
#: Tilde characters.
TILDES = {
'~', # \u007e Tilde
'˜', # \u02dc Small tilde
'⁓', # \u2053 Swung dash
'∼', # \u223c Tilde operator #in mbert vocab
'∽', # \u223d Reversed tilde
'∿', # \u223f Sine wave
'γ€œ', # \u301c Wave dash #in mbert vocab
'~', # \uff5e Full-width tilde #in mbert vocab
}
#: Apostrophe characters.
APOSTROPHES = {
"'", # \u0027
'’', # \u2019
'՚', # \u055a
'κž‹', # \ua78b
'ꞌ', # \ua78c
'οΌ‡', # \uff07
}
#: Single quote characters.
SINGLE_QUOTES = {
"'", # \u0027
'β€˜', # \u2018
'’', # \u2019
'β€š', # \u201a
'β€›', # \u201b
}
#: Double quote characters.
DOUBLE_QUOTES = {
'"', # \u0022
'β€œ', # \u201c
'”', # \u201d
'β€ž', # \u201e
'β€Ÿ', # \u201f
}
#: Accent characters.
ACCENTS = {
'`', # \u0060
'Β΄', # \u00b4
}
#: Prime characters.
PRIMES = {
'β€²', # \u2032
'β€³', # \u2033
'‴', # \u2034
'‡', # \u2035
'•', # \u2036
'β€·', # \u2037
'⁗', # \u2057
}
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
def normalize(text):
for control in CONTROLS:
text = text.replace(control, '')
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
for hyphen in HYPHENS | MINUSES:
text = text.replace(hyphen, '-')
text = text.replace('\u00ad', '')
for double_quote in DOUBLE_QUOTES:
text = text.replace(double_quote, '"') # \u0022
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
text = text.replace(single_quote, "'") # \u0027
text = text.replace('β€²', "'") # \u2032 prime
text = text.replace('‡', "'") # \u2035 reversed prime
text = text.replace('β€³', "''") # \u2033 double prime
text = text.replace('•', "''") # \u2036 reversed double prime
text = text.replace('‴', "'''") # \u2034 triple prime
text = text.replace('β€·', "'''") # \u2037 reversed triple prime
text = text.replace('⁗', "''''") # \u2057 quadruple prime
text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026
for slash in SLASHES:
text = text.replace(slash, '/')
#for tilde in TILDES:
# text = text.replace(tilde, '~')
return text