File size: 3,647 Bytes
0a937d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
#: Control characters.
CONTROLS = {
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
}
# There are further control characters, but they are instead replaced with a space by unicode normalization
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'
#: Hyphen and dash characters.
HYPHENS = {
'-', # \u002d Hyphen-minus
'β', # \u2010 Hyphen
'β', # \u2011 Non-breaking hyphen
'β', # \u2043 Hyphen bullet
'β', # \u2012 figure dash
'β', # \u2013 en dash
'β', # \u2014 em dash
'β', # \u2015 horizontal bar
}
#: Minus characters.
MINUSES = {
'-', # \u002d Hyphen-minus
'β', # \u2212 Minus
'οΌ', # \uff0d Full-width Hyphen-minus
'β»', # \u207b Superscript minus
}
#: Plus characters.
PLUSES = {
'+', # \u002b Plus
'οΌ', # \uff0b Full-width Plus
'βΊ', # \u207a Superscript plus
}
#: Slash characters.
SLASHES = {
'/', # \u002f Solidus
'β', # \u2044 Fraction slash
'β', # \u2215 Division slash
}
#: Tilde characters.
TILDES = {
'~', # \u007e Tilde
'Λ', # \u02dc Small tilde
'β', # \u2053 Swung dash
'βΌ', # \u223c Tilde operator #in mbert vocab
'β½', # \u223d Reversed tilde
'βΏ', # \u223f Sine wave
'γ', # \u301c Wave dash #in mbert vocab
'ο½', # \uff5e Full-width tilde #in mbert vocab
}
#: Apostrophe characters.
APOSTROPHES = {
"'", # \u0027
'β', # \u2019
'Υ', # \u055a
'κ', # \ua78b
'κ', # \ua78c
'οΌ', # \uff07
}
#: Single quote characters.
SINGLE_QUOTES = {
"'", # \u0027
'β', # \u2018
'β', # \u2019
'β', # \u201a
'β', # \u201b
}
#: Double quote characters.
DOUBLE_QUOTES = {
'"', # \u0022
'β', # \u201c
'β', # \u201d
'β', # \u201e
'β', # \u201f
}
#: Accent characters.
ACCENTS = {
'`', # \u0060
'Β΄', # \u00b4
}
#: Prime characters.
PRIMES = {
'β²', # \u2032
'β³', # \u2033
'β΄', # \u2034
'β΅', # \u2035
'βΆ', # \u2036
'β·', # \u2037
'β', # \u2057
}
#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
def normalize(text):
for control in CONTROLS:
text = text.replace(control, '')
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
for hyphen in HYPHENS | MINUSES:
text = text.replace(hyphen, '-')
text = text.replace('\u00ad', '')
for double_quote in DOUBLE_QUOTES:
text = text.replace(double_quote, '"') # \u0022
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
text = text.replace(single_quote, "'") # \u0027
text = text.replace('β²', "'") # \u2032 prime
text = text.replace('β΅', "'") # \u2035 reversed prime
text = text.replace('β³', "''") # \u2033 double prime
text = text.replace('βΆ', "''") # \u2036 reversed double prime
text = text.replace('β΄', "'''") # \u2034 triple prime
text = text.replace('β·', "'''") # \u2037 reversed triple prime
text = text.replace('β', "''''") # \u2057 quadruple prime
text = text.replace('β¦', '...').replace(' . . . ', ' ... ') # \u2026
for slash in SLASHES:
text = text.replace(slash, '/')
#for tilde in TILDES:
# text = text.replace(tilde, '~')
return text |