| #: Control characters. | |
| CONTROLS = { | |
| '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011', | |
| '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b', | |
| } | |
| # There are further control characters, but they are instead replaced with a space by unicode normalization | |
| # '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f' | |
| #: Hyphen and dash characters. | |
| HYPHENS = { | |
| '-', # \u002d Hyphen-minus | |
| 'β', # \u2010 Hyphen | |
| 'β', # \u2011 Non-breaking hyphen | |
| 'β', # \u2043 Hyphen bullet | |
| 'β', # \u2012 figure dash | |
| 'β', # \u2013 en dash | |
| 'β', # \u2014 em dash | |
| 'β', # \u2015 horizontal bar | |
| } | |
| #: Minus characters. | |
| MINUSES = { | |
| '-', # \u002d Hyphen-minus | |
| 'β', # \u2212 Minus | |
| 'οΌ', # \uff0d Full-width Hyphen-minus | |
| 'β»', # \u207b Superscript minus | |
| } | |
| #: Plus characters. | |
| PLUSES = { | |
| '+', # \u002b Plus | |
| 'οΌ', # \uff0b Full-width Plus | |
| 'βΊ', # \u207a Superscript plus | |
| } | |
| #: Slash characters. | |
| SLASHES = { | |
| '/', # \u002f Solidus | |
| 'β', # \u2044 Fraction slash | |
| 'β', # \u2215 Division slash | |
| } | |
| #: Tilde characters. | |
| TILDES = { | |
| '~', # \u007e Tilde | |
| 'Λ', # \u02dc Small tilde | |
| 'β', # \u2053 Swung dash | |
| 'βΌ', # \u223c Tilde operator #in mbert vocab | |
| 'β½', # \u223d Reversed tilde | |
| 'βΏ', # \u223f Sine wave | |
| 'γ', # \u301c Wave dash #in mbert vocab | |
| 'ο½', # \uff5e Full-width tilde #in mbert vocab | |
| } | |
| #: Apostrophe characters. | |
| APOSTROPHES = { | |
| "'", # \u0027 | |
| 'β', # \u2019 | |
| 'Υ', # \u055a | |
| 'κ', # \ua78b | |
| 'κ', # \ua78c | |
| 'οΌ', # \uff07 | |
| } | |
| #: Single quote characters. | |
| SINGLE_QUOTES = { | |
| "'", # \u0027 | |
| 'β', # \u2018 | |
| 'β', # \u2019 | |
| 'β', # \u201a | |
| 'β', # \u201b | |
| } | |
| #: Double quote characters. | |
| DOUBLE_QUOTES = { | |
| '"', # \u0022 | |
| 'β', # \u201c | |
| 'β', # \u201d | |
| 'β', # \u201e | |
| 'β', # \u201f | |
| } | |
| #: Accent characters. | |
| ACCENTS = { | |
| '`', # \u0060 | |
| 'Β΄', # \u00b4 | |
| } | |
| #: Prime characters. | |
| PRIMES = { | |
| 'β²', # \u2032 | |
| 'β³', # \u2033 | |
| 'β΄', # \u2034 | |
| 'β΅', # \u2035 | |
| 'βΆ', # \u2036 | |
| 'β·', # \u2037 | |
| 'β', # \u2057 | |
| } | |
| #: Quote characters, including apostrophes, single quotes, double quotes, accents and primes. | |
| QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES | |
| def normalize(text): | |
| for control in CONTROLS: | |
| text = text.replace(control, '') | |
| text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ') | |
| for hyphen in HYPHENS | MINUSES: | |
| text = text.replace(hyphen, '-') | |
| text = text.replace('\u00ad', '') | |
| for double_quote in DOUBLE_QUOTES: | |
| text = text.replace(double_quote, '"') # \u0022 | |
| for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS): | |
| text = text.replace(single_quote, "'") # \u0027 | |
| text = text.replace('β²', "'") # \u2032 prime | |
| text = text.replace('β΅', "'") # \u2035 reversed prime | |
| text = text.replace('β³', "''") # \u2033 double prime | |
| text = text.replace('βΆ', "''") # \u2036 reversed double prime | |
| text = text.replace('β΄', "'''") # \u2034 triple prime | |
| text = text.replace('β·', "'''") # \u2037 reversed triple prime | |
| text = text.replace('β', "''''") # \u2057 quadruple prime | |
| text = text.replace('β¦', '...').replace(' . . . ', ' ... ') # \u2026 | |
| for slash in SLASHES: | |
| text = text.replace(slash, '/') | |
| #for tilde in TILDES: | |
| # text = text.replace(tilde, '~') | |
| return text |