Buckets:
MisterAI/LocalAI_Demo_backends / cpu-diffusers.upgrade-tmp /venv /lib /python3.10 /site-packages /ftfy /chardata.py
| """ | |
| This gives other modules access to the gritty details about characters and the | |
| encodings that use them. | |
| """ | |
| from __future__ import annotations | |
| import html | |
| import itertools | |
| import re | |
| import unicodedata | |
| # These are the encodings we will try to fix in ftfy, in the | |
| # order that they should be tried. | |
| CHARMAP_ENCODINGS = [ | |
| "latin-1", | |
| "sloppy-windows-1252", | |
| "sloppy-windows-1251", | |
| "sloppy-windows-1250", | |
| "sloppy-windows-1253", | |
| "sloppy-windows-1254", | |
| "sloppy-windows-1257", | |
| "iso-8859-2", | |
| "macroman", | |
| "cp437", | |
| ] | |
| SINGLE_QUOTE_RE = re.compile("[\u02bc\u2018-\u201b]") | |
| DOUBLE_QUOTE_RE = re.compile("[\u201c-\u201f]") | |
| def _build_regexes() -> dict[str, re.Pattern[str]]: | |
| """ | |
| ENCODING_REGEXES contain reasonably fast ways to detect if we | |
| could represent a given string in a given encoding. The simplest one is | |
| the 'ascii' detector, which of course just determines if all characters | |
| are between U+0000 and U+007F. | |
| """ | |
| # Define a regex that matches ASCII text. | |
| encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} | |
| for encoding in CHARMAP_ENCODINGS: | |
| # Make a sequence of characters that bytes \x80 to \xFF decode to | |
| # in each encoding, as well as byte \x1A, which is used to represent | |
| # the replacement character � in the sloppy-* encodings. | |
| byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) | |
| charlist = byte_range.decode(encoding) | |
| # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B | |
| # to \x7F -- will decode as those ASCII characters in any encoding we | |
| # support, so we can just include them as ranges. This also lets us | |
| # not worry about escaping regex special characters, because all of | |
| # them are in the \x1B to \x7F range. | |
| regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" | |
| encoding_regexes[encoding] = re.compile(regex) | |
| return encoding_regexes | |
| ENCODING_REGEXES = _build_regexes() | |
| def _build_html_entities() -> dict[str, str]: | |
| entities = {} | |
| # Create a dictionary based on the built-in HTML5 entity dictionary. | |
| # Add a limited set of HTML entities that we'll also decode if they've | |
| # been case-folded to uppercase, such as decoding Ñ as "Ñ". | |
| for name, char in html.entities.html5.items(): # type: ignore | |
| if name.endswith(";"): | |
| entities["&" + name] = char | |
| # Restrict the set of characters we can attempt to decode if their | |
| # name has been uppercased. If we tried to handle all entity names, | |
| # the results would be ambiguous. | |
| if name == name.lower(): | |
| name_upper = name.upper() | |
| entity_upper = "&" + name_upper | |
| if html.unescape(entity_upper) == entity_upper: | |
| entities[entity_upper] = char.upper() | |
| return entities | |
| HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") | |
| HTML_ENTITIES = _build_html_entities() | |
| def possible_encoding(text: str, encoding: str) -> bool: | |
| """ | |
| Given text and a single-byte encoding, check whether that text could have | |
| been decoded from that single-byte encoding. | |
| In other words, check whether it can be encoded in that encoding, possibly | |
| sloppily. | |
| """ | |
| return bool(ENCODING_REGEXES[encoding].match(text)) | |
| def _build_control_char_mapping() -> dict[int, None]: | |
| """ | |
| Build a translate mapping that strips likely-unintended control characters. | |
| See :func:`ftfy.fixes.remove_control_chars` for a description of these | |
| codepoint ranges and why they should be removed. | |
| """ | |
| control_chars: dict[int, None] = {} | |
| for i in itertools.chain( | |
| range(0x00, 0x09), | |
| [0x0B], | |
| range(0x0E, 0x20), | |
| [0x7F], | |
| range(0x206A, 0x2070), | |
| [0xFEFF], | |
| range(0xFFF9, 0xFFFD), | |
| ): | |
| control_chars[i] = None | |
| return control_chars | |
| CONTROL_CHARS = _build_control_char_mapping() | |
| # Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0' | |
| # that some Windows-1252 program converted to a plain space. | |
| # | |
| # The smaller values are included on a case-by-case basis, because we don't want | |
| # to decode likely input sequences to unlikely characters. These are the ones | |
| # that *do* form likely characters before 0xa0: | |
| # | |
| # 0xc2 -> U+A0 NO-BREAK SPACE | |
| # 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE | |
| # 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON | |
| # 0xce -> U+3A0 GREEK CAPITAL LETTER PI | |
| # 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER | |
| # 0xd9 -> U+660 ARABIC-INDIC DIGIT ZERO | |
| # | |
| # In three-character sequences, we exclude some lead bytes in some cases. | |
| # | |
| # When the lead byte is immediately followed by 0xA0, we shouldn't accept | |
| # a space there, because it leads to some less-likely character ranges: | |
| # | |
| # 0xe0 -> Samaritan script | |
| # 0xe1 -> Mongolian script (corresponds to Latin-1 'á' which is too common) | |
| # | |
| # We accept 0xe2 and 0xe3, which cover many scripts. Bytes 0xe4 and | |
| # higher point mostly to CJK characters, which we generally don't want to | |
| # decode near Latin lowercase letters. | |
| # | |
| # In four-character sequences, the lead byte must be F0, because that accounts | |
| # for almost all of the usage of high-numbered codepoints (tag characters whose | |
| # UTF-8 starts with the byte F3 are only used in some rare new emoji sequences). | |
| # | |
| # This is meant to be applied to encodings of text that tests true for `is_bad`. | |
| # Any of these could represent characters that legitimately appear surrounded by | |
| # spaces, particularly U+C5 (Å), which is a word in multiple languages! | |
| # | |
| # We should consider checking for b'\x85' being converted to ... in the future. | |
| # I've seen it once, but the text still wasn't recoverable. | |
| ALTERED_UTF8_RE = re.compile( | |
| b"[\xc2\xc3\xc5\xce\xd0\xd9][ ]" | |
| b"|[\xe2\xe3][ ][\x80-\x84\x86-\x9f\xa1-\xbf]" | |
| b"|[\xe0-\xe3][\x80-\x84\x86-\x9f\xa1-\xbf][ ]" | |
| b"|[\xf0][ ][\x80-\xbf][\x80-\xbf]" | |
| b"|[\xf0][\x80-\xbf][ ][\x80-\xbf]" | |
| b"|[\xf0][\x80-\xbf][\x80-\xbf][ ]" | |
| ) | |
| # This expression matches UTF-8 and CESU-8 sequences where some of the | |
| # continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is | |
| # used within ftfy to represent a byte that produced the replacement character | |
| # \ufffd. We don't know which byte it was, but we can at least decode the UTF-8 | |
| # sequence as \ufffd instead of failing to re-decode it at all. | |
| # | |
| # In some cases, we allow the ASCII '?' in place of \ufffd, but at most once per | |
| # sequence. | |
| LOSSY_UTF8_RE = re.compile( | |
| b"[\xc2-\xdf][\x1a]" | |
| b"|[\xc2-\xc3][?]" | |
| b"|\xed[\xa0-\xaf][\x1a?]\xed[\xb0-\xbf][\x1a?\x80-\xbf]" | |
| b"|\xed[\xa0-\xaf][\x1a?\x80-\xbf]\xed[\xb0-\xbf][\x1a?]" | |
| b"|[\xe0-\xef][\x1a?][\x1a\x80-\xbf]" | |
| b"|[\xe0-\xef][\x1a\x80-\xbf][\x1a?]" | |
| b"|[\xf0-\xf4][\x1a?][\x1a\x80-\xbf][\x1a\x80-\xbf]" | |
| b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a?][\x1a\x80-\xbf]" | |
| b"|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a?]" | |
| b"|\x1a" | |
| ) | |
| # This regex matches C1 control characters, which occupy some of the positions | |
| # in the Latin-1 character map that Windows assigns to other characters instead. | |
| C1_CONTROL_RE = re.compile(r"[\x80-\x9f]") | |
| # A translate mapping that breaks ligatures made of Latin letters. While | |
| # ligatures may be important to the representation of other languages, in Latin | |
| # letters they tend to represent a copy/paste error. It omits ligatures such | |
| # as æ that are frequently used intentionally. | |
| # | |
| # This list additionally includes some Latin digraphs that represent two | |
| # characters for legacy encoding reasons, not for typographical reasons. | |
| # | |
| # Ligatures and digraphs may also be separated by NFKC normalization, but that | |
| # is sometimes more normalization than you want. | |
| LIGATURES = { | |
| ord("IJ"): "IJ", # Dutch ligatures | |
| ord("ij"): "ij", | |
| ord("ʼn"): "ʼn", # Afrikaans digraph meant to avoid auto-curled quote | |
| ord("DZ"): "DZ", # Serbian/Croatian digraphs for Cyrillic conversion | |
| ord("Dz"): "Dz", | |
| ord("dz"): "dz", | |
| ord("DŽ"): "DŽ", | |
| ord("Dž"): "Dž", | |
| ord("dž"): "dž", | |
| ord("LJ"): "LJ", | |
| ord("Lj"): "Lj", | |
| ord("lj"): "lj", | |
| ord("NJ"): "NJ", | |
| ord("Nj"): "Nj", | |
| ord("nj"): "nj", | |
| ord("ff"): "ff", # Latin typographical ligatures | |
| ord("fi"): "fi", | |
| ord("fl"): "fl", | |
| ord("ffi"): "ffi", | |
| ord("ffl"): "ffl", | |
| ord("ſt"): "ſt", | |
| ord("st"): "st", | |
| } | |
| def _build_width_map() -> dict[int, str]: | |
| """ | |
| Build a translate mapping that replaces halfwidth and fullwidth forms | |
| with their standard-width forms. | |
| """ | |
| # Though it's not listed as a fullwidth character, we'll want to convert | |
| # U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start | |
| # with that in the dictionary. | |
| width_map = {0x3000: " "} | |
| for i in range(0xFF01, 0xFFF0): | |
| char = chr(i) | |
| alternate = unicodedata.normalize("NFKC", char) | |
| if alternate != char: | |
| width_map[i] = alternate | |
| return width_map | |
| WIDTH_MAP = _build_width_map() | |
| # Character classes that help us pinpoint embedded mojibake. These can | |
| # include common characters, because we'll also check them for 'badness'. | |
| # | |
| # Though they go on for many lines, the members of this dictionary are | |
| # single concatenated strings. | |
| # | |
| # This code is generated using scripts/char_data_table.py. | |
| UTF8_CLUES: dict[str, str] = { | |
| # Letters that decode to 0xC2 - 0xDF in a Latin-1-like encoding | |
| "utf8_first_of_2": ( | |
| "\N{LATIN CAPITAL LETTER A WITH BREVE}" # windows-1250:C3 | |
| "\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}" # latin-1:C2 | |
| "\N{LATIN CAPITAL LETTER A WITH DIAERESIS}" # latin-1:C4 | |
| "\N{LATIN CAPITAL LETTER A WITH MACRON}" # windows-1257:C2 | |
| "\N{LATIN CAPITAL LETTER A WITH RING ABOVE}" # latin-1:C5 | |
| "\N{LATIN CAPITAL LETTER A WITH TILDE}" # latin-1:C3 | |
| "\N{LATIN CAPITAL LETTER AE}" # latin-1:C6 | |
| "\N{LATIN CAPITAL LETTER C WITH ACUTE}" # windows-1250:C6 | |
| "\N{LATIN CAPITAL LETTER C WITH CARON}" # windows-1250:C8 | |
| "\N{LATIN CAPITAL LETTER C WITH CEDILLA}" # latin-1:C7 | |
| "\N{LATIN CAPITAL LETTER D WITH CARON}" # windows-1250:CF | |
| "\N{LATIN CAPITAL LETTER D WITH STROKE}" # windows-1250:D0 | |
| "\N{LATIN CAPITAL LETTER E WITH ACUTE}" # latin-1:C9 | |
| "\N{LATIN CAPITAL LETTER E WITH CARON}" # windows-1250:CC | |
| "\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}" # latin-1:CA | |
| "\N{LATIN CAPITAL LETTER E WITH DIAERESIS}" # latin-1:CB | |
| "\N{LATIN CAPITAL LETTER E WITH DOT ABOVE}" # windows-1257:CB | |
| "\N{LATIN CAPITAL LETTER E WITH GRAVE}" # latin-1:C8 | |
| "\N{LATIN CAPITAL LETTER E WITH MACRON}" # windows-1257:C7 | |
| "\N{LATIN CAPITAL LETTER E WITH OGONEK}" # windows-1250:CA | |
| "\N{LATIN CAPITAL LETTER ETH}" # latin-1:D0 | |
| "\N{LATIN CAPITAL LETTER G WITH BREVE}" # windows-1254:D0 | |
| "\N{LATIN CAPITAL LETTER G WITH CEDILLA}" # windows-1257:CC | |
| "\N{LATIN CAPITAL LETTER I WITH ACUTE}" # latin-1:CD | |
| "\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}" # latin-1:CE | |
| "\N{LATIN CAPITAL LETTER I WITH DIAERESIS}" # latin-1:CF | |
| "\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}" # windows-1254:DD | |
| "\N{LATIN CAPITAL LETTER I WITH GRAVE}" # latin-1:CC | |
| "\N{LATIN CAPITAL LETTER I WITH MACRON}" # windows-1257:CE | |
| "\N{LATIN CAPITAL LETTER K WITH CEDILLA}" # windows-1257:CD | |
| "\N{LATIN CAPITAL LETTER L WITH ACUTE}" # windows-1250:C5 | |
| "\N{LATIN CAPITAL LETTER L WITH CEDILLA}" # windows-1257:CF | |
| "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1257:D9 | |
| "\N{LATIN CAPITAL LETTER N WITH ACUTE}" # windows-1250:D1 | |
| "\N{LATIN CAPITAL LETTER N WITH CARON}" # windows-1250:D2 | |
| "\N{LATIN CAPITAL LETTER N WITH CEDILLA}" # windows-1257:D2 | |
| "\N{LATIN CAPITAL LETTER N WITH TILDE}" # latin-1:D1 | |
| "\N{LATIN CAPITAL LETTER O WITH ACUTE}" # latin-1:D3 | |
| "\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}" # latin-1:D4 | |
| "\N{LATIN CAPITAL LETTER O WITH DIAERESIS}" # latin-1:D6 | |
| "\N{LATIN CAPITAL LETTER O WITH DOUBLE ACUTE}" # windows-1250:D5 | |
| "\N{LATIN CAPITAL LETTER O WITH GRAVE}" # latin-1:D2 | |
| "\N{LATIN CAPITAL LETTER O WITH MACRON}" # windows-1257:D4 | |
| "\N{LATIN CAPITAL LETTER O WITH STROKE}" # latin-1:D8 | |
| "\N{LATIN CAPITAL LETTER O WITH TILDE}" # latin-1:D5 | |
| "\N{LATIN CAPITAL LETTER R WITH CARON}" # windows-1250:D8 | |
| "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1257:DA | |
| "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1257:D0 | |
| "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1254:DE | |
| "\N{LATIN CAPITAL LETTER T WITH CEDILLA}" # windows-1250:DE | |
| "\N{LATIN CAPITAL LETTER THORN}" # latin-1:DE | |
| "\N{LATIN CAPITAL LETTER U WITH ACUTE}" # latin-1:DA | |
| "\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}" # latin-1:DB | |
| "\N{LATIN CAPITAL LETTER U WITH DIAERESIS}" # latin-1:DC | |
| "\N{LATIN CAPITAL LETTER U WITH DOUBLE ACUTE}" # windows-1250:DB | |
| "\N{LATIN CAPITAL LETTER U WITH GRAVE}" # latin-1:D9 | |
| "\N{LATIN CAPITAL LETTER U WITH MACRON}" # windows-1257:DB | |
| "\N{LATIN CAPITAL LETTER U WITH OGONEK}" # windows-1257:D8 | |
| "\N{LATIN CAPITAL LETTER U WITH RING ABOVE}" # windows-1250:D9 | |
| "\N{LATIN CAPITAL LETTER Y WITH ACUTE}" # latin-1:DD | |
| "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1257:CA | |
| "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1257:DE | |
| "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1257:DD | |
| "\N{LATIN SMALL LETTER SHARP S}" # latin-1:DF | |
| "\N{MULTIPLICATION SIGN}" # latin-1:D7 | |
| "\N{GREEK CAPITAL LETTER BETA}" # windows-1253:C2 | |
| "\N{GREEK CAPITAL LETTER GAMMA}" # windows-1253:C3 | |
| "\N{GREEK CAPITAL LETTER DELTA}" # windows-1253:C4 | |
| "\N{GREEK CAPITAL LETTER EPSILON}" # windows-1253:C5 | |
| "\N{GREEK CAPITAL LETTER ZETA}" # windows-1253:C6 | |
| "\N{GREEK CAPITAL LETTER ETA}" # windows-1253:C7 | |
| "\N{GREEK CAPITAL LETTER THETA}" # windows-1253:C8 | |
| "\N{GREEK CAPITAL LETTER IOTA}" # windows-1253:C9 | |
| "\N{GREEK CAPITAL LETTER KAPPA}" # windows-1253:CA | |
| "\N{GREEK CAPITAL LETTER LAMDA}" # windows-1253:CB | |
| "\N{GREEK CAPITAL LETTER MU}" # windows-1253:CC | |
| "\N{GREEK CAPITAL LETTER NU}" # windows-1253:CD | |
| "\N{GREEK CAPITAL LETTER XI}" # windows-1253:CE | |
| "\N{GREEK CAPITAL LETTER OMICRON}" # windows-1253:CF | |
| "\N{GREEK CAPITAL LETTER PI}" # windows-1253:D0 | |
| "\N{GREEK CAPITAL LETTER RHO}" # windows-1253:D1 | |
| "\N{GREEK CAPITAL LETTER SIGMA}" # windows-1253:D3 | |
| "\N{GREEK CAPITAL LETTER TAU}" # windows-1253:D4 | |
| "\N{GREEK CAPITAL LETTER UPSILON}" # windows-1253:D5 | |
| "\N{GREEK CAPITAL LETTER PHI}" # windows-1253:D6 | |
| "\N{GREEK CAPITAL LETTER CHI}" # windows-1253:D7 | |
| "\N{GREEK CAPITAL LETTER PSI}" # windows-1253:D8 | |
| "\N{GREEK CAPITAL LETTER OMEGA}" # windows-1253:D9 | |
| "\N{GREEK CAPITAL LETTER IOTA WITH DIALYTIKA}" # windows-1253:DA | |
| "\N{GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA}" # windows-1253:DB | |
| "\N{GREEK SMALL LETTER ALPHA WITH TONOS}" # windows-1253:DC | |
| "\N{GREEK SMALL LETTER EPSILON WITH TONOS}" # windows-1253:DD | |
| "\N{GREEK SMALL LETTER ETA WITH TONOS}" # windows-1253:DE | |
| "\N{GREEK SMALL LETTER IOTA WITH TONOS}" # windows-1253:DF | |
| "\N{CYRILLIC CAPITAL LETTER VE}" # windows-1251:C2 | |
| "\N{CYRILLIC CAPITAL LETTER GHE}" # windows-1251:C3 | |
| "\N{CYRILLIC CAPITAL LETTER DE}" # windows-1251:C4 | |
| "\N{CYRILLIC CAPITAL LETTER IE}" # windows-1251:C5 | |
| "\N{CYRILLIC CAPITAL LETTER ZHE}" # windows-1251:C6 | |
| "\N{CYRILLIC CAPITAL LETTER ZE}" # windows-1251:C7 | |
| "\N{CYRILLIC CAPITAL LETTER I}" # windows-1251:C8 | |
| "\N{CYRILLIC CAPITAL LETTER SHORT I}" # windows-1251:C9 | |
| "\N{CYRILLIC CAPITAL LETTER KA}" # windows-1251:CA | |
| "\N{CYRILLIC CAPITAL LETTER EL}" # windows-1251:CB | |
| "\N{CYRILLIC CAPITAL LETTER EM}" # windows-1251:CC | |
| "\N{CYRILLIC CAPITAL LETTER EN}" # windows-1251:CD | |
| "\N{CYRILLIC CAPITAL LETTER O}" # windows-1251:CE | |
| "\N{CYRILLIC CAPITAL LETTER PE}" # windows-1251:CF | |
| "\N{CYRILLIC CAPITAL LETTER ER}" # windows-1251:D0 | |
| "\N{CYRILLIC CAPITAL LETTER ES}" # windows-1251:D1 | |
| "\N{CYRILLIC CAPITAL LETTER TE}" # windows-1251:D2 | |
| "\N{CYRILLIC CAPITAL LETTER U}" # windows-1251:D3 | |
| "\N{CYRILLIC CAPITAL LETTER EF}" # windows-1251:D4 | |
| "\N{CYRILLIC CAPITAL LETTER HA}" # windows-1251:D5 | |
| "\N{CYRILLIC CAPITAL LETTER TSE}" # windows-1251:D6 | |
| "\N{CYRILLIC CAPITAL LETTER CHE}" # windows-1251:D7 | |
| "\N{CYRILLIC CAPITAL LETTER SHA}" # windows-1251:D8 | |
| "\N{CYRILLIC CAPITAL LETTER SHCHA}" # windows-1251:D9 | |
| "\N{CYRILLIC CAPITAL LETTER HARD SIGN}" # windows-1251:DA | |
| "\N{CYRILLIC CAPITAL LETTER YERU}" # windows-1251:DB | |
| "\N{CYRILLIC CAPITAL LETTER SOFT SIGN}" # windows-1251:DC | |
| "\N{CYRILLIC CAPITAL LETTER E}" # windows-1251:DD | |
| "\N{CYRILLIC CAPITAL LETTER YU}" # windows-1251:DE | |
| "\N{CYRILLIC CAPITAL LETTER YA}" # windows-1251:DF | |
| ), | |
| # Letters that decode to 0xE0 - 0xEF in a Latin-1-like encoding | |
| "utf8_first_of_3": ( | |
| "\N{LATIN SMALL LETTER A WITH ACUTE}" # latin-1:E1 | |
| "\N{LATIN SMALL LETTER A WITH BREVE}" # windows-1250:E3 | |
| "\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}" # latin-1:E2 | |
| "\N{LATIN SMALL LETTER A WITH DIAERESIS}" # latin-1:E4 | |
| "\N{LATIN SMALL LETTER A WITH GRAVE}" # latin-1:E0 | |
| "\N{LATIN SMALL LETTER A WITH MACRON}" # windows-1257:E2 | |
| "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1257:E0 | |
| "\N{LATIN SMALL LETTER A WITH RING ABOVE}" # latin-1:E5 | |
| "\N{LATIN SMALL LETTER A WITH TILDE}" # latin-1:E3 | |
| "\N{LATIN SMALL LETTER AE}" # latin-1:E6 | |
| "\N{LATIN SMALL LETTER C WITH ACUTE}" # windows-1250:E6 | |
| "\N{LATIN SMALL LETTER C WITH CARON}" # windows-1250:E8 | |
| "\N{LATIN SMALL LETTER C WITH CEDILLA}" # latin-1:E7 | |
| "\N{LATIN SMALL LETTER D WITH CARON}" # windows-1250:EF | |
| "\N{LATIN SMALL LETTER E WITH ACUTE}" # latin-1:E9 | |
| "\N{LATIN SMALL LETTER E WITH CARON}" # windows-1250:EC | |
| "\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}" # latin-1:EA | |
| "\N{LATIN SMALL LETTER E WITH DIAERESIS}" # latin-1:EB | |
| "\N{LATIN SMALL LETTER E WITH DOT ABOVE}" # windows-1257:EB | |
| "\N{LATIN SMALL LETTER E WITH GRAVE}" # latin-1:E8 | |
| "\N{LATIN SMALL LETTER E WITH MACRON}" # windows-1257:E7 | |
| "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA | |
| "\N{LATIN SMALL LETTER E WITH OGONEK}" # windows-1250:EA | |
| "\N{LATIN SMALL LETTER G WITH CEDILLA}" # windows-1257:EC | |
| "\N{LATIN SMALL LETTER I WITH ACUTE}" # latin-1:ED | |
| "\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}" # latin-1:EE | |
| "\N{LATIN SMALL LETTER I WITH DIAERESIS}" # latin-1:EF | |
| "\N{LATIN SMALL LETTER I WITH GRAVE}" # latin-1:EC | |
| "\N{LATIN SMALL LETTER I WITH MACRON}" # windows-1257:EE | |
| "\N{LATIN SMALL LETTER I WITH OGONEK}" # windows-1257:E1 | |
| "\N{LATIN SMALL LETTER K WITH CEDILLA}" # windows-1257:ED | |
| "\N{LATIN SMALL LETTER L WITH ACUTE}" # windows-1250:E5 | |
| "\N{LATIN SMALL LETTER L WITH CEDILLA}" # windows-1257:EF | |
| "\N{LATIN SMALL LETTER R WITH ACUTE}" # windows-1250:E0 | |
| "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1257:EA | |
| "\N{GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS}" # windows-1253:E0 | |
| "\N{GREEK SMALL LETTER ALPHA}" # windows-1253:E1 | |
| "\N{GREEK SMALL LETTER BETA}" # windows-1253:E2 | |
| "\N{GREEK SMALL LETTER GAMMA}" # windows-1253:E3 | |
| "\N{GREEK SMALL LETTER DELTA}" # windows-1253:E4 | |
| "\N{GREEK SMALL LETTER EPSILON}" # windows-1253:E5 | |
| "\N{GREEK SMALL LETTER ZETA}" # windows-1253:E6 | |
| "\N{GREEK SMALL LETTER ETA}" # windows-1253:E7 | |
| "\N{GREEK SMALL LETTER THETA}" # windows-1253:E8 | |
| "\N{GREEK SMALL LETTER IOTA}" # windows-1253:E9 | |
| "\N{GREEK SMALL LETTER KAPPA}" # windows-1253:EA | |
| "\N{GREEK SMALL LETTER LAMDA}" # windows-1253:EB | |
| "\N{GREEK SMALL LETTER MU}" # windows-1253:EC | |
| "\N{GREEK SMALL LETTER NU}" # windows-1253:ED | |
| "\N{GREEK SMALL LETTER XI}" # windows-1253:EE | |
| "\N{GREEK SMALL LETTER OMICRON}" # windows-1253:EF | |
| "\N{CYRILLIC SMALL LETTER A}" # windows-1251:E0 | |
| "\N{CYRILLIC SMALL LETTER BE}" # windows-1251:E1 | |
| "\N{CYRILLIC SMALL LETTER VE}" # windows-1251:E2 | |
| "\N{CYRILLIC SMALL LETTER GHE}" # windows-1251:E3 | |
| "\N{CYRILLIC SMALL LETTER DE}" # windows-1251:E4 | |
| "\N{CYRILLIC SMALL LETTER IE}" # windows-1251:E5 | |
| "\N{CYRILLIC SMALL LETTER ZHE}" # windows-1251:E6 | |
| "\N{CYRILLIC SMALL LETTER ZE}" # windows-1251:E7 | |
| "\N{CYRILLIC SMALL LETTER I}" # windows-1251:E8 | |
| "\N{CYRILLIC SMALL LETTER SHORT I}" # windows-1251:E9 | |
| "\N{CYRILLIC SMALL LETTER KA}" # windows-1251:EA | |
| "\N{CYRILLIC SMALL LETTER EL}" # windows-1251:EB | |
| "\N{CYRILLIC SMALL LETTER EM}" # windows-1251:EC | |
| "\N{CYRILLIC SMALL LETTER EN}" # windows-1251:ED | |
| "\N{CYRILLIC SMALL LETTER O}" # windows-1251:EE | |
| "\N{CYRILLIC SMALL LETTER PE}" # windows-1251:EF | |
| ), | |
| # Letters that decode to 0xF0 or 0xF3 in a Latin-1-like encoding. | |
| # (Other leading bytes correspond only to unassigned codepoints) | |
| "utf8_first_of_4": ( | |
| "\N{LATIN SMALL LETTER D WITH STROKE}" # windows-1250:F0 | |
| "\N{LATIN SMALL LETTER ETH}" # latin-1:F0 | |
| "\N{LATIN SMALL LETTER G WITH BREVE}" # windows-1254:F0 | |
| "\N{LATIN SMALL LETTER O WITH ACUTE}" # latin-1:F3 | |
| "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1257:F0 | |
| "\N{GREEK SMALL LETTER PI}" # windows-1253:F0 | |
| "\N{GREEK SMALL LETTER SIGMA}" # windows-1253:F3 | |
| "\N{CYRILLIC SMALL LETTER ER}" # windows-1251:F0 | |
| "\N{CYRILLIC SMALL LETTER U}" # windows-1251:F3 | |
| ), | |
| # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, | |
| # including a space standing in for 0xA0 | |
| "utf8_continuation": ( | |
| "\x80-\xbf" | |
| "\N{SPACE}" # modification of latin-1:A0, NO-BREAK SPACE | |
| "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 | |
| "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF | |
| "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC | |
| "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 | |
| "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 | |
| "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA | |
| "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C | |
| "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A | |
| "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA | |
| "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D | |
| "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F | |
| "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F | |
| "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E | |
| "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF | |
| "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C | |
| "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 | |
| "\N{LATIN SMALL LETTER AE}" # windows-1257:BF | |
| "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 | |
| "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE | |
| "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 | |
| "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 | |
| "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA | |
| "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C | |
| "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A | |
| "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA | |
| "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D | |
| "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F | |
| "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E | |
| "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF | |
| "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C | |
| "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 | |
| "\N{CARON}" # windows-1250:A1 | |
| "\N{BREVE}" # windows-1250:A2 | |
| "\N{OGONEK}" # windows-1250:B2 | |
| "\N{SMALL TILDE}" # windows-1252:98 | |
| "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD | |
| "\N{GREEK TONOS}" # windows-1253:B4 | |
| "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 | |
| "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 | |
| "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 | |
| "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 | |
| "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA | |
| "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC | |
| "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE | |
| "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF | |
| "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 | |
| "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 | |
| "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 | |
| "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA | |
| "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD | |
| "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 | |
| "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF | |
| "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 | |
| "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A | |
| "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C | |
| "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E | |
| "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D | |
| "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 | |
| "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F | |
| "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 | |
| "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 | |
| "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 | |
| "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA | |
| "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE | |
| "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 | |
| "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF | |
| "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC | |
| "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A | |
| "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C | |
| "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E | |
| "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D | |
| "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 | |
| "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F | |
| "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 | |
| "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 | |
| "\N{EN DASH}" # windows-1252:96 | |
| "\N{EM DASH}" # windows-1252:97 | |
| "\N{HORIZONTAL BAR}" # windows-1253:AF | |
| "\N{LEFT SINGLE QUOTATION MARK}" # windows-1252:91 | |
| "\N{RIGHT SINGLE QUOTATION MARK}" # windows-1252:92 | |
| "\N{SINGLE LOW-9 QUOTATION MARK}" # windows-1252:82 | |
| "\N{LEFT DOUBLE QUOTATION MARK}" # windows-1252:93 | |
| "\N{RIGHT DOUBLE QUOTATION MARK}" # windows-1252:94 | |
| "\N{DOUBLE LOW-9 QUOTATION MARK}" # windows-1252:84 | |
| "\N{DAGGER}" # windows-1252:86 | |
| "\N{DOUBLE DAGGER}" # windows-1252:87 | |
| "\N{BULLET}" # windows-1252:95 | |
| "\N{HORIZONTAL ELLIPSIS}" # windows-1252:85 | |
| "\N{PER MILLE SIGN}" # windows-1252:89 | |
| "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B | |
| "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B | |
| "\N{EURO SIGN}" # windows-1252:80 | |
| "\N{NUMERO SIGN}" # windows-1251:B9 | |
| "\N{TRADE MARK SIGN}" # windows-1252:99 | |
| ), | |
| # Letters that decode to 0x80 - 0xBF in a Latin-1-like encoding, | |
| # and don't usually stand for themselves when adjacent to mojibake. | |
| # This excludes spaces, dashes, 'bullet', quotation marks, and ellipses. | |
| "utf8_continuation_strict": ( | |
| "\x80-\xbf" | |
| "\N{LATIN CAPITAL LETTER A WITH OGONEK}" # windows-1250:A5 | |
| "\N{LATIN CAPITAL LETTER AE}" # windows-1257:AF | |
| "\N{LATIN CAPITAL LETTER L WITH CARON}" # windows-1250:BC | |
| "\N{LATIN CAPITAL LETTER L WITH STROKE}" # windows-1250:A3 | |
| "\N{LATIN CAPITAL LETTER O WITH STROKE}" # windows-1257:A8 | |
| "\N{LATIN CAPITAL LETTER R WITH CEDILLA}" # windows-1257:AA | |
| "\N{LATIN CAPITAL LETTER S WITH ACUTE}" # windows-1250:8C | |
| "\N{LATIN CAPITAL LETTER S WITH CARON}" # windows-1252:8A | |
| "\N{LATIN CAPITAL LETTER S WITH CEDILLA}" # windows-1250:AA | |
| "\N{LATIN CAPITAL LETTER T WITH CARON}" # windows-1250:8D | |
| "\N{LATIN CAPITAL LETTER Y WITH DIAERESIS}" # windows-1252:9F | |
| "\N{LATIN CAPITAL LETTER Z WITH ACUTE}" # windows-1250:8F | |
| "\N{LATIN CAPITAL LETTER Z WITH CARON}" # windows-1252:8E | |
| "\N{LATIN CAPITAL LETTER Z WITH DOT ABOVE}" # windows-1250:AF | |
| "\N{LATIN CAPITAL LIGATURE OE}" # windows-1252:8C | |
| "\N{LATIN SMALL LETTER A WITH OGONEK}" # windows-1250:B9 | |
| "\N{LATIN SMALL LETTER AE}" # windows-1257:BF | |
| "\N{LATIN SMALL LETTER F WITH HOOK}" # windows-1252:83 | |
| "\N{LATIN SMALL LETTER L WITH CARON}" # windows-1250:BE | |
| "\N{LATIN SMALL LETTER L WITH STROKE}" # windows-1250:B3 | |
| "\N{LATIN SMALL LETTER O WITH STROKE}" # windows-1257:B8 | |
| "\N{LATIN SMALL LETTER R WITH CEDILLA}" # windows-1257:BA | |
| "\N{LATIN SMALL LETTER S WITH ACUTE}" # windows-1250:9C | |
| "\N{LATIN SMALL LETTER S WITH CARON}" # windows-1252:9A | |
| "\N{LATIN SMALL LETTER S WITH CEDILLA}" # windows-1250:BA | |
| "\N{LATIN SMALL LETTER T WITH CARON}" # windows-1250:9D | |
| "\N{LATIN SMALL LETTER Z WITH ACUTE}" # windows-1250:9F | |
| "\N{LATIN SMALL LETTER Z WITH CARON}" # windows-1252:9E | |
| "\N{LATIN SMALL LETTER Z WITH DOT ABOVE}" # windows-1250:BF | |
| "\N{LATIN SMALL LIGATURE OE}" # windows-1252:9C | |
| "\N{MODIFIER LETTER CIRCUMFLEX ACCENT}" # windows-1252:88 | |
| "\N{CARON}" # windows-1250:A1 | |
| "\N{BREVE}" # windows-1250:A2 | |
| "\N{OGONEK}" # windows-1250:B2 | |
| "\N{SMALL TILDE}" # windows-1252:98 | |
| "\N{DOUBLE ACUTE ACCENT}" # windows-1250:BD | |
| "\N{GREEK TONOS}" # windows-1253:B4 | |
| "\N{GREEK DIALYTIKA TONOS}" # windows-1253:A1 | |
| "\N{GREEK CAPITAL LETTER ALPHA WITH TONOS}" # windows-1253:A2 | |
| "\N{GREEK CAPITAL LETTER EPSILON WITH TONOS}" # windows-1253:B8 | |
| "\N{GREEK CAPITAL LETTER ETA WITH TONOS}" # windows-1253:B9 | |
| "\N{GREEK CAPITAL LETTER IOTA WITH TONOS}" # windows-1253:BA | |
| "\N{GREEK CAPITAL LETTER OMICRON WITH TONOS}" # windows-1253:BC | |
| "\N{GREEK CAPITAL LETTER UPSILON WITH TONOS}" # windows-1253:BE | |
| "\N{GREEK CAPITAL LETTER OMEGA WITH TONOS}" # windows-1253:BF | |
| "\N{CYRILLIC CAPITAL LETTER IO}" # windows-1251:A8 | |
| "\N{CYRILLIC CAPITAL LETTER DJE}" # windows-1251:80 | |
| "\N{CYRILLIC CAPITAL LETTER GJE}" # windows-1251:81 | |
| "\N{CYRILLIC CAPITAL LETTER UKRAINIAN IE}" # windows-1251:AA | |
| "\N{CYRILLIC CAPITAL LETTER DZE}" # windows-1251:BD | |
| "\N{CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B2 | |
| "\N{CYRILLIC CAPITAL LETTER YI}" # windows-1251:AF | |
| "\N{CYRILLIC CAPITAL LETTER JE}" # windows-1251:A3 | |
| "\N{CYRILLIC CAPITAL LETTER LJE}" # windows-1251:8A | |
| "\N{CYRILLIC CAPITAL LETTER NJE}" # windows-1251:8C | |
| "\N{CYRILLIC CAPITAL LETTER TSHE}" # windows-1251:8E | |
| "\N{CYRILLIC CAPITAL LETTER KJE}" # windows-1251:8D | |
| "\N{CYRILLIC CAPITAL LETTER SHORT U}" # windows-1251:A1 | |
| "\N{CYRILLIC CAPITAL LETTER DZHE}" # windows-1251:8F | |
| "\N{CYRILLIC SMALL LETTER IO}" # windows-1251:B8 | |
| "\N{CYRILLIC SMALL LETTER DJE}" # windows-1251:90 | |
| "\N{CYRILLIC SMALL LETTER GJE}" # windows-1251:83 | |
| "\N{CYRILLIC SMALL LETTER UKRAINIAN IE}" # windows-1251:BA | |
| "\N{CYRILLIC SMALL LETTER DZE}" # windows-1251:BE | |
| "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}" # windows-1251:B3 | |
| "\N{CYRILLIC SMALL LETTER YI}" # windows-1251:BF | |
| "\N{CYRILLIC SMALL LETTER JE}" # windows-1251:BC | |
| "\N{CYRILLIC SMALL LETTER LJE}" # windows-1251:9A | |
| "\N{CYRILLIC SMALL LETTER NJE}" # windows-1251:9C | |
| "\N{CYRILLIC SMALL LETTER TSHE}" # windows-1251:9E | |
| "\N{CYRILLIC SMALL LETTER KJE}" # windows-1251:9D | |
| "\N{CYRILLIC SMALL LETTER SHORT U}" # windows-1251:A2 | |
| "\N{CYRILLIC SMALL LETTER DZHE}" # windows-1251:9F | |
| "\N{CYRILLIC CAPITAL LETTER GHE WITH UPTURN}" # windows-1251:A5 | |
| "\N{CYRILLIC SMALL LETTER GHE WITH UPTURN}" # windows-1251:B4 | |
| "\N{DAGGER}" # windows-1252:86 | |
| "\N{DOUBLE DAGGER}" # windows-1252:87 | |
| "\N{PER MILLE SIGN}" # windows-1252:89 | |
| "\N{SINGLE LEFT-POINTING ANGLE QUOTATION MARK}" # windows-1252:8B | |
| "\N{SINGLE RIGHT-POINTING ANGLE QUOTATION MARK}" # windows-1252:9B | |
| "\N{EURO SIGN}" # windows-1252:80 | |
| "\N{NUMERO SIGN}" # windows-1251:B9 | |
| "\N{TRADE MARK SIGN}" # windows-1252:99 | |
| ), | |
| } | |
| # This regex uses UTF8_CLUES to find sequences of likely mojibake. | |
| # It matches them with + so that several adjacent UTF-8-looking sequences | |
| # get coalesced into one, allowing them to be fixed more efficiently | |
| # and not requiring every individual subsequence to be detected as 'badness'. | |
| # | |
| # We accept spaces in place of "utf8_continuation", because spaces might have | |
| # been intended to be U+A0 NO-BREAK SPACE. | |
| # | |
| # We do a lookbehind to make sure the previous character isn't a | |
| # "utf8_continuation_strict" character, so that we don't fix just a few | |
| # characters in a huge garble and make the situation worse. | |
| # | |
| # Unfortunately, the matches to this regular expression won't show their | |
| # surrounding context, and including context would make the expression much | |
| # less efficient. The 'badness' rules that require context, such as a preceding | |
| # lowercase letter, will prevent some cases of inconsistent UTF-8 from being | |
| # fixed when they don't see it. | |
| UTF8_DETECTOR_RE = re.compile( | |
| """ | |
| (?<! [{utf8_continuation_strict}]) | |
| ( | |
| [{utf8_first_of_2}] [{utf8_continuation}] | |
| | | |
| [{utf8_first_of_3}] [{utf8_continuation}]{{2}} | |
| | | |
| [{utf8_first_of_4}] [{utf8_continuation}]{{3}} | |
| )+ | |
| """.format(**UTF8_CLUES), | |
| re.VERBOSE, | |
| ) | |
Xet Storage Details
- Size:
- 34.9 kB
- Xet hash:
- 2db534654507fbf3ced4c9eea57a9702aafbf4f27dafbf3d3a34a2babc6cdda8
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.