Home | History | Annotate | Download | only in bs4
      1 # -*- coding: utf-8 -*-
      2 """Beautiful Soup bonus library: Unicode, Dammit
      3 
      4 This library converts a bytestream to Unicode through any means
      5 necessary. It is heavily based on code from Mark Pilgrim's Universal
      6 Feed Parser. It works best on XML and XML, but it does not rewrite the
      7 XML or HTML to reflect a new encoding; that's the tree builder's job.
      8 """
      9 
     10 import codecs
     11 from htmlentitydefs import codepoint2name
     12 import re
     13 import logging
     14 import string
     15 
     16 # Import a library to autodetect character encodings.
     17 chardet_type = None
     18 try:
     19     # First try the fast C implementation.
     20     #  PyPI package: cchardet
     21     import cchardet
     22     def chardet_dammit(s):
     23         return cchardet.detect(s)['encoding']
     24 except ImportError:
     25     try:
     26         # Fall back to the pure Python implementation
     27         #  Debian package: python-chardet
     28         #  PyPI package: chardet
     29         import chardet
     30         def chardet_dammit(s):
     31             return chardet.detect(s)['encoding']
     32         #import chardet.constants
     33         #chardet.constants._debug = 1
     34     except ImportError:
     35         # No chardet available.
     36         def chardet_dammit(s):
     37             return None
     38 
     39 # Available from http://cjkpython.i18n.org/.
     40 try:
     41     import iconv_codec
     42 except ImportError:
     43     pass
     44 
     45 xml_encoding_re = re.compile(
     46     '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
     47 html_meta_re = re.compile(
     48     '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
     49 
     50 class EntitySubstitution(object):
     51 
     52     """Substitute XML or HTML entities for the corresponding characters."""
     53 
     54     def _populate_class_variables():
     55         lookup = {}
     56         reverse_lookup = {}
     57         characters_for_re = []
     58         for codepoint, name in list(codepoint2name.items()):
     59             character = unichr(codepoint)
     60             if codepoint != 34:
     61                 # There's no point in turning the quotation mark into
     62                 # &quot;, unless it happens within an attribute value, which
     63                 # is handled elsewhere.
     64                 characters_for_re.append(character)
     65                 lookup[character] = name
     66             # But we do want to turn &quot; into the quotation mark.
     67             reverse_lookup[name] = character
     68         re_definition = "[%s]" % "".join(characters_for_re)
     69         return lookup, reverse_lookup, re.compile(re_definition)
     70     (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
     71      CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
     72 
     73     CHARACTER_TO_XML_ENTITY = {
     74         "'": "apos",
     75         '"': "quot",
     76         "&": "amp",
     77         "<": "lt",
     78         ">": "gt",
     79         }
     80 
     81     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
     82                                            "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
     83                                            ")")
     84 
     85     AMPERSAND_OR_BRACKET = re.compile("([<>&])")
     86 
     87     @classmethod
     88     def _substitute_html_entity(cls, matchobj):
     89         entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
     90         return "&%s;" % entity
     91 
     92     @classmethod
     93     def _substitute_xml_entity(cls, matchobj):
     94         """Used with a regular expression to substitute the
     95         appropriate XML entity for an XML special character."""
     96         entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
     97         return "&%s;" % entity
     98 
     99     @classmethod
    100     def quoted_attribute_value(self, value):
    101         """Make a value into a quoted XML attribute, possibly escaping it.
    102 
    103          Most strings will be quoted using double quotes.
    104 
    105           Bob's Bar -> "Bob's Bar"
    106 
    107          If a string contains double quotes, it will be quoted using
    108          single quotes.
    109 
    110           Welcome to "my bar" -> 'Welcome to "my bar"'
    111 
    112          If a string contains both single and double quotes, the
    113          double quotes will be escaped, and the string will be quoted
    114          using double quotes.
    115 
    116           Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
    117         """
    118         quote_with = '"'
    119         if '"' in value:
    120             if "'" in value:
    121                 # The string contains both single and double
    122                 # quotes.  Turn the double quotes into
    123                 # entities. We quote the double quotes rather than
    124                 # the single quotes because the entity name is
    125                 # "&quot;" whether this is HTML or XML.  If we
    126                 # quoted the single quotes, we'd have to decide
    127                 # between &apos; and &squot;.
    128                 replace_with = "&quot;"
    129                 value = value.replace('"', replace_with)
    130             else:
    131                 # There are double quotes but no single quotes.
    132                 # We can use single quotes to quote the attribute.
    133                 quote_with = "'"
    134         return quote_with + value + quote_with
    135 
    136     @classmethod
    137     def substitute_xml(cls, value, make_quoted_attribute=False):
    138         """Substitute XML entities for special XML characters.
    139 
    140         :param value: A string to be substituted. The less-than sign
    141           will become &lt;, the greater-than sign will become &gt;,
    142           and any ampersands will become &amp;. If you want ampersands
    143           that appear to be part of an entity definition to be left
    144           alone, use substitute_xml_containing_entities() instead.
    145 
    146         :param make_quoted_attribute: If True, then the string will be
    147          quoted, as befits an attribute value.
    148         """
    149         # Escape angle brackets and ampersands.
    150         value = cls.AMPERSAND_OR_BRACKET.sub(
    151             cls._substitute_xml_entity, value)
    152 
    153         if make_quoted_attribute:
    154             value = cls.quoted_attribute_value(value)
    155         return value
    156 
    157     @classmethod
    158     def substitute_xml_containing_entities(
    159         cls, value, make_quoted_attribute=False):
    160         """Substitute XML entities for special XML characters.
    161 
    162         :param value: A string to be substituted. The less-than sign will
    163           become &lt;, the greater-than sign will become &gt;, and any
    164           ampersands that are not part of an entity defition will
    165           become &amp;.
    166 
    167         :param make_quoted_attribute: If True, then the string will be
    168          quoted, as befits an attribute value.
    169         """
    170         # Escape angle brackets, and ampersands that aren't part of
    171         # entities.
    172         value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
    173             cls._substitute_xml_entity, value)
    174 
    175         if make_quoted_attribute:
    176             value = cls.quoted_attribute_value(value)
    177         return value
    178 
    179     @classmethod
    180     def substitute_html(cls, s):
    181         """Replace certain Unicode characters with named HTML entities.
    182 
    183         This differs from data.encode(encoding, 'xmlcharrefreplace')
    184         in that the goal is to make the result more readable (to those
    185         with ASCII displays) rather than to recover from
    186         errors. There's absolutely nothing wrong with a UTF-8 string
    187         containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
    188         character with "&eacute;" will make it more readable to some
    189         people.
    190         """
    191         return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
    192             cls._substitute_html_entity, s)
    193 
    194 
    195 class EncodingDetector:
    196     """Suggests a number of possible encodings for a bytestring.
    197 
    198     Order of precedence:
    199 
    200     1. Encodings you specifically tell EncodingDetector to try first
    201     (the override_encodings argument to the constructor).
    202 
    203     2. An encoding declared within the bytestring itself, either in an
    204     XML declaration (if the bytestring is to be interpreted as an XML
    205     document), or in a <meta> tag (if the bytestring is to be
    206     interpreted as an HTML document.)
    207 
    208     3. An encoding detected through textual analysis by chardet,
    209     cchardet, or a similar external library.
    210 
    211     4. UTF-8.
    212 
    213     5. Windows-1252.
    214     """
    215     def __init__(self, markup, override_encodings=None, is_html=False):
    216         self.override_encodings = override_encodings or []
    217         self.chardet_encoding = None
    218         self.is_html = is_html
    219         self.declared_encoding = None
    220 
    221         # First order of business: strip a byte-order mark.
    222         self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
    223 
    224     def _usable(self, encoding, tried):
    225         if encoding is not None:
    226             encoding = encoding.lower()
    227             if encoding not in tried:
    228                 tried.add(encoding)
    229                 return True
    230         return False
    231 
    232     @property
    233     def encodings(self):
    234         """Yield a number of encodings that might work for this markup."""
    235         tried = set()
    236         for e in self.override_encodings:
    237             if self._usable(e, tried):
    238                 yield e
    239 
    240         # Did the document originally start with a byte-order mark
    241         # that indicated its encoding?
    242         if self._usable(self.sniffed_encoding, tried):
    243             yield self.sniffed_encoding
    244 
    245         # Look within the document for an XML or HTML encoding
    246         # declaration.
    247         if self.declared_encoding is None:
    248             self.declared_encoding = self.find_declared_encoding(
    249                 self.markup, self.is_html)
    250         if self._usable(self.declared_encoding, tried):
    251             yield self.declared_encoding
    252 
    253         # Use third-party character set detection to guess at the
    254         # encoding.
    255         if self.chardet_encoding is None:
    256             self.chardet_encoding = chardet_dammit(self.markup)
    257         if self._usable(self.chardet_encoding, tried):
    258             yield self.chardet_encoding
    259 
    260         # As a last-ditch effort, try utf-8 and windows-1252.
    261         for e in ('utf-8', 'windows-1252'):
    262             if self._usable(e, tried):
    263                 yield e
    264 
    265     @classmethod
    266     def strip_byte_order_mark(cls, data):
    267         """If a byte-order mark is present, strip it and return the encoding it implies."""
    268         encoding = None
    269         if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
    270                and (data[2:4] != '\x00\x00'):
    271             encoding = 'utf-16be'
    272             data = data[2:]
    273         elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
    274                  and (data[2:4] != '\x00\x00'):
    275             encoding = 'utf-16le'
    276             data = data[2:]
    277         elif data[:3] == b'\xef\xbb\xbf':
    278             encoding = 'utf-8'
    279             data = data[3:]
    280         elif data[:4] == b'\x00\x00\xfe\xff':
    281             encoding = 'utf-32be'
    282             data = data[4:]
    283         elif data[:4] == b'\xff\xfe\x00\x00':
    284             encoding = 'utf-32le'
    285             data = data[4:]
    286         return data, encoding
    287 
    288     @classmethod
    289     def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
    290         """Given a document, tries to find its declared encoding.
    291 
    292         An XML encoding is declared at the beginning of the document.
    293 
    294         An HTML encoding is declared in a <meta> tag, hopefully near the
    295         beginning of the document.
    296         """
    297         if search_entire_document:
    298             xml_endpos = html_endpos = len(markup)
    299         else:
    300             xml_endpos = 1024
    301             html_endpos = max(2048, int(len(markup) * 0.05))
    302             
    303         declared_encoding = None
    304         declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
    305         if not declared_encoding_match and is_html:
    306             declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
    307         if declared_encoding_match is not None:
    308             declared_encoding = declared_encoding_match.groups()[0].decode(
    309                 'ascii')
    310         if declared_encoding:
    311             return declared_encoding.lower()
    312         return None
    313 
    314 class UnicodeDammit:
    315     """A class for detecting the encoding of a *ML document and
    316     converting it to a Unicode string. If the source encoding is
    317     windows-1252, can replace MS smart quotes with their HTML or XML
    318     equivalents."""
    319 
    320     # This dictionary maps commonly seen values for "charset" in HTML
    321     # meta tags to the corresponding Python codec names. It only covers
    322     # values that aren't in Python's aliases and can't be determined
    323     # by the heuristics in find_codec.
    324     CHARSET_ALIASES = {"macintosh": "mac-roman",
    325                        "x-sjis": "shift-jis"}
    326 
    327     ENCODINGS_WITH_SMART_QUOTES = [
    328         "windows-1252",
    329         "iso-8859-1",
    330         "iso-8859-2",
    331         ]
    332 
    333     def __init__(self, markup, override_encodings=[],
    334                  smart_quotes_to=None, is_html=False):
    335         self.smart_quotes_to = smart_quotes_to
    336         self.tried_encodings = []
    337         self.contains_replacement_characters = False
    338         self.is_html = is_html
    339 
    340         self.detector = EncodingDetector(markup, override_encodings, is_html)
    341 
    342         # Short-circuit if the data is in Unicode to begin with.
    343         if isinstance(markup, unicode) or markup == '':
    344             self.markup = markup
    345             self.unicode_markup = unicode(markup)
    346             self.original_encoding = None
    347             return
    348 
    349         # The encoding detector may have stripped a byte-order mark.
    350         # Use the stripped markup from this point on.
    351         self.markup = self.detector.markup
    352 
    353         u = None
    354         for encoding in self.detector.encodings:
    355             markup = self.detector.markup
    356             u = self._convert_from(encoding)
    357             if u is not None:
    358                 break
    359 
    360         if not u:
    361             # None of the encodings worked. As an absolute last resort,
    362             # try them again with character replacement.
    363 
    364             for encoding in self.detector.encodings:
    365                 if encoding != "ascii":
    366                     u = self._convert_from(encoding, "replace")
    367                 if u is not None:
    368                     logging.warning(
    369                             "Some characters could not be decoded, and were "
    370                             "replaced with REPLACEMENT CHARACTER.")
    371                     self.contains_replacement_characters = True
    372                     break
    373 
    374         # If none of that worked, we could at this point force it to
    375         # ASCII, but that would destroy so much data that I think
    376         # giving up is better.
    377         self.unicode_markup = u
    378         if not u:
    379             self.original_encoding = None
    380 
    381     def _sub_ms_char(self, match):
    382         """Changes a MS smart quote character to an XML or HTML
    383         entity, or an ASCII character."""
    384         orig = match.group(1)
    385         if self.smart_quotes_to == 'ascii':
    386             sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
    387         else:
    388             sub = self.MS_CHARS.get(orig)
    389             if type(sub) == tuple:
    390                 if self.smart_quotes_to == 'xml':
    391                     sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
    392                 else:
    393                     sub = '&'.encode() + sub[0].encode() + ';'.encode()
    394             else:
    395                 sub = sub.encode()
    396         return sub
    397 
    398     def _convert_from(self, proposed, errors="strict"):
    399         proposed = self.find_codec(proposed)
    400         if not proposed or (proposed, errors) in self.tried_encodings:
    401             return None
    402         self.tried_encodings.append((proposed, errors))
    403         markup = self.markup
    404         # Convert smart quotes to HTML if coming from an encoding
    405         # that might have them.
    406         if (self.smart_quotes_to is not None
    407             and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
    408             smart_quotes_re = b"([\x80-\x9f])"
    409             smart_quotes_compiled = re.compile(smart_quotes_re)
    410             markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
    411 
    412         try:
    413             #print "Trying to convert document to %s (errors=%s)" % (
    414             #    proposed, errors)
    415             u = self._to_unicode(markup, proposed, errors)
    416             self.markup = u
    417             self.original_encoding = proposed
    418         except Exception as e:
    419             #print "That didn't work!"
    420             #print e
    421             return None
    422         #print "Correct encoding: %s" % proposed
    423         return self.markup
    424 
    425     def _to_unicode(self, data, encoding, errors="strict"):
    426         '''Given a string and its encoding, decodes the string into Unicode.
    427         %encoding is a string recognized by encodings.aliases'''
    428         return unicode(data, encoding, errors)
    429 
    430     @property
    431     def declared_html_encoding(self):
    432         if not self.is_html:
    433             return None
    434         return self.detector.declared_encoding
    435 
    436     def find_codec(self, charset):
    437         value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
    438                or (charset and self._codec(charset.replace("-", "")))
    439                or (charset and self._codec(charset.replace("-", "_")))
    440                or (charset and charset.lower())
    441                or charset
    442                 )
    443         if value:
    444             return value.lower()
    445         return None
    446 
    447     def _codec(self, charset):
    448         if not charset:
    449             return charset
    450         codec = None
    451         try:
    452             codecs.lookup(charset)
    453             codec = charset
    454         except (LookupError, ValueError):
    455             pass
    456         return codec
    457 
    458 
    459     # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
    460     MS_CHARS = {b'\x80': ('euro', '20AC'),
    461                 b'\x81': ' ',
    462                 b'\x82': ('sbquo', '201A'),
    463                 b'\x83': ('fnof', '192'),
    464                 b'\x84': ('bdquo', '201E'),
    465                 b'\x85': ('hellip', '2026'),
    466                 b'\x86': ('dagger', '2020'),
    467                 b'\x87': ('Dagger', '2021'),
    468                 b'\x88': ('circ', '2C6'),
    469                 b'\x89': ('permil', '2030'),
    470                 b'\x8A': ('Scaron', '160'),
    471                 b'\x8B': ('lsaquo', '2039'),
    472                 b'\x8C': ('OElig', '152'),
    473                 b'\x8D': '?',
    474                 b'\x8E': ('#x17D', '17D'),
    475                 b'\x8F': '?',
    476                 b'\x90': '?',
    477                 b'\x91': ('lsquo', '2018'),
    478                 b'\x92': ('rsquo', '2019'),
    479                 b'\x93': ('ldquo', '201C'),
    480                 b'\x94': ('rdquo', '201D'),
    481                 b'\x95': ('bull', '2022'),
    482                 b'\x96': ('ndash', '2013'),
    483                 b'\x97': ('mdash', '2014'),
    484                 b'\x98': ('tilde', '2DC'),
    485                 b'\x99': ('trade', '2122'),
    486                 b'\x9a': ('scaron', '161'),
    487                 b'\x9b': ('rsaquo', '203A'),
    488                 b'\x9c': ('oelig', '153'),
    489                 b'\x9d': '?',
    490                 b'\x9e': ('#x17E', '17E'),
    491                 b'\x9f': ('Yuml', ''),}
    492 
    493     # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
    494     # horrors like stripping diacritical marks to turn  into a, but also
    495     # contains non-horrors like turning  into ".
    496     MS_CHARS_TO_ASCII = {
    497         b'\x80' : 'EUR',
    498         b'\x81' : ' ',
    499         b'\x82' : ',',
    500         b'\x83' : 'f',
    501         b'\x84' : ',,',
    502         b'\x85' : '...',
    503         b'\x86' : '+',
    504         b'\x87' : '++',
    505         b'\x88' : '^',
    506         b'\x89' : '%',
    507         b'\x8a' : 'S',
    508         b'\x8b' : '<',
    509         b'\x8c' : 'OE',
    510         b'\x8d' : '?',
    511         b'\x8e' : 'Z',
    512         b'\x8f' : '?',
    513         b'\x90' : '?',
    514         b'\x91' : "'",
    515         b'\x92' : "'",
    516         b'\x93' : '"',
    517         b'\x94' : '"',
    518         b'\x95' : '*',
    519         b'\x96' : '-',
    520         b'\x97' : '--',
    521         b'\x98' : '~',
    522         b'\x99' : '(TM)',
    523         b'\x9a' : 's',
    524         b'\x9b' : '>',
    525         b'\x9c' : 'oe',
    526         b'\x9d' : '?',
    527         b'\x9e' : 'z',
    528         b'\x9f' : 'Y',
    529         b'\xa0' : ' ',
    530         b'\xa1' : '!',
    531         b'\xa2' : 'c',
    532         b'\xa3' : 'GBP',
    533         b'\xa4' : '$', #This approximation is especially parochial--this is the
    534                        #generic currency symbol.
    535         b'\xa5' : 'YEN',
    536         b'\xa6' : '|',
    537         b'\xa7' : 'S',
    538         b'\xa8' : '..',
    539         b'\xa9' : '',
    540         b'\xaa' : '(th)',
    541         b'\xab' : '<<',
    542         b'\xac' : '!',
    543         b'\xad' : ' ',
    544         b'\xae' : '(R)',
    545         b'\xaf' : '-',
    546         b'\xb0' : 'o',
    547         b'\xb1' : '+-',
    548         b'\xb2' : '2',
    549         b'\xb3' : '3',
    550         b'\xb4' : ("'", 'acute'),
    551         b'\xb5' : 'u',
    552         b'\xb6' : 'P',
    553         b'\xb7' : '*',
    554         b'\xb8' : ',',
    555         b'\xb9' : '1',
    556         b'\xba' : '(th)',
    557         b'\xbb' : '>>',
    558         b'\xbc' : '1/4',
    559         b'\xbd' : '1/2',
    560         b'\xbe' : '3/4',
    561         b'\xbf' : '?',
    562         b'\xc0' : 'A',
    563         b'\xc1' : 'A',
    564         b'\xc2' : 'A',
    565         b'\xc3' : 'A',
    566         b'\xc4' : 'A',
    567         b'\xc5' : 'A',
    568         b'\xc6' : 'AE',
    569         b'\xc7' : 'C',
    570         b'\xc8' : 'E',
    571         b'\xc9' : 'E',
    572         b'\xca' : 'E',
    573         b'\xcb' : 'E',
    574         b'\xcc' : 'I',
    575         b'\xcd' : 'I',
    576         b'\xce' : 'I',
    577         b'\xcf' : 'I',
    578         b'\xd0' : 'D',
    579         b'\xd1' : 'N',
    580         b'\xd2' : 'O',
    581         b'\xd3' : 'O',
    582         b'\xd4' : 'O',
    583         b'\xd5' : 'O',
    584         b'\xd6' : 'O',
    585         b'\xd7' : '*',
    586         b'\xd8' : 'O',
    587         b'\xd9' : 'U',
    588         b'\xda' : 'U',
    589         b'\xdb' : 'U',
    590         b'\xdc' : 'U',
    591         b'\xdd' : 'Y',
    592         b'\xde' : 'b',
    593         b'\xdf' : 'B',
    594         b'\xe0' : 'a',
    595         b'\xe1' : 'a',
    596         b'\xe2' : 'a',
    597         b'\xe3' : 'a',
    598         b'\xe4' : 'a',
    599         b'\xe5' : 'a',
    600         b'\xe6' : 'ae',
    601         b'\xe7' : 'c',
    602         b'\xe8' : 'e',
    603         b'\xe9' : 'e',
    604         b'\xea' : 'e',
    605         b'\xeb' : 'e',
    606         b'\xec' : 'i',
    607         b'\xed' : 'i',
    608         b'\xee' : 'i',
    609         b'\xef' : 'i',
    610         b'\xf0' : 'o',
    611         b'\xf1' : 'n',
    612         b'\xf2' : 'o',
    613         b'\xf3' : 'o',
    614         b'\xf4' : 'o',
    615         b'\xf5' : 'o',
    616         b'\xf6' : 'o',
    617         b'\xf7' : '/',
    618         b'\xf8' : 'o',
    619         b'\xf9' : 'u',
    620         b'\xfa' : 'u',
    621         b'\xfb' : 'u',
    622         b'\xfc' : 'u',
    623         b'\xfd' : 'y',
    624         b'\xfe' : 'b',
    625         b'\xff' : 'y',
    626         }
    627 
    628     # A map used when removing rogue Windows-1252/ISO-8859-1
    629     # characters in otherwise UTF-8 documents.
    630     #
    631     # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
    632     # Windows-1252.
    633     WINDOWS_1252_TO_UTF8 = {
    634         0x80 : b'\xe2\x82\xac', # 
    635         0x82 : b'\xe2\x80\x9a', # 
    636         0x83 : b'\xc6\x92',     # 
    637         0x84 : b'\xe2\x80\x9e', # 
    638         0x85 : b'\xe2\x80\xa6', # 
    639         0x86 : b'\xe2\x80\xa0', # 
    640         0x87 : b'\xe2\x80\xa1', # 
    641         0x88 : b'\xcb\x86',     # 
    642         0x89 : b'\xe2\x80\xb0', # 
    643         0x8a : b'\xc5\xa0',     # 
    644         0x8b : b'\xe2\x80\xb9', # 
    645         0x8c : b'\xc5\x92',     # 
    646         0x8e : b'\xc5\xbd',     # 
    647         0x91 : b'\xe2\x80\x98', # 
    648         0x92 : b'\xe2\x80\x99', # 
    649         0x93 : b'\xe2\x80\x9c', # 
    650         0x94 : b'\xe2\x80\x9d', # 
    651         0x95 : b'\xe2\x80\xa2', # 
    652         0x96 : b'\xe2\x80\x93', # 
    653         0x97 : b'\xe2\x80\x94', # 
    654         0x98 : b'\xcb\x9c',     # 
    655         0x99 : b'\xe2\x84\xa2', # 
    656         0x9a : b'\xc5\xa1',     # 
    657         0x9b : b'\xe2\x80\xba', # 
    658         0x9c : b'\xc5\x93',     # 
    659         0x9e : b'\xc5\xbe',     # 
    660         0x9f : b'\xc5\xb8',     # 
    661         0xa0 : b'\xc2\xa0',     # 
    662         0xa1 : b'\xc2\xa1',     # 
    663         0xa2 : b'\xc2\xa2',     # 
    664         0xa3 : b'\xc2\xa3',     # 
    665         0xa4 : b'\xc2\xa4',     # 
    666         0xa5 : b'\xc2\xa5',     # 
    667         0xa6 : b'\xc2\xa6',     # 
    668         0xa7 : b'\xc2\xa7',     # 
    669         0xa8 : b'\xc2\xa8',     # 
    670         0xa9 : b'\xc2\xa9',     # 
    671         0xaa : b'\xc2\xaa',     # 
    672         0xab : b'\xc2\xab',     # 
    673         0xac : b'\xc2\xac',     # 
    674         0xad : b'\xc2\xad',     # 
    675         0xae : b'\xc2\xae',     # 
    676         0xaf : b'\xc2\xaf',     # 
    677         0xb0 : b'\xc2\xb0',     # 
    678         0xb1 : b'\xc2\xb1',     # 
    679         0xb2 : b'\xc2\xb2',     # 
    680         0xb3 : b'\xc2\xb3',     # 
    681         0xb4 : b'\xc2\xb4',     # 
    682         0xb5 : b'\xc2\xb5',     # 
    683         0xb6 : b'\xc2\xb6',     # 
    684         0xb7 : b'\xc2\xb7',     # 
    685         0xb8 : b'\xc2\xb8',     # 
    686         0xb9 : b'\xc2\xb9',     # 
    687         0xba : b'\xc2\xba',     # 
    688         0xbb : b'\xc2\xbb',     # 
    689         0xbc : b'\xc2\xbc',     # 
    690         0xbd : b'\xc2\xbd',     # 
    691         0xbe : b'\xc2\xbe',     # 
    692         0xbf : b'\xc2\xbf',     # 
    693         0xc0 : b'\xc3\x80',     # 
    694         0xc1 : b'\xc3\x81',     # 
    695         0xc2 : b'\xc3\x82',     # 
    696         0xc3 : b'\xc3\x83',     # 
    697         0xc4 : b'\xc3\x84',     # 
    698         0xc5 : b'\xc3\x85',     # 
    699         0xc6 : b'\xc3\x86',     # 
    700         0xc7 : b'\xc3\x87',     # 
    701         0xc8 : b'\xc3\x88',     # 
    702         0xc9 : b'\xc3\x89',     # 
    703         0xca : b'\xc3\x8a',     # 
    704         0xcb : b'\xc3\x8b',     # 
    705         0xcc : b'\xc3\x8c',     # 
    706         0xcd : b'\xc3\x8d',     # 
    707         0xce : b'\xc3\x8e',     # 
    708         0xcf : b'\xc3\x8f',     # 
    709         0xd0 : b'\xc3\x90',     # 
    710         0xd1 : b'\xc3\x91',     # 
    711         0xd2 : b'\xc3\x92',     # 
    712         0xd3 : b'\xc3\x93',     # 
    713         0xd4 : b'\xc3\x94',     # 
    714         0xd5 : b'\xc3\x95',     # 
    715         0xd6 : b'\xc3\x96',     # 
    716         0xd7 : b'\xc3\x97',     # 
    717         0xd8 : b'\xc3\x98',     # 
    718         0xd9 : b'\xc3\x99',     # 
    719         0xda : b'\xc3\x9a',     # 
    720         0xdb : b'\xc3\x9b',     # 
    721         0xdc : b'\xc3\x9c',     # 
    722         0xdd : b'\xc3\x9d',     # 
    723         0xde : b'\xc3\x9e',     # 
    724         0xdf : b'\xc3\x9f',     # 
    725         0xe0 : b'\xc3\xa0',     # 
    726         0xe1 : b'\xa1',     # 
    727         0xe2 : b'\xc3\xa2',     # 
    728         0xe3 : b'\xc3\xa3',     # 
    729         0xe4 : b'\xc3\xa4',     # 
    730         0xe5 : b'\xc3\xa5',     # 
    731         0xe6 : b'\xc3\xa6',     # 
    732         0xe7 : b'\xc3\xa7',     # 
    733         0xe8 : b'\xc3\xa8',     # 
    734         0xe9 : b'\xc3\xa9',     # 
    735         0xea : b'\xc3\xaa',     # 
    736         0xeb : b'\xc3\xab',     # 
    737         0xec : b'\xc3\xac',     # 
    738         0xed : b'\xc3\xad',     # 
    739         0xee : b'\xc3\xae',     # 
    740         0xef : b'\xc3\xaf',     # 
    741         0xf0 : b'\xc3\xb0',     # 
    742         0xf1 : b'\xc3\xb1',     # 
    743         0xf2 : b'\xc3\xb2',     # 
    744         0xf3 : b'\xc3\xb3',     # 
    745         0xf4 : b'\xc3\xb4',     # 
    746         0xf5 : b'\xc3\xb5',     # 
    747         0xf6 : b'\xc3\xb6',     # 
    748         0xf7 : b'\xc3\xb7',     # 
    749         0xf8 : b'\xc3\xb8',     # 
    750         0xf9 : b'\xc3\xb9',     # 
    751         0xfa : b'\xc3\xba',     # 
    752         0xfb : b'\xc3\xbb',     # 
    753         0xfc : b'\xc3\xbc',     # 
    754         0xfd : b'\xc3\xbd',     # 
    755         0xfe : b'\xc3\xbe',     # 
    756         }
    757 
    758     MULTIBYTE_MARKERS_AND_SIZES = [
    759         (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
    760         (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
    761         (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
    762         ]
    763 
    764     FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
    765     LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
    766 
    767     @classmethod
    768     def detwingle(cls, in_bytes, main_encoding="utf8",
    769                   embedded_encoding="windows-1252"):
    770         """Fix characters from one encoding embedded in some other encoding.
    771 
    772         Currently the only situation supported is Windows-1252 (or its
    773         subset ISO-8859-1), embedded in UTF-8.
    774 
    775         The input must be a bytestring. If you've already converted
    776         the document to Unicode, you're too late.
    777 
    778         The output is a bytestring in which `embedded_encoding`
    779         characters have been converted to their `main_encoding`
    780         equivalents.
    781         """
    782         if embedded_encoding.replace('_', '-').lower() not in (
    783             'windows-1252', 'windows_1252'):
    784             raise NotImplementedError(
    785                 "Windows-1252 and ISO-8859-1 are the only currently supported "
    786                 "embedded encodings.")
    787 
    788         if main_encoding.lower() not in ('utf8', 'utf-8'):
    789             raise NotImplementedError(
    790                 "UTF-8 is the only currently supported main encoding.")
    791 
    792         byte_chunks = []
    793 
    794         chunk_start = 0
    795         pos = 0
    796         while pos < len(in_bytes):
    797             byte = in_bytes[pos]
    798             if not isinstance(byte, int):
    799                 # Python 2.x
    800                 byte = ord(byte)
    801             if (byte >= cls.FIRST_MULTIBYTE_MARKER
    802                 and byte <= cls.LAST_MULTIBYTE_MARKER):
    803                 # This is the start of a UTF-8 multibyte character. Skip
    804                 # to the end.
    805                 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
    806                     if byte >= start and byte <= end:
    807                         pos += size
    808                         break
    809             elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
    810                 # We found a Windows-1252 character!
    811                 # Save the string up to this point as a chunk.
    812                 byte_chunks.append(in_bytes[chunk_start:pos])
    813 
    814                 # Now translate the Windows-1252 character into UTF-8
    815                 # and add it as another, one-byte chunk.
    816                 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
    817                 pos += 1
    818                 chunk_start = pos
    819             else:
    820                 # Go on to the next character.
    821                 pos += 1
    822         if chunk_start == 0:
    823             # The string is unchanged.
    824             return in_bytes
    825         else:
    826             # Store the final chunk.
    827             byte_chunks.append(in_bytes[chunk_start:])
    828         return b''.join(byte_chunks)
    829 
    830