1 # -*- coding: utf-8 -*- 2 """Beautiful Soup bonus library: Unicode, Dammit 3 4 This library converts a bytestream to Unicode through any means 5 necessary. It is heavily based on code from Mark Pilgrim's Universal 6 Feed Parser. It works best on XML and XML, but it does not rewrite the 7 XML or HTML to reflect a new encoding; that's the tree builder's job. 8 """ 9 10 import codecs 11 from htmlentitydefs import codepoint2name 12 import re 13 import logging 14 import string 15 16 # Import a library to autodetect character encodings. 17 chardet_type = None 18 try: 19 # First try the fast C implementation. 20 # PyPI package: cchardet 21 import cchardet 22 def chardet_dammit(s): 23 return cchardet.detect(s)['encoding'] 24 except ImportError: 25 try: 26 # Fall back to the pure Python implementation 27 # Debian package: python-chardet 28 # PyPI package: chardet 29 import chardet 30 def chardet_dammit(s): 31 return chardet.detect(s)['encoding'] 32 #import chardet.constants 33 #chardet.constants._debug = 1 34 except ImportError: 35 # No chardet available. 36 def chardet_dammit(s): 37 return None 38 39 # Available from http://cjkpython.i18n.org/. 40 try: 41 import iconv_codec 42 except ImportError: 43 pass 44 45 xml_encoding_re = re.compile( 46 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) 47 html_meta_re = re.compile( 48 '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) 49 50 class EntitySubstitution(object): 51 52 """Substitute XML or HTML entities for the corresponding characters.""" 53 54 def _populate_class_variables(): 55 lookup = {} 56 reverse_lookup = {} 57 characters_for_re = [] 58 for codepoint, name in list(codepoint2name.items()): 59 character = unichr(codepoint) 60 if codepoint != 34: 61 # There's no point in turning the quotation mark into 62 # ", unless it happens within an attribute value, which 63 # is handled elsewhere. 64 characters_for_re.append(character) 65 lookup[character] = name 66 # But we do want to turn " into the quotation mark. 67 reverse_lookup[name] = character 68 re_definition = "[%s]" % "".join(characters_for_re) 69 return lookup, reverse_lookup, re.compile(re_definition) 70 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, 71 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() 72 73 CHARACTER_TO_XML_ENTITY = { 74 "'": "apos", 75 '"': "quot", 76 "&": "amp", 77 "<": "lt", 78 ">": "gt", 79 } 80 81 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 82 "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 83 ")") 84 85 AMPERSAND_OR_BRACKET = re.compile("([<>&])") 86 87 @classmethod 88 def _substitute_html_entity(cls, matchobj): 89 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) 90 return "&%s;" % entity 91 92 @classmethod 93 def _substitute_xml_entity(cls, matchobj): 94 """Used with a regular expression to substitute the 95 appropriate XML entity for an XML special character.""" 96 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] 97 return "&%s;" % entity 98 99 @classmethod 100 def quoted_attribute_value(self, value): 101 """Make a value into a quoted XML attribute, possibly escaping it. 102 103 Most strings will be quoted using double quotes. 104 105 Bob's Bar -> "Bob's Bar" 106 107 If a string contains double quotes, it will be quoted using 108 single quotes. 109 110 Welcome to "my bar" -> 'Welcome to "my bar"' 111 112 If a string contains both single and double quotes, the 113 double quotes will be escaped, and the string will be quoted 114 using double quotes. 115 116 Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" 117 """ 118 quote_with = '"' 119 if '"' in value: 120 if "'" in value: 121 # The string contains both single and double 122 # quotes. Turn the double quotes into 123 # entities. We quote the double quotes rather than 124 # the single quotes because the entity name is 125 # """ whether this is HTML or XML. If we 126 # quoted the single quotes, we'd have to decide 127 # between ' and &squot;. 128 replace_with = """ 129 value = value.replace('"', replace_with) 130 else: 131 # There are double quotes but no single quotes. 132 # We can use single quotes to quote the attribute. 133 quote_with = "'" 134 return quote_with + value + quote_with 135 136 @classmethod 137 def substitute_xml(cls, value, make_quoted_attribute=False): 138 """Substitute XML entities for special XML characters. 139 140 :param value: A string to be substituted. The less-than sign 141 will become <, the greater-than sign will become >, 142 and any ampersands will become &. If you want ampersands 143 that appear to be part of an entity definition to be left 144 alone, use substitute_xml_containing_entities() instead. 145 146 :param make_quoted_attribute: If True, then the string will be 147 quoted, as befits an attribute value. 148 """ 149 # Escape angle brackets and ampersands. 150 value = cls.AMPERSAND_OR_BRACKET.sub( 151 cls._substitute_xml_entity, value) 152 153 if make_quoted_attribute: 154 value = cls.quoted_attribute_value(value) 155 return value 156 157 @classmethod 158 def substitute_xml_containing_entities( 159 cls, value, make_quoted_attribute=False): 160 """Substitute XML entities for special XML characters. 161 162 :param value: A string to be substituted. The less-than sign will 163 become <, the greater-than sign will become >, and any 164 ampersands that are not part of an entity defition will 165 become &. 166 167 :param make_quoted_attribute: If True, then the string will be 168 quoted, as befits an attribute value. 169 """ 170 # Escape angle brackets, and ampersands that aren't part of 171 # entities. 172 value = cls.BARE_AMPERSAND_OR_BRACKET.sub( 173 cls._substitute_xml_entity, value) 174 175 if make_quoted_attribute: 176 value = cls.quoted_attribute_value(value) 177 return value 178 179 @classmethod 180 def substitute_html(cls, s): 181 """Replace certain Unicode characters with named HTML entities. 182 183 This differs from data.encode(encoding, 'xmlcharrefreplace') 184 in that the goal is to make the result more readable (to those 185 with ASCII displays) rather than to recover from 186 errors. There's absolutely nothing wrong with a UTF-8 string 187 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that 188 character with "é" will make it more readable to some 189 people. 190 """ 191 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( 192 cls._substitute_html_entity, s) 193 194 195 class EncodingDetector: 196 """Suggests a number of possible encodings for a bytestring. 197 198 Order of precedence: 199 200 1. Encodings you specifically tell EncodingDetector to try first 201 (the override_encodings argument to the constructor). 202 203 2. An encoding declared within the bytestring itself, either in an 204 XML declaration (if the bytestring is to be interpreted as an XML 205 document), or in a <meta> tag (if the bytestring is to be 206 interpreted as an HTML document.) 207 208 3. An encoding detected through textual analysis by chardet, 209 cchardet, or a similar external library. 210 211 4. UTF-8. 212 213 5. Windows-1252. 214 """ 215 def __init__(self, markup, override_encodings=None, is_html=False): 216 self.override_encodings = override_encodings or [] 217 self.chardet_encoding = None 218 self.is_html = is_html 219 self.declared_encoding = None 220 221 # First order of business: strip a byte-order mark. 222 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) 223 224 def _usable(self, encoding, tried): 225 if encoding is not None: 226 encoding = encoding.lower() 227 if encoding not in tried: 228 tried.add(encoding) 229 return True 230 return False 231 232 @property 233 def encodings(self): 234 """Yield a number of encodings that might work for this markup.""" 235 tried = set() 236 for e in self.override_encodings: 237 if self._usable(e, tried): 238 yield e 239 240 # Did the document originally start with a byte-order mark 241 # that indicated its encoding? 242 if self._usable(self.sniffed_encoding, tried): 243 yield self.sniffed_encoding 244 245 # Look within the document for an XML or HTML encoding 246 # declaration. 247 if self.declared_encoding is None: 248 self.declared_encoding = self.find_declared_encoding( 249 self.markup, self.is_html) 250 if self._usable(self.declared_encoding, tried): 251 yield self.declared_encoding 252 253 # Use third-party character set detection to guess at the 254 # encoding. 255 if self.chardet_encoding is None: 256 self.chardet_encoding = chardet_dammit(self.markup) 257 if self._usable(self.chardet_encoding, tried): 258 yield self.chardet_encoding 259 260 # As a last-ditch effort, try utf-8 and windows-1252. 261 for e in ('utf-8', 'windows-1252'): 262 if self._usable(e, tried): 263 yield e 264 265 @classmethod 266 def strip_byte_order_mark(cls, data): 267 """If a byte-order mark is present, strip it and return the encoding it implies.""" 268 encoding = None 269 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ 270 and (data[2:4] != '\x00\x00'): 271 encoding = 'utf-16be' 272 data = data[2:] 273 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ 274 and (data[2:4] != '\x00\x00'): 275 encoding = 'utf-16le' 276 data = data[2:] 277 elif data[:3] == b'\xef\xbb\xbf': 278 encoding = 'utf-8' 279 data = data[3:] 280 elif data[:4] == b'\x00\x00\xfe\xff': 281 encoding = 'utf-32be' 282 data = data[4:] 283 elif data[:4] == b'\xff\xfe\x00\x00': 284 encoding = 'utf-32le' 285 data = data[4:] 286 return data, encoding 287 288 @classmethod 289 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): 290 """Given a document, tries to find its declared encoding. 291 292 An XML encoding is declared at the beginning of the document. 293 294 An HTML encoding is declared in a <meta> tag, hopefully near the 295 beginning of the document. 296 """ 297 if search_entire_document: 298 xml_endpos = html_endpos = len(markup) 299 else: 300 xml_endpos = 1024 301 html_endpos = max(2048, int(len(markup) * 0.05)) 302 303 declared_encoding = None 304 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) 305 if not declared_encoding_match and is_html: 306 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) 307 if declared_encoding_match is not None: 308 declared_encoding = declared_encoding_match.groups()[0].decode( 309 'ascii') 310 if declared_encoding: 311 return declared_encoding.lower() 312 return None 313 314 class UnicodeDammit: 315 """A class for detecting the encoding of a *ML document and 316 converting it to a Unicode string. If the source encoding is 317 windows-1252, can replace MS smart quotes with their HTML or XML 318 equivalents.""" 319 320 # This dictionary maps commonly seen values for "charset" in HTML 321 # meta tags to the corresponding Python codec names. It only covers 322 # values that aren't in Python's aliases and can't be determined 323 # by the heuristics in find_codec. 324 CHARSET_ALIASES = {"macintosh": "mac-roman", 325 "x-sjis": "shift-jis"} 326 327 ENCODINGS_WITH_SMART_QUOTES = [ 328 "windows-1252", 329 "iso-8859-1", 330 "iso-8859-2", 331 ] 332 333 def __init__(self, markup, override_encodings=[], 334 smart_quotes_to=None, is_html=False): 335 self.smart_quotes_to = smart_quotes_to 336 self.tried_encodings = [] 337 self.contains_replacement_characters = False 338 self.is_html = is_html 339 340 self.detector = EncodingDetector(markup, override_encodings, is_html) 341 342 # Short-circuit if the data is in Unicode to begin with. 343 if isinstance(markup, unicode) or markup == '': 344 self.markup = markup 345 self.unicode_markup = unicode(markup) 346 self.original_encoding = None 347 return 348 349 # The encoding detector may have stripped a byte-order mark. 350 # Use the stripped markup from this point on. 351 self.markup = self.detector.markup 352 353 u = None 354 for encoding in self.detector.encodings: 355 markup = self.detector.markup 356 u = self._convert_from(encoding) 357 if u is not None: 358 break 359 360 if not u: 361 # None of the encodings worked. As an absolute last resort, 362 # try them again with character replacement. 363 364 for encoding in self.detector.encodings: 365 if encoding != "ascii": 366 u = self._convert_from(encoding, "replace") 367 if u is not None: 368 logging.warning( 369 "Some characters could not be decoded, and were " 370 "replaced with REPLACEMENT CHARACTER.") 371 self.contains_replacement_characters = True 372 break 373 374 # If none of that worked, we could at this point force it to 375 # ASCII, but that would destroy so much data that I think 376 # giving up is better. 377 self.unicode_markup = u 378 if not u: 379 self.original_encoding = None 380 381 def _sub_ms_char(self, match): 382 """Changes a MS smart quote character to an XML or HTML 383 entity, or an ASCII character.""" 384 orig = match.group(1) 385 if self.smart_quotes_to == 'ascii': 386 sub = self.MS_CHARS_TO_ASCII.get(orig).encode() 387 else: 388 sub = self.MS_CHARS.get(orig) 389 if type(sub) == tuple: 390 if self.smart_quotes_to == 'xml': 391 sub = '&#x'.encode() + sub[1].encode() + ';'.encode() 392 else: 393 sub = '&'.encode() + sub[0].encode() + ';'.encode() 394 else: 395 sub = sub.encode() 396 return sub 397 398 def _convert_from(self, proposed, errors="strict"): 399 proposed = self.find_codec(proposed) 400 if not proposed or (proposed, errors) in self.tried_encodings: 401 return None 402 self.tried_encodings.append((proposed, errors)) 403 markup = self.markup 404 # Convert smart quotes to HTML if coming from an encoding 405 # that might have them. 406 if (self.smart_quotes_to is not None 407 and proposed in self.ENCODINGS_WITH_SMART_QUOTES): 408 smart_quotes_re = b"([\x80-\x9f])" 409 smart_quotes_compiled = re.compile(smart_quotes_re) 410 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) 411 412 try: 413 #print "Trying to convert document to %s (errors=%s)" % ( 414 # proposed, errors) 415 u = self._to_unicode(markup, proposed, errors) 416 self.markup = u 417 self.original_encoding = proposed 418 except Exception as e: 419 #print "That didn't work!" 420 #print e 421 return None 422 #print "Correct encoding: %s" % proposed 423 return self.markup 424 425 def _to_unicode(self, data, encoding, errors="strict"): 426 '''Given a string and its encoding, decodes the string into Unicode. 427 %encoding is a string recognized by encodings.aliases''' 428 return unicode(data, encoding, errors) 429 430 @property 431 def declared_html_encoding(self): 432 if not self.is_html: 433 return None 434 return self.detector.declared_encoding 435 436 def find_codec(self, charset): 437 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) 438 or (charset and self._codec(charset.replace("-", ""))) 439 or (charset and self._codec(charset.replace("-", "_"))) 440 or (charset and charset.lower()) 441 or charset 442 ) 443 if value: 444 return value.lower() 445 return None 446 447 def _codec(self, charset): 448 if not charset: 449 return charset 450 codec = None 451 try: 452 codecs.lookup(charset) 453 codec = charset 454 except (LookupError, ValueError): 455 pass 456 return codec 457 458 459 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. 460 MS_CHARS = {b'\x80': ('euro', '20AC'), 461 b'\x81': ' ', 462 b'\x82': ('sbquo', '201A'), 463 b'\x83': ('fnof', '192'), 464 b'\x84': ('bdquo', '201E'), 465 b'\x85': ('hellip', '2026'), 466 b'\x86': ('dagger', '2020'), 467 b'\x87': ('Dagger', '2021'), 468 b'\x88': ('circ', '2C6'), 469 b'\x89': ('permil', '2030'), 470 b'\x8A': ('Scaron', '160'), 471 b'\x8B': ('lsaquo', '2039'), 472 b'\x8C': ('OElig', '152'), 473 b'\x8D': '?', 474 b'\x8E': ('#x17D', '17D'), 475 b'\x8F': '?', 476 b'\x90': '?', 477 b'\x91': ('lsquo', '2018'), 478 b'\x92': ('rsquo', '2019'), 479 b'\x93': ('ldquo', '201C'), 480 b'\x94': ('rdquo', '201D'), 481 b'\x95': ('bull', '2022'), 482 b'\x96': ('ndash', '2013'), 483 b'\x97': ('mdash', '2014'), 484 b'\x98': ('tilde', '2DC'), 485 b'\x99': ('trade', '2122'), 486 b'\x9a': ('scaron', '161'), 487 b'\x9b': ('rsaquo', '203A'), 488 b'\x9c': ('oelig', '153'), 489 b'\x9d': '?', 490 b'\x9e': ('#x17E', '17E'), 491 b'\x9f': ('Yuml', ''),} 492 493 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains 494 # horrors like stripping diacritical marks to turn into a, but also 495 # contains non-horrors like turning into ". 496 MS_CHARS_TO_ASCII = { 497 b'\x80' : 'EUR', 498 b'\x81' : ' ', 499 b'\x82' : ',', 500 b'\x83' : 'f', 501 b'\x84' : ',,', 502 b'\x85' : '...', 503 b'\x86' : '+', 504 b'\x87' : '++', 505 b'\x88' : '^', 506 b'\x89' : '%', 507 b'\x8a' : 'S', 508 b'\x8b' : '<', 509 b'\x8c' : 'OE', 510 b'\x8d' : '?', 511 b'\x8e' : 'Z', 512 b'\x8f' : '?', 513 b'\x90' : '?', 514 b'\x91' : "'", 515 b'\x92' : "'", 516 b'\x93' : '"', 517 b'\x94' : '"', 518 b'\x95' : '*', 519 b'\x96' : '-', 520 b'\x97' : '--', 521 b'\x98' : '~', 522 b'\x99' : '(TM)', 523 b'\x9a' : 's', 524 b'\x9b' : '>', 525 b'\x9c' : 'oe', 526 b'\x9d' : '?', 527 b'\x9e' : 'z', 528 b'\x9f' : 'Y', 529 b'\xa0' : ' ', 530 b'\xa1' : '!', 531 b'\xa2' : 'c', 532 b'\xa3' : 'GBP', 533 b'\xa4' : '$', #This approximation is especially parochial--this is the 534 #generic currency symbol. 535 b'\xa5' : 'YEN', 536 b'\xa6' : '|', 537 b'\xa7' : 'S', 538 b'\xa8' : '..', 539 b'\xa9' : '', 540 b'\xaa' : '(th)', 541 b'\xab' : '<<', 542 b'\xac' : '!', 543 b'\xad' : ' ', 544 b'\xae' : '(R)', 545 b'\xaf' : '-', 546 b'\xb0' : 'o', 547 b'\xb1' : '+-', 548 b'\xb2' : '2', 549 b'\xb3' : '3', 550 b'\xb4' : ("'", 'acute'), 551 b'\xb5' : 'u', 552 b'\xb6' : 'P', 553 b'\xb7' : '*', 554 b'\xb8' : ',', 555 b'\xb9' : '1', 556 b'\xba' : '(th)', 557 b'\xbb' : '>>', 558 b'\xbc' : '1/4', 559 b'\xbd' : '1/2', 560 b'\xbe' : '3/4', 561 b'\xbf' : '?', 562 b'\xc0' : 'A', 563 b'\xc1' : 'A', 564 b'\xc2' : 'A', 565 b'\xc3' : 'A', 566 b'\xc4' : 'A', 567 b'\xc5' : 'A', 568 b'\xc6' : 'AE', 569 b'\xc7' : 'C', 570 b'\xc8' : 'E', 571 b'\xc9' : 'E', 572 b'\xca' : 'E', 573 b'\xcb' : 'E', 574 b'\xcc' : 'I', 575 b'\xcd' : 'I', 576 b'\xce' : 'I', 577 b'\xcf' : 'I', 578 b'\xd0' : 'D', 579 b'\xd1' : 'N', 580 b'\xd2' : 'O', 581 b'\xd3' : 'O', 582 b'\xd4' : 'O', 583 b'\xd5' : 'O', 584 b'\xd6' : 'O', 585 b'\xd7' : '*', 586 b'\xd8' : 'O', 587 b'\xd9' : 'U', 588 b'\xda' : 'U', 589 b'\xdb' : 'U', 590 b'\xdc' : 'U', 591 b'\xdd' : 'Y', 592 b'\xde' : 'b', 593 b'\xdf' : 'B', 594 b'\xe0' : 'a', 595 b'\xe1' : 'a', 596 b'\xe2' : 'a', 597 b'\xe3' : 'a', 598 b'\xe4' : 'a', 599 b'\xe5' : 'a', 600 b'\xe6' : 'ae', 601 b'\xe7' : 'c', 602 b'\xe8' : 'e', 603 b'\xe9' : 'e', 604 b'\xea' : 'e', 605 b'\xeb' : 'e', 606 b'\xec' : 'i', 607 b'\xed' : 'i', 608 b'\xee' : 'i', 609 b'\xef' : 'i', 610 b'\xf0' : 'o', 611 b'\xf1' : 'n', 612 b'\xf2' : 'o', 613 b'\xf3' : 'o', 614 b'\xf4' : 'o', 615 b'\xf5' : 'o', 616 b'\xf6' : 'o', 617 b'\xf7' : '/', 618 b'\xf8' : 'o', 619 b'\xf9' : 'u', 620 b'\xfa' : 'u', 621 b'\xfb' : 'u', 622 b'\xfc' : 'u', 623 b'\xfd' : 'y', 624 b'\xfe' : 'b', 625 b'\xff' : 'y', 626 } 627 628 # A map used when removing rogue Windows-1252/ISO-8859-1 629 # characters in otherwise UTF-8 documents. 630 # 631 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in 632 # Windows-1252. 633 WINDOWS_1252_TO_UTF8 = { 634 0x80 : b'\xe2\x82\xac', # 635 0x82 : b'\xe2\x80\x9a', # 636 0x83 : b'\xc6\x92', # 637 0x84 : b'\xe2\x80\x9e', # 638 0x85 : b'\xe2\x80\xa6', # 639 0x86 : b'\xe2\x80\xa0', # 640 0x87 : b'\xe2\x80\xa1', # 641 0x88 : b'\xcb\x86', # 642 0x89 : b'\xe2\x80\xb0', # 643 0x8a : b'\xc5\xa0', # 644 0x8b : b'\xe2\x80\xb9', # 645 0x8c : b'\xc5\x92', # 646 0x8e : b'\xc5\xbd', # 647 0x91 : b'\xe2\x80\x98', # 648 0x92 : b'\xe2\x80\x99', # 649 0x93 : b'\xe2\x80\x9c', # 650 0x94 : b'\xe2\x80\x9d', # 651 0x95 : b'\xe2\x80\xa2', # 652 0x96 : b'\xe2\x80\x93', # 653 0x97 : b'\xe2\x80\x94', # 654 0x98 : b'\xcb\x9c', # 655 0x99 : b'\xe2\x84\xa2', # 656 0x9a : b'\xc5\xa1', # 657 0x9b : b'\xe2\x80\xba', # 658 0x9c : b'\xc5\x93', # 659 0x9e : b'\xc5\xbe', # 660 0x9f : b'\xc5\xb8', # 661 0xa0 : b'\xc2\xa0', # 662 0xa1 : b'\xc2\xa1', # 663 0xa2 : b'\xc2\xa2', # 664 0xa3 : b'\xc2\xa3', # 665 0xa4 : b'\xc2\xa4', # 666 0xa5 : b'\xc2\xa5', # 667 0xa6 : b'\xc2\xa6', # 668 0xa7 : b'\xc2\xa7', # 669 0xa8 : b'\xc2\xa8', # 670 0xa9 : b'\xc2\xa9', # 671 0xaa : b'\xc2\xaa', # 672 0xab : b'\xc2\xab', # 673 0xac : b'\xc2\xac', # 674 0xad : b'\xc2\xad', # 675 0xae : b'\xc2\xae', # 676 0xaf : b'\xc2\xaf', # 677 0xb0 : b'\xc2\xb0', # 678 0xb1 : b'\xc2\xb1', # 679 0xb2 : b'\xc2\xb2', # 680 0xb3 : b'\xc2\xb3', # 681 0xb4 : b'\xc2\xb4', # 682 0xb5 : b'\xc2\xb5', # 683 0xb6 : b'\xc2\xb6', # 684 0xb7 : b'\xc2\xb7', # 685 0xb8 : b'\xc2\xb8', # 686 0xb9 : b'\xc2\xb9', # 687 0xba : b'\xc2\xba', # 688 0xbb : b'\xc2\xbb', # 689 0xbc : b'\xc2\xbc', # 690 0xbd : b'\xc2\xbd', # 691 0xbe : b'\xc2\xbe', # 692 0xbf : b'\xc2\xbf', # 693 0xc0 : b'\xc3\x80', # 694 0xc1 : b'\xc3\x81', # 695 0xc2 : b'\xc3\x82', # 696 0xc3 : b'\xc3\x83', # 697 0xc4 : b'\xc3\x84', # 698 0xc5 : b'\xc3\x85', # 699 0xc6 : b'\xc3\x86', # 700 0xc7 : b'\xc3\x87', # 701 0xc8 : b'\xc3\x88', # 702 0xc9 : b'\xc3\x89', # 703 0xca : b'\xc3\x8a', # 704 0xcb : b'\xc3\x8b', # 705 0xcc : b'\xc3\x8c', # 706 0xcd : b'\xc3\x8d', # 707 0xce : b'\xc3\x8e', # 708 0xcf : b'\xc3\x8f', # 709 0xd0 : b'\xc3\x90', # 710 0xd1 : b'\xc3\x91', # 711 0xd2 : b'\xc3\x92', # 712 0xd3 : b'\xc3\x93', # 713 0xd4 : b'\xc3\x94', # 714 0xd5 : b'\xc3\x95', # 715 0xd6 : b'\xc3\x96', # 716 0xd7 : b'\xc3\x97', # 717 0xd8 : b'\xc3\x98', # 718 0xd9 : b'\xc3\x99', # 719 0xda : b'\xc3\x9a', # 720 0xdb : b'\xc3\x9b', # 721 0xdc : b'\xc3\x9c', # 722 0xdd : b'\xc3\x9d', # 723 0xde : b'\xc3\x9e', # 724 0xdf : b'\xc3\x9f', # 725 0xe0 : b'\xc3\xa0', # 726 0xe1 : b'\xa1', # 727 0xe2 : b'\xc3\xa2', # 728 0xe3 : b'\xc3\xa3', # 729 0xe4 : b'\xc3\xa4', # 730 0xe5 : b'\xc3\xa5', # 731 0xe6 : b'\xc3\xa6', # 732 0xe7 : b'\xc3\xa7', # 733 0xe8 : b'\xc3\xa8', # 734 0xe9 : b'\xc3\xa9', # 735 0xea : b'\xc3\xaa', # 736 0xeb : b'\xc3\xab', # 737 0xec : b'\xc3\xac', # 738 0xed : b'\xc3\xad', # 739 0xee : b'\xc3\xae', # 740 0xef : b'\xc3\xaf', # 741 0xf0 : b'\xc3\xb0', # 742 0xf1 : b'\xc3\xb1', # 743 0xf2 : b'\xc3\xb2', # 744 0xf3 : b'\xc3\xb3', # 745 0xf4 : b'\xc3\xb4', # 746 0xf5 : b'\xc3\xb5', # 747 0xf6 : b'\xc3\xb6', # 748 0xf7 : b'\xc3\xb7', # 749 0xf8 : b'\xc3\xb8', # 750 0xf9 : b'\xc3\xb9', # 751 0xfa : b'\xc3\xba', # 752 0xfb : b'\xc3\xbb', # 753 0xfc : b'\xc3\xbc', # 754 0xfd : b'\xc3\xbd', # 755 0xfe : b'\xc3\xbe', # 756 } 757 758 MULTIBYTE_MARKERS_AND_SIZES = [ 759 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF 760 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF 761 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 762 ] 763 764 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] 765 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] 766 767 @classmethod 768 def detwingle(cls, in_bytes, main_encoding="utf8", 769 embedded_encoding="windows-1252"): 770 """Fix characters from one encoding embedded in some other encoding. 771 772 Currently the only situation supported is Windows-1252 (or its 773 subset ISO-8859-1), embedded in UTF-8. 774 775 The input must be a bytestring. If you've already converted 776 the document to Unicode, you're too late. 777 778 The output is a bytestring in which `embedded_encoding` 779 characters have been converted to their `main_encoding` 780 equivalents. 781 """ 782 if embedded_encoding.replace('_', '-').lower() not in ( 783 'windows-1252', 'windows_1252'): 784 raise NotImplementedError( 785 "Windows-1252 and ISO-8859-1 are the only currently supported " 786 "embedded encodings.") 787 788 if main_encoding.lower() not in ('utf8', 'utf-8'): 789 raise NotImplementedError( 790 "UTF-8 is the only currently supported main encoding.") 791 792 byte_chunks = [] 793 794 chunk_start = 0 795 pos = 0 796 while pos < len(in_bytes): 797 byte = in_bytes[pos] 798 if not isinstance(byte, int): 799 # Python 2.x 800 byte = ord(byte) 801 if (byte >= cls.FIRST_MULTIBYTE_MARKER 802 and byte <= cls.LAST_MULTIBYTE_MARKER): 803 # This is the start of a UTF-8 multibyte character. Skip 804 # to the end. 805 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: 806 if byte >= start and byte <= end: 807 pos += size 808 break 809 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: 810 # We found a Windows-1252 character! 811 # Save the string up to this point as a chunk. 812 byte_chunks.append(in_bytes[chunk_start:pos]) 813 814 # Now translate the Windows-1252 character into UTF-8 815 # and add it as another, one-byte chunk. 816 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) 817 pos += 1 818 chunk_start = pos 819 else: 820 # Go on to the next character. 821 pos += 1 822 if chunk_start == 0: 823 # The string is unchanged. 824 return in_bytes 825 else: 826 # Store the final chunk. 827 byte_chunks.append(in_bytes[chunk_start:]) 828 return b''.join(byte_chunks) 829 830