1 """Beautiful Soup 2 Elixir and Tonic 3 "The Screen-Scraper's Friend" 4 http://www.crummy.com/software/BeautifulSoup/ 5 6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 tree representation. It provides methods and Pythonic idioms that make 8 it easy to navigate, search, and modify the tree. 9 10 A well-formed XML/HTML document yields a well-formed data 11 structure. An ill-formed XML/HTML document yields a correspondingly 12 ill-formed data structure. If your document is only locally 13 well-formed, you can use this library to find and process the 14 well-formed part of it. 15 16 Beautiful Soup works with Python 2.2 and up. It has no external 17 dependencies, but you'll have more success at converting data to UTF-8 18 if you also install these three packages: 19 20 * chardet, for auto-detecting character encodings 21 http://chardet.feedparser.org/ 22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 by stock Python. 24 http://cjkpython.i18n.org/ 25 26 Beautiful Soup defines classes for two main parsing strategies: 27 28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 language that kind of looks like XML. 30 31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 or invalid. This class has web browser-like heuristics for 33 obtaining a sensible parse tree in the face of common HTML errors. 34 35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 the encoding of an HTML or XML document, and converting it to 37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 39 For more than you ever wanted to know about Beautiful Soup, see the 40 documentation: 41 http://www.crummy.com/software/BeautifulSoup/documentation.html 42 43 Here, have some legalese: 44 45 Copyright (c) 2004-2010, Leonard Richardson 46 47 All rights reserved. 48 49 Redistribution and use in source and binary forms, with or without 50 modification, are permitted provided that the following conditions are 51 met: 52 53 * Redistributions of source code must retain the above copyright 54 notice, this list of conditions and the following disclaimer. 55 56 * Redistributions in binary form must reproduce the above 57 copyright notice, this list of conditions and the following 58 disclaimer in the documentation and/or other materials provided 59 with the distribution. 60 61 * Neither the name of the the Beautiful Soup Consortium and All 62 Night Kosher Bakery nor the names of its contributors may be 63 used to endorse or promote products derived from this software 64 without specific prior written permission. 65 66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 78 """ 79 from __future__ import generators 80 81 __author__ = "Leonard Richardson (leonardr (at] segfault.org)" 82 __version__ = "3.2.0" 83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" 84 __license__ = "New-style BSD" 85 86 from sgmllib import SGMLParser, SGMLParseError 87 import codecs 88 import markupbase 89 import types 90 import re 91 import sgmllib 92 try: 93 from htmlentitydefs import name2codepoint 94 except ImportError: 95 name2codepoint = {} 96 try: 97 set 98 except NameError: 99 from sets import Set as set 100 101 #These hacks make Beautiful Soup able to parse XML with namespaces 102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 104 105 DEFAULT_OUTPUT_ENCODING = "utf-8" 106 107 def _match_css_class(str): 108 """Build a RE to match the given CSS class.""" 109 return re.compile(r"(^|.*\s)%s($|\s)" % str) 110 111 # First, the classes that represent markup elements. 112 113 class PageElement(object): 114 """Contains the navigational information for some part of the page 115 (either a tag or a piece of text)""" 116 117 def setup(self, parent=None, previous=None): 118 """Sets up the initial relations between this element and 119 other elements.""" 120 self.parent = parent 121 self.previous = previous 122 self.next = None 123 self.previousSibling = None 124 self.nextSibling = None 125 if self.parent and self.parent.contents: 126 self.previousSibling = self.parent.contents[-1] 127 self.previousSibling.nextSibling = self 128 129 def replaceWith(self, replaceWith): 130 oldParent = self.parent 131 myIndex = self.parent.index(self) 132 if hasattr(replaceWith, "parent")\ 133 and replaceWith.parent is self.parent: 134 # We're replacing this element with one of its siblings. 135 index = replaceWith.parent.index(replaceWith) 136 if index and index < myIndex: 137 # Furthermore, it comes before this element. That 138 # means that when we extract it, the index of this 139 # element will change. 140 myIndex = myIndex - 1 141 self.extract() 142 oldParent.insert(myIndex, replaceWith) 143 144 def replaceWithChildren(self): 145 myParent = self.parent 146 myIndex = self.parent.index(self) 147 self.extract() 148 reversedChildren = list(self.contents) 149 reversedChildren.reverse() 150 for child in reversedChildren: 151 myParent.insert(myIndex, child) 152 153 def extract(self): 154 """Destructively rips this element out of the tree.""" 155 if self.parent: 156 try: 157 del self.parent.contents[self.parent.index(self)] 158 except ValueError: 159 pass 160 161 #Find the two elements that would be next to each other if 162 #this element (and any children) hadn't been parsed. Connect 163 #the two. 164 lastChild = self._lastRecursiveChild() 165 nextElement = lastChild.next 166 167 if self.previous: 168 self.previous.next = nextElement 169 if nextElement: 170 nextElement.previous = self.previous 171 self.previous = None 172 lastChild.next = None 173 174 self.parent = None 175 if self.previousSibling: 176 self.previousSibling.nextSibling = self.nextSibling 177 if self.nextSibling: 178 self.nextSibling.previousSibling = self.previousSibling 179 self.previousSibling = self.nextSibling = None 180 return self 181 182 def _lastRecursiveChild(self): 183 "Finds the last element beneath this object to be parsed." 184 lastChild = self 185 while hasattr(lastChild, 'contents') and lastChild.contents: 186 lastChild = lastChild.contents[-1] 187 return lastChild 188 189 def insert(self, position, newChild): 190 if isinstance(newChild, basestring) \ 191 and not isinstance(newChild, NavigableString): 192 newChild = NavigableString(newChild) 193 194 position = min(position, len(self.contents)) 195 if hasattr(newChild, 'parent') and newChild.parent is not None: 196 # We're 'inserting' an element that's already one 197 # of this object's children. 198 if newChild.parent is self: 199 index = self.index(newChild) 200 if index > position: 201 # Furthermore we're moving it further down the 202 # list of this object's children. That means that 203 # when we extract this element, our target index 204 # will jump down one. 205 position = position - 1 206 newChild.extract() 207 208 newChild.parent = self 209 previousChild = None 210 if position == 0: 211 newChild.previousSibling = None 212 newChild.previous = self 213 else: 214 previousChild = self.contents[position-1] 215 newChild.previousSibling = previousChild 216 newChild.previousSibling.nextSibling = newChild 217 newChild.previous = previousChild._lastRecursiveChild() 218 if newChild.previous: 219 newChild.previous.next = newChild 220 221 newChildsLastElement = newChild._lastRecursiveChild() 222 223 if position >= len(self.contents): 224 newChild.nextSibling = None 225 226 parent = self 227 parentsNextSibling = None 228 while not parentsNextSibling: 229 parentsNextSibling = parent.nextSibling 230 parent = parent.parent 231 if not parent: # This is the last element in the document. 232 break 233 if parentsNextSibling: 234 newChildsLastElement.next = parentsNextSibling 235 else: 236 newChildsLastElement.next = None 237 else: 238 nextChild = self.contents[position] 239 newChild.nextSibling = nextChild 240 if newChild.nextSibling: 241 newChild.nextSibling.previousSibling = newChild 242 newChildsLastElement.next = nextChild 243 244 if newChildsLastElement.next: 245 newChildsLastElement.next.previous = newChildsLastElement 246 self.contents.insert(position, newChild) 247 248 def append(self, tag): 249 """Appends the given tag to the contents of this tag.""" 250 self.insert(len(self.contents), tag) 251 252 def findNext(self, name=None, attrs={}, text=None, **kwargs): 253 """Returns the first item that matches the given criteria and 254 appears after this Tag in the document.""" 255 return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 256 257 def findAllNext(self, name=None, attrs={}, text=None, limit=None, 258 **kwargs): 259 """Returns all items that match the given criteria and appear 260 after this Tag in the document.""" 261 return self._findAll(name, attrs, text, limit, self.nextGenerator, 262 **kwargs) 263 264 def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 265 """Returns the closest sibling to this Tag that matches the 266 given criteria and appears after this Tag in the document.""" 267 return self._findOne(self.findNextSiblings, name, attrs, text, 268 **kwargs) 269 270 def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 271 **kwargs): 272 """Returns the siblings of this Tag that match the given 273 criteria and appear after this Tag in the document.""" 274 return self._findAll(name, attrs, text, limit, 275 self.nextSiblingGenerator, **kwargs) 276 fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 277 278 def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 279 """Returns the first item that matches the given criteria and 280 appears before this Tag in the document.""" 281 return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 282 283 def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 284 **kwargs): 285 """Returns all items that match the given criteria and appear 286 before this Tag in the document.""" 287 return self._findAll(name, attrs, text, limit, self.previousGenerator, 288 **kwargs) 289 fetchPrevious = findAllPrevious # Compatibility with pre-3.x 290 291 def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 292 """Returns the closest sibling to this Tag that matches the 293 given criteria and appears before this Tag in the document.""" 294 return self._findOne(self.findPreviousSiblings, name, attrs, text, 295 **kwargs) 296 297 def findPreviousSiblings(self, name=None, attrs={}, text=None, 298 limit=None, **kwargs): 299 """Returns the siblings of this Tag that match the given 300 criteria and appear before this Tag in the document.""" 301 return self._findAll(name, attrs, text, limit, 302 self.previousSiblingGenerator, **kwargs) 303 fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 304 305 def findParent(self, name=None, attrs={}, **kwargs): 306 """Returns the closest parent of this Tag that matches the given 307 criteria.""" 308 # NOTE: We can't use _findOne because findParents takes a different 309 # set of arguments. 310 r = None 311 l = self.findParents(name, attrs, 1) 312 if l: 313 r = l[0] 314 return r 315 316 def findParents(self, name=None, attrs={}, limit=None, **kwargs): 317 """Returns the parents of this Tag that match the given 318 criteria.""" 319 320 return self._findAll(name, attrs, None, limit, self.parentGenerator, 321 **kwargs) 322 fetchParents = findParents # Compatibility with pre-3.x 323 324 #These methods do the real heavy lifting. 325 326 def _findOne(self, method, name, attrs, text, **kwargs): 327 r = None 328 l = method(name, attrs, text, 1, **kwargs) 329 if l: 330 r = l[0] 331 return r 332 333 def _findAll(self, name, attrs, text, limit, generator, **kwargs): 334 "Iterates over a generator looking for things that match." 335 336 if isinstance(name, SoupStrainer): 337 strainer = name 338 # (Possibly) special case some findAll*(...) searches 339 elif text is None and not limit and not attrs and not kwargs: 340 # findAll*(True) 341 if name is True: 342 return [element for element in generator() 343 if isinstance(element, Tag)] 344 # findAll*('tag-name') 345 elif isinstance(name, basestring): 346 return [element for element in generator() 347 if isinstance(element, Tag) and 348 element.name == name] 349 else: 350 strainer = SoupStrainer(name, attrs, text, **kwargs) 351 # Build a SoupStrainer 352 else: 353 strainer = SoupStrainer(name, attrs, text, **kwargs) 354 results = ResultSet(strainer) 355 g = generator() 356 while True: 357 try: 358 i = g.next() 359 except StopIteration: 360 break 361 if i: 362 found = strainer.search(i) 363 if found: 364 results.append(found) 365 if limit and len(results) >= limit: 366 break 367 return results 368 369 #These Generators can be used to navigate starting from both 370 #NavigableStrings and Tags. 371 def nextGenerator(self): 372 i = self 373 while i is not None: 374 i = i.next 375 yield i 376 377 def nextSiblingGenerator(self): 378 i = self 379 while i is not None: 380 i = i.nextSibling 381 yield i 382 383 def previousGenerator(self): 384 i = self 385 while i is not None: 386 i = i.previous 387 yield i 388 389 def previousSiblingGenerator(self): 390 i = self 391 while i is not None: 392 i = i.previousSibling 393 yield i 394 395 def parentGenerator(self): 396 i = self 397 while i is not None: 398 i = i.parent 399 yield i 400 401 # Utility methods 402 def substituteEncoding(self, str, encoding=None): 403 encoding = encoding or "utf-8" 404 return str.replace("%SOUP-ENCODING%", encoding) 405 406 def toEncoding(self, s, encoding=None): 407 """Encodes an object to a string in some encoding, or to Unicode. 408 .""" 409 if isinstance(s, unicode): 410 if encoding: 411 s = s.encode(encoding) 412 elif isinstance(s, str): 413 if encoding: 414 s = s.encode(encoding) 415 else: 416 s = unicode(s) 417 else: 418 if encoding: 419 s = self.toEncoding(str(s), encoding) 420 else: 421 s = unicode(s) 422 return s 423 424 class NavigableString(unicode, PageElement): 425 426 def __new__(cls, value): 427 """Create a new NavigableString. 428 429 When unpickling a NavigableString, this method is called with 430 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 431 passed in to the superclass's __new__ or the superclass won't know 432 how to handle non-ASCII characters. 433 """ 434 if isinstance(value, unicode): 435 return unicode.__new__(cls, value) 436 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 437 438 def __getnewargs__(self): 439 return (NavigableString.__str__(self),) 440 441 def __getattr__(self, attr): 442 """text.string gives you text. This is for backwards 443 compatibility for Navigable*String, but for CData* it lets you 444 get the string without the CData wrapper.""" 445 if attr == 'string': 446 return self 447 else: 448 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 449 450 def __unicode__(self): 451 return str(self).decode(DEFAULT_OUTPUT_ENCODING) 452 453 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 454 if encoding: 455 return self.encode(encoding) 456 else: 457 return self 458 459 class CData(NavigableString): 460 461 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 462 return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) 463 464 class ProcessingInstruction(NavigableString): 465 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 466 output = self 467 if "%SOUP-ENCODING%" in output: 468 output = self.substituteEncoding(output, encoding) 469 return "<?%s?>" % self.toEncoding(output, encoding) 470 471 class Comment(NavigableString): 472 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 473 return "<!--%s-->" % NavigableString.__str__(self, encoding) 474 475 class Declaration(NavigableString): 476 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 477 return "<!%s>" % NavigableString.__str__(self, encoding) 478 479 class Tag(PageElement): 480 481 """Represents a found HTML tag with its attributes and contents.""" 482 483 def _invert(h): 484 "Cheap function to invert a hash." 485 i = {} 486 for k,v in h.items(): 487 i[v] = k 488 return i 489 490 XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 491 "quot" : '"', 492 "amp" : "&", 493 "lt" : "<", 494 "gt" : ">" } 495 496 XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 497 498 def _convertEntities(self, match): 499 """Used in a call to re.sub to replace HTML, XML, and numeric 500 entities with the appropriate Unicode characters. If HTML 501 entities are being converted, any unrecognized entities are 502 escaped.""" 503 x = match.group(1) 504 if self.convertHTMLEntities and x in name2codepoint: 505 return unichr(name2codepoint[x]) 506 elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 507 if self.convertXMLEntities: 508 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 509 else: 510 return u'&%s;' % x 511 elif len(x) > 0 and x[0] == '#': 512 # Handle numeric entities 513 if len(x) > 1 and x[1] == 'x': 514 return unichr(int(x[2:], 16)) 515 else: 516 return unichr(int(x[1:])) 517 518 elif self.escapeUnrecognizedEntities: 519 return u'&%s;' % x 520 else: 521 return u'&%s;' % x 522 523 def __init__(self, parser, name, attrs=None, parent=None, 524 previous=None): 525 "Basic constructor." 526 527 # We don't actually store the parser object: that lets extracted 528 # chunks be garbage-collected 529 self.parserClass = parser.__class__ 530 self.isSelfClosing = parser.isSelfClosingTag(name) 531 self.name = name 532 if attrs is None: 533 attrs = [] 534 elif isinstance(attrs, dict): 535 attrs = attrs.items() 536 self.attrs = attrs 537 self.contents = [] 538 self.setup(parent, previous) 539 self.hidden = False 540 self.containsSubstitutions = False 541 self.convertHTMLEntities = parser.convertHTMLEntities 542 self.convertXMLEntities = parser.convertXMLEntities 543 self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 544 545 # Convert any HTML, XML, or numeric entities in the attribute values. 546 convert = lambda(k, val): (k, 547 re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 548 self._convertEntities, 549 val)) 550 self.attrs = map(convert, self.attrs) 551 552 def getString(self): 553 if (len(self.contents) == 1 554 and isinstance(self.contents[0], NavigableString)): 555 return self.contents[0] 556 557 def setString(self, string): 558 """Replace the contents of the tag with a string""" 559 self.clear() 560 self.append(string) 561 562 string = property(getString, setString) 563 564 def getText(self, separator=u""): 565 if not len(self.contents): 566 return u"" 567 stopNode = self._lastRecursiveChild().next 568 strings = [] 569 current = self.contents[0] 570 while current is not stopNode: 571 if isinstance(current, NavigableString): 572 strings.append(current.strip()) 573 current = current.next 574 return separator.join(strings) 575 576 text = property(getText) 577 578 def get(self, key, default=None): 579 """Returns the value of the 'key' attribute for the tag, or 580 the value given for 'default' if it doesn't have that 581 attribute.""" 582 return self._getAttrMap().get(key, default) 583 584 def clear(self): 585 """Extract all children.""" 586 for child in self.contents[:]: 587 child.extract() 588 589 def index(self, element): 590 for i, child in enumerate(self.contents): 591 if child is element: 592 return i 593 raise ValueError("Tag.index: element not in tag") 594 595 def has_key(self, key): 596 return self._getAttrMap().has_key(key) 597 598 def __getitem__(self, key): 599 """tag[key] returns the value of the 'key' attribute for the tag, 600 and throws an exception if it's not there.""" 601 return self._getAttrMap()[key] 602 603 def __iter__(self): 604 "Iterating over a tag iterates over its contents." 605 return iter(self.contents) 606 607 def __len__(self): 608 "The length of a tag is the length of its list of contents." 609 return len(self.contents) 610 611 def __contains__(self, x): 612 return x in self.contents 613 614 def __nonzero__(self): 615 "A tag is non-None even if it has no contents." 616 return True 617 618 def __setitem__(self, key, value): 619 """Setting tag[key] sets the value of the 'key' attribute for the 620 tag.""" 621 self._getAttrMap() 622 self.attrMap[key] = value 623 found = False 624 for i in range(0, len(self.attrs)): 625 if self.attrs[i][0] == key: 626 self.attrs[i] = (key, value) 627 found = True 628 if not found: 629 self.attrs.append((key, value)) 630 self._getAttrMap()[key] = value 631 632 def __delitem__(self, key): 633 "Deleting tag[key] deletes all 'key' attributes for the tag." 634 for item in self.attrs: 635 if item[0] == key: 636 self.attrs.remove(item) 637 #We don't break because bad HTML can define the same 638 #attribute multiple times. 639 self._getAttrMap() 640 if self.attrMap.has_key(key): 641 del self.attrMap[key] 642 643 def __call__(self, *args, **kwargs): 644 """Calling a tag like a function is the same as calling its 645 findAll() method. Eg. tag('a') returns a list of all the A tags 646 found within this tag.""" 647 return apply(self.findAll, args, kwargs) 648 649 def __getattr__(self, tag): 650 #print "Getattr %s.%s" % (self.__class__, tag) 651 if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 652 return self.find(tag[:-3]) 653 elif tag.find('__') != 0: 654 return self.find(tag) 655 raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 656 657 def __eq__(self, other): 658 """Returns true iff this tag has the same name, the same attributes, 659 and the same contents (recursively) as the given tag. 660 661 NOTE: right now this will return false if two tags have the 662 same attributes in a different order. Should this be fixed?""" 663 if other is self: 664 return True 665 if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 666 return False 667 for i in range(0, len(self.contents)): 668 if self.contents[i] != other.contents[i]: 669 return False 670 return True 671 672 def __ne__(self, other): 673 """Returns true iff this tag is not identical to the other tag, 674 as defined in __eq__.""" 675 return not self == other 676 677 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 678 """Renders this tag as a string.""" 679 return self.__str__(encoding) 680 681 def __unicode__(self): 682 return self.__str__(None) 683 684 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 685 + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 686 + ")") 687 688 def _sub_entity(self, x): 689 """Used with a regular expression to substitute the 690 appropriate XML entity for an XML special character.""" 691 return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 692 693 def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 694 prettyPrint=False, indentLevel=0): 695 """Returns a string or Unicode representation of this tag and 696 its contents. To get Unicode, pass None for encoding. 697 698 NOTE: since Python's HTML parser consumes whitespace, this 699 method is not certain to reproduce the whitespace present in 700 the original string.""" 701 702 encodedName = self.toEncoding(self.name, encoding) 703 704 attrs = [] 705 if self.attrs: 706 for key, val in self.attrs: 707 fmt = '%s="%s"' 708 if isinstance(val, basestring): 709 if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 710 val = self.substituteEncoding(val, encoding) 711 712 # The attribute value either: 713 # 714 # * Contains no embedded double quotes or single quotes. 715 # No problem: we enclose it in double quotes. 716 # * Contains embedded single quotes. No problem: 717 # double quotes work here too. 718 # * Contains embedded double quotes. No problem: 719 # we enclose it in single quotes. 720 # * Embeds both single _and_ double quotes. This 721 # can't happen naturally, but it can happen if 722 # you modify an attribute value after parsing 723 # the document. Now we have a bit of a 724 # problem. We solve it by enclosing the 725 # attribute in single quotes, and escaping any 726 # embedded single quotes to XML entities. 727 if '"' in val: 728 fmt = "%s='%s'" 729 if "'" in val: 730 # TODO: replace with apos when 731 # appropriate. 732 val = val.replace("'", "&squot;") 733 734 # Now we're okay w/r/t quotes. But the attribute 735 # value might also contain angle brackets, or 736 # ampersands that aren't part of entities. We need 737 # to escape those to XML entities too. 738 val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 739 740 attrs.append(fmt % (self.toEncoding(key, encoding), 741 self.toEncoding(val, encoding))) 742 close = '' 743 closeTag = '' 744 if self.isSelfClosing: 745 close = ' /' 746 else: 747 closeTag = '</%s>' % encodedName 748 749 indentTag, indentContents = 0, 0 750 if prettyPrint: 751 indentTag = indentLevel 752 space = (' ' * (indentTag-1)) 753 indentContents = indentTag + 1 754 contents = self.renderContents(encoding, prettyPrint, indentContents) 755 if self.hidden: 756 s = contents 757 else: 758 s = [] 759 attributeString = '' 760 if attrs: 761 attributeString = ' ' + ' '.join(attrs) 762 if prettyPrint: 763 s.append(space) 764 s.append('<%s%s%s>' % (encodedName, attributeString, close)) 765 if prettyPrint: 766 s.append("\n") 767 s.append(contents) 768 if prettyPrint and contents and contents[-1] != "\n": 769 s.append("\n") 770 if prettyPrint and closeTag: 771 s.append(space) 772 s.append(closeTag) 773 if prettyPrint and closeTag and self.nextSibling: 774 s.append("\n") 775 s = ''.join(s) 776 return s 777 778 def decompose(self): 779 """Recursively destroys the contents of this tree.""" 780 self.extract() 781 if len(self.contents) == 0: 782 return 783 current = self.contents[0] 784 while current is not None: 785 next = current.next 786 if isinstance(current, Tag): 787 del current.contents[:] 788 current.parent = None 789 current.previous = None 790 current.previousSibling = None 791 current.next = None 792 current.nextSibling = None 793 current = next 794 795 def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 796 return self.__str__(encoding, True) 797 798 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 799 prettyPrint=False, indentLevel=0): 800 """Renders the contents of this tag as a string in the given 801 encoding. If encoding is None, returns a Unicode string..""" 802 s=[] 803 for c in self: 804 text = None 805 if isinstance(c, NavigableString): 806 text = c.__str__(encoding) 807 elif isinstance(c, Tag): 808 s.append(c.__str__(encoding, prettyPrint, indentLevel)) 809 if text and prettyPrint: 810 text = text.strip() 811 if text: 812 if prettyPrint: 813 s.append(" " * (indentLevel-1)) 814 s.append(text) 815 if prettyPrint: 816 s.append("\n") 817 return ''.join(s) 818 819 #Soup methods 820 821 def find(self, name=None, attrs={}, recursive=True, text=None, 822 **kwargs): 823 """Return only the first child of this Tag matching the given 824 criteria.""" 825 r = None 826 l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 827 if l: 828 r = l[0] 829 return r 830 findChild = find 831 832 def findAll(self, name=None, attrs={}, recursive=True, text=None, 833 limit=None, **kwargs): 834 """Extracts a list of Tag objects that match the given 835 criteria. You can specify the name of the Tag and any 836 attributes you want the Tag to have. 837 838 The value of a key-value pair in the 'attrs' map can be a 839 string, a list of strings, a regular expression object, or a 840 callable that takes a string and returns whether or not the 841 string matches for some custom definition of 'matches'. The 842 same is true of the tag name.""" 843 generator = self.recursiveChildGenerator 844 if not recursive: 845 generator = self.childGenerator 846 return self._findAll(name, attrs, text, limit, generator, **kwargs) 847 findChildren = findAll 848 849 # Pre-3.x compatibility methods 850 first = find 851 fetch = findAll 852 853 def fetchText(self, text=None, recursive=True, limit=None): 854 return self.findAll(text=text, recursive=recursive, limit=limit) 855 856 def firstText(self, text=None, recursive=True): 857 return self.find(text=text, recursive=recursive) 858 859 #Private methods 860 861 def _getAttrMap(self): 862 """Initializes a map representation of this tag's attributes, 863 if not already initialized.""" 864 if not getattr(self, 'attrMap'): 865 self.attrMap = {} 866 for (key, value) in self.attrs: 867 self.attrMap[key] = value 868 return self.attrMap 869 870 #Generator methods 871 def childGenerator(self): 872 # Just use the iterator from the contents 873 return iter(self.contents) 874 875 def recursiveChildGenerator(self): 876 if not len(self.contents): 877 raise StopIteration 878 stopNode = self._lastRecursiveChild().next 879 current = self.contents[0] 880 while current is not stopNode: 881 yield current 882 current = current.next 883 884 885 # Next, a couple classes to represent queries and their results. 886 class SoupStrainer: 887 """Encapsulates a number of ways of matching a markup element (tag or 888 text).""" 889 890 def __init__(self, name=None, attrs={}, text=None, **kwargs): 891 self.name = name 892 if isinstance(attrs, basestring): 893 kwargs['class'] = _match_css_class(attrs) 894 attrs = None 895 if kwargs: 896 if attrs: 897 attrs = attrs.copy() 898 attrs.update(kwargs) 899 else: 900 attrs = kwargs 901 self.attrs = attrs 902 self.text = text 903 904 def __str__(self): 905 if self.text: 906 return self.text 907 else: 908 return "%s|%s" % (self.name, self.attrs) 909 910 def searchTag(self, markupName=None, markupAttrs={}): 911 found = None 912 markup = None 913 if isinstance(markupName, Tag): 914 markup = markupName 915 markupAttrs = markup 916 callFunctionWithTagData = callable(self.name) \ 917 and not isinstance(markupName, Tag) 918 919 if (not self.name) \ 920 or callFunctionWithTagData \ 921 or (markup and self._matches(markup, self.name)) \ 922 or (not markup and self._matches(markupName, self.name)): 923 if callFunctionWithTagData: 924 match = self.name(markupName, markupAttrs) 925 else: 926 match = True 927 markupAttrMap = None 928 for attr, matchAgainst in self.attrs.items(): 929 if not markupAttrMap: 930 if hasattr(markupAttrs, 'get'): 931 markupAttrMap = markupAttrs 932 else: 933 markupAttrMap = {} 934 for k,v in markupAttrs: 935 markupAttrMap[k] = v 936 attrValue = markupAttrMap.get(attr) 937 if not self._matches(attrValue, matchAgainst): 938 match = False 939 break 940 if match: 941 if markup: 942 found = markup 943 else: 944 found = markupName 945 return found 946 947 def search(self, markup): 948 #print 'looking for %s in %s' % (self, markup) 949 found = None 950 # If given a list of items, scan it for a text element that 951 # matches. 952 if hasattr(markup, "__iter__") \ 953 and not isinstance(markup, Tag): 954 for element in markup: 955 if isinstance(element, NavigableString) \ 956 and self.search(element): 957 found = element 958 break 959 # If it's a Tag, make sure its name or attributes match. 960 # Don't bother with Tags if we're searching for text. 961 elif isinstance(markup, Tag): 962 if not self.text: 963 found = self.searchTag(markup) 964 # If it's text, make sure the text matches. 965 elif isinstance(markup, NavigableString) or \ 966 isinstance(markup, basestring): 967 if self._matches(markup, self.text): 968 found = markup 969 else: 970 raise Exception, "I don't know how to match against a %s" \ 971 % markup.__class__ 972 return found 973 974 def _matches(self, markup, matchAgainst): 975 #print "Matching %s against %s" % (markup, matchAgainst) 976 result = False 977 if matchAgainst is True: 978 result = markup is not None 979 elif callable(matchAgainst): 980 result = matchAgainst(markup) 981 else: 982 #Custom match methods take the tag as an argument, but all 983 #other ways of matching match the tag name as a string. 984 if isinstance(markup, Tag): 985 markup = markup.name 986 if markup and not isinstance(markup, basestring): 987 markup = unicode(markup) 988 #Now we know that chunk is either a string, or None. 989 if hasattr(matchAgainst, 'match'): 990 # It's a regexp object. 991 result = markup and matchAgainst.search(markup) 992 elif hasattr(matchAgainst, '__iter__'): # list-like 993 result = markup in matchAgainst 994 elif hasattr(matchAgainst, 'items'): 995 result = markup.has_key(matchAgainst) 996 elif matchAgainst and isinstance(markup, basestring): 997 if isinstance(markup, unicode): 998 matchAgainst = unicode(matchAgainst) 999 else: 1000 matchAgainst = str(matchAgainst) 1001 1002 if not result: 1003 result = matchAgainst == markup 1004 return result 1005 1006 class ResultSet(list): 1007 """A ResultSet is just a list that keeps track of the SoupStrainer 1008 that created it.""" 1009 def __init__(self, source): 1010 list.__init__([]) 1011 self.source = source 1012 1013 # Now, some helper functions. 1014 1015 def buildTagMap(default, *args): 1016 """Turns a list of maps, lists, or scalars into a single map. 1017 Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1018 NESTING_RESET_TAGS maps out of lists and partial maps.""" 1019 built = {} 1020 for portion in args: 1021 if hasattr(portion, 'items'): 1022 #It's a map. Merge it. 1023 for k,v in portion.items(): 1024 built[k] = v 1025 elif hasattr(portion, '__iter__'): # is a list 1026 #It's a list. Map each item to the default. 1027 for k in portion: 1028 built[k] = default 1029 else: 1030 #It's a scalar. Map it to the default. 1031 built[portion] = default 1032 return built 1033 1034 # Now, the parser classes. 1035 1036 class BeautifulStoneSoup(Tag, SGMLParser): 1037 1038 """This class contains the basic parser and search code. It defines 1039 a parser that knows nothing about tag behavior except for the 1040 following: 1041 1042 You can't close a tag without closing all the tags it encloses. 1043 That is, "<foo><bar></foo>" actually means 1044 "<foo><bar></bar></foo>". 1045 1046 [Another possible explanation is "<foo><bar /></foo>", but since 1047 this class defines no SELF_CLOSING_TAGS, it will never use that 1048 explanation.] 1049 1050 This class is useful for parsing XML or made-up markup languages, 1051 or when BeautifulSoup makes an assumption counter to what you were 1052 expecting.""" 1053 1054 SELF_CLOSING_TAGS = {} 1055 NESTABLE_TAGS = {} 1056 RESET_NESTING_TAGS = {} 1057 QUOTE_TAGS = {} 1058 PRESERVE_WHITESPACE_TAGS = [] 1059 1060 MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1061 lambda x: x.group(1) + ' />'), 1062 (re.compile('<!\s+([^<>]*)>'), 1063 lambda x: '<!' + x.group(1) + '>') 1064 ] 1065 1066 ROOT_TAG_NAME = u'[document]' 1067 1068 HTML_ENTITIES = "html" 1069 XML_ENTITIES = "xml" 1070 XHTML_ENTITIES = "xhtml" 1071 # TODO: This only exists for backwards-compatibility 1072 ALL_ENTITIES = XHTML_ENTITIES 1073 1074 # Used when determining whether a text node is all whitespace and 1075 # can be replaced with a single space. A text node that contains 1076 # fancy Unicode spaces (usually non-breaking) should be left 1077 # alone. 1078 STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1079 1080 def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1081 markupMassage=True, smartQuotesTo=XML_ENTITIES, 1082 convertEntities=None, selfClosingTags=None, isHTML=False): 1083 """The Soup object is initialized as the 'root tag', and the 1084 provided markup (which can be a string or a file-like object) 1085 is fed into the underlying parser. 1086 1087 sgmllib will process most bad HTML, and the BeautifulSoup 1088 class has some tricks for dealing with some HTML that kills 1089 sgmllib, but Beautiful Soup can nonetheless choke or lose data 1090 if your data uses self-closing tags or declarations 1091 incorrectly. 1092 1093 By default, Beautiful Soup uses regexes to sanitize input, 1094 avoiding the vast majority of these problems. If the problems 1095 don't apply to you, pass in False for markupMassage, and 1096 you'll get better performance. 1097 1098 The default parser massage techniques fix the two most common 1099 instances of invalid HTML that choke sgmllib: 1100 1101 <br/> (No space between name of closing tag and tag close) 1102 <! --Comment--> (Extraneous whitespace in declaration) 1103 1104 You can pass in a custom list of (RE object, replace method) 1105 tuples to get Beautiful Soup to scrub your input the way you 1106 want.""" 1107 1108 self.parseOnlyThese = parseOnlyThese 1109 self.fromEncoding = fromEncoding 1110 self.smartQuotesTo = smartQuotesTo 1111 self.convertEntities = convertEntities 1112 # Set the rules for how we'll deal with the entities we 1113 # encounter 1114 if self.convertEntities: 1115 # It doesn't make sense to convert encoded characters to 1116 # entities even while you're converting entities to Unicode. 1117 # Just convert it all to Unicode. 1118 self.smartQuotesTo = None 1119 if convertEntities == self.HTML_ENTITIES: 1120 self.convertXMLEntities = False 1121 self.convertHTMLEntities = True 1122 self.escapeUnrecognizedEntities = True 1123 elif convertEntities == self.XHTML_ENTITIES: 1124 self.convertXMLEntities = True 1125 self.convertHTMLEntities = True 1126 self.escapeUnrecognizedEntities = False 1127 elif convertEntities == self.XML_ENTITIES: 1128 self.convertXMLEntities = True 1129 self.convertHTMLEntities = False 1130 self.escapeUnrecognizedEntities = False 1131 else: 1132 self.convertXMLEntities = False 1133 self.convertHTMLEntities = False 1134 self.escapeUnrecognizedEntities = False 1135 1136 self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1137 SGMLParser.__init__(self) 1138 1139 if hasattr(markup, 'read'): # It's a file-type object. 1140 markup = markup.read() 1141 self.markup = markup 1142 self.markupMassage = markupMassage 1143 try: 1144 self._feed(isHTML=isHTML) 1145 except StopParsing: 1146 pass 1147 self.markup = None # The markup can now be GCed 1148 1149 def convert_charref(self, name): 1150 """This method fixes a bug in Python's SGMLParser.""" 1151 try: 1152 n = int(name) 1153 except ValueError: 1154 return 1155 if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1156 return 1157 return self.convert_codepoint(n) 1158 1159 def _feed(self, inDocumentEncoding=None, isHTML=False): 1160 # Convert the document to Unicode. 1161 markup = self.markup 1162 if isinstance(markup, unicode): 1163 if not hasattr(self, 'originalEncoding'): 1164 self.originalEncoding = None 1165 else: 1166 dammit = UnicodeDammit\ 1167 (markup, [self.fromEncoding, inDocumentEncoding], 1168 smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1169 markup = dammit.unicode 1170 self.originalEncoding = dammit.originalEncoding 1171 self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1172 if markup: 1173 if self.markupMassage: 1174 if not hasattr(self.markupMassage, "__iter__"): 1175 self.markupMassage = self.MARKUP_MASSAGE 1176 for fix, m in self.markupMassage: 1177 markup = fix.sub(m, markup) 1178 # TODO: We get rid of markupMassage so that the 1179 # soup object can be deepcopied later on. Some 1180 # Python installations can't copy regexes. If anyone 1181 # was relying on the existence of markupMassage, this 1182 # might cause problems. 1183 del(self.markupMassage) 1184 self.reset() 1185 1186 SGMLParser.feed(self, markup) 1187 # Close out any unfinished strings and close all the open tags. 1188 self.endData() 1189 while self.currentTag.name != self.ROOT_TAG_NAME: 1190 self.popTag() 1191 1192 def __getattr__(self, methodName): 1193 """This method routes method call requests to either the SGMLParser 1194 superclass or the Tag superclass, depending on the method name.""" 1195 #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1196 1197 if methodName.startswith('start_') or methodName.startswith('end_') \ 1198 or methodName.startswith('do_'): 1199 return SGMLParser.__getattr__(self, methodName) 1200 elif not methodName.startswith('__'): 1201 return Tag.__getattr__(self, methodName) 1202 else: 1203 raise AttributeError 1204 1205 def isSelfClosingTag(self, name): 1206 """Returns true iff the given string is the name of a 1207 self-closing tag according to this parser.""" 1208 return self.SELF_CLOSING_TAGS.has_key(name) \ 1209 or self.instanceSelfClosingTags.has_key(name) 1210 1211 def reset(self): 1212 Tag.__init__(self, self, self.ROOT_TAG_NAME) 1213 self.hidden = 1 1214 SGMLParser.reset(self) 1215 self.currentData = [] 1216 self.currentTag = None 1217 self.tagStack = [] 1218 self.quoteStack = [] 1219 self.pushTag(self) 1220 1221 def popTag(self): 1222 tag = self.tagStack.pop() 1223 1224 #print "Pop", tag.name 1225 if self.tagStack: 1226 self.currentTag = self.tagStack[-1] 1227 return self.currentTag 1228 1229 def pushTag(self, tag): 1230 #print "Push", tag.name 1231 if self.currentTag: 1232 self.currentTag.contents.append(tag) 1233 self.tagStack.append(tag) 1234 self.currentTag = self.tagStack[-1] 1235 1236 def endData(self, containerClass=NavigableString): 1237 if self.currentData: 1238 currentData = u''.join(self.currentData) 1239 if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1240 not set([tag.name for tag in self.tagStack]).intersection( 1241 self.PRESERVE_WHITESPACE_TAGS)): 1242 if '\n' in currentData: 1243 currentData = '\n' 1244 else: 1245 currentData = ' ' 1246 self.currentData = [] 1247 if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1248 (not self.parseOnlyThese.text or \ 1249 not self.parseOnlyThese.search(currentData)): 1250 return 1251 o = containerClass(currentData) 1252 o.setup(self.currentTag, self.previous) 1253 if self.previous: 1254 self.previous.next = o 1255 self.previous = o 1256 self.currentTag.contents.append(o) 1257 1258 1259 def _popToTag(self, name, inclusivePop=True): 1260 """Pops the tag stack up to and including the most recent 1261 instance of the given tag. If inclusivePop is false, pops the tag 1262 stack up to but *not* including the most recent instqance of 1263 the given tag.""" 1264 #print "Popping to %s" % name 1265 if name == self.ROOT_TAG_NAME: 1266 return 1267 1268 numPops = 0 1269 mostRecentTag = None 1270 for i in range(len(self.tagStack)-1, 0, -1): 1271 if name == self.tagStack[i].name: 1272 numPops = len(self.tagStack)-i 1273 break 1274 if not inclusivePop: 1275 numPops = numPops - 1 1276 1277 for i in range(0, numPops): 1278 mostRecentTag = self.popTag() 1279 return mostRecentTag 1280 1281 def _smartPop(self, name): 1282 1283 """We need to pop up to the previous tag of this type, unless 1284 one of this tag's nesting reset triggers comes between this 1285 tag and the previous tag of this type, OR unless this tag is a 1286 generic nesting trigger and another generic nesting trigger 1287 comes between this tag and the previous tag of this type. 1288 1289 Examples: 1290 <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. 1291 <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. 1292 <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. 1293 1294 <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. 1295 <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' 1296 <td><tr><td> *<td>* should pop to 'tr', not the first 'td' 1297 """ 1298 1299 nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1300 isNestable = nestingResetTriggers != None 1301 isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1302 popTo = None 1303 inclusive = True 1304 for i in range(len(self.tagStack)-1, 0, -1): 1305 p = self.tagStack[i] 1306 if (not p or p.name == name) and not isNestable: 1307 #Non-nestable tags get popped to the top or to their 1308 #last occurance. 1309 popTo = name 1310 break 1311 if (nestingResetTriggers is not None 1312 and p.name in nestingResetTriggers) \ 1313 or (nestingResetTriggers is None and isResetNesting 1314 and self.RESET_NESTING_TAGS.has_key(p.name)): 1315 1316 #If we encounter one of the nesting reset triggers 1317 #peculiar to this tag, or we encounter another tag 1318 #that causes nesting to reset, pop up to but not 1319 #including that tag. 1320 popTo = p.name 1321 inclusive = False 1322 break 1323 p = p.parent 1324 if popTo: 1325 self._popToTag(popTo, inclusive) 1326 1327 def unknown_starttag(self, name, attrs, selfClosing=0): 1328 #print "Start tag %s: %s" % (name, attrs) 1329 if self.quoteStack: 1330 #This is not a real tag. 1331 #print "<%s> is not real!" % name 1332 attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1333 self.handle_data('<%s%s>' % (name, attrs)) 1334 return 1335 self.endData() 1336 1337 if not self.isSelfClosingTag(name) and not selfClosing: 1338 self._smartPop(name) 1339 1340 if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1341 and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1342 return 1343 1344 tag = Tag(self, name, attrs, self.currentTag, self.previous) 1345 if self.previous: 1346 self.previous.next = tag 1347 self.previous = tag 1348 self.pushTag(tag) 1349 if selfClosing or self.isSelfClosingTag(name): 1350 self.popTag() 1351 if name in self.QUOTE_TAGS: 1352 #print "Beginning quote (%s)" % name 1353 self.quoteStack.append(name) 1354 self.literal = 1 1355 return tag 1356 1357 def unknown_endtag(self, name): 1358 #print "End tag %s" % name 1359 if self.quoteStack and self.quoteStack[-1] != name: 1360 #This is not a real end tag. 1361 #print "</%s> is not real!" % name 1362 self.handle_data('</%s>' % name) 1363 return 1364 self.endData() 1365 self._popToTag(name) 1366 if self.quoteStack and self.quoteStack[-1] == name: 1367 self.quoteStack.pop() 1368 self.literal = (len(self.quoteStack) > 0) 1369 1370 def handle_data(self, data): 1371 self.currentData.append(data) 1372 1373 def _toStringSubclass(self, text, subclass): 1374 """Adds a certain piece of text to the tree as a NavigableString 1375 subclass.""" 1376 self.endData() 1377 self.handle_data(text) 1378 self.endData(subclass) 1379 1380 def handle_pi(self, text): 1381 """Handle a processing instruction as a ProcessingInstruction 1382 object, possibly one with a %SOUP-ENCODING% slot into which an 1383 encoding will be plugged later.""" 1384 if text[:3] == "xml": 1385 text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1386 self._toStringSubclass(text, ProcessingInstruction) 1387 1388 def handle_comment(self, text): 1389 "Handle comments as Comment objects." 1390 self._toStringSubclass(text, Comment) 1391 1392 def handle_charref(self, ref): 1393 "Handle character references as data." 1394 if self.convertEntities: 1395 data = unichr(int(ref)) 1396 else: 1397 data = '&#%s;' % ref 1398 self.handle_data(data) 1399 1400 def handle_entityref(self, ref): 1401 """Handle entity references as data, possibly converting known 1402 HTML and/or XML entity references to the corresponding Unicode 1403 characters.""" 1404 data = None 1405 if self.convertHTMLEntities: 1406 try: 1407 data = unichr(name2codepoint[ref]) 1408 except KeyError: 1409 pass 1410 1411 if not data and self.convertXMLEntities: 1412 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1413 1414 if not data and self.convertHTMLEntities and \ 1415 not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1416 # TODO: We've got a problem here. We're told this is 1417 # an entity reference, but it's not an XML entity 1418 # reference or an HTML entity reference. Nonetheless, 1419 # the logical thing to do is to pass it through as an 1420 # unrecognized entity reference. 1421 # 1422 # Except: when the input is "&carol;" this function 1423 # will be called with input "carol". When the input is 1424 # "AT&T", this function will be called with input 1425 # "T". We have no way of knowing whether a semicolon 1426 # was present originally, so we don't know whether 1427 # this is an unknown entity or just a misplaced 1428 # ampersand. 1429 # 1430 # The more common case is a misplaced ampersand, so I 1431 # escape the ampersand and omit the trailing semicolon. 1432 data = "&%s" % ref 1433 if not data: 1434 # This case is different from the one above, because we 1435 # haven't already gone through a supposedly comprehensive 1436 # mapping of entities to Unicode characters. We might not 1437 # have gone through any mapping at all. So the chances are 1438 # very high that this is a real entity, and not a 1439 # misplaced ampersand. 1440 data = "&%s;" % ref 1441 self.handle_data(data) 1442 1443 def handle_decl(self, data): 1444 "Handle DOCTYPEs and the like as Declaration objects." 1445 self._toStringSubclass(data, Declaration) 1446 1447 def parse_declaration(self, i): 1448 """Treat a bogus SGML declaration as raw data. Treat a CDATA 1449 declaration as a CData object.""" 1450 j = None 1451 if self.rawdata[i:i+9] == '<![CDATA[': 1452 k = self.rawdata.find(']]>', i) 1453 if k == -1: 1454 k = len(self.rawdata) 1455 data = self.rawdata[i+9:k] 1456 j = k+3 1457 self._toStringSubclass(data, CData) 1458 else: 1459 try: 1460 j = SGMLParser.parse_declaration(self, i) 1461 except SGMLParseError: 1462 toHandle = self.rawdata[i:] 1463 self.handle_data(toHandle) 1464 j = i + len(toHandle) 1465 return j 1466 1467 class BeautifulSoup(BeautifulStoneSoup): 1468 1469 """This parser knows the following facts about HTML: 1470 1471 * Some tags have no closing tag and should be interpreted as being 1472 closed as soon as they are encountered. 1473 1474 * The text inside some tags (ie. 'script') may contain tags which 1475 are not really part of the document and which should be parsed 1476 as text, not tags. If you want to parse the text as tags, you can 1477 always fetch it and parse it explicitly. 1478 1479 * Tag nesting rules: 1480 1481 Most tags can't be nested at all. For instance, the occurance of 1482 a <p> tag should implicitly close the previous <p> tag. 1483 1484 <p>Para1<p>Para2 1485 should be transformed into: 1486 <p>Para1</p><p>Para2 1487 1488 Some tags can be nested arbitrarily. For instance, the occurance 1489 of a <blockquote> tag should _not_ implicitly close the previous 1490 <blockquote> tag. 1491 1492 Alice said: <blockquote>Bob said: <blockquote>Blah 1493 should NOT be transformed into: 1494 Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah 1495 1496 Some tags can be nested, but the nesting is reset by the 1497 interposition of other tags. For instance, a <tr> tag should 1498 implicitly close the previous <tr> tag within the same <table>, 1499 but not close a <tr> tag in another table. 1500 1501 <table><tr>Blah<tr>Blah 1502 should be transformed into: 1503 <table><tr>Blah</tr><tr>Blah 1504 but, 1505 <tr>Blah<table><tr>Blah 1506 should NOT be transformed into 1507 <tr>Blah<table></tr><tr>Blah 1508 1509 Differing assumptions about tag nesting rules are a major source 1510 of problems with the BeautifulSoup class. If BeautifulSoup is not 1511 treating as nestable a tag your page author treats as nestable, 1512 try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1513 BeautifulStoneSoup before writing your own subclass.""" 1514 1515 def __init__(self, *args, **kwargs): 1516 if not kwargs.has_key('smartQuotesTo'): 1517 kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1518 kwargs['isHTML'] = True 1519 BeautifulStoneSoup.__init__(self, *args, **kwargs) 1520 1521 SELF_CLOSING_TAGS = buildTagMap(None, 1522 ('br' , 'hr', 'input', 'img', 'meta', 1523 'spacer', 'link', 'frame', 'base', 'col')) 1524 1525 PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1526 1527 QUOTE_TAGS = {'script' : None, 'textarea' : None} 1528 1529 #According to the HTML standard, each of these inline tags can 1530 #contain another tag of the same type. Furthermore, it's common 1531 #to actually use these tags this way. 1532 NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1533 'center') 1534 1535 #According to the HTML standard, these block tags can contain 1536 #another tag of the same type. Furthermore, it's common 1537 #to actually use these tags this way. 1538 NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1539 1540 #Lists can contain other lists, but there are restrictions. 1541 NESTABLE_LIST_TAGS = { 'ol' : [], 1542 'ul' : [], 1543 'li' : ['ul', 'ol'], 1544 'dl' : [], 1545 'dd' : ['dl'], 1546 'dt' : ['dl'] } 1547 1548 #Tables can contain other tables, but there are restrictions. 1549 NESTABLE_TABLE_TAGS = {'table' : [], 1550 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1551 'td' : ['tr'], 1552 'th' : ['tr'], 1553 'thead' : ['table'], 1554 'tbody' : ['table'], 1555 'tfoot' : ['table'], 1556 } 1557 1558 NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1559 1560 #If one of these tags is encountered, all tags up to the next tag of 1561 #this type are popped. 1562 RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1563 NON_NESTABLE_BLOCK_TAGS, 1564 NESTABLE_LIST_TAGS, 1565 NESTABLE_TABLE_TAGS) 1566 1567 NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1568 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1569 1570 # Used to detect the charset in a META tag; see start_meta 1571 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1572 1573 def start_meta(self, attrs): 1574 """Beautiful Soup can detect a charset included in a META tag, 1575 try to convert the document to that charset, and re-parse the 1576 document from the beginning.""" 1577 httpEquiv = None 1578 contentType = None 1579 contentTypeIndex = None 1580 tagNeedsEncodingSubstitution = False 1581 1582 for i in range(0, len(attrs)): 1583 key, value = attrs[i] 1584 key = key.lower() 1585 if key == 'http-equiv': 1586 httpEquiv = value 1587 elif key == 'content': 1588 contentType = value 1589 contentTypeIndex = i 1590 1591 if httpEquiv and contentType: # It's an interesting meta tag. 1592 match = self.CHARSET_RE.search(contentType) 1593 if match: 1594 if (self.declaredHTMLEncoding is not None or 1595 self.originalEncoding == self.fromEncoding): 1596 # An HTML encoding was sniffed while converting 1597 # the document to Unicode, or an HTML encoding was 1598 # sniffed during a previous pass through the 1599 # document, or an encoding was specified 1600 # explicitly and it worked. Rewrite the meta tag. 1601 def rewrite(match): 1602 return match.group(1) + "%SOUP-ENCODING%" 1603 newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1604 attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1605 newAttr) 1606 tagNeedsEncodingSubstitution = True 1607 else: 1608 # This is our first pass through the document. 1609 # Go through it again with the encoding information. 1610 newCharset = match.group(3) 1611 if newCharset and newCharset != self.originalEncoding: 1612 self.declaredHTMLEncoding = newCharset 1613 self._feed(self.declaredHTMLEncoding) 1614 raise StopParsing 1615 pass 1616 tag = self.unknown_starttag("meta", attrs) 1617 if tag and tagNeedsEncodingSubstitution: 1618 tag.containsSubstitutions = True 1619 1620 class StopParsing(Exception): 1621 pass 1622 1623 class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1624 1625 """The BeautifulSoup class is oriented towards skipping over 1626 common HTML errors like unclosed tags. However, sometimes it makes 1627 errors of its own. For instance, consider this fragment: 1628 1629 <b>Foo<b>Bar</b></b> 1630 1631 This is perfectly valid (if bizarre) HTML. However, the 1632 BeautifulSoup class will implicitly close the first b tag when it 1633 encounters the second 'b'. It will think the author wrote 1634 "<b>Foo<b>Bar", and didn't close the first 'b' tag, because 1635 there's no real-world reason to bold something that's already 1636 bold. When it encounters '</b></b>' it will close two more 'b' 1637 tags, for a grand total of three tags closed instead of two. This 1638 can throw off the rest of your document structure. The same is 1639 true of a number of other tags, listed below. 1640 1641 It's much more common for someone to forget to close a 'b' tag 1642 than to actually use nested 'b' tags, and the BeautifulSoup class 1643 handles the common case. This class handles the not-co-common 1644 case: where you can't believe someone wrote what they did, but 1645 it's valid HTML and BeautifulSoup screwed up by assuming it 1646 wouldn't be.""" 1647 1648 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1649 ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1650 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1651 'big') 1652 1653 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1654 1655 NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1656 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1657 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1658 1659 class MinimalSoup(BeautifulSoup): 1660 """The MinimalSoup class is for parsing HTML that contains 1661 pathologically bad markup. It makes no assumptions about tag 1662 nesting, but it does know which tags are self-closing, that 1663 <script> tags contain Javascript and should not be parsed, that 1664 META tags may contain encoding information, and so on. 1665 1666 This also makes it better for subclassing than BeautifulStoneSoup 1667 or BeautifulSoup.""" 1668 1669 RESET_NESTING_TAGS = buildTagMap('noscript') 1670 NESTABLE_TAGS = {} 1671 1672 class BeautifulSOAP(BeautifulStoneSoup): 1673 """This class will push a tag with only a single string child into 1674 the tag's parent as an attribute. The attribute's name is the tag 1675 name, and the value is the string child. An example should give 1676 the flavor of the change: 1677 1678 <foo><bar>baz</bar></foo> 1679 => 1680 <foo bar="baz"><bar>baz</bar></foo> 1681 1682 You can then access fooTag['bar'] instead of fooTag.barTag.string. 1683 1684 This is, of course, useful for scraping structures that tend to 1685 use subelements instead of attributes, such as SOAP messages. Note 1686 that it modifies its input, so don't print the modified version 1687 out. 1688 1689 I'm not sure how many people really want to use this class; let me 1690 know if you do. Mainly I like the name.""" 1691 1692 def popTag(self): 1693 if len(self.tagStack) > 1: 1694 tag = self.tagStack[-1] 1695 parent = self.tagStack[-2] 1696 parent._getAttrMap() 1697 if (isinstance(tag, Tag) and len(tag.contents) == 1 and 1698 isinstance(tag.contents[0], NavigableString) and 1699 not parent.attrMap.has_key(tag.name)): 1700 parent[tag.name] = tag.contents[0] 1701 BeautifulStoneSoup.popTag(self) 1702 1703 #Enterprise class names! It has come to our attention that some people 1704 #think the names of the Beautiful Soup parser classes are too silly 1705 #and "unprofessional" for use in enterprise screen-scraping. We feel 1706 #your pain! For such-minded folk, the Beautiful Soup Consortium And 1707 #All-Night Kosher Bakery recommends renaming this file to 1708 #"RobustParser.py" (or, in cases of extreme enterprisiness, 1709 #"RobustParserBeanInterface.class") and using the following 1710 #enterprise-friendly class aliases: 1711 class RobustXMLParser(BeautifulStoneSoup): 1712 pass 1713 class RobustHTMLParser(BeautifulSoup): 1714 pass 1715 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup): 1716 pass 1717 class RobustInsanelyWackAssHTMLParser(MinimalSoup): 1718 pass 1719 class SimplifyingSOAPParser(BeautifulSOAP): 1720 pass 1721 1722 ###################################################### 1723 # 1724 # Bonus library: Unicode, Dammit 1725 # 1726 # This class forces XML data into a standard format (usually to UTF-8 1727 # or Unicode). It is heavily based on code from Mark Pilgrim's 1728 # Universal Feed Parser. It does not rewrite the XML or HTML to 1729 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi 1730 # (XML) and BeautifulSoup.start_meta (HTML). 1731 1732 # Autodetects character encodings. 1733 # Download from http://chardet.feedparser.org/ 1734 try: 1735 import chardet 1736 # import chardet.constants 1737 # chardet.constants._debug = 1 1738 except ImportError: 1739 chardet = None 1740 1741 # cjkcodecs and iconv_codec make Python know about more character encodings. 1742 # Both are available from http://cjkpython.i18n.org/ 1743 # They're built in if you use Python 2.4. 1744 try: 1745 import cjkcodecs.aliases 1746 except ImportError: 1747 pass 1748 try: 1749 import iconv_codec 1750 except ImportError: 1751 pass 1752 1753 class UnicodeDammit: 1754 """A class for detecting the encoding of a *ML document and 1755 converting it to a Unicode string. If the source encoding is 1756 windows-1252, can replace MS smart quotes with their HTML or XML 1757 equivalents.""" 1758 1759 # This dictionary maps commonly seen values for "charset" in HTML 1760 # meta tags to the corresponding Python codec names. It only covers 1761 # values that aren't in Python's aliases and can't be determined 1762 # by the heuristics in find_codec. 1763 CHARSET_ALIASES = { "macintosh" : "mac-roman", 1764 "x-sjis" : "shift-jis" } 1765 1766 def __init__(self, markup, overrideEncodings=[], 1767 smartQuotesTo='xml', isHTML=False): 1768 self.declaredHTMLEncoding = None 1769 self.markup, documentEncoding, sniffedEncoding = \ 1770 self._detectEncoding(markup, isHTML) 1771 self.smartQuotesTo = smartQuotesTo 1772 self.triedEncodings = [] 1773 if markup == '' or isinstance(markup, unicode): 1774 self.originalEncoding = None 1775 self.unicode = unicode(markup) 1776 return 1777 1778 u = None 1779 for proposedEncoding in overrideEncodings: 1780 u = self._convertFrom(proposedEncoding) 1781 if u: break 1782 if not u: 1783 for proposedEncoding in (documentEncoding, sniffedEncoding): 1784 u = self._convertFrom(proposedEncoding) 1785 if u: break 1786 1787 # If no luck and we have auto-detection library, try that: 1788 if not u and chardet and not isinstance(self.markup, unicode): 1789 u = self._convertFrom(chardet.detect(self.markup)['encoding']) 1790 1791 # As a last resort, try utf-8 and windows-1252: 1792 if not u: 1793 for proposed_encoding in ("utf-8", "windows-1252"): 1794 u = self._convertFrom(proposed_encoding) 1795 if u: break 1796 1797 self.unicode = u 1798 if not u: self.originalEncoding = None 1799 1800 def _subMSChar(self, orig): 1801 """Changes a MS smart quote character to an XML or HTML 1802 entity.""" 1803 sub = self.MS_CHARS.get(orig) 1804 if isinstance(sub, tuple): 1805 if self.smartQuotesTo == 'xml': 1806 sub = '&#x%s;' % sub[1] 1807 else: 1808 sub = '&%s;' % sub[0] 1809 return sub 1810 1811 def _convertFrom(self, proposed): 1812 proposed = self.find_codec(proposed) 1813 if not proposed or proposed in self.triedEncodings: 1814 return None 1815 self.triedEncodings.append(proposed) 1816 markup = self.markup 1817 1818 # Convert smart quotes to HTML if coming from an encoding 1819 # that might have them. 1820 if self.smartQuotesTo and proposed.lower() in("windows-1252", 1821 "iso-8859-1", 1822 "iso-8859-2"): 1823 markup = re.compile("([\x80-\x9f])").sub \ 1824 (lambda(x): self._subMSChar(x.group(1)), 1825 markup) 1826 1827 try: 1828 # print "Trying to convert document to %s" % proposed 1829 u = self._toUnicode(markup, proposed) 1830 self.markup = u 1831 self.originalEncoding = proposed 1832 except Exception, e: 1833 # print "That didn't work!" 1834 # print e 1835 return None 1836 #print "Correct encoding: %s" % proposed 1837 return self.markup 1838 1839 def _toUnicode(self, data, encoding): 1840 '''Given a string and its encoding, decodes the string into Unicode. 1841 %encoding is a string recognized by encodings.aliases''' 1842 1843 # strip Byte Order Mark (if present) 1844 if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ 1845 and (data[2:4] != '\x00\x00'): 1846 encoding = 'utf-16be' 1847 data = data[2:] 1848 elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ 1849 and (data[2:4] != '\x00\x00'): 1850 encoding = 'utf-16le' 1851 data = data[2:] 1852 elif data[:3] == '\xef\xbb\xbf': 1853 encoding = 'utf-8' 1854 data = data[3:] 1855 elif data[:4] == '\x00\x00\xfe\xff': 1856 encoding = 'utf-32be' 1857 data = data[4:] 1858 elif data[:4] == '\xff\xfe\x00\x00': 1859 encoding = 'utf-32le' 1860 data = data[4:] 1861 newdata = unicode(data, encoding) 1862 return newdata 1863 1864 def _detectEncoding(self, xml_data, isHTML=False): 1865 """Given a document, tries to detect its XML encoding.""" 1866 xml_encoding = sniffed_xml_encoding = None 1867 try: 1868 if xml_data[:4] == '\x4c\x6f\xa7\x94': 1869 # EBCDIC 1870 xml_data = self._ebcdic_to_ascii(xml_data) 1871 elif xml_data[:4] == '\x00\x3c\x00\x3f': 1872 # UTF-16BE 1873 sniffed_xml_encoding = 'utf-16be' 1874 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') 1875 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ 1876 and (xml_data[2:4] != '\x00\x00'): 1877 # UTF-16BE with BOM 1878 sniffed_xml_encoding = 'utf-16be' 1879 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') 1880 elif xml_data[:4] == '\x3c\x00\x3f\x00': 1881 # UTF-16LE 1882 sniffed_xml_encoding = 'utf-16le' 1883 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') 1884 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ 1885 (xml_data[2:4] != '\x00\x00'): 1886 # UTF-16LE with BOM 1887 sniffed_xml_encoding = 'utf-16le' 1888 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') 1889 elif xml_data[:4] == '\x00\x00\x00\x3c': 1890 # UTF-32BE 1891 sniffed_xml_encoding = 'utf-32be' 1892 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') 1893 elif xml_data[:4] == '\x3c\x00\x00\x00': 1894 # UTF-32LE 1895 sniffed_xml_encoding = 'utf-32le' 1896 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') 1897 elif xml_data[:4] == '\x00\x00\xfe\xff': 1898 # UTF-32BE with BOM 1899 sniffed_xml_encoding = 'utf-32be' 1900 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') 1901 elif xml_data[:4] == '\xff\xfe\x00\x00': 1902 # UTF-32LE with BOM 1903 sniffed_xml_encoding = 'utf-32le' 1904 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') 1905 elif xml_data[:3] == '\xef\xbb\xbf': 1906 # UTF-8 with BOM 1907 sniffed_xml_encoding = 'utf-8' 1908 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') 1909 else: 1910 sniffed_xml_encoding = 'ascii' 1911 pass 1912 except: 1913 xml_encoding_match = None 1914 xml_encoding_match = re.compile( 1915 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) 1916 if not xml_encoding_match and isHTML: 1917 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) 1918 xml_encoding_match = regexp.search(xml_data) 1919 if xml_encoding_match is not None: 1920 xml_encoding = xml_encoding_match.groups()[0].lower() 1921 if isHTML: 1922 self.declaredHTMLEncoding = xml_encoding 1923 if sniffed_xml_encoding and \ 1924 (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 1925 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 1926 'utf-16', 'utf-32', 'utf_16', 'utf_32', 1927 'utf16', 'u16')): 1928 xml_encoding = sniffed_xml_encoding 1929 return xml_data, xml_encoding, sniffed_xml_encoding 1930 1931 1932 def find_codec(self, charset): 1933 return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ 1934 or (charset and self._codec(charset.replace("-", ""))) \ 1935 or (charset and self._codec(charset.replace("-", "_"))) \ 1936 or charset 1937 1938 def _codec(self, charset): 1939 if not charset: return charset 1940 codec = None 1941 try: 1942 codecs.lookup(charset) 1943 codec = charset 1944 except (LookupError, ValueError): 1945 pass 1946 return codec 1947 1948 EBCDIC_TO_ASCII_MAP = None 1949 def _ebcdic_to_ascii(self, s): 1950 c = self.__class__ 1951 if not c.EBCDIC_TO_ASCII_MAP: 1952 emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, 1953 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, 1954 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, 1955 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, 1956 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, 1957 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, 1958 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, 1959 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, 1960 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, 1961 201,202,106,107,108,109,110,111,112,113,114,203,204,205, 1962 206,207,208,209,126,115,116,117,118,119,120,121,122,210, 1963 211,212,213,214,215,216,217,218,219,220,221,222,223,224, 1964 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, 1965 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, 1966 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, 1967 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, 1968 250,251,252,253,254,255) 1969 import string 1970 c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ 1971 ''.join(map(chr, range(256))), ''.join(map(chr, emap))) 1972 return s.translate(c.EBCDIC_TO_ASCII_MAP) 1973 1974 MS_CHARS = { '\x80' : ('euro', '20AC'), 1975 '\x81' : ' ', 1976 '\x82' : ('sbquo', '201A'), 1977 '\x83' : ('fnof', '192'), 1978 '\x84' : ('bdquo', '201E'), 1979 '\x85' : ('hellip', '2026'), 1980 '\x86' : ('dagger', '2020'), 1981 '\x87' : ('Dagger', '2021'), 1982 '\x88' : ('circ', '2C6'), 1983 '\x89' : ('permil', '2030'), 1984 '\x8A' : ('Scaron', '160'), 1985 '\x8B' : ('lsaquo', '2039'), 1986 '\x8C' : ('OElig', '152'), 1987 '\x8D' : '?', 1988 '\x8E' : ('#x17D', '17D'), 1989 '\x8F' : '?', 1990 '\x90' : '?', 1991 '\x91' : ('lsquo', '2018'), 1992 '\x92' : ('rsquo', '2019'), 1993 '\x93' : ('ldquo', '201C'), 1994 '\x94' : ('rdquo', '201D'), 1995 '\x95' : ('bull', '2022'), 1996 '\x96' : ('ndash', '2013'), 1997 '\x97' : ('mdash', '2014'), 1998 '\x98' : ('tilde', '2DC'), 1999 '\x99' : ('trade', '2122'), 2000 '\x9a' : ('scaron', '161'), 2001 '\x9b' : ('rsaquo', '203A'), 2002 '\x9c' : ('oelig', '153'), 2003 '\x9d' : '?', 2004 '\x9e' : ('#x17E', '17E'), 2005 '\x9f' : ('Yuml', ''),} 2006 2007 ####################################################################### 2008 2009 2010 #By default, act as an HTML pretty-printer. 2011 if __name__ == '__main__': 2012 import sys 2013 soup = BeautifulSoup(sys.stdin) 2014 print soup.prettify() 2015