Home | History | Annotate | Download | only in thirdparty
      1 """Beautiful Soup
      2 Elixir and Tonic
      3 "The Screen-Scraper's Friend"
      4 http://www.crummy.com/software/BeautifulSoup/
      5 
      6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
      7 tree representation. It provides methods and Pythonic idioms that make
      8 it easy to navigate, search, and modify the tree.
      9 
     10 A well-formed XML/HTML document yields a well-formed data
     11 structure. An ill-formed XML/HTML document yields a correspondingly
     12 ill-formed data structure. If your document is only locally
     13 well-formed, you can use this library to find and process the
     14 well-formed part of it.
     15 
     16 Beautiful Soup works with Python 2.2 and up. It has no external
     17 dependencies, but you'll have more success at converting data to UTF-8
     18 if you also install these three packages:
     19 
     20 * chardet, for auto-detecting character encodings
     21   http://chardet.feedparser.org/
     22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
     23   by stock Python.
     24   http://cjkpython.i18n.org/
     25 
     26 Beautiful Soup defines classes for two main parsing strategies:
     27 
     28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
     29    language that kind of looks like XML.
     30 
     31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
     32    or invalid. This class has web browser-like heuristics for
     33    obtaining a sensible parse tree in the face of common HTML errors.
     34 
     35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
     36 the encoding of an HTML or XML document, and converting it to
     37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
     38 
     39 For more than you ever wanted to know about Beautiful Soup, see the
     40 documentation:
     41 http://www.crummy.com/software/BeautifulSoup/documentation.html
     42 
     43 Here, have some legalese:
     44 
     45 Copyright (c) 2004-2010, Leonard Richardson
     46 
     47 All rights reserved.
     48 
     49 Redistribution and use in source and binary forms, with or without
     50 modification, are permitted provided that the following conditions are
     51 met:
     52 
     53   * Redistributions of source code must retain the above copyright
     54     notice, this list of conditions and the following disclaimer.
     55 
     56   * Redistributions in binary form must reproduce the above
     57     copyright notice, this list of conditions and the following
     58     disclaimer in the documentation and/or other materials provided
     59     with the distribution.
     60 
     61   * Neither the name of the the Beautiful Soup Consortium and All
     62     Night Kosher Bakery nor the names of its contributors may be
     63     used to endorse or promote products derived from this software
     64     without specific prior written permission.
     65 
     66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
     70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
     77 
     78 """
     79 from __future__ import generators
     80 
     81 __author__ = "Leonard Richardson (leonardr (at] segfault.org)"
     82 __version__ = "3.2.0"
     83 __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
     84 __license__ = "New-style BSD"
     85 
     86 from sgmllib import SGMLParser, SGMLParseError
     87 import codecs
     88 import markupbase
     89 import types
     90 import re
     91 import sgmllib
     92 try:
     93   from htmlentitydefs import name2codepoint
     94 except ImportError:
     95   name2codepoint = {}
     96 try:
     97     set
     98 except NameError:
     99     from sets import Set as set
    100 
    101 #These hacks make Beautiful Soup able to parse XML with namespaces
    102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
    103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
    104 
    105 DEFAULT_OUTPUT_ENCODING = "utf-8"
    106 
    107 def _match_css_class(str):
    108     """Build a RE to match the given CSS class."""
    109     return re.compile(r"(^|.*\s)%s($|\s)" % str)
    110 
    111 # First, the classes that represent markup elements.
    112 
    113 class PageElement(object):
    114     """Contains the navigational information for some part of the page
    115     (either a tag or a piece of text)"""
    116 
    117     def setup(self, parent=None, previous=None):
    118         """Sets up the initial relations between this element and
    119         other elements."""
    120         self.parent = parent
    121         self.previous = previous
    122         self.next = None
    123         self.previousSibling = None
    124         self.nextSibling = None
    125         if self.parent and self.parent.contents:
    126             self.previousSibling = self.parent.contents[-1]
    127             self.previousSibling.nextSibling = self
    128 
    129     def replaceWith(self, replaceWith):
    130         oldParent = self.parent
    131         myIndex = self.parent.index(self)
    132         if hasattr(replaceWith, "parent")\
    133                   and replaceWith.parent is self.parent:
    134             # We're replacing this element with one of its siblings.
    135             index = replaceWith.parent.index(replaceWith)
    136             if index and index < myIndex:
    137                 # Furthermore, it comes before this element. That
    138                 # means that when we extract it, the index of this
    139                 # element will change.
    140                 myIndex = myIndex - 1
    141         self.extract()
    142         oldParent.insert(myIndex, replaceWith)
    143 
    144     def replaceWithChildren(self):
    145         myParent = self.parent
    146         myIndex = self.parent.index(self)
    147         self.extract()
    148         reversedChildren = list(self.contents)
    149         reversedChildren.reverse()
    150         for child in reversedChildren:
    151             myParent.insert(myIndex, child)
    152 
    153     def extract(self):
    154         """Destructively rips this element out of the tree."""
    155         if self.parent:
    156             try:
    157                 del self.parent.contents[self.parent.index(self)]
    158             except ValueError:
    159                 pass
    160 
    161         #Find the two elements that would be next to each other if
    162         #this element (and any children) hadn't been parsed. Connect
    163         #the two.
    164         lastChild = self._lastRecursiveChild()
    165         nextElement = lastChild.next
    166 
    167         if self.previous:
    168             self.previous.next = nextElement
    169         if nextElement:
    170             nextElement.previous = self.previous
    171         self.previous = None
    172         lastChild.next = None
    173 
    174         self.parent = None
    175         if self.previousSibling:
    176             self.previousSibling.nextSibling = self.nextSibling
    177         if self.nextSibling:
    178             self.nextSibling.previousSibling = self.previousSibling
    179         self.previousSibling = self.nextSibling = None
    180         return self
    181 
    182     def _lastRecursiveChild(self):
    183         "Finds the last element beneath this object to be parsed."
    184         lastChild = self
    185         while hasattr(lastChild, 'contents') and lastChild.contents:
    186             lastChild = lastChild.contents[-1]
    187         return lastChild
    188 
    189     def insert(self, position, newChild):
    190         if isinstance(newChild, basestring) \
    191             and not isinstance(newChild, NavigableString):
    192             newChild = NavigableString(newChild)
    193 
    194         position =  min(position, len(self.contents))
    195         if hasattr(newChild, 'parent') and newChild.parent is not None:
    196             # We're 'inserting' an element that's already one
    197             # of this object's children.
    198             if newChild.parent is self:
    199                 index = self.index(newChild)
    200                 if index > position:
    201                     # Furthermore we're moving it further down the
    202                     # list of this object's children. That means that
    203                     # when we extract this element, our target index
    204                     # will jump down one.
    205                     position = position - 1
    206             newChild.extract()
    207 
    208         newChild.parent = self
    209         previousChild = None
    210         if position == 0:
    211             newChild.previousSibling = None
    212             newChild.previous = self
    213         else:
    214             previousChild = self.contents[position-1]
    215             newChild.previousSibling = previousChild
    216             newChild.previousSibling.nextSibling = newChild
    217             newChild.previous = previousChild._lastRecursiveChild()
    218         if newChild.previous:
    219             newChild.previous.next = newChild
    220 
    221         newChildsLastElement = newChild._lastRecursiveChild()
    222 
    223         if position >= len(self.contents):
    224             newChild.nextSibling = None
    225 
    226             parent = self
    227             parentsNextSibling = None
    228             while not parentsNextSibling:
    229                 parentsNextSibling = parent.nextSibling
    230                 parent = parent.parent
    231                 if not parent: # This is the last element in the document.
    232                     break
    233             if parentsNextSibling:
    234                 newChildsLastElement.next = parentsNextSibling
    235             else:
    236                 newChildsLastElement.next = None
    237         else:
    238             nextChild = self.contents[position]
    239             newChild.nextSibling = nextChild
    240             if newChild.nextSibling:
    241                 newChild.nextSibling.previousSibling = newChild
    242             newChildsLastElement.next = nextChild
    243 
    244         if newChildsLastElement.next:
    245             newChildsLastElement.next.previous = newChildsLastElement
    246         self.contents.insert(position, newChild)
    247 
    248     def append(self, tag):
    249         """Appends the given tag to the contents of this tag."""
    250         self.insert(len(self.contents), tag)
    251 
    252     def findNext(self, name=None, attrs={}, text=None, **kwargs):
    253         """Returns the first item that matches the given criteria and
    254         appears after this Tag in the document."""
    255         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
    256 
    257     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
    258                     **kwargs):
    259         """Returns all items that match the given criteria and appear
    260         after this Tag in the document."""
    261         return self._findAll(name, attrs, text, limit, self.nextGenerator,
    262                              **kwargs)
    263 
    264     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
    265         """Returns the closest sibling to this Tag that matches the
    266         given criteria and appears after this Tag in the document."""
    267         return self._findOne(self.findNextSiblings, name, attrs, text,
    268                              **kwargs)
    269 
    270     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
    271                          **kwargs):
    272         """Returns the siblings of this Tag that match the given
    273         criteria and appear after this Tag in the document."""
    274         return self._findAll(name, attrs, text, limit,
    275                              self.nextSiblingGenerator, **kwargs)
    276     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
    277 
    278     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
    279         """Returns the first item that matches the given criteria and
    280         appears before this Tag in the document."""
    281         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
    282 
    283     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
    284                         **kwargs):
    285         """Returns all items that match the given criteria and appear
    286         before this Tag in the document."""
    287         return self._findAll(name, attrs, text, limit, self.previousGenerator,
    288                            **kwargs)
    289     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
    290 
    291     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
    292         """Returns the closest sibling to this Tag that matches the
    293         given criteria and appears before this Tag in the document."""
    294         return self._findOne(self.findPreviousSiblings, name, attrs, text,
    295                              **kwargs)
    296 
    297     def findPreviousSiblings(self, name=None, attrs={}, text=None,
    298                              limit=None, **kwargs):
    299         """Returns the siblings of this Tag that match the given
    300         criteria and appear before this Tag in the document."""
    301         return self._findAll(name, attrs, text, limit,
    302                              self.previousSiblingGenerator, **kwargs)
    303     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
    304 
    305     def findParent(self, name=None, attrs={}, **kwargs):
    306         """Returns the closest parent of this Tag that matches the given
    307         criteria."""
    308         # NOTE: We can't use _findOne because findParents takes a different
    309         # set of arguments.
    310         r = None
    311         l = self.findParents(name, attrs, 1)
    312         if l:
    313             r = l[0]
    314         return r
    315 
    316     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
    317         """Returns the parents of this Tag that match the given
    318         criteria."""
    319 
    320         return self._findAll(name, attrs, None, limit, self.parentGenerator,
    321                              **kwargs)
    322     fetchParents = findParents # Compatibility with pre-3.x
    323 
    324     #These methods do the real heavy lifting.
    325 
    326     def _findOne(self, method, name, attrs, text, **kwargs):
    327         r = None
    328         l = method(name, attrs, text, 1, **kwargs)
    329         if l:
    330             r = l[0]
    331         return r
    332 
    333     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
    334         "Iterates over a generator looking for things that match."
    335 
    336         if isinstance(name, SoupStrainer):
    337             strainer = name
    338         # (Possibly) special case some findAll*(...) searches
    339         elif text is None and not limit and not attrs and not kwargs:
    340             # findAll*(True)
    341             if name is True:
    342                 return [element for element in generator()
    343                         if isinstance(element, Tag)]
    344             # findAll*('tag-name')
    345             elif isinstance(name, basestring):
    346                 return [element for element in generator()
    347                         if isinstance(element, Tag) and
    348                         element.name == name]
    349             else:
    350                 strainer = SoupStrainer(name, attrs, text, **kwargs)
    351         # Build a SoupStrainer
    352         else:
    353             strainer = SoupStrainer(name, attrs, text, **kwargs)
    354         results = ResultSet(strainer)
    355         g = generator()
    356         while True:
    357             try:
    358                 i = g.next()
    359             except StopIteration:
    360                 break
    361             if i:
    362                 found = strainer.search(i)
    363                 if found:
    364                     results.append(found)
    365                     if limit and len(results) >= limit:
    366                         break
    367         return results
    368 
    369     #These Generators can be used to navigate starting from both
    370     #NavigableStrings and Tags.
    371     def nextGenerator(self):
    372         i = self
    373         while i is not None:
    374             i = i.next
    375             yield i
    376 
    377     def nextSiblingGenerator(self):
    378         i = self
    379         while i is not None:
    380             i = i.nextSibling
    381             yield i
    382 
    383     def previousGenerator(self):
    384         i = self
    385         while i is not None:
    386             i = i.previous
    387             yield i
    388 
    389     def previousSiblingGenerator(self):
    390         i = self
    391         while i is not None:
    392             i = i.previousSibling
    393             yield i
    394 
    395     def parentGenerator(self):
    396         i = self
    397         while i is not None:
    398             i = i.parent
    399             yield i
    400 
    401     # Utility methods
    402     def substituteEncoding(self, str, encoding=None):
    403         encoding = encoding or "utf-8"
    404         return str.replace("%SOUP-ENCODING%", encoding)
    405 
    406     def toEncoding(self, s, encoding=None):
    407         """Encodes an object to a string in some encoding, or to Unicode.
    408         ."""
    409         if isinstance(s, unicode):
    410             if encoding:
    411                 s = s.encode(encoding)
    412         elif isinstance(s, str):
    413             if encoding:
    414                 s = s.encode(encoding)
    415             else:
    416                 s = unicode(s)
    417         else:
    418             if encoding:
    419                 s  = self.toEncoding(str(s), encoding)
    420             else:
    421                 s = unicode(s)
    422         return s
    423 
    424 class NavigableString(unicode, PageElement):
    425 
    426     def __new__(cls, value):
    427         """Create a new NavigableString.
    428 
    429         When unpickling a NavigableString, this method is called with
    430         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
    431         passed in to the superclass's __new__ or the superclass won't know
    432         how to handle non-ASCII characters.
    433         """
    434         if isinstance(value, unicode):
    435             return unicode.__new__(cls, value)
    436         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
    437 
    438     def __getnewargs__(self):
    439         return (NavigableString.__str__(self),)
    440 
    441     def __getattr__(self, attr):
    442         """text.string gives you text. This is for backwards
    443         compatibility for Navigable*String, but for CData* it lets you
    444         get the string without the CData wrapper."""
    445         if attr == 'string':
    446             return self
    447         else:
    448             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
    449 
    450     def __unicode__(self):
    451         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
    452 
    453     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    454         if encoding:
    455             return self.encode(encoding)
    456         else:
    457             return self
    458 
    459 class CData(NavigableString):
    460 
    461     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    462         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
    463 
    464 class ProcessingInstruction(NavigableString):
    465     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    466         output = self
    467         if "%SOUP-ENCODING%" in output:
    468             output = self.substituteEncoding(output, encoding)
    469         return "<?%s?>" % self.toEncoding(output, encoding)
    470 
    471 class Comment(NavigableString):
    472     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    473         return "<!--%s-->" % NavigableString.__str__(self, encoding)
    474 
    475 class Declaration(NavigableString):
    476     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    477         return "<!%s>" % NavigableString.__str__(self, encoding)
    478 
    479 class Tag(PageElement):
    480 
    481     """Represents a found HTML tag with its attributes and contents."""
    482 
    483     def _invert(h):
    484         "Cheap function to invert a hash."
    485         i = {}
    486         for k,v in h.items():
    487             i[v] = k
    488         return i
    489 
    490     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
    491                                       "quot" : '"',
    492                                       "amp" : "&",
    493                                       "lt" : "<",
    494                                       "gt" : ">" }
    495 
    496     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
    497 
    498     def _convertEntities(self, match):
    499         """Used in a call to re.sub to replace HTML, XML, and numeric
    500         entities with the appropriate Unicode characters. If HTML
    501         entities are being converted, any unrecognized entities are
    502         escaped."""
    503         x = match.group(1)
    504         if self.convertHTMLEntities and x in name2codepoint:
    505             return unichr(name2codepoint[x])
    506         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
    507             if self.convertXMLEntities:
    508                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
    509             else:
    510                 return u'&%s;' % x
    511         elif len(x) > 0 and x[0] == '#':
    512             # Handle numeric entities
    513             if len(x) > 1 and x[1] == 'x':
    514                 return unichr(int(x[2:], 16))
    515             else:
    516                 return unichr(int(x[1:]))
    517 
    518         elif self.escapeUnrecognizedEntities:
    519             return u'&amp;%s;' % x
    520         else:
    521             return u'&%s;' % x
    522 
    523     def __init__(self, parser, name, attrs=None, parent=None,
    524                  previous=None):
    525         "Basic constructor."
    526 
    527         # We don't actually store the parser object: that lets extracted
    528         # chunks be garbage-collected
    529         self.parserClass = parser.__class__
    530         self.isSelfClosing = parser.isSelfClosingTag(name)
    531         self.name = name
    532         if attrs is None:
    533             attrs = []
    534         elif isinstance(attrs, dict):
    535             attrs = attrs.items()
    536         self.attrs = attrs
    537         self.contents = []
    538         self.setup(parent, previous)
    539         self.hidden = False
    540         self.containsSubstitutions = False
    541         self.convertHTMLEntities = parser.convertHTMLEntities
    542         self.convertXMLEntities = parser.convertXMLEntities
    543         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
    544 
    545         # Convert any HTML, XML, or numeric entities in the attribute values.
    546         convert = lambda(k, val): (k,
    547                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
    548                                           self._convertEntities,
    549                                           val))
    550         self.attrs = map(convert, self.attrs)
    551 
    552     def getString(self):
    553         if (len(self.contents) == 1
    554             and isinstance(self.contents[0], NavigableString)):
    555             return self.contents[0]
    556 
    557     def setString(self, string):
    558         """Replace the contents of the tag with a string"""
    559         self.clear()
    560         self.append(string)
    561 
    562     string = property(getString, setString)
    563 
    564     def getText(self, separator=u""):
    565         if not len(self.contents):
    566             return u""
    567         stopNode = self._lastRecursiveChild().next
    568         strings = []
    569         current = self.contents[0]
    570         while current is not stopNode:
    571             if isinstance(current, NavigableString):
    572                 strings.append(current.strip())
    573             current = current.next
    574         return separator.join(strings)
    575 
    576     text = property(getText)
    577 
    578     def get(self, key, default=None):
    579         """Returns the value of the 'key' attribute for the tag, or
    580         the value given for 'default' if it doesn't have that
    581         attribute."""
    582         return self._getAttrMap().get(key, default)
    583 
    584     def clear(self):
    585         """Extract all children."""
    586         for child in self.contents[:]:
    587             child.extract()
    588 
    589     def index(self, element):
    590         for i, child in enumerate(self.contents):
    591             if child is element:
    592                 return i
    593         raise ValueError("Tag.index: element not in tag")
    594 
    595     def has_key(self, key):
    596         return self._getAttrMap().has_key(key)
    597 
    598     def __getitem__(self, key):
    599         """tag[key] returns the value of the 'key' attribute for the tag,
    600         and throws an exception if it's not there."""
    601         return self._getAttrMap()[key]
    602 
    603     def __iter__(self):
    604         "Iterating over a tag iterates over its contents."
    605         return iter(self.contents)
    606 
    607     def __len__(self):
    608         "The length of a tag is the length of its list of contents."
    609         return len(self.contents)
    610 
    611     def __contains__(self, x):
    612         return x in self.contents
    613 
    614     def __nonzero__(self):
    615         "A tag is non-None even if it has no contents."
    616         return True
    617 
    618     def __setitem__(self, key, value):
    619         """Setting tag[key] sets the value of the 'key' attribute for the
    620         tag."""
    621         self._getAttrMap()
    622         self.attrMap[key] = value
    623         found = False
    624         for i in range(0, len(self.attrs)):
    625             if self.attrs[i][0] == key:
    626                 self.attrs[i] = (key, value)
    627                 found = True
    628         if not found:
    629             self.attrs.append((key, value))
    630         self._getAttrMap()[key] = value
    631 
    632     def __delitem__(self, key):
    633         "Deleting tag[key] deletes all 'key' attributes for the tag."
    634         for item in self.attrs:
    635             if item[0] == key:
    636                 self.attrs.remove(item)
    637                 #We don't break because bad HTML can define the same
    638                 #attribute multiple times.
    639             self._getAttrMap()
    640             if self.attrMap.has_key(key):
    641                 del self.attrMap[key]
    642 
    643     def __call__(self, *args, **kwargs):
    644         """Calling a tag like a function is the same as calling its
    645         findAll() method. Eg. tag('a') returns a list of all the A tags
    646         found within this tag."""
    647         return apply(self.findAll, args, kwargs)
    648 
    649     def __getattr__(self, tag):
    650         #print "Getattr %s.%s" % (self.__class__, tag)
    651         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
    652             return self.find(tag[:-3])
    653         elif tag.find('__') != 0:
    654             return self.find(tag)
    655         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
    656 
    657     def __eq__(self, other):
    658         """Returns true iff this tag has the same name, the same attributes,
    659         and the same contents (recursively) as the given tag.
    660 
    661         NOTE: right now this will return false if two tags have the
    662         same attributes in a different order. Should this be fixed?"""
    663         if other is self:
    664             return True
    665         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
    666             return False
    667         for i in range(0, len(self.contents)):
    668             if self.contents[i] != other.contents[i]:
    669                 return False
    670         return True
    671 
    672     def __ne__(self, other):
    673         """Returns true iff this tag is not identical to the other tag,
    674         as defined in __eq__."""
    675         return not self == other
    676 
    677     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    678         """Renders this tag as a string."""
    679         return self.__str__(encoding)
    680 
    681     def __unicode__(self):
    682         return self.__str__(None)
    683 
    684     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
    685                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
    686                                            + ")")
    687 
    688     def _sub_entity(self, x):
    689         """Used with a regular expression to substitute the
    690         appropriate XML entity for an XML special character."""
    691         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
    692 
    693     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
    694                 prettyPrint=False, indentLevel=0):
    695         """Returns a string or Unicode representation of this tag and
    696         its contents. To get Unicode, pass None for encoding.
    697 
    698         NOTE: since Python's HTML parser consumes whitespace, this
    699         method is not certain to reproduce the whitespace present in
    700         the original string."""
    701 
    702         encodedName = self.toEncoding(self.name, encoding)
    703 
    704         attrs = []
    705         if self.attrs:
    706             for key, val in self.attrs:
    707                 fmt = '%s="%s"'
    708                 if isinstance(val, basestring):
    709                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
    710                         val = self.substituteEncoding(val, encoding)
    711 
    712                     # The attribute value either:
    713                     #
    714                     # * Contains no embedded double quotes or single quotes.
    715                     #   No problem: we enclose it in double quotes.
    716                     # * Contains embedded single quotes. No problem:
    717                     #   double quotes work here too.
    718                     # * Contains embedded double quotes. No problem:
    719                     #   we enclose it in single quotes.
    720                     # * Embeds both single _and_ double quotes. This
    721                     #   can't happen naturally, but it can happen if
    722                     #   you modify an attribute value after parsing
    723                     #   the document. Now we have a bit of a
    724                     #   problem. We solve it by enclosing the
    725                     #   attribute in single quotes, and escaping any
    726                     #   embedded single quotes to XML entities.
    727                     if '"' in val:
    728                         fmt = "%s='%s'"
    729                         if "'" in val:
    730                             # TODO: replace with apos when
    731                             # appropriate.
    732                             val = val.replace("'", "&squot;")
    733 
    734                     # Now we're okay w/r/t quotes. But the attribute
    735                     # value might also contain angle brackets, or
    736                     # ampersands that aren't part of entities. We need
    737                     # to escape those to XML entities too.
    738                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
    739 
    740                 attrs.append(fmt % (self.toEncoding(key, encoding),
    741                                     self.toEncoding(val, encoding)))
    742         close = ''
    743         closeTag = ''
    744         if self.isSelfClosing:
    745             close = ' /'
    746         else:
    747             closeTag = '</%s>' % encodedName
    748 
    749         indentTag, indentContents = 0, 0
    750         if prettyPrint:
    751             indentTag = indentLevel
    752             space = (' ' * (indentTag-1))
    753             indentContents = indentTag + 1
    754         contents = self.renderContents(encoding, prettyPrint, indentContents)
    755         if self.hidden:
    756             s = contents
    757         else:
    758             s = []
    759             attributeString = ''
    760             if attrs:
    761                 attributeString = ' ' + ' '.join(attrs)
    762             if prettyPrint:
    763                 s.append(space)
    764             s.append('<%s%s%s>' % (encodedName, attributeString, close))
    765             if prettyPrint:
    766                 s.append("\n")
    767             s.append(contents)
    768             if prettyPrint and contents and contents[-1] != "\n":
    769                 s.append("\n")
    770             if prettyPrint and closeTag:
    771                 s.append(space)
    772             s.append(closeTag)
    773             if prettyPrint and closeTag and self.nextSibling:
    774                 s.append("\n")
    775             s = ''.join(s)
    776         return s
    777 
    778     def decompose(self):
    779         """Recursively destroys the contents of this tree."""
    780         self.extract()
    781         if len(self.contents) == 0:
    782             return
    783         current = self.contents[0]
    784         while current is not None:
    785             next = current.next
    786             if isinstance(current, Tag):
    787                 del current.contents[:]
    788             current.parent = None
    789             current.previous = None
    790             current.previousSibling = None
    791             current.next = None
    792             current.nextSibling = None
    793             current = next
    794 
    795     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
    796         return self.__str__(encoding, True)
    797 
    798     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
    799                        prettyPrint=False, indentLevel=0):
    800         """Renders the contents of this tag as a string in the given
    801         encoding. If encoding is None, returns a Unicode string.."""
    802         s=[]
    803         for c in self:
    804             text = None
    805             if isinstance(c, NavigableString):
    806                 text = c.__str__(encoding)
    807             elif isinstance(c, Tag):
    808                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
    809             if text and prettyPrint:
    810                 text = text.strip()
    811             if text:
    812                 if prettyPrint:
    813                     s.append(" " * (indentLevel-1))
    814                 s.append(text)
    815                 if prettyPrint:
    816                     s.append("\n")
    817         return ''.join(s)
    818 
    819     #Soup methods
    820 
    821     def find(self, name=None, attrs={}, recursive=True, text=None,
    822              **kwargs):
    823         """Return only the first child of this Tag matching the given
    824         criteria."""
    825         r = None
    826         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
    827         if l:
    828             r = l[0]
    829         return r
    830     findChild = find
    831 
    832     def findAll(self, name=None, attrs={}, recursive=True, text=None,
    833                 limit=None, **kwargs):
    834         """Extracts a list of Tag objects that match the given
    835         criteria.  You can specify the name of the Tag and any
    836         attributes you want the Tag to have.
    837 
    838         The value of a key-value pair in the 'attrs' map can be a
    839         string, a list of strings, a regular expression object, or a
    840         callable that takes a string and returns whether or not the
    841         string matches for some custom definition of 'matches'. The
    842         same is true of the tag name."""
    843         generator = self.recursiveChildGenerator
    844         if not recursive:
    845             generator = self.childGenerator
    846         return self._findAll(name, attrs, text, limit, generator, **kwargs)
    847     findChildren = findAll
    848 
    849     # Pre-3.x compatibility methods
    850     first = find
    851     fetch = findAll
    852 
    853     def fetchText(self, text=None, recursive=True, limit=None):
    854         return self.findAll(text=text, recursive=recursive, limit=limit)
    855 
    856     def firstText(self, text=None, recursive=True):
    857         return self.find(text=text, recursive=recursive)
    858 
    859     #Private methods
    860 
    861     def _getAttrMap(self):
    862         """Initializes a map representation of this tag's attributes,
    863         if not already initialized."""
    864         if not getattr(self, 'attrMap'):
    865             self.attrMap = {}
    866             for (key, value) in self.attrs:
    867                 self.attrMap[key] = value
    868         return self.attrMap
    869 
    870     #Generator methods
    871     def childGenerator(self):
    872         # Just use the iterator from the contents
    873         return iter(self.contents)
    874 
    875     def recursiveChildGenerator(self):
    876         if not len(self.contents):
    877             raise StopIteration
    878         stopNode = self._lastRecursiveChild().next
    879         current = self.contents[0]
    880         while current is not stopNode:
    881             yield current
    882             current = current.next
    883 
    884 
    885 # Next, a couple classes to represent queries and their results.
    886 class SoupStrainer:
    887     """Encapsulates a number of ways of matching a markup element (tag or
    888     text)."""
    889 
    890     def __init__(self, name=None, attrs={}, text=None, **kwargs):
    891         self.name = name
    892         if isinstance(attrs, basestring):
    893             kwargs['class'] = _match_css_class(attrs)
    894             attrs = None
    895         if kwargs:
    896             if attrs:
    897                 attrs = attrs.copy()
    898                 attrs.update(kwargs)
    899             else:
    900                 attrs = kwargs
    901         self.attrs = attrs
    902         self.text = text
    903 
    904     def __str__(self):
    905         if self.text:
    906             return self.text
    907         else:
    908             return "%s|%s" % (self.name, self.attrs)
    909 
    910     def searchTag(self, markupName=None, markupAttrs={}):
    911         found = None
    912         markup = None
    913         if isinstance(markupName, Tag):
    914             markup = markupName
    915             markupAttrs = markup
    916         callFunctionWithTagData = callable(self.name) \
    917                                 and not isinstance(markupName, Tag)
    918 
    919         if (not self.name) \
    920                or callFunctionWithTagData \
    921                or (markup and self._matches(markup, self.name)) \
    922                or (not markup and self._matches(markupName, self.name)):
    923             if callFunctionWithTagData:
    924                 match = self.name(markupName, markupAttrs)
    925             else:
    926                 match = True
    927                 markupAttrMap = None
    928                 for attr, matchAgainst in self.attrs.items():
    929                     if not markupAttrMap:
    930                          if hasattr(markupAttrs, 'get'):
    931                             markupAttrMap = markupAttrs
    932                          else:
    933                             markupAttrMap = {}
    934                             for k,v in markupAttrs:
    935                                 markupAttrMap[k] = v
    936                     attrValue = markupAttrMap.get(attr)
    937                     if not self._matches(attrValue, matchAgainst):
    938                         match = False
    939                         break
    940             if match:
    941                 if markup:
    942                     found = markup
    943                 else:
    944                     found = markupName
    945         return found
    946 
    947     def search(self, markup):
    948         #print 'looking for %s in %s' % (self, markup)
    949         found = None
    950         # If given a list of items, scan it for a text element that
    951         # matches.
    952         if hasattr(markup, "__iter__") \
    953                 and not isinstance(markup, Tag):
    954             for element in markup:
    955                 if isinstance(element, NavigableString) \
    956                        and self.search(element):
    957                     found = element
    958                     break
    959         # If it's a Tag, make sure its name or attributes match.
    960         # Don't bother with Tags if we're searching for text.
    961         elif isinstance(markup, Tag):
    962             if not self.text:
    963                 found = self.searchTag(markup)
    964         # If it's text, make sure the text matches.
    965         elif isinstance(markup, NavigableString) or \
    966                  isinstance(markup, basestring):
    967             if self._matches(markup, self.text):
    968                 found = markup
    969         else:
    970             raise Exception, "I don't know how to match against a %s" \
    971                   % markup.__class__
    972         return found
    973 
    974     def _matches(self, markup, matchAgainst):
    975         #print "Matching %s against %s" % (markup, matchAgainst)
    976         result = False
    977         if matchAgainst is True:
    978             result = markup is not None
    979         elif callable(matchAgainst):
    980             result = matchAgainst(markup)
    981         else:
    982             #Custom match methods take the tag as an argument, but all
    983             #other ways of matching match the tag name as a string.
    984             if isinstance(markup, Tag):
    985                 markup = markup.name
    986             if markup and not isinstance(markup, basestring):
    987                 markup = unicode(markup)
    988             #Now we know that chunk is either a string, or None.
    989             if hasattr(matchAgainst, 'match'):
    990                 # It's a regexp object.
    991                 result = markup and matchAgainst.search(markup)
    992             elif hasattr(matchAgainst, '__iter__'): # list-like
    993                 result = markup in matchAgainst
    994             elif hasattr(matchAgainst, 'items'):
    995                 result = markup.has_key(matchAgainst)
    996             elif matchAgainst and isinstance(markup, basestring):
    997                 if isinstance(markup, unicode):
    998                     matchAgainst = unicode(matchAgainst)
    999                 else:
   1000                     matchAgainst = str(matchAgainst)
   1001 
   1002             if not result:
   1003                 result = matchAgainst == markup
   1004         return result
   1005 
   1006 class ResultSet(list):
   1007     """A ResultSet is just a list that keeps track of the SoupStrainer
   1008     that created it."""
   1009     def __init__(self, source):
   1010         list.__init__([])
   1011         self.source = source
   1012 
   1013 # Now, some helper functions.
   1014 
   1015 def buildTagMap(default, *args):
   1016     """Turns a list of maps, lists, or scalars into a single map.
   1017     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
   1018     NESTING_RESET_TAGS maps out of lists and partial maps."""
   1019     built = {}
   1020     for portion in args:
   1021         if hasattr(portion, 'items'):
   1022             #It's a map. Merge it.
   1023             for k,v in portion.items():
   1024                 built[k] = v
   1025         elif hasattr(portion, '__iter__'): # is a list
   1026             #It's a list. Map each item to the default.
   1027             for k in portion:
   1028                 built[k] = default
   1029         else:
   1030             #It's a scalar. Map it to the default.
   1031             built[portion] = default
   1032     return built
   1033 
   1034 # Now, the parser classes.
   1035 
   1036 class BeautifulStoneSoup(Tag, SGMLParser):
   1037 
   1038     """This class contains the basic parser and search code. It defines
   1039     a parser that knows nothing about tag behavior except for the
   1040     following:
   1041 
   1042       You can't close a tag without closing all the tags it encloses.
   1043       That is, "<foo><bar></foo>" actually means
   1044       "<foo><bar></bar></foo>".
   1045 
   1046     [Another possible explanation is "<foo><bar /></foo>", but since
   1047     this class defines no SELF_CLOSING_TAGS, it will never use that
   1048     explanation.]
   1049 
   1050     This class is useful for parsing XML or made-up markup languages,
   1051     or when BeautifulSoup makes an assumption counter to what you were
   1052     expecting."""
   1053 
   1054     SELF_CLOSING_TAGS = {}
   1055     NESTABLE_TAGS = {}
   1056     RESET_NESTING_TAGS = {}
   1057     QUOTE_TAGS = {}
   1058     PRESERVE_WHITESPACE_TAGS = []
   1059 
   1060     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
   1061                        lambda x: x.group(1) + ' />'),
   1062                       (re.compile('<!\s+([^<>]*)>'),
   1063                        lambda x: '<!' + x.group(1) + '>')
   1064                       ]
   1065 
   1066     ROOT_TAG_NAME = u'[document]'
   1067 
   1068     HTML_ENTITIES = "html"
   1069     XML_ENTITIES = "xml"
   1070     XHTML_ENTITIES = "xhtml"
   1071     # TODO: This only exists for backwards-compatibility
   1072     ALL_ENTITIES = XHTML_ENTITIES
   1073 
   1074     # Used when determining whether a text node is all whitespace and
   1075     # can be replaced with a single space. A text node that contains
   1076     # fancy Unicode spaces (usually non-breaking) should be left
   1077     # alone.
   1078     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
   1079 
   1080     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
   1081                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
   1082                  convertEntities=None, selfClosingTags=None, isHTML=False):
   1083         """The Soup object is initialized as the 'root tag', and the
   1084         provided markup (which can be a string or a file-like object)
   1085         is fed into the underlying parser.
   1086 
   1087         sgmllib will process most bad HTML, and the BeautifulSoup
   1088         class has some tricks for dealing with some HTML that kills
   1089         sgmllib, but Beautiful Soup can nonetheless choke or lose data
   1090         if your data uses self-closing tags or declarations
   1091         incorrectly.
   1092 
   1093         By default, Beautiful Soup uses regexes to sanitize input,
   1094         avoiding the vast majority of these problems. If the problems
   1095         don't apply to you, pass in False for markupMassage, and
   1096         you'll get better performance.
   1097 
   1098         The default parser massage techniques fix the two most common
   1099         instances of invalid HTML that choke sgmllib:
   1100 
   1101          <br/> (No space between name of closing tag and tag close)
   1102          <! --Comment--> (Extraneous whitespace in declaration)
   1103 
   1104         You can pass in a custom list of (RE object, replace method)
   1105         tuples to get Beautiful Soup to scrub your input the way you
   1106         want."""
   1107 
   1108         self.parseOnlyThese = parseOnlyThese
   1109         self.fromEncoding = fromEncoding
   1110         self.smartQuotesTo = smartQuotesTo
   1111         self.convertEntities = convertEntities
   1112         # Set the rules for how we'll deal with the entities we
   1113         # encounter
   1114         if self.convertEntities:
   1115             # It doesn't make sense to convert encoded characters to
   1116             # entities even while you're converting entities to Unicode.
   1117             # Just convert it all to Unicode.
   1118             self.smartQuotesTo = None
   1119             if convertEntities == self.HTML_ENTITIES:
   1120                 self.convertXMLEntities = False
   1121                 self.convertHTMLEntities = True
   1122                 self.escapeUnrecognizedEntities = True
   1123             elif convertEntities == self.XHTML_ENTITIES:
   1124                 self.convertXMLEntities = True
   1125                 self.convertHTMLEntities = True
   1126                 self.escapeUnrecognizedEntities = False
   1127             elif convertEntities == self.XML_ENTITIES:
   1128                 self.convertXMLEntities = True
   1129                 self.convertHTMLEntities = False
   1130                 self.escapeUnrecognizedEntities = False
   1131         else:
   1132             self.convertXMLEntities = False
   1133             self.convertHTMLEntities = False
   1134             self.escapeUnrecognizedEntities = False
   1135 
   1136         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
   1137         SGMLParser.__init__(self)
   1138 
   1139         if hasattr(markup, 'read'):        # It's a file-type object.
   1140             markup = markup.read()
   1141         self.markup = markup
   1142         self.markupMassage = markupMassage
   1143         try:
   1144             self._feed(isHTML=isHTML)
   1145         except StopParsing:
   1146             pass
   1147         self.markup = None                 # The markup can now be GCed
   1148 
   1149     def convert_charref(self, name):
   1150         """This method fixes a bug in Python's SGMLParser."""
   1151         try:
   1152             n = int(name)
   1153         except ValueError:
   1154             return
   1155         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
   1156             return
   1157         return self.convert_codepoint(n)
   1158 
   1159     def _feed(self, inDocumentEncoding=None, isHTML=False):
   1160         # Convert the document to Unicode.
   1161         markup = self.markup
   1162         if isinstance(markup, unicode):
   1163             if not hasattr(self, 'originalEncoding'):
   1164                 self.originalEncoding = None
   1165         else:
   1166             dammit = UnicodeDammit\
   1167                      (markup, [self.fromEncoding, inDocumentEncoding],
   1168                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
   1169             markup = dammit.unicode
   1170             self.originalEncoding = dammit.originalEncoding
   1171             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
   1172         if markup:
   1173             if self.markupMassage:
   1174                 if not hasattr(self.markupMassage, "__iter__"):
   1175                     self.markupMassage = self.MARKUP_MASSAGE
   1176                 for fix, m in self.markupMassage:
   1177                     markup = fix.sub(m, markup)
   1178                 # TODO: We get rid of markupMassage so that the
   1179                 # soup object can be deepcopied later on. Some
   1180                 # Python installations can't copy regexes. If anyone
   1181                 # was relying on the existence of markupMassage, this
   1182                 # might cause problems.
   1183                 del(self.markupMassage)
   1184         self.reset()
   1185 
   1186         SGMLParser.feed(self, markup)
   1187         # Close out any unfinished strings and close all the open tags.
   1188         self.endData()
   1189         while self.currentTag.name != self.ROOT_TAG_NAME:
   1190             self.popTag()
   1191 
   1192     def __getattr__(self, methodName):
   1193         """This method routes method call requests to either the SGMLParser
   1194         superclass or the Tag superclass, depending on the method name."""
   1195         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
   1196 
   1197         if methodName.startswith('start_') or methodName.startswith('end_') \
   1198                or methodName.startswith('do_'):
   1199             return SGMLParser.__getattr__(self, methodName)
   1200         elif not methodName.startswith('__'):
   1201             return Tag.__getattr__(self, methodName)
   1202         else:
   1203             raise AttributeError
   1204 
   1205     def isSelfClosingTag(self, name):
   1206         """Returns true iff the given string is the name of a
   1207         self-closing tag according to this parser."""
   1208         return self.SELF_CLOSING_TAGS.has_key(name) \
   1209                or self.instanceSelfClosingTags.has_key(name)
   1210 
   1211     def reset(self):
   1212         Tag.__init__(self, self, self.ROOT_TAG_NAME)
   1213         self.hidden = 1
   1214         SGMLParser.reset(self)
   1215         self.currentData = []
   1216         self.currentTag = None
   1217         self.tagStack = []
   1218         self.quoteStack = []
   1219         self.pushTag(self)
   1220 
   1221     def popTag(self):
   1222         tag = self.tagStack.pop()
   1223 
   1224         #print "Pop", tag.name
   1225         if self.tagStack:
   1226             self.currentTag = self.tagStack[-1]
   1227         return self.currentTag
   1228 
   1229     def pushTag(self, tag):
   1230         #print "Push", tag.name
   1231         if self.currentTag:
   1232             self.currentTag.contents.append(tag)
   1233         self.tagStack.append(tag)
   1234         self.currentTag = self.tagStack[-1]
   1235 
   1236     def endData(self, containerClass=NavigableString):
   1237         if self.currentData:
   1238             currentData = u''.join(self.currentData)
   1239             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
   1240                 not set([tag.name for tag in self.tagStack]).intersection(
   1241                     self.PRESERVE_WHITESPACE_TAGS)):
   1242                 if '\n' in currentData:
   1243                     currentData = '\n'
   1244                 else:
   1245                     currentData = ' '
   1246             self.currentData = []
   1247             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
   1248                    (not self.parseOnlyThese.text or \
   1249                     not self.parseOnlyThese.search(currentData)):
   1250                 return
   1251             o = containerClass(currentData)
   1252             o.setup(self.currentTag, self.previous)
   1253             if self.previous:
   1254                 self.previous.next = o
   1255             self.previous = o
   1256             self.currentTag.contents.append(o)
   1257 
   1258 
   1259     def _popToTag(self, name, inclusivePop=True):
   1260         """Pops the tag stack up to and including the most recent
   1261         instance of the given tag. If inclusivePop is false, pops the tag
   1262         stack up to but *not* including the most recent instqance of
   1263         the given tag."""
   1264         #print "Popping to %s" % name
   1265         if name == self.ROOT_TAG_NAME:
   1266             return
   1267 
   1268         numPops = 0
   1269         mostRecentTag = None
   1270         for i in range(len(self.tagStack)-1, 0, -1):
   1271             if name == self.tagStack[i].name:
   1272                 numPops = len(self.tagStack)-i
   1273                 break
   1274         if not inclusivePop:
   1275             numPops = numPops - 1
   1276 
   1277         for i in range(0, numPops):
   1278             mostRecentTag = self.popTag()
   1279         return mostRecentTag
   1280 
   1281     def _smartPop(self, name):
   1282 
   1283         """We need to pop up to the previous tag of this type, unless
   1284         one of this tag's nesting reset triggers comes between this
   1285         tag and the previous tag of this type, OR unless this tag is a
   1286         generic nesting trigger and another generic nesting trigger
   1287         comes between this tag and the previous tag of this type.
   1288 
   1289         Examples:
   1290          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
   1291          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
   1292          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
   1293 
   1294          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
   1295          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
   1296          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
   1297         """
   1298 
   1299         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
   1300         isNestable = nestingResetTriggers != None
   1301         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
   1302         popTo = None
   1303         inclusive = True
   1304         for i in range(len(self.tagStack)-1, 0, -1):
   1305             p = self.tagStack[i]
   1306             if (not p or p.name == name) and not isNestable:
   1307                 #Non-nestable tags get popped to the top or to their
   1308                 #last occurance.
   1309                 popTo = name
   1310                 break
   1311             if (nestingResetTriggers is not None
   1312                 and p.name in nestingResetTriggers) \
   1313                 or (nestingResetTriggers is None and isResetNesting
   1314                     and self.RESET_NESTING_TAGS.has_key(p.name)):
   1315 
   1316                 #If we encounter one of the nesting reset triggers
   1317                 #peculiar to this tag, or we encounter another tag
   1318                 #that causes nesting to reset, pop up to but not
   1319                 #including that tag.
   1320                 popTo = p.name
   1321                 inclusive = False
   1322                 break
   1323             p = p.parent
   1324         if popTo:
   1325             self._popToTag(popTo, inclusive)
   1326 
   1327     def unknown_starttag(self, name, attrs, selfClosing=0):
   1328         #print "Start tag %s: %s" % (name, attrs)
   1329         if self.quoteStack:
   1330             #This is not a real tag.
   1331             #print "<%s> is not real!" % name
   1332             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
   1333             self.handle_data('<%s%s>' % (name, attrs))
   1334             return
   1335         self.endData()
   1336 
   1337         if not self.isSelfClosingTag(name) and not selfClosing:
   1338             self._smartPop(name)
   1339 
   1340         if self.parseOnlyThese and len(self.tagStack) <= 1 \
   1341                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
   1342             return
   1343 
   1344         tag = Tag(self, name, attrs, self.currentTag, self.previous)
   1345         if self.previous:
   1346             self.previous.next = tag
   1347         self.previous = tag
   1348         self.pushTag(tag)
   1349         if selfClosing or self.isSelfClosingTag(name):
   1350             self.popTag()
   1351         if name in self.QUOTE_TAGS:
   1352             #print "Beginning quote (%s)" % name
   1353             self.quoteStack.append(name)
   1354             self.literal = 1
   1355         return tag
   1356 
   1357     def unknown_endtag(self, name):
   1358         #print "End tag %s" % name
   1359         if self.quoteStack and self.quoteStack[-1] != name:
   1360             #This is not a real end tag.
   1361             #print "</%s> is not real!" % name
   1362             self.handle_data('</%s>' % name)
   1363             return
   1364         self.endData()
   1365         self._popToTag(name)
   1366         if self.quoteStack and self.quoteStack[-1] == name:
   1367             self.quoteStack.pop()
   1368             self.literal = (len(self.quoteStack) > 0)
   1369 
   1370     def handle_data(self, data):
   1371         self.currentData.append(data)
   1372 
   1373     def _toStringSubclass(self, text, subclass):
   1374         """Adds a certain piece of text to the tree as a NavigableString
   1375         subclass."""
   1376         self.endData()
   1377         self.handle_data(text)
   1378         self.endData(subclass)
   1379 
   1380     def handle_pi(self, text):
   1381         """Handle a processing instruction as a ProcessingInstruction
   1382         object, possibly one with a %SOUP-ENCODING% slot into which an
   1383         encoding will be plugged later."""
   1384         if text[:3] == "xml":
   1385             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
   1386         self._toStringSubclass(text, ProcessingInstruction)
   1387 
   1388     def handle_comment(self, text):
   1389         "Handle comments as Comment objects."
   1390         self._toStringSubclass(text, Comment)
   1391 
   1392     def handle_charref(self, ref):
   1393         "Handle character references as data."
   1394         if self.convertEntities:
   1395             data = unichr(int(ref))
   1396         else:
   1397             data = '&#%s;' % ref
   1398         self.handle_data(data)
   1399 
   1400     def handle_entityref(self, ref):
   1401         """Handle entity references as data, possibly converting known
   1402         HTML and/or XML entity references to the corresponding Unicode
   1403         characters."""
   1404         data = None
   1405         if self.convertHTMLEntities:
   1406             try:
   1407                 data = unichr(name2codepoint[ref])
   1408             except KeyError:
   1409                 pass
   1410 
   1411         if not data and self.convertXMLEntities:
   1412                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
   1413 
   1414         if not data and self.convertHTMLEntities and \
   1415             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
   1416                 # TODO: We've got a problem here. We're told this is
   1417                 # an entity reference, but it's not an XML entity
   1418                 # reference or an HTML entity reference. Nonetheless,
   1419                 # the logical thing to do is to pass it through as an
   1420                 # unrecognized entity reference.
   1421                 #
   1422                 # Except: when the input is "&carol;" this function
   1423                 # will be called with input "carol". When the input is
   1424                 # "AT&T", this function will be called with input
   1425                 # "T". We have no way of knowing whether a semicolon
   1426                 # was present originally, so we don't know whether
   1427                 # this is an unknown entity or just a misplaced
   1428                 # ampersand.
   1429                 #
   1430                 # The more common case is a misplaced ampersand, so I
   1431                 # escape the ampersand and omit the trailing semicolon.
   1432                 data = "&amp;%s" % ref
   1433         if not data:
   1434             # This case is different from the one above, because we
   1435             # haven't already gone through a supposedly comprehensive
   1436             # mapping of entities to Unicode characters. We might not
   1437             # have gone through any mapping at all. So the chances are
   1438             # very high that this is a real entity, and not a
   1439             # misplaced ampersand.
   1440             data = "&%s;" % ref
   1441         self.handle_data(data)
   1442 
   1443     def handle_decl(self, data):
   1444         "Handle DOCTYPEs and the like as Declaration objects."
   1445         self._toStringSubclass(data, Declaration)
   1446 
   1447     def parse_declaration(self, i):
   1448         """Treat a bogus SGML declaration as raw data. Treat a CDATA
   1449         declaration as a CData object."""
   1450         j = None
   1451         if self.rawdata[i:i+9] == '<![CDATA[':
   1452              k = self.rawdata.find(']]>', i)
   1453              if k == -1:
   1454                  k = len(self.rawdata)
   1455              data = self.rawdata[i+9:k]
   1456              j = k+3
   1457              self._toStringSubclass(data, CData)
   1458         else:
   1459             try:
   1460                 j = SGMLParser.parse_declaration(self, i)
   1461             except SGMLParseError:
   1462                 toHandle = self.rawdata[i:]
   1463                 self.handle_data(toHandle)
   1464                 j = i + len(toHandle)
   1465         return j
   1466 
   1467 class BeautifulSoup(BeautifulStoneSoup):
   1468 
   1469     """This parser knows the following facts about HTML:
   1470 
   1471     * Some tags have no closing tag and should be interpreted as being
   1472       closed as soon as they are encountered.
   1473 
   1474     * The text inside some tags (ie. 'script') may contain tags which
   1475       are not really part of the document and which should be parsed
   1476       as text, not tags. If you want to parse the text as tags, you can
   1477       always fetch it and parse it explicitly.
   1478 
   1479     * Tag nesting rules:
   1480 
   1481       Most tags can't be nested at all. For instance, the occurance of
   1482       a <p> tag should implicitly close the previous <p> tag.
   1483 
   1484        <p>Para1<p>Para2
   1485         should be transformed into:
   1486        <p>Para1</p><p>Para2
   1487 
   1488       Some tags can be nested arbitrarily. For instance, the occurance
   1489       of a <blockquote> tag should _not_ implicitly close the previous
   1490       <blockquote> tag.
   1491 
   1492        Alice said: <blockquote>Bob said: <blockquote>Blah
   1493         should NOT be transformed into:
   1494        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
   1495 
   1496       Some tags can be nested, but the nesting is reset by the
   1497       interposition of other tags. For instance, a <tr> tag should
   1498       implicitly close the previous <tr> tag within the same <table>,
   1499       but not close a <tr> tag in another table.
   1500 
   1501        <table><tr>Blah<tr>Blah
   1502         should be transformed into:
   1503        <table><tr>Blah</tr><tr>Blah
   1504         but,
   1505        <tr>Blah<table><tr>Blah
   1506         should NOT be transformed into
   1507        <tr>Blah<table></tr><tr>Blah
   1508 
   1509     Differing assumptions about tag nesting rules are a major source
   1510     of problems with the BeautifulSoup class. If BeautifulSoup is not
   1511     treating as nestable a tag your page author treats as nestable,
   1512     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
   1513     BeautifulStoneSoup before writing your own subclass."""
   1514 
   1515     def __init__(self, *args, **kwargs):
   1516         if not kwargs.has_key('smartQuotesTo'):
   1517             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
   1518         kwargs['isHTML'] = True
   1519         BeautifulStoneSoup.__init__(self, *args, **kwargs)
   1520 
   1521     SELF_CLOSING_TAGS = buildTagMap(None,
   1522                                     ('br' , 'hr', 'input', 'img', 'meta',
   1523                                     'spacer', 'link', 'frame', 'base', 'col'))
   1524 
   1525     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
   1526 
   1527     QUOTE_TAGS = {'script' : None, 'textarea' : None}
   1528 
   1529     #According to the HTML standard, each of these inline tags can
   1530     #contain another tag of the same type. Furthermore, it's common
   1531     #to actually use these tags this way.
   1532     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
   1533                             'center')
   1534 
   1535     #According to the HTML standard, these block tags can contain
   1536     #another tag of the same type. Furthermore, it's common
   1537     #to actually use these tags this way.
   1538     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
   1539 
   1540     #Lists can contain other lists, but there are restrictions.
   1541     NESTABLE_LIST_TAGS = { 'ol' : [],
   1542                            'ul' : [],
   1543                            'li' : ['ul', 'ol'],
   1544                            'dl' : [],
   1545                            'dd' : ['dl'],
   1546                            'dt' : ['dl'] }
   1547 
   1548     #Tables can contain other tables, but there are restrictions.
   1549     NESTABLE_TABLE_TAGS = {'table' : [],
   1550                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
   1551                            'td' : ['tr'],
   1552                            'th' : ['tr'],
   1553                            'thead' : ['table'],
   1554                            'tbody' : ['table'],
   1555                            'tfoot' : ['table'],
   1556                            }
   1557 
   1558     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
   1559 
   1560     #If one of these tags is encountered, all tags up to the next tag of
   1561     #this type are popped.
   1562     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
   1563                                      NON_NESTABLE_BLOCK_TAGS,
   1564                                      NESTABLE_LIST_TAGS,
   1565                                      NESTABLE_TABLE_TAGS)
   1566 
   1567     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
   1568                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
   1569 
   1570     # Used to detect the charset in a META tag; see start_meta
   1571     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
   1572 
   1573     def start_meta(self, attrs):
   1574         """Beautiful Soup can detect a charset included in a META tag,
   1575         try to convert the document to that charset, and re-parse the
   1576         document from the beginning."""
   1577         httpEquiv = None
   1578         contentType = None
   1579         contentTypeIndex = None
   1580         tagNeedsEncodingSubstitution = False
   1581 
   1582         for i in range(0, len(attrs)):
   1583             key, value = attrs[i]
   1584             key = key.lower()
   1585             if key == 'http-equiv':
   1586                 httpEquiv = value
   1587             elif key == 'content':
   1588                 contentType = value
   1589                 contentTypeIndex = i
   1590 
   1591         if httpEquiv and contentType: # It's an interesting meta tag.
   1592             match = self.CHARSET_RE.search(contentType)
   1593             if match:
   1594                 if (self.declaredHTMLEncoding is not None or
   1595                     self.originalEncoding == self.fromEncoding):
   1596                     # An HTML encoding was sniffed while converting
   1597                     # the document to Unicode, or an HTML encoding was
   1598                     # sniffed during a previous pass through the
   1599                     # document, or an encoding was specified
   1600                     # explicitly and it worked. Rewrite the meta tag.
   1601                     def rewrite(match):
   1602                         return match.group(1) + "%SOUP-ENCODING%"
   1603                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
   1604                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
   1605                                                newAttr)
   1606                     tagNeedsEncodingSubstitution = True
   1607                 else:
   1608                     # This is our first pass through the document.
   1609                     # Go through it again with the encoding information.
   1610                     newCharset = match.group(3)
   1611                     if newCharset and newCharset != self.originalEncoding:
   1612                         self.declaredHTMLEncoding = newCharset
   1613                         self._feed(self.declaredHTMLEncoding)
   1614                         raise StopParsing
   1615                     pass
   1616         tag = self.unknown_starttag("meta", attrs)
   1617         if tag and tagNeedsEncodingSubstitution:
   1618             tag.containsSubstitutions = True
   1619 
   1620 class StopParsing(Exception):
   1621     pass
   1622 
   1623 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
   1624 
   1625     """The BeautifulSoup class is oriented towards skipping over
   1626     common HTML errors like unclosed tags. However, sometimes it makes
   1627     errors of its own. For instance, consider this fragment:
   1628 
   1629      <b>Foo<b>Bar</b></b>
   1630 
   1631     This is perfectly valid (if bizarre) HTML. However, the
   1632     BeautifulSoup class will implicitly close the first b tag when it
   1633     encounters the second 'b'. It will think the author wrote
   1634     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
   1635     there's no real-world reason to bold something that's already
   1636     bold. When it encounters '</b></b>' it will close two more 'b'
   1637     tags, for a grand total of three tags closed instead of two. This
   1638     can throw off the rest of your document structure. The same is
   1639     true of a number of other tags, listed below.
   1640 
   1641     It's much more common for someone to forget to close a 'b' tag
   1642     than to actually use nested 'b' tags, and the BeautifulSoup class
   1643     handles the common case. This class handles the not-co-common
   1644     case: where you can't believe someone wrote what they did, but
   1645     it's valid HTML and BeautifulSoup screwed up by assuming it
   1646     wouldn't be."""
   1647 
   1648     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
   1649      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
   1650       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
   1651       'big')
   1652 
   1653     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
   1654 
   1655     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
   1656                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
   1657                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
   1658 
   1659 class MinimalSoup(BeautifulSoup):
   1660     """The MinimalSoup class is for parsing HTML that contains
   1661     pathologically bad markup. It makes no assumptions about tag
   1662     nesting, but it does know which tags are self-closing, that
   1663     <script> tags contain Javascript and should not be parsed, that
   1664     META tags may contain encoding information, and so on.
   1665 
   1666     This also makes it better for subclassing than BeautifulStoneSoup
   1667     or BeautifulSoup."""
   1668 
   1669     RESET_NESTING_TAGS = buildTagMap('noscript')
   1670     NESTABLE_TAGS = {}
   1671 
   1672 class BeautifulSOAP(BeautifulStoneSoup):
   1673     """This class will push a tag with only a single string child into
   1674     the tag's parent as an attribute. The attribute's name is the tag
   1675     name, and the value is the string child. An example should give
   1676     the flavor of the change:
   1677 
   1678     <foo><bar>baz</bar></foo>
   1679      =>
   1680     <foo bar="baz"><bar>baz</bar></foo>
   1681 
   1682     You can then access fooTag['bar'] instead of fooTag.barTag.string.
   1683 
   1684     This is, of course, useful for scraping structures that tend to
   1685     use subelements instead of attributes, such as SOAP messages. Note
   1686     that it modifies its input, so don't print the modified version
   1687     out.
   1688 
   1689     I'm not sure how many people really want to use this class; let me
   1690     know if you do. Mainly I like the name."""
   1691 
   1692     def popTag(self):
   1693         if len(self.tagStack) > 1:
   1694             tag = self.tagStack[-1]
   1695             parent = self.tagStack[-2]
   1696             parent._getAttrMap()
   1697             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
   1698                 isinstance(tag.contents[0], NavigableString) and
   1699                 not parent.attrMap.has_key(tag.name)):
   1700                 parent[tag.name] = tag.contents[0]
   1701         BeautifulStoneSoup.popTag(self)
   1702 
   1703 #Enterprise class names! It has come to our attention that some people
   1704 #think the names of the Beautiful Soup parser classes are too silly
   1705 #and "unprofessional" for use in enterprise screen-scraping. We feel
   1706 #your pain! For such-minded folk, the Beautiful Soup Consortium And
   1707 #All-Night Kosher Bakery recommends renaming this file to
   1708 #"RobustParser.py" (or, in cases of extreme enterprisiness,
   1709 #"RobustParserBeanInterface.class") and using the following
   1710 #enterprise-friendly class aliases:
   1711 class RobustXMLParser(BeautifulStoneSoup):
   1712     pass
   1713 class RobustHTMLParser(BeautifulSoup):
   1714     pass
   1715 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
   1716     pass
   1717 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
   1718     pass
   1719 class SimplifyingSOAPParser(BeautifulSOAP):
   1720     pass
   1721 
   1722 ######################################################
   1723 #
   1724 # Bonus library: Unicode, Dammit
   1725 #
   1726 # This class forces XML data into a standard format (usually to UTF-8
   1727 # or Unicode).  It is heavily based on code from Mark Pilgrim's
   1728 # Universal Feed Parser. It does not rewrite the XML or HTML to
   1729 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
   1730 # (XML) and BeautifulSoup.start_meta (HTML).
   1731 
   1732 # Autodetects character encodings.
   1733 # Download from http://chardet.feedparser.org/
   1734 try:
   1735     import chardet
   1736 #    import chardet.constants
   1737 #    chardet.constants._debug = 1
   1738 except ImportError:
   1739     chardet = None
   1740 
   1741 # cjkcodecs and iconv_codec make Python know about more character encodings.
   1742 # Both are available from http://cjkpython.i18n.org/
   1743 # They're built in if you use Python 2.4.
   1744 try:
   1745     import cjkcodecs.aliases
   1746 except ImportError:
   1747     pass
   1748 try:
   1749     import iconv_codec
   1750 except ImportError:
   1751     pass
   1752 
   1753 class UnicodeDammit:
   1754     """A class for detecting the encoding of a *ML document and
   1755     converting it to a Unicode string. If the source encoding is
   1756     windows-1252, can replace MS smart quotes with their HTML or XML
   1757     equivalents."""
   1758 
   1759     # This dictionary maps commonly seen values for "charset" in HTML
   1760     # meta tags to the corresponding Python codec names. It only covers
   1761     # values that aren't in Python's aliases and can't be determined
   1762     # by the heuristics in find_codec.
   1763     CHARSET_ALIASES = { "macintosh" : "mac-roman",
   1764                         "x-sjis" : "shift-jis" }
   1765 
   1766     def __init__(self, markup, overrideEncodings=[],
   1767                  smartQuotesTo='xml', isHTML=False):
   1768         self.declaredHTMLEncoding = None
   1769         self.markup, documentEncoding, sniffedEncoding = \
   1770                      self._detectEncoding(markup, isHTML)
   1771         self.smartQuotesTo = smartQuotesTo
   1772         self.triedEncodings = []
   1773         if markup == '' or isinstance(markup, unicode):
   1774             self.originalEncoding = None
   1775             self.unicode = unicode(markup)
   1776             return
   1777 
   1778         u = None
   1779         for proposedEncoding in overrideEncodings:
   1780             u = self._convertFrom(proposedEncoding)
   1781             if u: break
   1782         if not u:
   1783             for proposedEncoding in (documentEncoding, sniffedEncoding):
   1784                 u = self._convertFrom(proposedEncoding)
   1785                 if u: break
   1786 
   1787         # If no luck and we have auto-detection library, try that:
   1788         if not u and chardet and not isinstance(self.markup, unicode):
   1789             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
   1790 
   1791         # As a last resort, try utf-8 and windows-1252:
   1792         if not u:
   1793             for proposed_encoding in ("utf-8", "windows-1252"):
   1794                 u = self._convertFrom(proposed_encoding)
   1795                 if u: break
   1796 
   1797         self.unicode = u
   1798         if not u: self.originalEncoding = None
   1799 
   1800     def _subMSChar(self, orig):
   1801         """Changes a MS smart quote character to an XML or HTML
   1802         entity."""
   1803         sub = self.MS_CHARS.get(orig)
   1804         if isinstance(sub, tuple):
   1805             if self.smartQuotesTo == 'xml':
   1806                 sub = '&#x%s;' % sub[1]
   1807             else:
   1808                 sub = '&%s;' % sub[0]
   1809         return sub
   1810 
   1811     def _convertFrom(self, proposed):
   1812         proposed = self.find_codec(proposed)
   1813         if not proposed or proposed in self.triedEncodings:
   1814             return None
   1815         self.triedEncodings.append(proposed)
   1816         markup = self.markup
   1817 
   1818         # Convert smart quotes to HTML if coming from an encoding
   1819         # that might have them.
   1820         if self.smartQuotesTo and proposed.lower() in("windows-1252",
   1821                                                       "iso-8859-1",
   1822                                                       "iso-8859-2"):
   1823             markup = re.compile("([\x80-\x9f])").sub \
   1824                      (lambda(x): self._subMSChar(x.group(1)),
   1825                       markup)
   1826 
   1827         try:
   1828             # print "Trying to convert document to %s" % proposed
   1829             u = self._toUnicode(markup, proposed)
   1830             self.markup = u
   1831             self.originalEncoding = proposed
   1832         except Exception, e:
   1833             # print "That didn't work!"
   1834             # print e
   1835             return None
   1836         #print "Correct encoding: %s" % proposed
   1837         return self.markup
   1838 
   1839     def _toUnicode(self, data, encoding):
   1840         '''Given a string and its encoding, decodes the string into Unicode.
   1841         %encoding is a string recognized by encodings.aliases'''
   1842 
   1843         # strip Byte Order Mark (if present)
   1844         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
   1845                and (data[2:4] != '\x00\x00'):
   1846             encoding = 'utf-16be'
   1847             data = data[2:]
   1848         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
   1849                  and (data[2:4] != '\x00\x00'):
   1850             encoding = 'utf-16le'
   1851             data = data[2:]
   1852         elif data[:3] == '\xef\xbb\xbf':
   1853             encoding = 'utf-8'
   1854             data = data[3:]
   1855         elif data[:4] == '\x00\x00\xfe\xff':
   1856             encoding = 'utf-32be'
   1857             data = data[4:]
   1858         elif data[:4] == '\xff\xfe\x00\x00':
   1859             encoding = 'utf-32le'
   1860             data = data[4:]
   1861         newdata = unicode(data, encoding)
   1862         return newdata
   1863 
   1864     def _detectEncoding(self, xml_data, isHTML=False):
   1865         """Given a document, tries to detect its XML encoding."""
   1866         xml_encoding = sniffed_xml_encoding = None
   1867         try:
   1868             if xml_data[:4] == '\x4c\x6f\xa7\x94':
   1869                 # EBCDIC
   1870                 xml_data = self._ebcdic_to_ascii(xml_data)
   1871             elif xml_data[:4] == '\x00\x3c\x00\x3f':
   1872                 # UTF-16BE
   1873                 sniffed_xml_encoding = 'utf-16be'
   1874                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
   1875             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
   1876                      and (xml_data[2:4] != '\x00\x00'):
   1877                 # UTF-16BE with BOM
   1878                 sniffed_xml_encoding = 'utf-16be'
   1879                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
   1880             elif xml_data[:4] == '\x3c\x00\x3f\x00':
   1881                 # UTF-16LE
   1882                 sniffed_xml_encoding = 'utf-16le'
   1883                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
   1884             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
   1885                      (xml_data[2:4] != '\x00\x00'):
   1886                 # UTF-16LE with BOM
   1887                 sniffed_xml_encoding = 'utf-16le'
   1888                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
   1889             elif xml_data[:4] == '\x00\x00\x00\x3c':
   1890                 # UTF-32BE
   1891                 sniffed_xml_encoding = 'utf-32be'
   1892                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
   1893             elif xml_data[:4] == '\x3c\x00\x00\x00':
   1894                 # UTF-32LE
   1895                 sniffed_xml_encoding = 'utf-32le'
   1896                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
   1897             elif xml_data[:4] == '\x00\x00\xfe\xff':
   1898                 # UTF-32BE with BOM
   1899                 sniffed_xml_encoding = 'utf-32be'
   1900                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
   1901             elif xml_data[:4] == '\xff\xfe\x00\x00':
   1902                 # UTF-32LE with BOM
   1903                 sniffed_xml_encoding = 'utf-32le'
   1904                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
   1905             elif xml_data[:3] == '\xef\xbb\xbf':
   1906                 # UTF-8 with BOM
   1907                 sniffed_xml_encoding = 'utf-8'
   1908                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
   1909             else:
   1910                 sniffed_xml_encoding = 'ascii'
   1911                 pass
   1912         except:
   1913             xml_encoding_match = None
   1914         xml_encoding_match = re.compile(
   1915             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
   1916         if not xml_encoding_match and isHTML:
   1917             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
   1918             xml_encoding_match = regexp.search(xml_data)
   1919         if xml_encoding_match is not None:
   1920             xml_encoding = xml_encoding_match.groups()[0].lower()
   1921             if isHTML:
   1922                 self.declaredHTMLEncoding = xml_encoding
   1923             if sniffed_xml_encoding and \
   1924                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
   1925                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
   1926                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
   1927                                  'utf16', 'u16')):
   1928                 xml_encoding = sniffed_xml_encoding
   1929         return xml_data, xml_encoding, sniffed_xml_encoding
   1930 
   1931 
   1932     def find_codec(self, charset):
   1933         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
   1934                or (charset and self._codec(charset.replace("-", ""))) \
   1935                or (charset and self._codec(charset.replace("-", "_"))) \
   1936                or charset
   1937 
   1938     def _codec(self, charset):
   1939         if not charset: return charset
   1940         codec = None
   1941         try:
   1942             codecs.lookup(charset)
   1943             codec = charset
   1944         except (LookupError, ValueError):
   1945             pass
   1946         return codec
   1947 
   1948     EBCDIC_TO_ASCII_MAP = None
   1949     def _ebcdic_to_ascii(self, s):
   1950         c = self.__class__
   1951         if not c.EBCDIC_TO_ASCII_MAP:
   1952             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
   1953                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
   1954                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
   1955                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
   1956                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
   1957                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
   1958                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
   1959                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
   1960                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
   1961                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
   1962                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
   1963                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
   1964                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
   1965                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
   1966                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
   1967                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
   1968                     250,251,252,253,254,255)
   1969             import string
   1970             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
   1971             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
   1972         return s.translate(c.EBCDIC_TO_ASCII_MAP)
   1973 
   1974     MS_CHARS = { '\x80' : ('euro', '20AC'),
   1975                  '\x81' : ' ',
   1976                  '\x82' : ('sbquo', '201A'),
   1977                  '\x83' : ('fnof', '192'),
   1978                  '\x84' : ('bdquo', '201E'),
   1979                  '\x85' : ('hellip', '2026'),
   1980                  '\x86' : ('dagger', '2020'),
   1981                  '\x87' : ('Dagger', '2021'),
   1982                  '\x88' : ('circ', '2C6'),
   1983                  '\x89' : ('permil', '2030'),
   1984                  '\x8A' : ('Scaron', '160'),
   1985                  '\x8B' : ('lsaquo', '2039'),
   1986                  '\x8C' : ('OElig', '152'),
   1987                  '\x8D' : '?',
   1988                  '\x8E' : ('#x17D', '17D'),
   1989                  '\x8F' : '?',
   1990                  '\x90' : '?',
   1991                  '\x91' : ('lsquo', '2018'),
   1992                  '\x92' : ('rsquo', '2019'),
   1993                  '\x93' : ('ldquo', '201C'),
   1994                  '\x94' : ('rdquo', '201D'),
   1995                  '\x95' : ('bull', '2022'),
   1996                  '\x96' : ('ndash', '2013'),
   1997                  '\x97' : ('mdash', '2014'),
   1998                  '\x98' : ('tilde', '2DC'),
   1999                  '\x99' : ('trade', '2122'),
   2000                  '\x9a' : ('scaron', '161'),
   2001                  '\x9b' : ('rsaquo', '203A'),
   2002                  '\x9c' : ('oelig', '153'),
   2003                  '\x9d' : '?',
   2004                  '\x9e' : ('#x17E', '17E'),
   2005                  '\x9f' : ('Yuml', ''),}
   2006 
   2007 #######################################################################
   2008 
   2009 
   2010 #By default, act as an HTML pretty-printer.
   2011 if __name__ == '__main__':
   2012     import sys
   2013     soup = BeautifulSoup(sys.stdin)
   2014     print soup.prettify()
   2015