Home | History | Annotate | Download | only in rpm_control_system
      1 """Beautiful Soup
      2 Elixir and Tonic
      3 "The Screen-Scraper's Friend"
      4 http://www.crummy.com/software/BeautifulSoup/
      5 
      6 Beautiful Soup parses a (possibly invalid) XML or HTML document into a
      7 tree representation. It provides methods and Pythonic idioms that make
      8 it easy to navigate, search, and modify the tree.
      9 
     10 A well-formed XML/HTML document yields a well-formed data
     11 structure. An ill-formed XML/HTML document yields a correspondingly
     12 ill-formed data structure. If your document is only locally
     13 well-formed, you can use this library to find and process the
     14 well-formed part of it.
     15 
     16 Beautiful Soup works with Python 2.2 and up. It has no external
     17 dependencies, but you'll have more success at converting data to UTF-8
     18 if you also install these three packages:
     19 
     20 * chardet, for auto-detecting character encodings
     21   http://chardet.feedparser.org/
     22 * cjkcodecs and iconv_codec, which add more encodings to the ones supported
     23   by stock Python.
     24   http://cjkpython.i18n.org/
     25 
     26 Beautiful Soup defines classes for two main parsing strategies:
     27 
     28  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
     29    language that kind of looks like XML.
     30 
     31  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
     32    or invalid. This class has web browser-like heuristics for
     33    obtaining a sensible parse tree in the face of common HTML errors.
     34 
     35 Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
     36 the encoding of an HTML or XML document, and converting it to
     37 Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
     38 
     39 For more than you ever wanted to know about Beautiful Soup, see the
     40 documentation:
     41 http://www.crummy.com/software/BeautifulSoup/documentation.html
     42 
     43 Here, have some legalese:
     44 
     45 Copyright (c) 2004-2010, Leonard Richardson
     46 
     47 All rights reserved.
     48 
     49 Redistribution and use in source and binary forms, with or without
     50 modification, are permitted provided that the following conditions are
     51 met:
     52 
     53   * Redistributions of source code must retain the above copyright
     54     notice, this list of conditions and the following disclaimer.
     55 
     56   * Redistributions in binary form must reproduce the above
     57     copyright notice, this list of conditions and the following
     58     disclaimer in the documentation and/or other materials provided
     59     with the distribution.
     60 
     61   * Neither the name of the the Beautiful Soup Consortium and All
     62     Night Kosher Bakery nor the names of its contributors may be
     63     used to endorse or promote products derived from this software
     64     without specific prior written permission.
     65 
     66 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     67 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     68 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     69 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
     70 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     71 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     72 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     73 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
     74 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
     75 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     76 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
     77 
     78 """
     79 from __future__ import generators
     80 
     81 __author__ = "Leonard Richardson (leonardr (at] segfault.org)"
     82 __version__ = "3.2.1"
     83 __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
     84 __license__ = "New-style BSD"
     85 
     86 from sgmllib import SGMLParser, SGMLParseError
     87 import codecs
     88 import markupbase
     89 import types
     90 import re
     91 import sgmllib
     92 try:
     93   from htmlentitydefs import name2codepoint
     94 except ImportError:
     95   name2codepoint = {}
     96 try:
     97     set
     98 except NameError:
     99     from sets import Set as set
    100 
    101 #These hacks make Beautiful Soup able to parse XML with namespaces
    102 sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
    103 markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
    104 
    105 DEFAULT_OUTPUT_ENCODING = "utf-8"
    106 
    107 def _match_css_class(str):
    108     """Build a RE to match the given CSS class."""
    109     return re.compile(r"(^|.*\s)%s($|\s)" % str)
    110 
    111 # First, the classes that represent markup elements.
    112 
    113 class PageElement(object):
    114     """Contains the navigational information for some part of the page
    115     (either a tag or a piece of text)"""
    116 
    117     def _invert(h):
    118         "Cheap function to invert a hash."
    119         i = {}
    120         for k,v in h.items():
    121             i[v] = k
    122         return i
    123 
    124     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
    125                                       "quot" : '"',
    126                                       "amp" : "&",
    127                                       "lt" : "<",
    128                                       "gt" : ">" }
    129 
    130     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
    131 
    132     def setup(self, parent=None, previous=None):
    133         """Sets up the initial relations between this element and
    134         other elements."""
    135         self.parent = parent
    136         self.previous = previous
    137         self.next = None
    138         self.previousSibling = None
    139         self.nextSibling = None
    140         if self.parent and self.parent.contents:
    141             self.previousSibling = self.parent.contents[-1]
    142             self.previousSibling.nextSibling = self
    143 
    144     def replaceWith(self, replaceWith):
    145         oldParent = self.parent
    146         myIndex = self.parent.index(self)
    147         if hasattr(replaceWith, "parent")\
    148                   and replaceWith.parent is self.parent:
    149             # We're replacing this element with one of its siblings.
    150             index = replaceWith.parent.index(replaceWith)
    151             if index and index < myIndex:
    152                 # Furthermore, it comes before this element. That
    153                 # means that when we extract it, the index of this
    154                 # element will change.
    155                 myIndex = myIndex - 1
    156         self.extract()
    157         oldParent.insert(myIndex, replaceWith)
    158 
    159     def replaceWithChildren(self):
    160         myParent = self.parent
    161         myIndex = self.parent.index(self)
    162         self.extract()
    163         reversedChildren = list(self.contents)
    164         reversedChildren.reverse()
    165         for child in reversedChildren:
    166             myParent.insert(myIndex, child)
    167 
    168     def extract(self):
    169         """Destructively rips this element out of the tree."""
    170         if self.parent:
    171             try:
    172                 del self.parent.contents[self.parent.index(self)]
    173             except ValueError:
    174                 pass
    175 
    176         #Find the two elements that would be next to each other if
    177         #this element (and any children) hadn't been parsed. Connect
    178         #the two.
    179         lastChild = self._lastRecursiveChild()
    180         nextElement = lastChild.next
    181 
    182         if self.previous:
    183             self.previous.next = nextElement
    184         if nextElement:
    185             nextElement.previous = self.previous
    186         self.previous = None
    187         lastChild.next = None
    188 
    189         self.parent = None
    190         if self.previousSibling:
    191             self.previousSibling.nextSibling = self.nextSibling
    192         if self.nextSibling:
    193             self.nextSibling.previousSibling = self.previousSibling
    194         self.previousSibling = self.nextSibling = None
    195         return self
    196 
    197     def _lastRecursiveChild(self):
    198         "Finds the last element beneath this object to be parsed."
    199         lastChild = self
    200         while hasattr(lastChild, 'contents') and lastChild.contents:
    201             lastChild = lastChild.contents[-1]
    202         return lastChild
    203 
    204     def insert(self, position, newChild):
    205         if isinstance(newChild, basestring) \
    206             and not isinstance(newChild, NavigableString):
    207             newChild = NavigableString(newChild)
    208 
    209         position =  min(position, len(self.contents))
    210         if hasattr(newChild, 'parent') and newChild.parent is not None:
    211             # We're 'inserting' an element that's already one
    212             # of this object's children.
    213             if newChild.parent is self:
    214                 index = self.index(newChild)
    215                 if index > position:
    216                     # Furthermore we're moving it further down the
    217                     # list of this object's children. That means that
    218                     # when we extract this element, our target index
    219                     # will jump down one.
    220                     position = position - 1
    221             newChild.extract()
    222 
    223         newChild.parent = self
    224         previousChild = None
    225         if position == 0:
    226             newChild.previousSibling = None
    227             newChild.previous = self
    228         else:
    229             previousChild = self.contents[position-1]
    230             newChild.previousSibling = previousChild
    231             newChild.previousSibling.nextSibling = newChild
    232             newChild.previous = previousChild._lastRecursiveChild()
    233         if newChild.previous:
    234             newChild.previous.next = newChild
    235 
    236         newChildsLastElement = newChild._lastRecursiveChild()
    237 
    238         if position >= len(self.contents):
    239             newChild.nextSibling = None
    240 
    241             parent = self
    242             parentsNextSibling = None
    243             while not parentsNextSibling:
    244                 parentsNextSibling = parent.nextSibling
    245                 parent = parent.parent
    246                 if not parent: # This is the last element in the document.
    247                     break
    248             if parentsNextSibling:
    249                 newChildsLastElement.next = parentsNextSibling
    250             else:
    251                 newChildsLastElement.next = None
    252         else:
    253             nextChild = self.contents[position]
    254             newChild.nextSibling = nextChild
    255             if newChild.nextSibling:
    256                 newChild.nextSibling.previousSibling = newChild
    257             newChildsLastElement.next = nextChild
    258 
    259         if newChildsLastElement.next:
    260             newChildsLastElement.next.previous = newChildsLastElement
    261         self.contents.insert(position, newChild)
    262 
    263     def append(self, tag):
    264         """Appends the given tag to the contents of this tag."""
    265         self.insert(len(self.contents), tag)
    266 
    267     def findNext(self, name=None, attrs={}, text=None, **kwargs):
    268         """Returns the first item that matches the given criteria and
    269         appears after this Tag in the document."""
    270         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
    271 
    272     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
    273                     **kwargs):
    274         """Returns all items that match the given criteria and appear
    275         after this Tag in the document."""
    276         return self._findAll(name, attrs, text, limit, self.nextGenerator,
    277                              **kwargs)
    278 
    279     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
    280         """Returns the closest sibling to this Tag that matches the
    281         given criteria and appears after this Tag in the document."""
    282         return self._findOne(self.findNextSiblings, name, attrs, text,
    283                              **kwargs)
    284 
    285     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
    286                          **kwargs):
    287         """Returns the siblings of this Tag that match the given
    288         criteria and appear after this Tag in the document."""
    289         return self._findAll(name, attrs, text, limit,
    290                              self.nextSiblingGenerator, **kwargs)
    291     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
    292 
    293     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
    294         """Returns the first item that matches the given criteria and
    295         appears before this Tag in the document."""
    296         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
    297 
    298     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
    299                         **kwargs):
    300         """Returns all items that match the given criteria and appear
    301         before this Tag in the document."""
    302         return self._findAll(name, attrs, text, limit, self.previousGenerator,
    303                            **kwargs)
    304     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
    305 
    306     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
    307         """Returns the closest sibling to this Tag that matches the
    308         given criteria and appears before this Tag in the document."""
    309         return self._findOne(self.findPreviousSiblings, name, attrs, text,
    310                              **kwargs)
    311 
    312     def findPreviousSiblings(self, name=None, attrs={}, text=None,
    313                              limit=None, **kwargs):
    314         """Returns the siblings of this Tag that match the given
    315         criteria and appear before this Tag in the document."""
    316         return self._findAll(name, attrs, text, limit,
    317                              self.previousSiblingGenerator, **kwargs)
    318     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
    319 
    320     def findParent(self, name=None, attrs={}, **kwargs):
    321         """Returns the closest parent of this Tag that matches the given
    322         criteria."""
    323         # NOTE: We can't use _findOne because findParents takes a different
    324         # set of arguments.
    325         r = None
    326         l = self.findParents(name, attrs, 1)
    327         if l:
    328             r = l[0]
    329         return r
    330 
    331     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
    332         """Returns the parents of this Tag that match the given
    333         criteria."""
    334 
    335         return self._findAll(name, attrs, None, limit, self.parentGenerator,
    336                              **kwargs)
    337     fetchParents = findParents # Compatibility with pre-3.x
    338 
    339     #These methods do the real heavy lifting.
    340 
    341     def _findOne(self, method, name, attrs, text, **kwargs):
    342         r = None
    343         l = method(name, attrs, text, 1, **kwargs)
    344         if l:
    345             r = l[0]
    346         return r
    347 
    348     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
    349         "Iterates over a generator looking for things that match."
    350 
    351         if isinstance(name, SoupStrainer):
    352             strainer = name
    353         # (Possibly) special case some findAll*(...) searches
    354         elif text is None and not limit and not attrs and not kwargs:
    355             # findAll*(True)
    356             if name is True:
    357                 return [element for element in generator()
    358                         if isinstance(element, Tag)]
    359             # findAll*('tag-name')
    360             elif isinstance(name, basestring):
    361                 return [element for element in generator()
    362                         if isinstance(element, Tag) and
    363                         element.name == name]
    364             else:
    365                 strainer = SoupStrainer(name, attrs, text, **kwargs)
    366         # Build a SoupStrainer
    367         else:
    368             strainer = SoupStrainer(name, attrs, text, **kwargs)
    369         results = ResultSet(strainer)
    370         g = generator()
    371         while True:
    372             try:
    373                 i = g.next()
    374             except StopIteration:
    375                 break
    376             if i:
    377                 found = strainer.search(i)
    378                 if found:
    379                     results.append(found)
    380                     if limit and len(results) >= limit:
    381                         break
    382         return results
    383 
    384     #These Generators can be used to navigate starting from both
    385     #NavigableStrings and Tags.
    386     def nextGenerator(self):
    387         i = self
    388         while i is not None:
    389             i = i.next
    390             yield i
    391 
    392     def nextSiblingGenerator(self):
    393         i = self
    394         while i is not None:
    395             i = i.nextSibling
    396             yield i
    397 
    398     def previousGenerator(self):
    399         i = self
    400         while i is not None:
    401             i = i.previous
    402             yield i
    403 
    404     def previousSiblingGenerator(self):
    405         i = self
    406         while i is not None:
    407             i = i.previousSibling
    408             yield i
    409 
    410     def parentGenerator(self):
    411         i = self
    412         while i is not None:
    413             i = i.parent
    414             yield i
    415 
    416     # Utility methods
    417     def substituteEncoding(self, str, encoding=None):
    418         encoding = encoding or "utf-8"
    419         return str.replace("%SOUP-ENCODING%", encoding)
    420 
    421     def toEncoding(self, s, encoding=None):
    422         """Encodes an object to a string in some encoding, or to Unicode.
    423         ."""
    424         if isinstance(s, unicode):
    425             if encoding:
    426                 s = s.encode(encoding)
    427         elif isinstance(s, str):
    428             if encoding:
    429                 s = s.encode(encoding)
    430             else:
    431                 s = unicode(s)
    432         else:
    433             if encoding:
    434                 s  = self.toEncoding(str(s), encoding)
    435             else:
    436                 s = unicode(s)
    437         return s
    438 
    439     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
    440                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
    441                                            + ")")
    442 
    443     def _sub_entity(self, x):
    444         """Used with a regular expression to substitute the
    445         appropriate XML entity for an XML special character."""
    446         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
    447 
    448 
    449 class NavigableString(unicode, PageElement):
    450 
    451     def __new__(cls, value):
    452         """Create a new NavigableString.
    453 
    454         When unpickling a NavigableString, this method is called with
    455         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
    456         passed in to the superclass's __new__ or the superclass won't know
    457         how to handle non-ASCII characters.
    458         """
    459         if isinstance(value, unicode):
    460             return unicode.__new__(cls, value)
    461         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
    462 
    463     def __getnewargs__(self):
    464         return (NavigableString.__str__(self),)
    465 
    466     def __getattr__(self, attr):
    467         """text.string gives you text. This is for backwards
    468         compatibility for Navigable*String, but for CData* it lets you
    469         get the string without the CData wrapper."""
    470         if attr == 'string':
    471             return self
    472         else:
    473             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
    474 
    475     def __unicode__(self):
    476         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
    477 
    478     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    479         # Substitute outgoing XML entities.
    480         data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
    481         if encoding:
    482             return data.encode(encoding)
    483         else:
    484             return data
    485 
    486 class CData(NavigableString):
    487 
    488     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    489         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
    490 
    491 class ProcessingInstruction(NavigableString):
    492     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    493         output = self
    494         if "%SOUP-ENCODING%" in output:
    495             output = self.substituteEncoding(output, encoding)
    496         return "<?%s?>" % self.toEncoding(output, encoding)
    497 
    498 class Comment(NavigableString):
    499     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    500         return "<!--%s-->" % NavigableString.__str__(self, encoding)
    501 
    502 class Declaration(NavigableString):
    503     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    504         return "<!%s>" % NavigableString.__str__(self, encoding)
    505 
    506 class Tag(PageElement):
    507 
    508     """Represents a found HTML tag with its attributes and contents."""
    509 
    510     def _convertEntities(self, match):
    511         """Used in a call to re.sub to replace HTML, XML, and numeric
    512         entities with the appropriate Unicode characters. If HTML
    513         entities are being converted, any unrecognized entities are
    514         escaped."""
    515         x = match.group(1)
    516         if self.convertHTMLEntities and x in name2codepoint:
    517             return unichr(name2codepoint[x])
    518         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
    519             if self.convertXMLEntities:
    520                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
    521             else:
    522                 return u'&%s;' % x
    523         elif len(x) > 0 and x[0] == '#':
    524             # Handle numeric entities
    525             if len(x) > 1 and x[1] == 'x':
    526                 return unichr(int(x[2:], 16))
    527             else:
    528                 return unichr(int(x[1:]))
    529 
    530         elif self.escapeUnrecognizedEntities:
    531             return u'&amp;%s;' % x
    532         else:
    533             return u'&%s;' % x
    534 
    535     def __init__(self, parser, name, attrs=None, parent=None,
    536                  previous=None):
    537         "Basic constructor."
    538 
    539         # We don't actually store the parser object: that lets extracted
    540         # chunks be garbage-collected
    541         self.parserClass = parser.__class__
    542         self.isSelfClosing = parser.isSelfClosingTag(name)
    543         self.name = name
    544         if attrs is None:
    545             attrs = []
    546         elif isinstance(attrs, dict):
    547             attrs = attrs.items()
    548         self.attrs = attrs
    549         self.contents = []
    550         self.setup(parent, previous)
    551         self.hidden = False
    552         self.containsSubstitutions = False
    553         self.convertHTMLEntities = parser.convertHTMLEntities
    554         self.convertXMLEntities = parser.convertXMLEntities
    555         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
    556 
    557         # Convert any HTML, XML, or numeric entities in the attribute values.
    558         convert = lambda(k, val): (k,
    559                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
    560                                           self._convertEntities,
    561                                           val))
    562         self.attrs = map(convert, self.attrs)
    563 
    564     def getString(self):
    565         if (len(self.contents) == 1
    566             and isinstance(self.contents[0], NavigableString)):
    567             return self.contents[0]
    568 
    569     def setString(self, string):
    570         """Replace the contents of the tag with a string"""
    571         self.clear()
    572         self.append(string)
    573 
    574     string = property(getString, setString)
    575 
    576     def getText(self, separator=u""):
    577         if not len(self.contents):
    578             return u""
    579         stopNode = self._lastRecursiveChild().next
    580         strings = []
    581         current = self.contents[0]
    582         while current is not stopNode:
    583             if isinstance(current, NavigableString):
    584                 strings.append(current.strip())
    585             current = current.next
    586         return separator.join(strings)
    587 
    588     text = property(getText)
    589 
    590     def get(self, key, default=None):
    591         """Returns the value of the 'key' attribute for the tag, or
    592         the value given for 'default' if it doesn't have that
    593         attribute."""
    594         return self._getAttrMap().get(key, default)
    595 
    596     def clear(self):
    597         """Extract all children."""
    598         for child in self.contents[:]:
    599             child.extract()
    600 
    601     def index(self, element):
    602         for i, child in enumerate(self.contents):
    603             if child is element:
    604                 return i
    605         raise ValueError("Tag.index: element not in tag")
    606 
    607     def has_key(self, key):
    608         return self._getAttrMap().has_key(key)
    609 
    610     def __getitem__(self, key):
    611         """tag[key] returns the value of the 'key' attribute for the tag,
    612         and throws an exception if it's not there."""
    613         return self._getAttrMap()[key]
    614 
    615     def __iter__(self):
    616         "Iterating over a tag iterates over its contents."
    617         return iter(self.contents)
    618 
    619     def __len__(self):
    620         "The length of a tag is the length of its list of contents."
    621         return len(self.contents)
    622 
    623     def __contains__(self, x):
    624         return x in self.contents
    625 
    626     def __nonzero__(self):
    627         "A tag is non-None even if it has no contents."
    628         return True
    629 
    630     def __setitem__(self, key, value):
    631         """Setting tag[key] sets the value of the 'key' attribute for the
    632         tag."""
    633         self._getAttrMap()
    634         self.attrMap[key] = value
    635         found = False
    636         for i in range(0, len(self.attrs)):
    637             if self.attrs[i][0] == key:
    638                 self.attrs[i] = (key, value)
    639                 found = True
    640         if not found:
    641             self.attrs.append((key, value))
    642         self._getAttrMap()[key] = value
    643 
    644     def __delitem__(self, key):
    645         "Deleting tag[key] deletes all 'key' attributes for the tag."
    646         for item in self.attrs:
    647             if item[0] == key:
    648                 self.attrs.remove(item)
    649                 #We don't break because bad HTML can define the same
    650                 #attribute multiple times.
    651             self._getAttrMap()
    652             if self.attrMap.has_key(key):
    653                 del self.attrMap[key]
    654 
    655     def __call__(self, *args, **kwargs):
    656         """Calling a tag like a function is the same as calling its
    657         findAll() method. Eg. tag('a') returns a list of all the A tags
    658         found within this tag."""
    659         return apply(self.findAll, args, kwargs)
    660 
    661     def __getattr__(self, tag):
    662         #print "Getattr %s.%s" % (self.__class__, tag)
    663         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
    664             return self.find(tag[:-3])
    665         elif tag.find('__') != 0:
    666             return self.find(tag)
    667         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
    668 
    669     def __eq__(self, other):
    670         """Returns true iff this tag has the same name, the same attributes,
    671         and the same contents (recursively) as the given tag.
    672 
    673         NOTE: right now this will return false if two tags have the
    674         same attributes in a different order. Should this be fixed?"""
    675         if other is self:
    676             return True
    677         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
    678             return False
    679         for i in range(0, len(self.contents)):
    680             if self.contents[i] != other.contents[i]:
    681                 return False
    682         return True
    683 
    684     def __ne__(self, other):
    685         """Returns true iff this tag is not identical to the other tag,
    686         as defined in __eq__."""
    687         return not self == other
    688 
    689     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
    690         """Renders this tag as a string."""
    691         return self.__str__(encoding)
    692 
    693     def __unicode__(self):
    694         return self.__str__(None)
    695 
    696     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
    697                 prettyPrint=False, indentLevel=0):
    698         """Returns a string or Unicode representation of this tag and
    699         its contents. To get Unicode, pass None for encoding.
    700 
    701         NOTE: since Python's HTML parser consumes whitespace, this
    702         method is not certain to reproduce the whitespace present in
    703         the original string."""
    704 
    705         encodedName = self.toEncoding(self.name, encoding)
    706 
    707         attrs = []
    708         if self.attrs:
    709             for key, val in self.attrs:
    710                 fmt = '%s="%s"'
    711                 if isinstance(val, basestring):
    712                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
    713                         val = self.substituteEncoding(val, encoding)
    714 
    715                     # The attribute value either:
    716                     #
    717                     # * Contains no embedded double quotes or single quotes.
    718                     #   No problem: we enclose it in double quotes.
    719                     # * Contains embedded single quotes. No problem:
    720                     #   double quotes work here too.
    721                     # * Contains embedded double quotes. No problem:
    722                     #   we enclose it in single quotes.
    723                     # * Embeds both single _and_ double quotes. This
    724                     #   can't happen naturally, but it can happen if
    725                     #   you modify an attribute value after parsing
    726                     #   the document. Now we have a bit of a
    727                     #   problem. We solve it by enclosing the
    728                     #   attribute in single quotes, and escaping any
    729                     #   embedded single quotes to XML entities.
    730                     if '"' in val:
    731                         fmt = "%s='%s'"
    732                         if "'" in val:
    733                             # TODO: replace with apos when
    734                             # appropriate.
    735                             val = val.replace("'", "&squot;")
    736 
    737                     # Now we're okay w/r/t quotes. But the attribute
    738                     # value might also contain angle brackets, or
    739                     # ampersands that aren't part of entities. We need
    740                     # to escape those to XML entities too.
    741                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
    742 
    743                 attrs.append(fmt % (self.toEncoding(key, encoding),
    744                                     self.toEncoding(val, encoding)))
    745         close = ''
    746         closeTag = ''
    747         if self.isSelfClosing:
    748             close = ' /'
    749         else:
    750             closeTag = '</%s>' % encodedName
    751 
    752         indentTag, indentContents = 0, 0
    753         if prettyPrint:
    754             indentTag = indentLevel
    755             space = (' ' * (indentTag-1))
    756             indentContents = indentTag + 1
    757         contents = self.renderContents(encoding, prettyPrint, indentContents)
    758         if self.hidden:
    759             s = contents
    760         else:
    761             s = []
    762             attributeString = ''
    763             if attrs:
    764                 attributeString = ' ' + ' '.join(attrs)
    765             if prettyPrint:
    766                 s.append(space)
    767             s.append('<%s%s%s>' % (encodedName, attributeString, close))
    768             if prettyPrint:
    769                 s.append("\n")
    770             s.append(contents)
    771             if prettyPrint and contents and contents[-1] != "\n":
    772                 s.append("\n")
    773             if prettyPrint and closeTag:
    774                 s.append(space)
    775             s.append(closeTag)
    776             if prettyPrint and closeTag and self.nextSibling:
    777                 s.append("\n")
    778             s = ''.join(s)
    779         return s
    780 
    781     def decompose(self):
    782         """Recursively destroys the contents of this tree."""
    783         self.extract()
    784         if len(self.contents) == 0:
    785             return
    786         current = self.contents[0]
    787         while current is not None:
    788             next = current.next
    789             if isinstance(current, Tag):
    790                 del current.contents[:]
    791             current.parent = None
    792             current.previous = None
    793             current.previousSibling = None
    794             current.next = None
    795             current.nextSibling = None
    796             current = next
    797 
    798     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
    799         return self.__str__(encoding, True)
    800 
    801     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
    802                        prettyPrint=False, indentLevel=0):
    803         """Renders the contents of this tag as a string in the given
    804         encoding. If encoding is None, returns a Unicode string.."""
    805         s=[]
    806         for c in self:
    807             text = None
    808             if isinstance(c, NavigableString):
    809                 text = c.__str__(encoding)
    810             elif isinstance(c, Tag):
    811                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
    812             if text and prettyPrint:
    813                 text = text.strip()
    814             if text:
    815                 if prettyPrint:
    816                     s.append(" " * (indentLevel-1))
    817                 s.append(text)
    818                 if prettyPrint:
    819                     s.append("\n")
    820         return ''.join(s)
    821 
    822     #Soup methods
    823 
    824     def find(self, name=None, attrs={}, recursive=True, text=None,
    825              **kwargs):
    826         """Return only the first child of this Tag matching the given
    827         criteria."""
    828         r = None
    829         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
    830         if l:
    831             r = l[0]
    832         return r
    833     findChild = find
    834 
    835     def findAll(self, name=None, attrs={}, recursive=True, text=None,
    836                 limit=None, **kwargs):
    837         """Extracts a list of Tag objects that match the given
    838         criteria.  You can specify the name of the Tag and any
    839         attributes you want the Tag to have.
    840 
    841         The value of a key-value pair in the 'attrs' map can be a
    842         string, a list of strings, a regular expression object, or a
    843         callable that takes a string and returns whether or not the
    844         string matches for some custom definition of 'matches'. The
    845         same is true of the tag name."""
    846         generator = self.recursiveChildGenerator
    847         if not recursive:
    848             generator = self.childGenerator
    849         return self._findAll(name, attrs, text, limit, generator, **kwargs)
    850     findChildren = findAll
    851 
    852     # Pre-3.x compatibility methods
    853     first = find
    854     fetch = findAll
    855 
    856     def fetchText(self, text=None, recursive=True, limit=None):
    857         return self.findAll(text=text, recursive=recursive, limit=limit)
    858 
    859     def firstText(self, text=None, recursive=True):
    860         return self.find(text=text, recursive=recursive)
    861 
    862     #Private methods
    863 
    864     def _getAttrMap(self):
    865         """Initializes a map representation of this tag's attributes,
    866         if not already initialized."""
    867         if not getattr(self, 'attrMap'):
    868             self.attrMap = {}
    869             for (key, value) in self.attrs:
    870                 self.attrMap[key] = value
    871         return self.attrMap
    872 
    873     #Generator methods
    874     def childGenerator(self):
    875         # Just use the iterator from the contents
    876         return iter(self.contents)
    877 
    878     def recursiveChildGenerator(self):
    879         if not len(self.contents):
    880             raise StopIteration
    881         stopNode = self._lastRecursiveChild().next
    882         current = self.contents[0]
    883         while current is not stopNode:
    884             yield current
    885             current = current.next
    886 
    887 
    888 # Next, a couple classes to represent queries and their results.
    889 class SoupStrainer:
    890     """Encapsulates a number of ways of matching a markup element (tag or
    891     text)."""
    892 
    893     def __init__(self, name=None, attrs={}, text=None, **kwargs):
    894         self.name = name
    895         if isinstance(attrs, basestring):
    896             kwargs['class'] = _match_css_class(attrs)
    897             attrs = None
    898         if kwargs:
    899             if attrs:
    900                 attrs = attrs.copy()
    901                 attrs.update(kwargs)
    902             else:
    903                 attrs = kwargs
    904         self.attrs = attrs
    905         self.text = text
    906 
    907     def __str__(self):
    908         if self.text:
    909             return self.text
    910         else:
    911             return "%s|%s" % (self.name, self.attrs)
    912 
    913     def searchTag(self, markupName=None, markupAttrs={}):
    914         found = None
    915         markup = None
    916         if isinstance(markupName, Tag):
    917             markup = markupName
    918             markupAttrs = markup
    919         callFunctionWithTagData = callable(self.name) \
    920                                 and not isinstance(markupName, Tag)
    921 
    922         if (not self.name) \
    923                or callFunctionWithTagData \
    924                or (markup and self._matches(markup, self.name)) \
    925                or (not markup and self._matches(markupName, self.name)):
    926             if callFunctionWithTagData:
    927                 match = self.name(markupName, markupAttrs)
    928             else:
    929                 match = True
    930                 markupAttrMap = None
    931                 for attr, matchAgainst in self.attrs.items():
    932                     if not markupAttrMap:
    933                          if hasattr(markupAttrs, 'get'):
    934                             markupAttrMap = markupAttrs
    935                          else:
    936                             markupAttrMap = {}
    937                             for k,v in markupAttrs:
    938                                 markupAttrMap[k] = v
    939                     attrValue = markupAttrMap.get(attr)
    940                     if not self._matches(attrValue, matchAgainst):
    941                         match = False
    942                         break
    943             if match:
    944                 if markup:
    945                     found = markup
    946                 else:
    947                     found = markupName
    948         return found
    949 
    950     def search(self, markup):
    951         #print 'looking for %s in %s' % (self, markup)
    952         found = None
    953         # If given a list of items, scan it for a text element that
    954         # matches.
    955         if hasattr(markup, "__iter__") \
    956                 and not isinstance(markup, Tag):
    957             for element in markup:
    958                 if isinstance(element, NavigableString) \
    959                        and self.search(element):
    960                     found = element
    961                     break
    962         # If it's a Tag, make sure its name or attributes match.
    963         # Don't bother with Tags if we're searching for text.
    964         elif isinstance(markup, Tag):
    965             if not self.text:
    966                 found = self.searchTag(markup)
    967         # If it's text, make sure the text matches.
    968         elif isinstance(markup, NavigableString) or \
    969                  isinstance(markup, basestring):
    970             if self._matches(markup, self.text):
    971                 found = markup
    972         else:
    973             raise Exception, "I don't know how to match against a %s" \
    974                   % markup.__class__
    975         return found
    976 
    977     def _matches(self, markup, matchAgainst):
    978         #print "Matching %s against %s" % (markup, matchAgainst)
    979         result = False
    980         if matchAgainst is True:
    981             result = markup is not None
    982         elif callable(matchAgainst):
    983             result = matchAgainst(markup)
    984         else:
    985             #Custom match methods take the tag as an argument, but all
    986             #other ways of matching match the tag name as a string.
    987             if isinstance(markup, Tag):
    988                 markup = markup.name
    989             if markup and not isinstance(markup, basestring):
    990                 markup = unicode(markup)
    991             #Now we know that chunk is either a string, or None.
    992             if hasattr(matchAgainst, 'match'):
    993                 # It's a regexp object.
    994                 result = markup and matchAgainst.search(markup)
    995             elif hasattr(matchAgainst, '__iter__'): # list-like
    996                 result = markup in matchAgainst
    997             elif hasattr(matchAgainst, 'items'):
    998                 result = markup.has_key(matchAgainst)
    999             elif matchAgainst and isinstance(markup, basestring):
   1000                 if isinstance(markup, unicode):
   1001                     matchAgainst = unicode(matchAgainst)
   1002                 else:
   1003                     matchAgainst = str(matchAgainst)
   1004 
   1005             if not result:
   1006                 result = matchAgainst == markup
   1007         return result
   1008 
   1009 class ResultSet(list):
   1010     """A ResultSet is just a list that keeps track of the SoupStrainer
   1011     that created it."""
   1012     def __init__(self, source):
   1013         list.__init__([])
   1014         self.source = source
   1015 
   1016 # Now, some helper functions.
   1017 
   1018 def buildTagMap(default, *args):
   1019     """Turns a list of maps, lists, or scalars into a single map.
   1020     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
   1021     NESTING_RESET_TAGS maps out of lists and partial maps."""
   1022     built = {}
   1023     for portion in args:
   1024         if hasattr(portion, 'items'):
   1025             #It's a map. Merge it.
   1026             for k,v in portion.items():
   1027                 built[k] = v
   1028         elif hasattr(portion, '__iter__'): # is a list
   1029             #It's a list. Map each item to the default.
   1030             for k in portion:
   1031                 built[k] = default
   1032         else:
   1033             #It's a scalar. Map it to the default.
   1034             built[portion] = default
   1035     return built
   1036 
   1037 # Now, the parser classes.
   1038 
   1039 class BeautifulStoneSoup(Tag, SGMLParser):
   1040 
   1041     """This class contains the basic parser and search code. It defines
   1042     a parser that knows nothing about tag behavior except for the
   1043     following:
   1044 
   1045       You can't close a tag without closing all the tags it encloses.
   1046       That is, "<foo><bar></foo>" actually means
   1047       "<foo><bar></bar></foo>".
   1048 
   1049     [Another possible explanation is "<foo><bar /></foo>", but since
   1050     this class defines no SELF_CLOSING_TAGS, it will never use that
   1051     explanation.]
   1052 
   1053     This class is useful for parsing XML or made-up markup languages,
   1054     or when BeautifulSoup makes an assumption counter to what you were
   1055     expecting."""
   1056 
   1057     SELF_CLOSING_TAGS = {}
   1058     NESTABLE_TAGS = {}
   1059     RESET_NESTING_TAGS = {}
   1060     QUOTE_TAGS = {}
   1061     PRESERVE_WHITESPACE_TAGS = []
   1062 
   1063     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
   1064                        lambda x: x.group(1) + ' />'),
   1065                       (re.compile('<!\s+([^<>]*)>'),
   1066                        lambda x: '<!' + x.group(1) + '>')
   1067                       ]
   1068 
   1069     ROOT_TAG_NAME = u'[document]'
   1070 
   1071     HTML_ENTITIES = "html"
   1072     XML_ENTITIES = "xml"
   1073     XHTML_ENTITIES = "xhtml"
   1074     # TODO: This only exists for backwards-compatibility
   1075     ALL_ENTITIES = XHTML_ENTITIES
   1076 
   1077     # Used when determining whether a text node is all whitespace and
   1078     # can be replaced with a single space. A text node that contains
   1079     # fancy Unicode spaces (usually non-breaking) should be left
   1080     # alone.
   1081     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
   1082 
   1083     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
   1084                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
   1085                  convertEntities=None, selfClosingTags=None, isHTML=False):
   1086         """The Soup object is initialized as the 'root tag', and the
   1087         provided markup (which can be a string or a file-like object)
   1088         is fed into the underlying parser.
   1089 
   1090         sgmllib will process most bad HTML, and the BeautifulSoup
   1091         class has some tricks for dealing with some HTML that kills
   1092         sgmllib, but Beautiful Soup can nonetheless choke or lose data
   1093         if your data uses self-closing tags or declarations
   1094         incorrectly.
   1095 
   1096         By default, Beautiful Soup uses regexes to sanitize input,
   1097         avoiding the vast majority of these problems. If the problems
   1098         don't apply to you, pass in False for markupMassage, and
   1099         you'll get better performance.
   1100 
   1101         The default parser massage techniques fix the two most common
   1102         instances of invalid HTML that choke sgmllib:
   1103 
   1104          <br/> (No space between name of closing tag and tag close)
   1105          <! --Comment--> (Extraneous whitespace in declaration)
   1106 
   1107         You can pass in a custom list of (RE object, replace method)
   1108         tuples to get Beautiful Soup to scrub your input the way you
   1109         want."""
   1110 
   1111         self.parseOnlyThese = parseOnlyThese
   1112         self.fromEncoding = fromEncoding
   1113         self.smartQuotesTo = smartQuotesTo
   1114         self.convertEntities = convertEntities
   1115         # Set the rules for how we'll deal with the entities we
   1116         # encounter
   1117         if self.convertEntities:
   1118             # It doesn't make sense to convert encoded characters to
   1119             # entities even while you're converting entities to Unicode.
   1120             # Just convert it all to Unicode.
   1121             self.smartQuotesTo = None
   1122             if convertEntities == self.HTML_ENTITIES:
   1123                 self.convertXMLEntities = False
   1124                 self.convertHTMLEntities = True
   1125                 self.escapeUnrecognizedEntities = True
   1126             elif convertEntities == self.XHTML_ENTITIES:
   1127                 self.convertXMLEntities = True
   1128                 self.convertHTMLEntities = True
   1129                 self.escapeUnrecognizedEntities = False
   1130             elif convertEntities == self.XML_ENTITIES:
   1131                 self.convertXMLEntities = True
   1132                 self.convertHTMLEntities = False
   1133                 self.escapeUnrecognizedEntities = False
   1134         else:
   1135             self.convertXMLEntities = False
   1136             self.convertHTMLEntities = False
   1137             self.escapeUnrecognizedEntities = False
   1138 
   1139         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
   1140         SGMLParser.__init__(self)
   1141 
   1142         if hasattr(markup, 'read'):        # It's a file-type object.
   1143             markup = markup.read()
   1144         self.markup = markup
   1145         self.markupMassage = markupMassage
   1146         try:
   1147             self._feed(isHTML=isHTML)
   1148         except StopParsing:
   1149             pass
   1150         self.markup = None                 # The markup can now be GCed
   1151 
   1152     def convert_charref(self, name):
   1153         """This method fixes a bug in Python's SGMLParser."""
   1154         try:
   1155             n = int(name)
   1156         except ValueError:
   1157             return
   1158         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
   1159             return
   1160         return self.convert_codepoint(n)
   1161 
   1162     def _feed(self, inDocumentEncoding=None, isHTML=False):
   1163         # Convert the document to Unicode.
   1164         markup = self.markup
   1165         if isinstance(markup, unicode):
   1166             if not hasattr(self, 'originalEncoding'):
   1167                 self.originalEncoding = None
   1168         else:
   1169             dammit = UnicodeDammit\
   1170                      (markup, [self.fromEncoding, inDocumentEncoding],
   1171                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
   1172             markup = dammit.unicode
   1173             self.originalEncoding = dammit.originalEncoding
   1174             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
   1175         if markup:
   1176             if self.markupMassage:
   1177                 if not hasattr(self.markupMassage, "__iter__"):
   1178                     self.markupMassage = self.MARKUP_MASSAGE
   1179                 for fix, m in self.markupMassage:
   1180                     markup = fix.sub(m, markup)
   1181                 # TODO: We get rid of markupMassage so that the
   1182                 # soup object can be deepcopied later on. Some
   1183                 # Python installations can't copy regexes. If anyone
   1184                 # was relying on the existence of markupMassage, this
   1185                 # might cause problems.
   1186                 del(self.markupMassage)
   1187         self.reset()
   1188 
   1189         SGMLParser.feed(self, markup)
   1190         # Close out any unfinished strings and close all the open tags.
   1191         self.endData()
   1192         while self.currentTag.name != self.ROOT_TAG_NAME:
   1193             self.popTag()
   1194 
   1195     def __getattr__(self, methodName):
   1196         """This method routes method call requests to either the SGMLParser
   1197         superclass or the Tag superclass, depending on the method name."""
   1198         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
   1199 
   1200         if methodName.startswith('start_') or methodName.startswith('end_') \
   1201                or methodName.startswith('do_'):
   1202             return SGMLParser.__getattr__(self, methodName)
   1203         elif not methodName.startswith('__'):
   1204             return Tag.__getattr__(self, methodName)
   1205         else:
   1206             raise AttributeError
   1207 
   1208     def isSelfClosingTag(self, name):
   1209         """Returns true iff the given string is the name of a
   1210         self-closing tag according to this parser."""
   1211         return self.SELF_CLOSING_TAGS.has_key(name) \
   1212                or self.instanceSelfClosingTags.has_key(name)
   1213 
   1214     def reset(self):
   1215         Tag.__init__(self, self, self.ROOT_TAG_NAME)
   1216         self.hidden = 1
   1217         SGMLParser.reset(self)
   1218         self.currentData = []
   1219         self.currentTag = None
   1220         self.tagStack = []
   1221         self.quoteStack = []
   1222         self.pushTag(self)
   1223 
   1224     def popTag(self):
   1225         tag = self.tagStack.pop()
   1226 
   1227         #print "Pop", tag.name
   1228         if self.tagStack:
   1229             self.currentTag = self.tagStack[-1]
   1230         return self.currentTag
   1231 
   1232     def pushTag(self, tag):
   1233         #print "Push", tag.name
   1234         if self.currentTag:
   1235             self.currentTag.contents.append(tag)
   1236         self.tagStack.append(tag)
   1237         self.currentTag = self.tagStack[-1]
   1238 
   1239     def endData(self, containerClass=NavigableString):
   1240         if self.currentData:
   1241             currentData = u''.join(self.currentData)
   1242             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
   1243                 not set([tag.name for tag in self.tagStack]).intersection(
   1244                     self.PRESERVE_WHITESPACE_TAGS)):
   1245                 if '\n' in currentData:
   1246                     currentData = '\n'
   1247                 else:
   1248                     currentData = ' '
   1249             self.currentData = []
   1250             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
   1251                    (not self.parseOnlyThese.text or \
   1252                     not self.parseOnlyThese.search(currentData)):
   1253                 return
   1254             o = containerClass(currentData)
   1255             o.setup(self.currentTag, self.previous)
   1256             if self.previous:
   1257                 self.previous.next = o
   1258             self.previous = o
   1259             self.currentTag.contents.append(o)
   1260 
   1261 
   1262     def _popToTag(self, name, inclusivePop=True):
   1263         """Pops the tag stack up to and including the most recent
   1264         instance of the given tag. If inclusivePop is false, pops the tag
   1265         stack up to but *not* including the most recent instqance of
   1266         the given tag."""
   1267         #print "Popping to %s" % name
   1268         if name == self.ROOT_TAG_NAME:
   1269             return
   1270 
   1271         numPops = 0
   1272         mostRecentTag = None
   1273         for i in range(len(self.tagStack)-1, 0, -1):
   1274             if name == self.tagStack[i].name:
   1275                 numPops = len(self.tagStack)-i
   1276                 break
   1277         if not inclusivePop:
   1278             numPops = numPops - 1
   1279 
   1280         for i in range(0, numPops):
   1281             mostRecentTag = self.popTag()
   1282         return mostRecentTag
   1283 
   1284     def _smartPop(self, name):
   1285 
   1286         """We need to pop up to the previous tag of this type, unless
   1287         one of this tag's nesting reset triggers comes between this
   1288         tag and the previous tag of this type, OR unless this tag is a
   1289         generic nesting trigger and another generic nesting trigger
   1290         comes between this tag and the previous tag of this type.
   1291 
   1292         Examples:
   1293          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
   1294          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
   1295          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
   1296 
   1297          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
   1298          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
   1299          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
   1300         """
   1301 
   1302         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
   1303         isNestable = nestingResetTriggers != None
   1304         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
   1305         popTo = None
   1306         inclusive = True
   1307         for i in range(len(self.tagStack)-1, 0, -1):
   1308             p = self.tagStack[i]
   1309             if (not p or p.name == name) and not isNestable:
   1310                 #Non-nestable tags get popped to the top or to their
   1311                 #last occurance.
   1312                 popTo = name
   1313                 break
   1314             if (nestingResetTriggers is not None
   1315                 and p.name in nestingResetTriggers) \
   1316                 or (nestingResetTriggers is None and isResetNesting
   1317                     and self.RESET_NESTING_TAGS.has_key(p.name)):
   1318 
   1319                 #If we encounter one of the nesting reset triggers
   1320                 #peculiar to this tag, or we encounter another tag
   1321                 #that causes nesting to reset, pop up to but not
   1322                 #including that tag.
   1323                 popTo = p.name
   1324                 inclusive = False
   1325                 break
   1326             p = p.parent
   1327         if popTo:
   1328             self._popToTag(popTo, inclusive)
   1329 
   1330     def unknown_starttag(self, name, attrs, selfClosing=0):
   1331         #print "Start tag %s: %s" % (name, attrs)
   1332         if self.quoteStack:
   1333             #This is not a real tag.
   1334             #print "<%s> is not real!" % name
   1335             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
   1336             self.handle_data('<%s%s>' % (name, attrs))
   1337             return
   1338         self.endData()
   1339 
   1340         if not self.isSelfClosingTag(name) and not selfClosing:
   1341             self._smartPop(name)
   1342 
   1343         if self.parseOnlyThese and len(self.tagStack) <= 1 \
   1344                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
   1345             return
   1346 
   1347         tag = Tag(self, name, attrs, self.currentTag, self.previous)
   1348         if self.previous:
   1349             self.previous.next = tag
   1350         self.previous = tag
   1351         self.pushTag(tag)
   1352         if selfClosing or self.isSelfClosingTag(name):
   1353             self.popTag()
   1354         if name in self.QUOTE_TAGS:
   1355             #print "Beginning quote (%s)" % name
   1356             self.quoteStack.append(name)
   1357             self.literal = 1
   1358         return tag
   1359 
   1360     def unknown_endtag(self, name):
   1361         #print "End tag %s" % name
   1362         if self.quoteStack and self.quoteStack[-1] != name:
   1363             #This is not a real end tag.
   1364             #print "</%s> is not real!" % name
   1365             self.handle_data('</%s>' % name)
   1366             return
   1367         self.endData()
   1368         self._popToTag(name)
   1369         if self.quoteStack and self.quoteStack[-1] == name:
   1370             self.quoteStack.pop()
   1371             self.literal = (len(self.quoteStack) > 0)
   1372 
   1373     def handle_data(self, data):
   1374         self.currentData.append(data)
   1375 
   1376     def _toStringSubclass(self, text, subclass):
   1377         """Adds a certain piece of text to the tree as a NavigableString
   1378         subclass."""
   1379         self.endData()
   1380         self.handle_data(text)
   1381         self.endData(subclass)
   1382 
   1383     def handle_pi(self, text):
   1384         """Handle a processing instruction as a ProcessingInstruction
   1385         object, possibly one with a %SOUP-ENCODING% slot into which an
   1386         encoding will be plugged later."""
   1387         if text[:3] == "xml":
   1388             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
   1389         self._toStringSubclass(text, ProcessingInstruction)
   1390 
   1391     def handle_comment(self, text):
   1392         "Handle comments as Comment objects."
   1393         self._toStringSubclass(text, Comment)
   1394 
   1395     def handle_charref(self, ref):
   1396         "Handle character references as data."
   1397         if self.convertEntities:
   1398             data = unichr(int(ref))
   1399         else:
   1400             data = '&#%s;' % ref
   1401         self.handle_data(data)
   1402 
   1403     def handle_entityref(self, ref):
   1404         """Handle entity references as data, possibly converting known
   1405         HTML and/or XML entity references to the corresponding Unicode
   1406         characters."""
   1407         data = None
   1408         if self.convertHTMLEntities:
   1409             try:
   1410                 data = unichr(name2codepoint[ref])
   1411             except KeyError:
   1412                 pass
   1413 
   1414         if not data and self.convertXMLEntities:
   1415                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
   1416 
   1417         if not data and self.convertHTMLEntities and \
   1418             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
   1419                 # TODO: We've got a problem here. We're told this is
   1420                 # an entity reference, but it's not an XML entity
   1421                 # reference or an HTML entity reference. Nonetheless,
   1422                 # the logical thing to do is to pass it through as an
   1423                 # unrecognized entity reference.
   1424                 #
   1425                 # Except: when the input is "&carol;" this function
   1426                 # will be called with input "carol". When the input is
   1427                 # "AT&T", this function will be called with input
   1428                 # "T". We have no way of knowing whether a semicolon
   1429                 # was present originally, so we don't know whether
   1430                 # this is an unknown entity or just a misplaced
   1431                 # ampersand.
   1432                 #
   1433                 # The more common case is a misplaced ampersand, so I
   1434                 # escape the ampersand and omit the trailing semicolon.
   1435                 data = "&amp;%s" % ref
   1436         if not data:
   1437             # This case is different from the one above, because we
   1438             # haven't already gone through a supposedly comprehensive
   1439             # mapping of entities to Unicode characters. We might not
   1440             # have gone through any mapping at all. So the chances are
   1441             # very high that this is a real entity, and not a
   1442             # misplaced ampersand.
   1443             data = "&%s;" % ref
   1444         self.handle_data(data)
   1445 
   1446     def handle_decl(self, data):
   1447         "Handle DOCTYPEs and the like as Declaration objects."
   1448         self._toStringSubclass(data, Declaration)
   1449 
   1450     def parse_declaration(self, i):
   1451         """Treat a bogus SGML declaration as raw data. Treat a CDATA
   1452         declaration as a CData object."""
   1453         j = None
   1454         if self.rawdata[i:i+9] == '<![CDATA[':
   1455              k = self.rawdata.find(']]>', i)
   1456              if k == -1:
   1457                  k = len(self.rawdata)
   1458              data = self.rawdata[i+9:k]
   1459              j = k+3
   1460              self._toStringSubclass(data, CData)
   1461         else:
   1462             try:
   1463                 j = SGMLParser.parse_declaration(self, i)
   1464             except SGMLParseError:
   1465                 toHandle = self.rawdata[i:]
   1466                 self.handle_data(toHandle)
   1467                 j = i + len(toHandle)
   1468         return j
   1469 
   1470 class BeautifulSoup(BeautifulStoneSoup):
   1471 
   1472     """This parser knows the following facts about HTML:
   1473 
   1474     * Some tags have no closing tag and should be interpreted as being
   1475       closed as soon as they are encountered.
   1476 
   1477     * The text inside some tags (ie. 'script') may contain tags which
   1478       are not really part of the document and which should be parsed
   1479       as text, not tags. If you want to parse the text as tags, you can
   1480       always fetch it and parse it explicitly.
   1481 
   1482     * Tag nesting rules:
   1483 
   1484       Most tags can't be nested at all. For instance, the occurance of
   1485       a <p> tag should implicitly close the previous <p> tag.
   1486 
   1487        <p>Para1<p>Para2
   1488         should be transformed into:
   1489        <p>Para1</p><p>Para2
   1490 
   1491       Some tags can be nested arbitrarily. For instance, the occurance
   1492       of a <blockquote> tag should _not_ implicitly close the previous
   1493       <blockquote> tag.
   1494 
   1495        Alice said: <blockquote>Bob said: <blockquote>Blah
   1496         should NOT be transformed into:
   1497        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
   1498 
   1499       Some tags can be nested, but the nesting is reset by the
   1500       interposition of other tags. For instance, a <tr> tag should
   1501       implicitly close the previous <tr> tag within the same <table>,
   1502       but not close a <tr> tag in another table.
   1503 
   1504        <table><tr>Blah<tr>Blah
   1505         should be transformed into:
   1506        <table><tr>Blah</tr><tr>Blah
   1507         but,
   1508        <tr>Blah<table><tr>Blah
   1509         should NOT be transformed into
   1510        <tr>Blah<table></tr><tr>Blah
   1511 
   1512     Differing assumptions about tag nesting rules are a major source
   1513     of problems with the BeautifulSoup class. If BeautifulSoup is not
   1514     treating as nestable a tag your page author treats as nestable,
   1515     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
   1516     BeautifulStoneSoup before writing your own subclass."""
   1517 
   1518     def __init__(self, *args, **kwargs):
   1519         if not kwargs.has_key('smartQuotesTo'):
   1520             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
   1521         kwargs['isHTML'] = True
   1522         BeautifulStoneSoup.__init__(self, *args, **kwargs)
   1523 
   1524     SELF_CLOSING_TAGS = buildTagMap(None,
   1525                                     ('br' , 'hr', 'input', 'img', 'meta',
   1526                                     'spacer', 'link', 'frame', 'base', 'col'))
   1527 
   1528     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
   1529 
   1530     QUOTE_TAGS = {'script' : None, 'textarea' : None}
   1531 
   1532     #According to the HTML standard, each of these inline tags can
   1533     #contain another tag of the same type. Furthermore, it's common
   1534     #to actually use these tags this way.
   1535     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
   1536                             'center')
   1537 
   1538     #According to the HTML standard, these block tags can contain
   1539     #another tag of the same type. Furthermore, it's common
   1540     #to actually use these tags this way.
   1541     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
   1542 
   1543     #Lists can contain other lists, but there are restrictions.
   1544     NESTABLE_LIST_TAGS = { 'ol' : [],
   1545                            'ul' : [],
   1546                            'li' : ['ul', 'ol'],
   1547                            'dl' : [],
   1548                            'dd' : ['dl'],
   1549                            'dt' : ['dl'] }
   1550 
   1551     #Tables can contain other tables, but there are restrictions.
   1552     NESTABLE_TABLE_TAGS = {'table' : [],
   1553                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
   1554                            'td' : ['tr'],
   1555                            'th' : ['tr'],
   1556                            'thead' : ['table'],
   1557                            'tbody' : ['table'],
   1558                            'tfoot' : ['table'],
   1559                            }
   1560 
   1561     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
   1562 
   1563     #If one of these tags is encountered, all tags up to the next tag of
   1564     #this type are popped.
   1565     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
   1566                                      NON_NESTABLE_BLOCK_TAGS,
   1567                                      NESTABLE_LIST_TAGS,
   1568                                      NESTABLE_TABLE_TAGS)
   1569 
   1570     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
   1571                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
   1572 
   1573     # Used to detect the charset in a META tag; see start_meta
   1574     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
   1575 
   1576     def start_meta(self, attrs):
   1577         """Beautiful Soup can detect a charset included in a META tag,
   1578         try to convert the document to that charset, and re-parse the
   1579         document from the beginning."""
   1580         httpEquiv = None
   1581         contentType = None
   1582         contentTypeIndex = None
   1583         tagNeedsEncodingSubstitution = False
   1584 
   1585         for i in range(0, len(attrs)):
   1586             key, value = attrs[i]
   1587             key = key.lower()
   1588             if key == 'http-equiv':
   1589                 httpEquiv = value
   1590             elif key == 'content':
   1591                 contentType = value
   1592                 contentTypeIndex = i
   1593 
   1594         if httpEquiv and contentType: # It's an interesting meta tag.
   1595             match = self.CHARSET_RE.search(contentType)
   1596             if match:
   1597                 if (self.declaredHTMLEncoding is not None or
   1598                     self.originalEncoding == self.fromEncoding):
   1599                     # An HTML encoding was sniffed while converting
   1600                     # the document to Unicode, or an HTML encoding was
   1601                     # sniffed during a previous pass through the
   1602                     # document, or an encoding was specified
   1603                     # explicitly and it worked. Rewrite the meta tag.
   1604                     def rewrite(match):
   1605                         return match.group(1) + "%SOUP-ENCODING%"
   1606                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
   1607                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
   1608                                                newAttr)
   1609                     tagNeedsEncodingSubstitution = True
   1610                 else:
   1611                     # This is our first pass through the document.
   1612                     # Go through it again with the encoding information.
   1613                     newCharset = match.group(3)
   1614                     if newCharset and newCharset != self.originalEncoding:
   1615                         self.declaredHTMLEncoding = newCharset
   1616                         self._feed(self.declaredHTMLEncoding)
   1617                         raise StopParsing
   1618                     pass
   1619         tag = self.unknown_starttag("meta", attrs)
   1620         if tag and tagNeedsEncodingSubstitution:
   1621             tag.containsSubstitutions = True
   1622 
   1623 class StopParsing(Exception):
   1624     pass
   1625 
   1626 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
   1627 
   1628     """The BeautifulSoup class is oriented towards skipping over
   1629     common HTML errors like unclosed tags. However, sometimes it makes
   1630     errors of its own. For instance, consider this fragment:
   1631 
   1632      <b>Foo<b>Bar</b></b>
   1633 
   1634     This is perfectly valid (if bizarre) HTML. However, the
   1635     BeautifulSoup class will implicitly close the first b tag when it
   1636     encounters the second 'b'. It will think the author wrote
   1637     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
   1638     there's no real-world reason to bold something that's already
   1639     bold. When it encounters '</b></b>' it will close two more 'b'
   1640     tags, for a grand total of three tags closed instead of two. This
   1641     can throw off the rest of your document structure. The same is
   1642     true of a number of other tags, listed below.
   1643 
   1644     It's much more common for someone to forget to close a 'b' tag
   1645     than to actually use nested 'b' tags, and the BeautifulSoup class
   1646     handles the common case. This class handles the not-co-common
   1647     case: where you can't believe someone wrote what they did, but
   1648     it's valid HTML and BeautifulSoup screwed up by assuming it
   1649     wouldn't be."""
   1650 
   1651     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
   1652      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
   1653       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
   1654       'big')
   1655 
   1656     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
   1657 
   1658     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
   1659                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
   1660                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
   1661 
   1662 class MinimalSoup(BeautifulSoup):
   1663     """The MinimalSoup class is for parsing HTML that contains
   1664     pathologically bad markup. It makes no assumptions about tag
   1665     nesting, but it does know which tags are self-closing, that
   1666     <script> tags contain Javascript and should not be parsed, that
   1667     META tags may contain encoding information, and so on.
   1668 
   1669     This also makes it better for subclassing than BeautifulStoneSoup
   1670     or BeautifulSoup."""
   1671 
   1672     RESET_NESTING_TAGS = buildTagMap('noscript')
   1673     NESTABLE_TAGS = {}
   1674 
   1675 class BeautifulSOAP(BeautifulStoneSoup):
   1676     """This class will push a tag with only a single string child into
   1677     the tag's parent as an attribute. The attribute's name is the tag
   1678     name, and the value is the string child. An example should give
   1679     the flavor of the change:
   1680 
   1681     <foo><bar>baz</bar></foo>
   1682      =>
   1683     <foo bar="baz"><bar>baz</bar></foo>
   1684 
   1685     You can then access fooTag['bar'] instead of fooTag.barTag.string.
   1686 
   1687     This is, of course, useful for scraping structures that tend to
   1688     use subelements instead of attributes, such as SOAP messages. Note
   1689     that it modifies its input, so don't print the modified version
   1690     out.
   1691 
   1692     I'm not sure how many people really want to use this class; let me
   1693     know if you do. Mainly I like the name."""
   1694 
   1695     def popTag(self):
   1696         if len(self.tagStack) > 1:
   1697             tag = self.tagStack[-1]
   1698             parent = self.tagStack[-2]
   1699             parent._getAttrMap()
   1700             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
   1701                 isinstance(tag.contents[0], NavigableString) and
   1702                 not parent.attrMap.has_key(tag.name)):
   1703                 parent[tag.name] = tag.contents[0]
   1704         BeautifulStoneSoup.popTag(self)
   1705 
   1706 #Enterprise class names! It has come to our attention that some people
   1707 #think the names of the Beautiful Soup parser classes are too silly
   1708 #and "unprofessional" for use in enterprise screen-scraping. We feel
   1709 #your pain! For such-minded folk, the Beautiful Soup Consortium And
   1710 #All-Night Kosher Bakery recommends renaming this file to
   1711 #"RobustParser.py" (or, in cases of extreme enterprisiness,
   1712 #"RobustParserBeanInterface.class") and using the following
   1713 #enterprise-friendly class aliases:
   1714 class RobustXMLParser(BeautifulStoneSoup):
   1715     pass
   1716 class RobustHTMLParser(BeautifulSoup):
   1717     pass
   1718 class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
   1719     pass
   1720 class RobustInsanelyWackAssHTMLParser(MinimalSoup):
   1721     pass
   1722 class SimplifyingSOAPParser(BeautifulSOAP):
   1723     pass
   1724 
   1725 ######################################################
   1726 #
   1727 # Bonus library: Unicode, Dammit
   1728 #
   1729 # This class forces XML data into a standard format (usually to UTF-8
   1730 # or Unicode).  It is heavily based on code from Mark Pilgrim's
   1731 # Universal Feed Parser. It does not rewrite the XML or HTML to
   1732 # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
   1733 # (XML) and BeautifulSoup.start_meta (HTML).
   1734 
   1735 # Autodetects character encodings.
   1736 # Download from http://chardet.feedparser.org/
   1737 try:
   1738     import chardet
   1739 #    import chardet.constants
   1740 #    chardet.constants._debug = 1
   1741 except ImportError:
   1742     chardet = None
   1743 
   1744 # cjkcodecs and iconv_codec make Python know about more character encodings.
   1745 # Both are available from http://cjkpython.i18n.org/
   1746 # They're built in if you use Python 2.4.
   1747 try:
   1748     import cjkcodecs.aliases
   1749 except ImportError:
   1750     pass
   1751 try:
   1752     import iconv_codec
   1753 except ImportError:
   1754     pass
   1755 
   1756 class UnicodeDammit:
   1757     """A class for detecting the encoding of a *ML document and
   1758     converting it to a Unicode string. If the source encoding is
   1759     windows-1252, can replace MS smart quotes with their HTML or XML
   1760     equivalents."""
   1761 
   1762     # This dictionary maps commonly seen values for "charset" in HTML
   1763     # meta tags to the corresponding Python codec names. It only covers
   1764     # values that aren't in Python's aliases and can't be determined
   1765     # by the heuristics in find_codec.
   1766     CHARSET_ALIASES = { "macintosh" : "mac-roman",
   1767                         "x-sjis" : "shift-jis" }
   1768 
   1769     def __init__(self, markup, overrideEncodings=[],
   1770                  smartQuotesTo='xml', isHTML=False):
   1771         self.declaredHTMLEncoding = None
   1772         self.markup, documentEncoding, sniffedEncoding = \
   1773                      self._detectEncoding(markup, isHTML)
   1774         self.smartQuotesTo = smartQuotesTo
   1775         self.triedEncodings = []
   1776         if markup == '' or isinstance(markup, unicode):
   1777             self.originalEncoding = None
   1778             self.unicode = unicode(markup)
   1779             return
   1780 
   1781         u = None
   1782         for proposedEncoding in overrideEncodings:
   1783             u = self._convertFrom(proposedEncoding)
   1784             if u: break
   1785         if not u:
   1786             for proposedEncoding in (documentEncoding, sniffedEncoding):
   1787                 u = self._convertFrom(proposedEncoding)
   1788                 if u: break
   1789 
   1790         # If no luck and we have auto-detection library, try that:
   1791         if not u and chardet and not isinstance(self.markup, unicode):
   1792             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
   1793 
   1794         # As a last resort, try utf-8 and windows-1252:
   1795         if not u:
   1796             for proposed_encoding in ("utf-8", "windows-1252"):
   1797                 u = self._convertFrom(proposed_encoding)
   1798                 if u: break
   1799 
   1800         self.unicode = u
   1801         if not u: self.originalEncoding = None
   1802 
   1803     def _subMSChar(self, orig):
   1804         """Changes a MS smart quote character to an XML or HTML
   1805         entity."""
   1806         sub = self.MS_CHARS.get(orig)
   1807         if isinstance(sub, tuple):
   1808             if self.smartQuotesTo == 'xml':
   1809                 sub = '&#x%s;' % sub[1]
   1810             else:
   1811                 sub = '&%s;' % sub[0]
   1812         return sub
   1813 
   1814     def _convertFrom(self, proposed):
   1815         proposed = self.find_codec(proposed)
   1816         if not proposed or proposed in self.triedEncodings:
   1817             return None
   1818         self.triedEncodings.append(proposed)
   1819         markup = self.markup
   1820 
   1821         # Convert smart quotes to HTML if coming from an encoding
   1822         # that might have them.
   1823         if self.smartQuotesTo and proposed.lower() in("windows-1252",
   1824                                                       "iso-8859-1",
   1825                                                       "iso-8859-2"):
   1826             markup = re.compile("([\x80-\x9f])").sub \
   1827                      (lambda(x): self._subMSChar(x.group(1)),
   1828                       markup)
   1829 
   1830         try:
   1831             # print "Trying to convert document to %s" % proposed
   1832             u = self._toUnicode(markup, proposed)
   1833             self.markup = u
   1834             self.originalEncoding = proposed
   1835         except Exception, e:
   1836             # print "That didn't work!"
   1837             # print e
   1838             return None
   1839         #print "Correct encoding: %s" % proposed
   1840         return self.markup
   1841 
   1842     def _toUnicode(self, data, encoding):
   1843         '''Given a string and its encoding, decodes the string into Unicode.
   1844         %encoding is a string recognized by encodings.aliases'''
   1845 
   1846         # strip Byte Order Mark (if present)
   1847         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
   1848                and (data[2:4] != '\x00\x00'):
   1849             encoding = 'utf-16be'
   1850             data = data[2:]
   1851         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
   1852                  and (data[2:4] != '\x00\x00'):
   1853             encoding = 'utf-16le'
   1854             data = data[2:]
   1855         elif data[:3] == '\xef\xbb\xbf':
   1856             encoding = 'utf-8'
   1857             data = data[3:]
   1858         elif data[:4] == '\x00\x00\xfe\xff':
   1859             encoding = 'utf-32be'
   1860             data = data[4:]
   1861         elif data[:4] == '\xff\xfe\x00\x00':
   1862             encoding = 'utf-32le'
   1863             data = data[4:]
   1864         newdata = unicode(data, encoding)
   1865         return newdata
   1866 
   1867     def _detectEncoding(self, xml_data, isHTML=False):
   1868         """Given a document, tries to detect its XML encoding."""
   1869         xml_encoding = sniffed_xml_encoding = None
   1870         try:
   1871             if xml_data[:4] == '\x4c\x6f\xa7\x94':
   1872                 # EBCDIC
   1873                 xml_data = self._ebcdic_to_ascii(xml_data)
   1874             elif xml_data[:4] == '\x00\x3c\x00\x3f':
   1875                 # UTF-16BE
   1876                 sniffed_xml_encoding = 'utf-16be'
   1877                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
   1878             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
   1879                      and (xml_data[2:4] != '\x00\x00'):
   1880                 # UTF-16BE with BOM
   1881                 sniffed_xml_encoding = 'utf-16be'
   1882                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
   1883             elif xml_data[:4] == '\x3c\x00\x3f\x00':
   1884                 # UTF-16LE
   1885                 sniffed_xml_encoding = 'utf-16le'
   1886                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
   1887             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
   1888                      (xml_data[2:4] != '\x00\x00'):
   1889                 # UTF-16LE with BOM
   1890                 sniffed_xml_encoding = 'utf-16le'
   1891                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
   1892             elif xml_data[:4] == '\x00\x00\x00\x3c':
   1893                 # UTF-32BE
   1894                 sniffed_xml_encoding = 'utf-32be'
   1895                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
   1896             elif xml_data[:4] == '\x3c\x00\x00\x00':
   1897                 # UTF-32LE
   1898                 sniffed_xml_encoding = 'utf-32le'
   1899                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
   1900             elif xml_data[:4] == '\x00\x00\xfe\xff':
   1901                 # UTF-32BE with BOM
   1902                 sniffed_xml_encoding = 'utf-32be'
   1903                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
   1904             elif xml_data[:4] == '\xff\xfe\x00\x00':
   1905                 # UTF-32LE with BOM
   1906                 sniffed_xml_encoding = 'utf-32le'
   1907                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
   1908             elif xml_data[:3] == '\xef\xbb\xbf':
   1909                 # UTF-8 with BOM
   1910                 sniffed_xml_encoding = 'utf-8'
   1911                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
   1912             else:
   1913                 sniffed_xml_encoding = 'ascii'
   1914                 pass
   1915         except:
   1916             xml_encoding_match = None
   1917         xml_encoding_match = re.compile(
   1918             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
   1919         if not xml_encoding_match and isHTML:
   1920             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
   1921             xml_encoding_match = regexp.search(xml_data)
   1922         if xml_encoding_match is not None:
   1923             xml_encoding = xml_encoding_match.groups()[0].lower()
   1924             if isHTML:
   1925                 self.declaredHTMLEncoding = xml_encoding
   1926             if sniffed_xml_encoding and \
   1927                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
   1928                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
   1929                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
   1930                                  'utf16', 'u16')):
   1931                 xml_encoding = sniffed_xml_encoding
   1932         return xml_data, xml_encoding, sniffed_xml_encoding
   1933 
   1934 
   1935     def find_codec(self, charset):
   1936         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
   1937                or (charset and self._codec(charset.replace("-", ""))) \
   1938                or (charset and self._codec(charset.replace("-", "_"))) \
   1939                or charset
   1940 
   1941     def _codec(self, charset):
   1942         if not charset: return charset
   1943         codec = None
   1944         try:
   1945             codecs.lookup(charset)
   1946             codec = charset
   1947         except (LookupError, ValueError):
   1948             pass
   1949         return codec
   1950 
   1951     EBCDIC_TO_ASCII_MAP = None
   1952     def _ebcdic_to_ascii(self, s):
   1953         c = self.__class__
   1954         if not c.EBCDIC_TO_ASCII_MAP:
   1955             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
   1956                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
   1957                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
   1958                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
   1959                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
   1960                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
   1961                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
   1962                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
   1963                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
   1964                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
   1965                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
   1966                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
   1967                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
   1968                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
   1969                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
   1970                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
   1971                     250,251,252,253,254,255)
   1972             import string
   1973             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
   1974             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
   1975         return s.translate(c.EBCDIC_TO_ASCII_MAP)
   1976 
   1977     MS_CHARS = { '\x80' : ('euro', '20AC'),
   1978                  '\x81' : ' ',
   1979                  '\x82' : ('sbquo', '201A'),
   1980                  '\x83' : ('fnof', '192'),
   1981                  '\x84' : ('bdquo', '201E'),
   1982                  '\x85' : ('hellip', '2026'),
   1983                  '\x86' : ('dagger', '2020'),
   1984                  '\x87' : ('Dagger', '2021'),
   1985                  '\x88' : ('circ', '2C6'),
   1986                  '\x89' : ('permil', '2030'),
   1987                  '\x8A' : ('Scaron', '160'),
   1988                  '\x8B' : ('lsaquo', '2039'),
   1989                  '\x8C' : ('OElig', '152'),
   1990                  '\x8D' : '?',
   1991                  '\x8E' : ('#x17D', '17D'),
   1992                  '\x8F' : '?',
   1993                  '\x90' : '?',
   1994                  '\x91' : ('lsquo', '2018'),
   1995                  '\x92' : ('rsquo', '2019'),
   1996                  '\x93' : ('ldquo', '201C'),
   1997                  '\x94' : ('rdquo', '201D'),
   1998                  '\x95' : ('bull', '2022'),
   1999                  '\x96' : ('ndash', '2013'),
   2000                  '\x97' : ('mdash', '2014'),
   2001                  '\x98' : ('tilde', '2DC'),
   2002                  '\x99' : ('trade', '2122'),
   2003                  '\x9a' : ('scaron', '161'),
   2004                  '\x9b' : ('rsaquo', '203A'),
   2005                  '\x9c' : ('oelig', '153'),
   2006                  '\x9d' : '?',
   2007                  '\x9e' : ('#x17E', '17E'),
   2008                  '\x9f' : ('Yuml', ''),}
   2009 
   2010 #######################################################################
   2011 
   2012 
   2013 #By default, act as an HTML pretty-printer.
   2014 if __name__ == '__main__':
   2015     import sys
   2016     soup = BeautifulSoup(sys.stdin)
   2017     print soup.prettify()
   2018