Home | History | Annotate | Download | only in bs4
      1 """Beautiful Soup
      2 Elixir and Tonic
      3 "The Screen-Scraper's Friend"
      4 http://www.crummy.com/software/BeautifulSoup/
      5 
      6 Beautiful Soup uses a pluggable XML or HTML parser to parse a
      7 (possibly invalid) document into a tree representation. Beautiful Soup
      8 provides provides methods and Pythonic idioms that make it easy to
      9 navigate, search, and modify the parse tree.
     10 
     11 Beautiful Soup works with Python 2.6 and up. It works better if lxml
     12 and/or html5lib is installed.
     13 
     14 For more than you ever wanted to know about Beautiful Soup, see the
     15 documentation:
     16 http://www.crummy.com/software/BeautifulSoup/bs4/doc/
     17 """
     18 
     19 __author__ = "Leonard Richardson (leonardr (at] segfault.org)"
     20 __version__ = "4.3.2"
     21 __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
     22 __license__ = "MIT"
     23 
     24 __all__ = ['BeautifulSoup']
     25 
     26 import os
     27 import re
     28 import warnings
     29 
     30 from .builder import builder_registry, ParserRejectedMarkup
     31 from .dammit import UnicodeDammit
     32 from .element import (
     33     CData,
     34     Comment,
     35     DEFAULT_OUTPUT_ENCODING,
     36     Declaration,
     37     Doctype,
     38     NavigableString,
     39     PageElement,
     40     ProcessingInstruction,
     41     ResultSet,
     42     SoupStrainer,
     43     Tag,
     44     )
     45 
     46 # The very first thing we do is give a useful error if someone is
     47 # running this code under Python 3 without converting it.
     48 syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
     49 
     50 class BeautifulSoup(Tag):
     51     """
     52     This class defines the basic interface called by the tree builders.
     53 
     54     These methods will be called by the parser:
     55       reset()
     56       feed(markup)
     57 
     58     The tree builder may call these methods from its feed() implementation:
     59       handle_starttag(name, attrs) # See note about return value
     60       handle_endtag(name)
     61       handle_data(data) # Appends to the current data node
     62       endData(containerClass=NavigableString) # Ends the current data node
     63 
     64     No matter how complicated the underlying parser is, you should be
     65     able to build a tree using 'start tag' events, 'end tag' events,
     66     'data' events, and "done with data" events.
     67 
     68     If you encounter an empty-element tag (aka a self-closing tag,
     69     like HTML's <br> tag), call handle_starttag and then
     70     handle_endtag.
     71     """
     72     ROOT_TAG_NAME = u'[document]'
     73 
     74     # If the end-user gives no indication which tree builder they
     75     # want, look for one with these features.
     76     DEFAULT_BUILDER_FEATURES = ['html', 'fast']
     77 
     78     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
     79 
     80     def __init__(self, markup="", features=None, builder=None,
     81                  parse_only=None, from_encoding=None, **kwargs):
     82         """The Soup object is initialized as the 'root tag', and the
     83         provided markup (which can be a string or a file-like object)
     84         is fed into the underlying parser."""
     85 
     86         if 'convertEntities' in kwargs:
     87             warnings.warn(
     88                 "BS4 does not respect the convertEntities argument to the "
     89                 "BeautifulSoup constructor. Entities are always converted "
     90                 "to Unicode characters.")
     91 
     92         if 'markupMassage' in kwargs:
     93             del kwargs['markupMassage']
     94             warnings.warn(
     95                 "BS4 does not respect the markupMassage argument to the "
     96                 "BeautifulSoup constructor. The tree builder is responsible "
     97                 "for any necessary markup massage.")
     98 
     99         if 'smartQuotesTo' in kwargs:
    100             del kwargs['smartQuotesTo']
    101             warnings.warn(
    102                 "BS4 does not respect the smartQuotesTo argument to the "
    103                 "BeautifulSoup constructor. Smart quotes are always converted "
    104                 "to Unicode characters.")
    105 
    106         if 'selfClosingTags' in kwargs:
    107             del kwargs['selfClosingTags']
    108             warnings.warn(
    109                 "BS4 does not respect the selfClosingTags argument to the "
    110                 "BeautifulSoup constructor. The tree builder is responsible "
    111                 "for understanding self-closing tags.")
    112 
    113         if 'isHTML' in kwargs:
    114             del kwargs['isHTML']
    115             warnings.warn(
    116                 "BS4 does not respect the isHTML argument to the "
    117                 "BeautifulSoup constructor. You can pass in features='html' "
    118                 "or features='xml' to get a builder capable of handling "
    119                 "one or the other.")
    120 
    121         def deprecated_argument(old_name, new_name):
    122             if old_name in kwargs:
    123                 warnings.warn(
    124                     'The "%s" argument to the BeautifulSoup constructor '
    125                     'has been renamed to "%s."' % (old_name, new_name))
    126                 value = kwargs[old_name]
    127                 del kwargs[old_name]
    128                 return value
    129             return None
    130 
    131         parse_only = parse_only or deprecated_argument(
    132             "parseOnlyThese", "parse_only")
    133 
    134         from_encoding = from_encoding or deprecated_argument(
    135             "fromEncoding", "from_encoding")
    136 
    137         if len(kwargs) > 0:
    138             arg = kwargs.keys().pop()
    139             raise TypeError(
    140                 "__init__() got an unexpected keyword argument '%s'" % arg)
    141 
    142         if builder is None:
    143             if isinstance(features, basestring):
    144                 features = [features]
    145             if features is None or len(features) == 0:
    146                 features = self.DEFAULT_BUILDER_FEATURES
    147             builder_class = builder_registry.lookup(*features)
    148             if builder_class is None:
    149                 raise FeatureNotFound(
    150                     "Couldn't find a tree builder with the features you "
    151                     "requested: %s. Do you need to install a parser library?"
    152                     % ",".join(features))
    153             builder = builder_class()
    154         self.builder = builder
    155         self.is_xml = builder.is_xml
    156         self.builder.soup = self
    157 
    158         self.parse_only = parse_only
    159 
    160         if hasattr(markup, 'read'):        # It's a file-type object.
    161             markup = markup.read()
    162         elif len(markup) <= 256:
    163             # Print out warnings for a couple beginner problems
    164             # involving passing non-markup to Beautiful Soup.
    165             # Beautiful Soup will still parse the input as markup,
    166             # just in case that's what the user really wants.
    167             if (isinstance(markup, unicode)
    168                 and not os.path.supports_unicode_filenames):
    169                 possible_filename = markup.encode("utf8")
    170             else:
    171                 possible_filename = markup
    172             is_file = False
    173             try:
    174                 is_file = os.path.exists(possible_filename)
    175             except Exception, e:
    176                 # This is almost certainly a problem involving
    177                 # characters not valid in filenames on this
    178                 # system. Just let it go.
    179                 pass
    180             if is_file:
    181                 warnings.warn(
    182                     '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
    183             if markup[:5] == "http:" or markup[:6] == "https:":
    184                 # TODO: This is ugly but I couldn't get it to work in
    185                 # Python 3 otherwise.
    186                 if ((isinstance(markup, bytes) and not b' ' in markup)
    187                     or (isinstance(markup, unicode) and not u' ' in markup)):
    188                     warnings.warn(
    189                         '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
    190 
    191         for (self.markup, self.original_encoding, self.declared_html_encoding,
    192          self.contains_replacement_characters) in (
    193             self.builder.prepare_markup(markup, from_encoding)):
    194             self.reset()
    195             try:
    196                 self._feed()
    197                 break
    198             except ParserRejectedMarkup:
    199                 pass
    200 
    201         # Clear out the markup and remove the builder's circular
    202         # reference to this object.
    203         self.markup = None
    204         self.builder.soup = None
    205 
    206     def _feed(self):
    207         # Convert the document to Unicode.
    208         self.builder.reset()
    209 
    210         self.builder.feed(self.markup)
    211         # Close out any unfinished strings and close all the open tags.
    212         self.endData()
    213         while self.currentTag.name != self.ROOT_TAG_NAME:
    214             self.popTag()
    215 
    216     def reset(self):
    217         Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
    218         self.hidden = 1
    219         self.builder.reset()
    220         self.current_data = []
    221         self.currentTag = None
    222         self.tagStack = []
    223         self.preserve_whitespace_tag_stack = []
    224         self.pushTag(self)
    225 
    226     def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
    227         """Create a new tag associated with this soup."""
    228         return Tag(None, self.builder, name, namespace, nsprefix, attrs)
    229 
    230     def new_string(self, s, subclass=NavigableString):
    231         """Create a new NavigableString associated with this soup."""
    232         navigable = subclass(s)
    233         navigable.setup()
    234         return navigable
    235 
    236     def insert_before(self, successor):
    237         raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
    238 
    239     def insert_after(self, successor):
    240         raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
    241 
    242     def popTag(self):
    243         tag = self.tagStack.pop()
    244         if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
    245             self.preserve_whitespace_tag_stack.pop()
    246         #print "Pop", tag.name
    247         if self.tagStack:
    248             self.currentTag = self.tagStack[-1]
    249         return self.currentTag
    250 
    251     def pushTag(self, tag):
    252         #print "Push", tag.name
    253         if self.currentTag:
    254             self.currentTag.contents.append(tag)
    255         self.tagStack.append(tag)
    256         self.currentTag = self.tagStack[-1]
    257         if tag.name in self.builder.preserve_whitespace_tags:
    258             self.preserve_whitespace_tag_stack.append(tag)
    259 
    260     def endData(self, containerClass=NavigableString):
    261         if self.current_data:
    262             current_data = u''.join(self.current_data)
    263             # If whitespace is not preserved, and this string contains
    264             # nothing but ASCII spaces, replace it with a single space
    265             # or newline.
    266             if not self.preserve_whitespace_tag_stack:
    267                 strippable = True
    268                 for i in current_data:
    269                     if i not in self.ASCII_SPACES:
    270                         strippable = False
    271                         break
    272                 if strippable:
    273                     if '\n' in current_data:
    274                         current_data = '\n'
    275                     else:
    276                         current_data = ' '
    277 
    278             # Reset the data collector.
    279             self.current_data = []
    280 
    281             # Should we add this string to the tree at all?
    282             if self.parse_only and len(self.tagStack) <= 1 and \
    283                    (not self.parse_only.text or \
    284                     not self.parse_only.search(current_data)):
    285                 return
    286 
    287             o = containerClass(current_data)
    288             self.object_was_parsed(o)
    289 
    290     def object_was_parsed(self, o, parent=None, most_recent_element=None):
    291         """Add an object to the parse tree."""
    292         parent = parent or self.currentTag
    293         most_recent_element = most_recent_element or self._most_recent_element
    294         o.setup(parent, most_recent_element)
    295 
    296         if most_recent_element is not None:
    297             most_recent_element.next_element = o
    298         self._most_recent_element = o
    299         parent.contents.append(o)
    300 
    301     def _popToTag(self, name, nsprefix=None, inclusivePop=True):
    302         """Pops the tag stack up to and including the most recent
    303         instance of the given tag. If inclusivePop is false, pops the tag
    304         stack up to but *not* including the most recent instqance of
    305         the given tag."""
    306         #print "Popping to %s" % name
    307         if name == self.ROOT_TAG_NAME:
    308             # The BeautifulSoup object itself can never be popped.
    309             return
    310 
    311         most_recently_popped = None
    312 
    313         stack_size = len(self.tagStack)
    314         for i in range(stack_size - 1, 0, -1):
    315             t = self.tagStack[i]
    316             if (name == t.name and nsprefix == t.prefix):
    317                 if inclusivePop:
    318                     most_recently_popped = self.popTag()
    319                 break
    320             most_recently_popped = self.popTag()
    321 
    322         return most_recently_popped
    323 
    324     def handle_starttag(self, name, namespace, nsprefix, attrs):
    325         """Push a start tag on to the stack.
    326 
    327         If this method returns None, the tag was rejected by the
    328         SoupStrainer. You should proceed as if the tag had not occured
    329         in the document. For instance, if this was a self-closing tag,
    330         don't call handle_endtag.
    331         """
    332 
    333         # print "Start tag %s: %s" % (name, attrs)
    334         self.endData()
    335 
    336         if (self.parse_only and len(self.tagStack) <= 1
    337             and (self.parse_only.text
    338                  or not self.parse_only.search_tag(name, attrs))):
    339             return None
    340 
    341         tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
    342                   self.currentTag, self._most_recent_element)
    343         if tag is None:
    344             return tag
    345         if self._most_recent_element:
    346             self._most_recent_element.next_element = tag
    347         self._most_recent_element = tag
    348         self.pushTag(tag)
    349         return tag
    350 
    351     def handle_endtag(self, name, nsprefix=None):
    352         #print "End tag: " + name
    353         self.endData()
    354         self._popToTag(name, nsprefix)
    355 
    356     def handle_data(self, data):
    357         self.current_data.append(data)
    358 
    359     def decode(self, pretty_print=False,
    360                eventual_encoding=DEFAULT_OUTPUT_ENCODING,
    361                formatter="minimal"):
    362         """Returns a string or Unicode representation of this document.
    363         To get Unicode, pass None for encoding."""
    364 
    365         if self.is_xml:
    366             # Print the XML declaration
    367             encoding_part = ''
    368             if eventual_encoding != None:
    369                 encoding_part = ' encoding="%s"' % eventual_encoding
    370             prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
    371         else:
    372             prefix = u''
    373         if not pretty_print:
    374             indent_level = None
    375         else:
    376             indent_level = 0
    377         return prefix + super(BeautifulSoup, self).decode(
    378             indent_level, eventual_encoding, formatter)
    379 
    380 # Alias to make it easier to type import: 'from bs4 import _soup'
    381 _s = BeautifulSoup
    382 _soup = BeautifulSoup
    383 
    384 class BeautifulStoneSoup(BeautifulSoup):
    385     """Deprecated interface to an XML parser."""
    386 
    387     def __init__(self, *args, **kwargs):
    388         kwargs['features'] = 'xml'
    389         warnings.warn(
    390             'The BeautifulStoneSoup class is deprecated. Instead of using '
    391             'it, pass features="xml" into the BeautifulSoup constructor.')
    392         super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
    393 
    394 
    395 class StopParsing(Exception):
    396     pass
    397 
    398 class FeatureNotFound(ValueError):
    399     pass
    400 
    401 
    402 #By default, act as an HTML pretty-printer.
    403 if __name__ == '__main__':
    404     import sys
    405     soup = BeautifulSoup(sys.stdin)
    406     print soup.prettify()
    407