Home | History | Annotate | Download | only in bs4
      1 """Helper classes for tests."""
      2 
      3 import copy
      4 import functools
      5 import unittest
      6 from unittest import TestCase
      7 from bs4 import BeautifulSoup
      8 from bs4.element import (
      9     CharsetMetaAttributeValue,
     10     Comment,
     11     ContentMetaAttributeValue,
     12     Doctype,
     13     SoupStrainer,
     14 )
     15 
     16 from bs4.builder import HTMLParserTreeBuilder
     17 default_builder = HTMLParserTreeBuilder
     18 
     19 
     20 class SoupTest(unittest.TestCase):
     21 
     22     @property
     23     def default_builder(self):
     24         return default_builder()
     25 
     26     def soup(self, markup, **kwargs):
     27         """Build a Beautiful Soup object from markup."""
     28         builder = kwargs.pop('builder', self.default_builder)
     29         return BeautifulSoup(markup, builder=builder, **kwargs)
     30 
     31     def document_for(self, markup):
     32         """Turn an HTML fragment into a document.
     33 
     34         The details depend on the builder.
     35         """
     36         return self.default_builder.test_fragment_to_document(markup)
     37 
     38     def assertSoupEquals(self, to_parse, compare_parsed_to=None):
     39         builder = self.default_builder
     40         obj = BeautifulSoup(to_parse, builder=builder)
     41         if compare_parsed_to is None:
     42             compare_parsed_to = to_parse
     43 
     44         self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
     45 
     46 
     47 class HTMLTreeBuilderSmokeTest(object):
     48 
     49     """A basic test of a treebuilder's competence.
     50 
     51     Any HTML treebuilder, present or future, should be able to pass
     52     these tests. With invalid markup, there's room for interpretation,
     53     and different parsers can handle it differently. But with the
     54     markup in these tests, there's not much room for interpretation.
     55     """
     56 
     57     def assertDoctypeHandled(self, doctype_fragment):
     58         """Assert that a given doctype string is handled correctly."""
     59         doctype_str, soup = self._document_with_doctype(doctype_fragment)
     60 
     61         # Make sure a Doctype object was created.
     62         doctype = soup.contents[0]
     63         self.assertEqual(doctype.__class__, Doctype)
     64         self.assertEqual(doctype, doctype_fragment)
     65         self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
     66 
     67         # Make sure that the doctype was correctly associated with the
     68         # parse tree and that the rest of the document parsed.
     69         self.assertEqual(soup.p.contents[0], 'foo')
     70 
     71     def _document_with_doctype(self, doctype_fragment):
     72         """Generate and parse a document with the given doctype."""
     73         doctype = '<!DOCTYPE %s>' % doctype_fragment
     74         markup = doctype + '\n<p>foo</p>'
     75         soup = self.soup(markup)
     76         return doctype, soup
     77 
     78     def test_normal_doctypes(self):
     79         """Make sure normal, everyday HTML doctypes are handled correctly."""
     80         self.assertDoctypeHandled("html")
     81         self.assertDoctypeHandled(
     82             'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
     83 
     84     def test_empty_doctype(self):
     85         soup = self.soup("<!DOCTYPE>")
     86         doctype = soup.contents[0]
     87         self.assertEqual("", doctype.strip())
     88 
     89     def test_public_doctype_with_url(self):
     90         doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
     91         self.assertDoctypeHandled(doctype)
     92 
     93     def test_system_doctype(self):
     94         self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
     95 
     96     def test_namespaced_system_doctype(self):
     97         # We can handle a namespaced doctype with a system ID.
     98         self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
     99 
    100     def test_namespaced_public_doctype(self):
    101         # Test a namespaced doctype with a public id.
    102         self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
    103 
    104     def test_real_xhtml_document(self):
    105         """A real XHTML document should come out more or less the same as it went in."""
    106         markup = b"""<?xml version="1.0" encoding="utf-8"?>
    107 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
    108 <html xmlns="http://www.w3.org/1999/xhtml">
    109 <head><title>Hello.</title></head>
    110 <body>Goodbye.</body>
    111 </html>"""
    112         soup = self.soup(markup)
    113         self.assertEqual(
    114             soup.encode("utf-8").replace(b"\n", b""),
    115             markup.replace(b"\n", b""))
    116 
    117     def test_deepcopy(self):
    118         """Make sure you can copy the tree builder.
    119 
    120         This is important because the builder is part of a
    121         BeautifulSoup object, and we want to be able to copy that.
    122         """
    123         copy.deepcopy(self.default_builder)
    124 
    125     def test_p_tag_is_never_empty_element(self):
    126         """A <p> tag is never designated as an empty-element tag.
    127 
    128         Even if the markup shows it as an empty-element tag, it
    129         shouldn't be presented that way.
    130         """
    131         soup = self.soup("<p/>")
    132         self.assertFalse(soup.p.is_empty_element)
    133         self.assertEqual(str(soup.p), "<p></p>")
    134 
    135     def test_unclosed_tags_get_closed(self):
    136         """A tag that's not closed by the end of the document should be closed.
    137 
    138         This applies to all tags except empty-element tags.
    139         """
    140         self.assertSoupEquals("<p>", "<p></p>")
    141         self.assertSoupEquals("<b>", "<b></b>")
    142 
    143         self.assertSoupEquals("<br>", "<br/>")
    144 
    145     def test_br_is_always_empty_element_tag(self):
    146         """A <br> tag is designated as an empty-element tag.
    147 
    148         Some parsers treat <br></br> as one <br/> tag, some parsers as
    149         two tags, but it should always be an empty-element tag.
    150         """
    151         soup = self.soup("<br></br>")
    152         self.assertTrue(soup.br.is_empty_element)
    153         self.assertEqual(str(soup.br), "<br/>")
    154 
    155     def test_nested_formatting_elements(self):
    156         self.assertSoupEquals("<em><em></em></em>")
    157 
    158     def test_comment(self):
    159         # Comments are represented as Comment objects.
    160         markup = "<p>foo<!--foobar-->baz</p>"
    161         self.assertSoupEquals(markup)
    162 
    163         soup = self.soup(markup)
    164         comment = soup.find(text="foobar")
    165         self.assertEqual(comment.__class__, Comment)
    166 
    167         # The comment is properly integrated into the tree.
    168         foo = soup.find(text="foo")
    169         self.assertEqual(comment, foo.next_element)
    170         baz = soup.find(text="baz")
    171         self.assertEqual(comment, baz.previous_element)
    172 
    173     def test_preserved_whitespace_in_pre_and_textarea(self):
    174         """Whitespace must be preserved in <pre> and <textarea> tags."""
    175         self.assertSoupEquals("<pre>   </pre>")
    176         self.assertSoupEquals("<textarea> woo  </textarea>")
    177 
    178     def test_nested_inline_elements(self):
    179         """Inline elements can be nested indefinitely."""
    180         b_tag = "<b>Inside a B tag</b>"
    181         self.assertSoupEquals(b_tag)
    182 
    183         nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
    184         self.assertSoupEquals(nested_b_tag)
    185 
    186         double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
    187         self.assertSoupEquals(nested_b_tag)
    188 
    189     def test_nested_block_level_elements(self):
    190         """Block elements can be nested."""
    191         soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
    192         blockquote = soup.blockquote
    193         self.assertEqual(blockquote.p.b.string, 'Foo')
    194         self.assertEqual(blockquote.b.string, 'Foo')
    195 
    196     def test_correctly_nested_tables(self):
    197         """One table can go inside another one."""
    198         markup = ('<table id="1">'
    199                   '<tr>'
    200                   "<td>Here's another table:"
    201                   '<table id="2">'
    202                   '<tr><td>foo</td></tr>'
    203                   '</table></td>')
    204 
    205         self.assertSoupEquals(
    206             markup,
    207             '<table id="1"><tr><td>Here\'s another table:'
    208             '<table id="2"><tr><td>foo</td></tr></table>'
    209             '</td></tr></table>')
    210 
    211         self.assertSoupEquals(
    212             "<table><thead><tr><td>Foo</td></tr></thead>"
    213             "<tbody><tr><td>Bar</td></tr></tbody>"
    214             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
    215 
    216     def test_deeply_nested_multivalued_attribute(self):
    217         # html5lib can set the attributes of the same tag many times
    218         # as it rearranges the tree. This has caused problems with
    219         # multivalued attributes.
    220         markup = '<table><div><div class="css"></div></div></table>'
    221         soup = self.soup(markup)
    222         self.assertEqual(["css"], soup.div.div['class'])
    223 
    224     def test_angle_brackets_in_attribute_values_are_escaped(self):
    225         self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
    226 
    227     def test_entities_in_attributes_converted_to_unicode(self):
    228         expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
    229         self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
    230         self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
    231         self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
    232         self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
    233 
    234     def test_entities_in_text_converted_to_unicode(self):
    235         expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
    236         self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
    237         self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
    238         self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
    239         self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
    240 
    241     def test_quot_entity_converted_to_quotation_mark(self):
    242         self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
    243                               '<p>I said "good day!"</p>')
    244 
    245     def test_out_of_range_entity(self):
    246         expect = u"\N{REPLACEMENT CHARACTER}"
    247         self.assertSoupEquals("&#10000000000000;", expect)
    248         self.assertSoupEquals("&#x10000000000000;", expect)
    249         self.assertSoupEquals("&#1000000000;", expect)
    250 
    251     def test_multipart_strings(self):
    252         "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
    253         soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
    254         self.assertEqual("p", soup.h2.string.next_element.name)
    255         self.assertEqual("p", soup.p.name)
    256 
    257     def test_basic_namespaces(self):
    258         """Parsers don't need to *understand* namespaces, but at the
    259         very least they should not choke on namespaces or lose
    260         data."""
    261 
    262         markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
    263         soup = self.soup(markup)
    264         self.assertEqual(markup, soup.encode())
    265         html = soup.html
    266         self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
    267         self.assertEqual(
    268             'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
    269         self.assertEqual(
    270             'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
    271 
    272     def test_multivalued_attribute_value_becomes_list(self):
    273         markup = b'<a class="foo bar">'
    274         soup = self.soup(markup)
    275         self.assertEqual(['foo', 'bar'], soup.a['class'])
    276 
    277     #
    278     # Generally speaking, tests below this point are more tests of
    279     # Beautiful Soup than tests of the tree builders. But parsers are
    280     # weird, so we run these tests separately for every tree builder
    281     # to detect any differences between them.
    282     #
    283 
    284     def test_can_parse_unicode_document(self):
    285         # A seemingly innocuous document... but it's in Unicode! And
    286         # it contains characters that can't be represented in the
    287         # encoding found in the  declaration! The horror!
    288         markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
    289         soup = self.soup(markup)
    290         self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
    291 
    292     def test_soupstrainer(self):
    293         """Parsers should be able to work with SoupStrainers."""
    294         strainer = SoupStrainer("b")
    295         soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
    296                          parse_only=strainer)
    297         self.assertEqual(soup.decode(), "<b>bold</b>")
    298 
    299     def test_single_quote_attribute_values_become_double_quotes(self):
    300         self.assertSoupEquals("<foo attr='bar'></foo>",
    301                               '<foo attr="bar"></foo>')
    302 
    303     def test_attribute_values_with_nested_quotes_are_left_alone(self):
    304         text = """<foo attr='bar "brawls" happen'>a</foo>"""
    305         self.assertSoupEquals(text)
    306 
    307     def test_attribute_values_with_double_nested_quotes_get_quoted(self):
    308         text = """<foo attr='bar "brawls" happen'>a</foo>"""
    309         soup = self.soup(text)
    310         soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
    311         self.assertSoupEquals(
    312             soup.foo.decode(),
    313             """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
    314 
    315     def test_ampersand_in_attribute_value_gets_escaped(self):
    316         self.assertSoupEquals('<this is="really messed up & stuff"></this>',
    317                               '<this is="really messed up &amp; stuff"></this>')
    318 
    319         self.assertSoupEquals(
    320             '<a href="http://example.org?a=1&b=2;3">foo</a>',
    321             '<a href="http://example.org?a=1&b=2;3">foo</a>')
    322 
    323     def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
    324         self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
    325 
    326     def test_entities_in_strings_converted_during_parsing(self):
    327         # Both XML and HTML entities are converted to Unicode characters
    328         # during parsing.
    329         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
    330         expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
    331         self.assertSoupEquals(text, expected)
    332 
    333     def test_smart_quotes_converted_on_the_way_in(self):
    334         # Microsoft smart quotes are converted to Unicode characters during
    335         # parsing.
    336         quote = b"<p>\x91Foo\x92</p>"
    337         soup = self.soup(quote)
    338         self.assertEqual(
    339             soup.p.string,
    340             u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
    341 
    342     def test_non_breaking_spaces_converted_on_the_way_in(self):
    343         soup = self.soup("<a>&nbsp;&nbsp;</a>")
    344         self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
    345 
    346     def test_entities_converted_on_the_way_out(self):
    347         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
    348         expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
    349         soup = self.soup(text)
    350         self.assertEqual(soup.p.encode("utf-8"), expected)
    351 
    352     def test_real_iso_latin_document(self):
    353         # Smoke test of interrelated functionality, using an
    354         # easy-to-understand document.
    355 
    356         # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
    357         unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
    358 
    359         # That's because we're going to encode it into ISO-Latin-1, and use
    360         # that to test.
    361         iso_latin_html = unicode_html.encode("iso-8859-1")
    362 
    363         # Parse the ISO-Latin-1 HTML.
    364         soup = self.soup(iso_latin_html)
    365         # Encode it to UTF-8.
    366         result = soup.encode("utf-8")
    367 
    368         # What do we expect the result to look like? Well, it would
    369         # look like unicode_html, except that the META tag would say
    370         # UTF-8 instead of ISO-Latin-1.
    371         expected = unicode_html.replace("ISO-Latin-1", "utf-8")
    372 
    373         # And, of course, it would be in UTF-8, not Unicode.
    374         expected = expected.encode("utf-8")
    375 
    376         # Ta-da!
    377         self.assertEqual(result, expected)
    378 
    379     def test_real_shift_jis_document(self):
    380         # Smoke test to make sure the parser can handle a document in
    381         # Shift-JIS encoding, without choking.
    382         shift_jis_html = (
    383             b'<html><head></head><body><pre>'
    384             b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
    385             b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
    386             b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
    387             b'</pre></body></html>')
    388         unicode_html = shift_jis_html.decode("shift-jis")
    389         soup = self.soup(unicode_html)
    390 
    391         # Make sure the parse tree is correctly encoded to various
    392         # encodings.
    393         self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
    394         self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
    395 
    396     def test_real_hebrew_document(self):
    397         # A real-world test to make sure we can convert ISO-8859-9 (a
    398         # Hebrew encoding) to UTF-8.
    399         hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
    400         soup = self.soup(
    401             hebrew_document, from_encoding="iso8859-8")
    402         self.assertEqual(soup.original_encoding, 'iso8859-8')
    403         self.assertEqual(
    404             soup.encode('utf-8'),
    405             hebrew_document.decode("iso8859-8").encode("utf-8"))
    406 
    407     def test_meta_tag_reflects_current_encoding(self):
    408         # Here's the <meta> tag saying that a document is
    409         # encoded in Shift-JIS.
    410         meta_tag = ('<meta content="text/html; charset=x-sjis" '
    411                     'http-equiv="Content-type"/>')
    412 
    413         # Here's a document incorporating that meta tag.
    414         shift_jis_html = (
    415             '<html><head>\n%s\n'
    416             '<meta http-equiv="Content-language" content="ja"/>'
    417             '</head><body>Shift-JIS markup goes here.') % meta_tag
    418         soup = self.soup(shift_jis_html)
    419 
    420         # Parse the document, and the charset is seemingly unaffected.
    421         parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
    422         content = parsed_meta['content']
    423         self.assertEqual('text/html; charset=x-sjis', content)
    424 
    425         # But that value is actually a ContentMetaAttributeValue object.
    426         self.assertTrue(isinstance(content, ContentMetaAttributeValue))
    427 
    428         # And it will take on a value that reflects its current
    429         # encoding.
    430         self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
    431 
    432         # For the rest of the story, see TestSubstitutions in
    433         # test_tree.py.
    434 
    435     def test_html5_style_meta_tag_reflects_current_encoding(self):
    436         # Here's the <meta> tag saying that a document is
    437         # encoded in Shift-JIS.
    438         meta_tag = ('<meta id="encoding" charset="x-sjis" />')
    439 
    440         # Here's a document incorporating that meta tag.
    441         shift_jis_html = (
    442             '<html><head>\n%s\n'
    443             '<meta http-equiv="Content-language" content="ja"/>'
    444             '</head><body>Shift-JIS markup goes here.') % meta_tag
    445         soup = self.soup(shift_jis_html)
    446 
    447         # Parse the document, and the charset is seemingly unaffected.
    448         parsed_meta = soup.find('meta', id="encoding")
    449         charset = parsed_meta['charset']
    450         self.assertEqual('x-sjis', charset)
    451 
    452         # But that value is actually a CharsetMetaAttributeValue object.
    453         self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
    454 
    455         # And it will take on a value that reflects its current
    456         # encoding.
    457         self.assertEqual('utf8', charset.encode("utf8"))
    458 
    459     def test_tag_with_no_attributes_can_have_attributes_added(self):
    460         data = self.soup("<a>text</a>")
    461         data.a['foo'] = 'bar'
    462         self.assertEqual('<a foo="bar">text</a>', data.a.decode())
    463 
    464 class XMLTreeBuilderSmokeTest(object):
    465 
    466     def test_docstring_generated(self):
    467         soup = self.soup("<root/>")
    468         self.assertEqual(
    469             soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
    470 
    471     def test_real_xhtml_document(self):
    472         """A real XHTML document should come out *exactly* the same as it went in."""
    473         markup = b"""<?xml version="1.0" encoding="utf-8"?>
    474 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
    475 <html xmlns="http://www.w3.org/1999/xhtml">
    476 <head><title>Hello.</title></head>
    477 <body>Goodbye.</body>
    478 </html>"""
    479         soup = self.soup(markup)
    480         self.assertEqual(
    481             soup.encode("utf-8"), markup)
    482 
    483     def test_formatter_processes_script_tag_for_xml_documents(self):
    484         doc = """
    485   <script type="text/javascript">
    486   </script>
    487 """
    488         soup = BeautifulSoup(doc, "xml")
    489         # lxml would have stripped this while parsing, but we can add
    490         # it later.
    491         soup.script.string = 'console.log("< < hey > > ");'
    492         encoded = soup.encode()
    493         self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
    494 
    495     def test_can_parse_unicode_document(self):
    496         markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
    497         soup = self.soup(markup)
    498         self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
    499 
    500     def test_popping_namespaced_tag(self):
    501         markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
    502         soup = self.soup(markup)
    503         self.assertEqual(
    504             unicode(soup.rss), markup)
    505 
    506     def test_docstring_includes_correct_encoding(self):
    507         soup = self.soup("<root/>")
    508         self.assertEqual(
    509             soup.encode("latin1"),
    510             b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
    511 
    512     def test_large_xml_document(self):
    513         """A large XML document should come out the same as it went in."""
    514         markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
    515                   + b'0' * (2**12)
    516                   + b'</root>')
    517         soup = self.soup(markup)
    518         self.assertEqual(soup.encode("utf-8"), markup)
    519 
    520 
    521     def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
    522         self.assertSoupEquals("<p>", "<p/>")
    523         self.assertSoupEquals("<p>foo</p>")
    524 
    525     def test_namespaces_are_preserved(self):
    526         markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
    527         soup = self.soup(markup)
    528         root = soup.root
    529         self.assertEqual("http://example.com/", root['xmlns:a'])
    530         self.assertEqual("http://example.net/", root['xmlns:b'])
    531 
    532     def test_closing_namespaced_tag(self):
    533         markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
    534         soup = self.soup(markup)
    535         self.assertEqual(unicode(soup.p), markup)
    536 
    537     def test_namespaced_attributes(self):
    538         markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
    539         soup = self.soup(markup)
    540         self.assertEqual(unicode(soup.foo), markup)
    541 
    542     def test_namespaced_attributes_xml_namespace(self):
    543         markup = '<foo xml:lang="fr">bar</foo>'
    544         soup = self.soup(markup)
    545         self.assertEqual(unicode(soup.foo), markup)
    546 
    547 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    548     """Smoke test for a tree builder that supports HTML5."""
    549 
    550     def test_real_xhtml_document(self):
    551         # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
    552         # XHTML documents in any particular way.
    553         pass
    554 
    555     def test_html_tags_have_namespace(self):
    556         markup = "<a>"
    557         soup = self.soup(markup)
    558         self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
    559 
    560     def test_svg_tags_have_namespace(self):
    561         markup = '<svg><circle/></svg>'
    562         soup = self.soup(markup)
    563         namespace = "http://www.w3.org/2000/svg"
    564         self.assertEqual(namespace, soup.svg.namespace)
    565         self.assertEqual(namespace, soup.circle.namespace)
    566 
    567 
    568     def test_mathml_tags_have_namespace(self):
    569         markup = '<math><msqrt>5</msqrt></math>'
    570         soup = self.soup(markup)
    571         namespace = 'http://www.w3.org/1998/Math/MathML'
    572         self.assertEqual(namespace, soup.math.namespace)
    573         self.assertEqual(namespace, soup.msqrt.namespace)
    574 
    575     def test_xml_declaration_becomes_comment(self):
    576         markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
    577         soup = self.soup(markup)
    578         self.assertTrue(isinstance(soup.contents[0], Comment))
    579         self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
    580         self.assertEqual("html", soup.contents[0].next_element.name)
    581 
    582 def skipIf(condition, reason):
    583    def nothing(test, *args, **kwargs):
    584        return None
    585 
    586    def decorator(test_item):
    587        if condition:
    588            return nothing
    589        else:
    590            return test_item
    591 
    592    return decorator
    593