1 """Helper classes for tests.""" 2 3 import copy 4 import functools 5 import unittest 6 from unittest import TestCase 7 from bs4 import BeautifulSoup 8 from bs4.element import ( 9 CharsetMetaAttributeValue, 10 Comment, 11 ContentMetaAttributeValue, 12 Doctype, 13 SoupStrainer, 14 ) 15 16 from bs4.builder import HTMLParserTreeBuilder 17 default_builder = HTMLParserTreeBuilder 18 19 20 class SoupTest(unittest.TestCase): 21 22 @property 23 def default_builder(self): 24 return default_builder() 25 26 def soup(self, markup, **kwargs): 27 """Build a Beautiful Soup object from markup.""" 28 builder = kwargs.pop('builder', self.default_builder) 29 return BeautifulSoup(markup, builder=builder, **kwargs) 30 31 def document_for(self, markup): 32 """Turn an HTML fragment into a document. 33 34 The details depend on the builder. 35 """ 36 return self.default_builder.test_fragment_to_document(markup) 37 38 def assertSoupEquals(self, to_parse, compare_parsed_to=None): 39 builder = self.default_builder 40 obj = BeautifulSoup(to_parse, builder=builder) 41 if compare_parsed_to is None: 42 compare_parsed_to = to_parse 43 44 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) 45 46 47 class HTMLTreeBuilderSmokeTest(object): 48 49 """A basic test of a treebuilder's competence. 50 51 Any HTML treebuilder, present or future, should be able to pass 52 these tests. With invalid markup, there's room for interpretation, 53 and different parsers can handle it differently. But with the 54 markup in these tests, there's not much room for interpretation. 55 """ 56 57 def assertDoctypeHandled(self, doctype_fragment): 58 """Assert that a given doctype string is handled correctly.""" 59 doctype_str, soup = self._document_with_doctype(doctype_fragment) 60 61 # Make sure a Doctype object was created. 62 doctype = soup.contents[0] 63 self.assertEqual(doctype.__class__, Doctype) 64 self.assertEqual(doctype, doctype_fragment) 65 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) 66 67 # Make sure that the doctype was correctly associated with the 68 # parse tree and that the rest of the document parsed. 69 self.assertEqual(soup.p.contents[0], 'foo') 70 71 def _document_with_doctype(self, doctype_fragment): 72 """Generate and parse a document with the given doctype.""" 73 doctype = '<!DOCTYPE %s>' % doctype_fragment 74 markup = doctype + '\n<p>foo</p>' 75 soup = self.soup(markup) 76 return doctype, soup 77 78 def test_normal_doctypes(self): 79 """Make sure normal, everyday HTML doctypes are handled correctly.""" 80 self.assertDoctypeHandled("html") 81 self.assertDoctypeHandled( 82 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') 83 84 def test_empty_doctype(self): 85 soup = self.soup("<!DOCTYPE>") 86 doctype = soup.contents[0] 87 self.assertEqual("", doctype.strip()) 88 89 def test_public_doctype_with_url(self): 90 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' 91 self.assertDoctypeHandled(doctype) 92 93 def test_system_doctype(self): 94 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') 95 96 def test_namespaced_system_doctype(self): 97 # We can handle a namespaced doctype with a system ID. 98 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') 99 100 def test_namespaced_public_doctype(self): 101 # Test a namespaced doctype with a public id. 102 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') 103 104 def test_real_xhtml_document(self): 105 """A real XHTML document should come out more or less the same as it went in.""" 106 markup = b"""<?xml version="1.0" encoding="utf-8"?> 107 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> 108 <html xmlns="http://www.w3.org/1999/xhtml"> 109 <head><title>Hello.</title></head> 110 <body>Goodbye.</body> 111 </html>""" 112 soup = self.soup(markup) 113 self.assertEqual( 114 soup.encode("utf-8").replace(b"\n", b""), 115 markup.replace(b"\n", b"")) 116 117 def test_deepcopy(self): 118 """Make sure you can copy the tree builder. 119 120 This is important because the builder is part of a 121 BeautifulSoup object, and we want to be able to copy that. 122 """ 123 copy.deepcopy(self.default_builder) 124 125 def test_p_tag_is_never_empty_element(self): 126 """A <p> tag is never designated as an empty-element tag. 127 128 Even if the markup shows it as an empty-element tag, it 129 shouldn't be presented that way. 130 """ 131 soup = self.soup("<p/>") 132 self.assertFalse(soup.p.is_empty_element) 133 self.assertEqual(str(soup.p), "<p></p>") 134 135 def test_unclosed_tags_get_closed(self): 136 """A tag that's not closed by the end of the document should be closed. 137 138 This applies to all tags except empty-element tags. 139 """ 140 self.assertSoupEquals("<p>", "<p></p>") 141 self.assertSoupEquals("<b>", "<b></b>") 142 143 self.assertSoupEquals("<br>", "<br/>") 144 145 def test_br_is_always_empty_element_tag(self): 146 """A <br> tag is designated as an empty-element tag. 147 148 Some parsers treat <br></br> as one <br/> tag, some parsers as 149 two tags, but it should always be an empty-element tag. 150 """ 151 soup = self.soup("<br></br>") 152 self.assertTrue(soup.br.is_empty_element) 153 self.assertEqual(str(soup.br), "<br/>") 154 155 def test_nested_formatting_elements(self): 156 self.assertSoupEquals("<em><em></em></em>") 157 158 def test_comment(self): 159 # Comments are represented as Comment objects. 160 markup = "<p>foo<!--foobar-->baz</p>" 161 self.assertSoupEquals(markup) 162 163 soup = self.soup(markup) 164 comment = soup.find(text="foobar") 165 self.assertEqual(comment.__class__, Comment) 166 167 # The comment is properly integrated into the tree. 168 foo = soup.find(text="foo") 169 self.assertEqual(comment, foo.next_element) 170 baz = soup.find(text="baz") 171 self.assertEqual(comment, baz.previous_element) 172 173 def test_preserved_whitespace_in_pre_and_textarea(self): 174 """Whitespace must be preserved in <pre> and <textarea> tags.""" 175 self.assertSoupEquals("<pre> </pre>") 176 self.assertSoupEquals("<textarea> woo </textarea>") 177 178 def test_nested_inline_elements(self): 179 """Inline elements can be nested indefinitely.""" 180 b_tag = "<b>Inside a B tag</b>" 181 self.assertSoupEquals(b_tag) 182 183 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" 184 self.assertSoupEquals(nested_b_tag) 185 186 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" 187 self.assertSoupEquals(nested_b_tag) 188 189 def test_nested_block_level_elements(self): 190 """Block elements can be nested.""" 191 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') 192 blockquote = soup.blockquote 193 self.assertEqual(blockquote.p.b.string, 'Foo') 194 self.assertEqual(blockquote.b.string, 'Foo') 195 196 def test_correctly_nested_tables(self): 197 """One table can go inside another one.""" 198 markup = ('<table id="1">' 199 '<tr>' 200 "<td>Here's another table:" 201 '<table id="2">' 202 '<tr><td>foo</td></tr>' 203 '</table></td>') 204 205 self.assertSoupEquals( 206 markup, 207 '<table id="1"><tr><td>Here\'s another table:' 208 '<table id="2"><tr><td>foo</td></tr></table>' 209 '</td></tr></table>') 210 211 self.assertSoupEquals( 212 "<table><thead><tr><td>Foo</td></tr></thead>" 213 "<tbody><tr><td>Bar</td></tr></tbody>" 214 "<tfoot><tr><td>Baz</td></tr></tfoot></table>") 215 216 def test_deeply_nested_multivalued_attribute(self): 217 # html5lib can set the attributes of the same tag many times 218 # as it rearranges the tree. This has caused problems with 219 # multivalued attributes. 220 markup = '<table><div><div class="css"></div></div></table>' 221 soup = self.soup(markup) 222 self.assertEqual(["css"], soup.div.div['class']) 223 224 def test_angle_brackets_in_attribute_values_are_escaped(self): 225 self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') 226 227 def test_entities_in_attributes_converted_to_unicode(self): 228 expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' 229 self.assertSoupEquals('<p id="piñata"></p>', expect) 230 self.assertSoupEquals('<p id="piñata"></p>', expect) 231 self.assertSoupEquals('<p id="piñata"></p>', expect) 232 self.assertSoupEquals('<p id="piñata"></p>', expect) 233 234 def test_entities_in_text_converted_to_unicode(self): 235 expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' 236 self.assertSoupEquals("<p>piñata</p>", expect) 237 self.assertSoupEquals("<p>piñata</p>", expect) 238 self.assertSoupEquals("<p>piñata</p>", expect) 239 self.assertSoupEquals("<p>piñata</p>", expect) 240 241 def test_quot_entity_converted_to_quotation_mark(self): 242 self.assertSoupEquals("<p>I said "good day!"</p>", 243 '<p>I said "good day!"</p>') 244 245 def test_out_of_range_entity(self): 246 expect = u"\N{REPLACEMENT CHARACTER}" 247 self.assertSoupEquals("�", expect) 248 self.assertSoupEquals("�", expect) 249 self.assertSoupEquals("�", expect) 250 251 def test_multipart_strings(self): 252 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." 253 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>") 254 self.assertEqual("p", soup.h2.string.next_element.name) 255 self.assertEqual("p", soup.p.name) 256 257 def test_basic_namespaces(self): 258 """Parsers don't need to *understand* namespaces, but at the 259 very least they should not choke on namespaces or lose 260 data.""" 261 262 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' 263 soup = self.soup(markup) 264 self.assertEqual(markup, soup.encode()) 265 html = soup.html 266 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) 267 self.assertEqual( 268 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) 269 self.assertEqual( 270 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) 271 272 def test_multivalued_attribute_value_becomes_list(self): 273 markup = b'<a class="foo bar">' 274 soup = self.soup(markup) 275 self.assertEqual(['foo', 'bar'], soup.a['class']) 276 277 # 278 # Generally speaking, tests below this point are more tests of 279 # Beautiful Soup than tests of the tree builders. But parsers are 280 # weird, so we run these tests separately for every tree builder 281 # to detect any differences between them. 282 # 283 284 def test_can_parse_unicode_document(self): 285 # A seemingly innocuous document... but it's in Unicode! And 286 # it contains characters that can't be represented in the 287 # encoding found in the declaration! The horror! 288 markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' 289 soup = self.soup(markup) 290 self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) 291 292 def test_soupstrainer(self): 293 """Parsers should be able to work with SoupStrainers.""" 294 strainer = SoupStrainer("b") 295 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", 296 parse_only=strainer) 297 self.assertEqual(soup.decode(), "<b>bold</b>") 298 299 def test_single_quote_attribute_values_become_double_quotes(self): 300 self.assertSoupEquals("<foo attr='bar'></foo>", 301 '<foo attr="bar"></foo>') 302 303 def test_attribute_values_with_nested_quotes_are_left_alone(self): 304 text = """<foo attr='bar "brawls" happen'>a</foo>""" 305 self.assertSoupEquals(text) 306 307 def test_attribute_values_with_double_nested_quotes_get_quoted(self): 308 text = """<foo attr='bar "brawls" happen'>a</foo>""" 309 soup = self.soup(text) 310 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' 311 self.assertSoupEquals( 312 soup.foo.decode(), 313 """<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") 314 315 def test_ampersand_in_attribute_value_gets_escaped(self): 316 self.assertSoupEquals('<this is="really messed up & stuff"></this>', 317 '<this is="really messed up & stuff"></this>') 318 319 self.assertSoupEquals( 320 '<a href="http://example.org?a=1&b=2;3">foo</a>', 321 '<a href="http://example.org?a=1&b=2;3">foo</a>') 322 323 def test_escaped_ampersand_in_attribute_value_is_left_alone(self): 324 self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') 325 326 def test_entities_in_strings_converted_during_parsing(self): 327 # Both XML and HTML entities are converted to Unicode characters 328 # during parsing. 329 text = "<p><<sacré bleu!>></p>" 330 expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" 331 self.assertSoupEquals(text, expected) 332 333 def test_smart_quotes_converted_on_the_way_in(self): 334 # Microsoft smart quotes are converted to Unicode characters during 335 # parsing. 336 quote = b"<p>\x91Foo\x92</p>" 337 soup = self.soup(quote) 338 self.assertEqual( 339 soup.p.string, 340 u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") 341 342 def test_non_breaking_spaces_converted_on_the_way_in(self): 343 soup = self.soup("<a> </a>") 344 self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) 345 346 def test_entities_converted_on_the_way_out(self): 347 text = "<p><<sacré bleu!>></p>" 348 expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") 349 soup = self.soup(text) 350 self.assertEqual(soup.p.encode("utf-8"), expected) 351 352 def test_real_iso_latin_document(self): 353 # Smoke test of interrelated functionality, using an 354 # easy-to-understand document. 355 356 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. 357 unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' 358 359 # That's because we're going to encode it into ISO-Latin-1, and use 360 # that to test. 361 iso_latin_html = unicode_html.encode("iso-8859-1") 362 363 # Parse the ISO-Latin-1 HTML. 364 soup = self.soup(iso_latin_html) 365 # Encode it to UTF-8. 366 result = soup.encode("utf-8") 367 368 # What do we expect the result to look like? Well, it would 369 # look like unicode_html, except that the META tag would say 370 # UTF-8 instead of ISO-Latin-1. 371 expected = unicode_html.replace("ISO-Latin-1", "utf-8") 372 373 # And, of course, it would be in UTF-8, not Unicode. 374 expected = expected.encode("utf-8") 375 376 # Ta-da! 377 self.assertEqual(result, expected) 378 379 def test_real_shift_jis_document(self): 380 # Smoke test to make sure the parser can handle a document in 381 # Shift-JIS encoding, without choking. 382 shift_jis_html = ( 383 b'<html><head></head><body><pre>' 384 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' 385 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' 386 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' 387 b'</pre></body></html>') 388 unicode_html = shift_jis_html.decode("shift-jis") 389 soup = self.soup(unicode_html) 390 391 # Make sure the parse tree is correctly encoded to various 392 # encodings. 393 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) 394 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) 395 396 def test_real_hebrew_document(self): 397 # A real-world test to make sure we can convert ISO-8859-9 (a 398 # Hebrew encoding) to UTF-8. 399 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' 400 soup = self.soup( 401 hebrew_document, from_encoding="iso8859-8") 402 self.assertEqual(soup.original_encoding, 'iso8859-8') 403 self.assertEqual( 404 soup.encode('utf-8'), 405 hebrew_document.decode("iso8859-8").encode("utf-8")) 406 407 def test_meta_tag_reflects_current_encoding(self): 408 # Here's the <meta> tag saying that a document is 409 # encoded in Shift-JIS. 410 meta_tag = ('<meta content="text/html; charset=x-sjis" ' 411 'http-equiv="Content-type"/>') 412 413 # Here's a document incorporating that meta tag. 414 shift_jis_html = ( 415 '<html><head>\n%s\n' 416 '<meta http-equiv="Content-language" content="ja"/>' 417 '</head><body>Shift-JIS markup goes here.') % meta_tag 418 soup = self.soup(shift_jis_html) 419 420 # Parse the document, and the charset is seemingly unaffected. 421 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) 422 content = parsed_meta['content'] 423 self.assertEqual('text/html; charset=x-sjis', content) 424 425 # But that value is actually a ContentMetaAttributeValue object. 426 self.assertTrue(isinstance(content, ContentMetaAttributeValue)) 427 428 # And it will take on a value that reflects its current 429 # encoding. 430 self.assertEqual('text/html; charset=utf8', content.encode("utf8")) 431 432 # For the rest of the story, see TestSubstitutions in 433 # test_tree.py. 434 435 def test_html5_style_meta_tag_reflects_current_encoding(self): 436 # Here's the <meta> tag saying that a document is 437 # encoded in Shift-JIS. 438 meta_tag = ('<meta id="encoding" charset="x-sjis" />') 439 440 # Here's a document incorporating that meta tag. 441 shift_jis_html = ( 442 '<html><head>\n%s\n' 443 '<meta http-equiv="Content-language" content="ja"/>' 444 '</head><body>Shift-JIS markup goes here.') % meta_tag 445 soup = self.soup(shift_jis_html) 446 447 # Parse the document, and the charset is seemingly unaffected. 448 parsed_meta = soup.find('meta', id="encoding") 449 charset = parsed_meta['charset'] 450 self.assertEqual('x-sjis', charset) 451 452 # But that value is actually a CharsetMetaAttributeValue object. 453 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) 454 455 # And it will take on a value that reflects its current 456 # encoding. 457 self.assertEqual('utf8', charset.encode("utf8")) 458 459 def test_tag_with_no_attributes_can_have_attributes_added(self): 460 data = self.soup("<a>text</a>") 461 data.a['foo'] = 'bar' 462 self.assertEqual('<a foo="bar">text</a>', data.a.decode()) 463 464 class XMLTreeBuilderSmokeTest(object): 465 466 def test_docstring_generated(self): 467 soup = self.soup("<root/>") 468 self.assertEqual( 469 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') 470 471 def test_real_xhtml_document(self): 472 """A real XHTML document should come out *exactly* the same as it went in.""" 473 markup = b"""<?xml version="1.0" encoding="utf-8"?> 474 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> 475 <html xmlns="http://www.w3.org/1999/xhtml"> 476 <head><title>Hello.</title></head> 477 <body>Goodbye.</body> 478 </html>""" 479 soup = self.soup(markup) 480 self.assertEqual( 481 soup.encode("utf-8"), markup) 482 483 def test_formatter_processes_script_tag_for_xml_documents(self): 484 doc = """ 485 <script type="text/javascript"> 486 </script> 487 """ 488 soup = BeautifulSoup(doc, "xml") 489 # lxml would have stripped this while parsing, but we can add 490 # it later. 491 soup.script.string = 'console.log("< < hey > > ");' 492 encoded = soup.encode() 493 self.assertTrue(b"< < hey > >" in encoded) 494 495 def test_can_parse_unicode_document(self): 496 markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' 497 soup = self.soup(markup) 498 self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) 499 500 def test_popping_namespaced_tag(self): 501 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' 502 soup = self.soup(markup) 503 self.assertEqual( 504 unicode(soup.rss), markup) 505 506 def test_docstring_includes_correct_encoding(self): 507 soup = self.soup("<root/>") 508 self.assertEqual( 509 soup.encode("latin1"), 510 b'<?xml version="1.0" encoding="latin1"?>\n<root/>') 511 512 def test_large_xml_document(self): 513 """A large XML document should come out the same as it went in.""" 514 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' 515 + b'0' * (2**12) 516 + b'</root>') 517 soup = self.soup(markup) 518 self.assertEqual(soup.encode("utf-8"), markup) 519 520 521 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): 522 self.assertSoupEquals("<p>", "<p/>") 523 self.assertSoupEquals("<p>foo</p>") 524 525 def test_namespaces_are_preserved(self): 526 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' 527 soup = self.soup(markup) 528 root = soup.root 529 self.assertEqual("http://example.com/", root['xmlns:a']) 530 self.assertEqual("http://example.net/", root['xmlns:b']) 531 532 def test_closing_namespaced_tag(self): 533 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' 534 soup = self.soup(markup) 535 self.assertEqual(unicode(soup.p), markup) 536 537 def test_namespaced_attributes(self): 538 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' 539 soup = self.soup(markup) 540 self.assertEqual(unicode(soup.foo), markup) 541 542 def test_namespaced_attributes_xml_namespace(self): 543 markup = '<foo xml:lang="fr">bar</foo>' 544 soup = self.soup(markup) 545 self.assertEqual(unicode(soup.foo), markup) 546 547 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): 548 """Smoke test for a tree builder that supports HTML5.""" 549 550 def test_real_xhtml_document(self): 551 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle 552 # XHTML documents in any particular way. 553 pass 554 555 def test_html_tags_have_namespace(self): 556 markup = "<a>" 557 soup = self.soup(markup) 558 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) 559 560 def test_svg_tags_have_namespace(self): 561 markup = '<svg><circle/></svg>' 562 soup = self.soup(markup) 563 namespace = "http://www.w3.org/2000/svg" 564 self.assertEqual(namespace, soup.svg.namespace) 565 self.assertEqual(namespace, soup.circle.namespace) 566 567 568 def test_mathml_tags_have_namespace(self): 569 markup = '<math><msqrt>5</msqrt></math>' 570 soup = self.soup(markup) 571 namespace = 'http://www.w3.org/1998/Math/MathML' 572 self.assertEqual(namespace, soup.math.namespace) 573 self.assertEqual(namespace, soup.msqrt.namespace) 574 575 def test_xml_declaration_becomes_comment(self): 576 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>' 577 soup = self.soup(markup) 578 self.assertTrue(isinstance(soup.contents[0], Comment)) 579 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') 580 self.assertEqual("html", soup.contents[0].next_element.name) 581 582 def skipIf(condition, reason): 583 def nothing(test, *args, **kwargs): 584 return None 585 586 def decorator(test_item): 587 if condition: 588 return nothing 589 else: 590 return test_item 591 592 return decorator 593