Home | History | Annotate | Download | only in test
      1 """Tests for HTMLParser.py."""
      2 
      3 import HTMLParser
      4 import pprint
      5 import unittest
      6 from test import test_support
      7 
      8 
      9 class EventCollector(HTMLParser.HTMLParser):
     10 
     11     def __init__(self):
     12         self.events = []
     13         self.append = self.events.append
     14         HTMLParser.HTMLParser.__init__(self)
     15 
     16     def get_events(self):
     17         # Normalize the list of events so that buffer artefacts don't
     18         # separate runs of contiguous characters.
     19         L = []
     20         prevtype = None
     21         for event in self.events:
     22             type = event[0]
     23             if type == prevtype == "data":
     24                 L[-1] = ("data", L[-1][1] + event[1])
     25             else:
     26                 L.append(event)
     27             prevtype = type
     28         self.events = L
     29         return L
     30 
     31     # structure markup
     32 
     33     def handle_starttag(self, tag, attrs):
     34         self.append(("starttag", tag, attrs))
     35 
     36     def handle_startendtag(self, tag, attrs):
     37         self.append(("startendtag", tag, attrs))
     38 
     39     def handle_endtag(self, tag):
     40         self.append(("endtag", tag))
     41 
     42     # all other markup
     43 
     44     def handle_comment(self, data):
     45         self.append(("comment", data))
     46 
     47     def handle_charref(self, data):
     48         self.append(("charref", data))
     49 
     50     def handle_data(self, data):
     51         self.append(("data", data))
     52 
     53     def handle_decl(self, data):
     54         self.append(("decl", data))
     55 
     56     def handle_entityref(self, data):
     57         self.append(("entityref", data))
     58 
     59     def handle_pi(self, data):
     60         self.append(("pi", data))
     61 
     62     def unknown_decl(self, decl):
     63         self.append(("unknown decl", decl))
     64 
     65 
     66 class EventCollectorExtra(EventCollector):
     67 
     68     def handle_starttag(self, tag, attrs):
     69         EventCollector.handle_starttag(self, tag, attrs)
     70         self.append(("starttag_text", self.get_starttag_text()))
     71 
     72 
     73 class TestCaseBase(unittest.TestCase):
     74 
     75     def _run_check(self, source, expected_events, collector=EventCollector):
     76         parser = collector()
     77         for s in source:
     78             parser.feed(s)
     79         parser.close()
     80         events = parser.get_events()
     81         if events != expected_events:
     82             self.fail("received events did not match expected events\n"
     83                       "Expected:\n" + pprint.pformat(expected_events) +
     84                       "\nReceived:\n" + pprint.pformat(events))
     85 
     86     def _run_check_extra(self, source, events):
     87         self._run_check(source, events, EventCollectorExtra)
     88 
     89     def _parse_error(self, source):
     90         def parse(source=source):
     91             parser = HTMLParser.HTMLParser()
     92             parser.feed(source)
     93             parser.close()
     94         self.assertRaises(HTMLParser.HTMLParseError, parse)
     95 
     96 
     97 class HTMLParserTestCase(TestCaseBase):
     98 
     99     def test_processing_instruction_only(self):
    100         self._run_check("<?processing instruction>", [
    101             ("pi", "processing instruction"),
    102             ])
    103         self._run_check("<?processing instruction ?>", [
    104             ("pi", "processing instruction ?"),
    105             ])
    106 
    107     def test_simple_html(self):
    108         self._run_check("""
    109 <!DOCTYPE html PUBLIC 'foo'>
    110 <HTML>&entity;&#32;
    111 <!--comment1a
    112 -></foo><bar>&lt;<?pi?></foo<bar
    113 comment1b-->
    114 <Img sRc='Bar' isMAP>sample
    115 text
    116 &#x201C;
    117 <!--comment2a-- --comment2b-->
    118 </Html>
    119 """, [
    120     ("data", "\n"),
    121     ("decl", "DOCTYPE html PUBLIC 'foo'"),
    122     ("data", "\n"),
    123     ("starttag", "html", []),
    124     ("entityref", "entity"),
    125     ("charref", "32"),
    126     ("data", "\n"),
    127     ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
    128     ("data", "\n"),
    129     ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
    130     ("data", "sample\ntext\n"),
    131     ("charref", "x201C"),
    132     ("data", "\n"),
    133     ("comment", "comment2a-- --comment2b"),
    134     ("data", "\n"),
    135     ("endtag", "html"),
    136     ("data", "\n"),
    137     ])
    138 
    139     def test_unclosed_entityref(self):
    140         self._run_check("&entityref foo", [
    141             ("entityref", "entityref"),
    142             ("data", " foo"),
    143             ])
    144 
    145     def test_bad_nesting(self):
    146         # Strangely, this *is* supposed to test that overlapping
    147         # elements are allowed.  HTMLParser is more geared toward
    148         # lexing the input that parsing the structure.
    149         self._run_check("<a><b></a></b>", [
    150             ("starttag", "a", []),
    151             ("starttag", "b", []),
    152             ("endtag", "a"),
    153             ("endtag", "b"),
    154             ])
    155 
    156     def test_bare_ampersands(self):
    157         self._run_check("this text & contains & ampersands &", [
    158             ("data", "this text & contains & ampersands &"),
    159             ])
    160 
    161     def test_bare_pointy_brackets(self):
    162         self._run_check("this < text > contains < bare>pointy< brackets", [
    163             ("data", "this < text > contains < bare>pointy< brackets"),
    164             ])
    165 
    166     def test_illegal_declarations(self):
    167         self._run_check('<!spacer type="block" height="25">',
    168                         [('comment', 'spacer type="block" height="25"')])
    169 
    170     def test_starttag_end_boundary(self):
    171         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
    172         self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
    173 
    174     def test_buffer_artefacts(self):
    175         output = [("starttag", "a", [("b", "<")])]
    176         self._run_check(["<a b='<'>"], output)
    177         self._run_check(["<a ", "b='<'>"], output)
    178         self._run_check(["<a b", "='<'>"], output)
    179         self._run_check(["<a b=", "'<'>"], output)
    180         self._run_check(["<a b='<", "'>"], output)
    181         self._run_check(["<a b='<'", ">"], output)
    182 
    183         output = [("starttag", "a", [("b", ">")])]
    184         self._run_check(["<a b='>'>"], output)
    185         self._run_check(["<a ", "b='>'>"], output)
    186         self._run_check(["<a b", "='>'>"], output)
    187         self._run_check(["<a b=", "'>'>"], output)
    188         self._run_check(["<a b='>", "'>"], output)
    189         self._run_check(["<a b='>'", ">"], output)
    190 
    191         output = [("comment", "abc")]
    192         self._run_check(["", "<!--abc-->"], output)
    193         self._run_check(["<", "!--abc-->"], output)
    194         self._run_check(["<!", "--abc-->"], output)
    195         self._run_check(["<!-", "-abc-->"], output)
    196         self._run_check(["<!--", "abc-->"], output)
    197         self._run_check(["<!--a", "bc-->"], output)
    198         self._run_check(["<!--ab", "c-->"], output)
    199         self._run_check(["<!--abc", "-->"], output)
    200         self._run_check(["<!--abc-", "->"], output)
    201         self._run_check(["<!--abc--", ">"], output)
    202         self._run_check(["<!--abc-->", ""], output)
    203 
    204     def test_starttag_junk_chars(self):
    205         self._run_check("</>", [])
    206         self._run_check("</$>", [('comment', '$')])
    207         self._run_check("</", [('data', '</')])
    208         self._run_check("</a", [('data', '</a')])
    209         # XXX this might be wrong
    210         self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])])
    211         self._run_check("</a<a>", [('endtag', 'a<a')])
    212         self._run_check("<!", [('data', '<!')])
    213         self._run_check("<a", [('data', '<a')])
    214         self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    215         self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    216         self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    217         self._run_check("<a foo='>", [('data', "<a foo='>")])
    218 
    219     def test_valid_doctypes(self):
    220         # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
    221         dtds = ['HTML',  # HTML5 doctype
    222                 ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
    223                  '"http://www.w3.org/TR/html4/strict.dtd"'),
    224                 ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
    225                  '"http://www.w3.org/TR/html4/loose.dtd"'),
    226                 ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
    227                  '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
    228                 ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
    229                  '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
    230                 ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
    231                  '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
    232                 ('html PUBLIC "-//W3C//DTD '
    233                  'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
    234                  '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
    235                 ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
    236                  '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
    237                 'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
    238                 'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
    239         for dtd in dtds:
    240             self._run_check("<!DOCTYPE %s>" % dtd,
    241                             [('decl', 'DOCTYPE ' + dtd)])
    242 
    243     def test_slashes_in_starttag(self):
    244         self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    245         html = ('<img width=902 height=250px '
    246                 'src="/sites/default/files/images/homepage/foo.jpg" '
    247                 '/*what am I doing here*/ />')
    248         expected = [(
    249             'startendtag', 'img',
    250             [('width', '902'), ('height', '250px'),
    251              ('src', '/sites/default/files/images/homepage/foo.jpg'),
    252              ('*what', None), ('am', None), ('i', None),
    253              ('doing', None), ('here*', None)]
    254         )]
    255         self._run_check(html, expected)
    256         html = ('<a / /foo/ / /=/ / /bar/ / />'
    257                 '<a / /foo/ / /=/ / /bar/ / >')
    258         expected = [
    259             ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
    260             ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
    261         ]
    262         self._run_check(html, expected)
    263         #see issue #14538
    264         html = ('<meta><meta / ><meta // ><meta / / >'
    265                 '<meta/><meta /><meta //><meta//>')
    266         expected = [
    267             ('starttag', 'meta', []), ('starttag', 'meta', []),
    268             ('starttag', 'meta', []), ('starttag', 'meta', []),
    269             ('startendtag', 'meta', []), ('startendtag', 'meta', []),
    270             ('startendtag', 'meta', []), ('startendtag', 'meta', []),
    271         ]
    272         self._run_check(html, expected)
    273 
    274     def test_declaration_junk_chars(self):
    275         self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
    276 
    277     def test_startendtag(self):
    278         self._run_check("<p/>", [
    279             ("startendtag", "p", []),
    280             ])
    281         self._run_check("<p></p>", [
    282             ("starttag", "p", []),
    283             ("endtag", "p"),
    284             ])
    285         self._run_check("<p><img src='foo' /></p>", [
    286             ("starttag", "p", []),
    287             ("startendtag", "img", [("src", "foo")]),
    288             ("endtag", "p"),
    289             ])
    290 
    291     def test_invalid_end_tags(self):
    292         # A collection of broken end tags. <br> is used as separator.
    293         # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
    294         # and #13993
    295         html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
    296                 '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
    297         expected = [('starttag', 'br', []),
    298                     # < is part of the name, / is discarded, p is an attribute
    299                     ('endtag', 'label<'),
    300                     ('starttag', 'br', []),
    301                     # text and attributes are discarded
    302                     ('endtag', 'div'),
    303                     ('starttag', 'br', []),
    304                     # comment because the first char after </ is not a-zA-Z
    305                     ('comment', '<h4'),
    306                     ('starttag', 'br', []),
    307                     # attributes are discarded
    308                     ('endtag', 'li'),
    309                     ('starttag', 'br', []),
    310                     # everything till ul (included) is discarded
    311                     ('endtag', 'li'),
    312                     ('starttag', 'br', []),
    313                     # </> is ignored
    314                     ('starttag', 'br', [])]
    315         self._run_check(html, expected)
    316 
    317     def test_broken_invalid_end_tag(self):
    318         # This is technically wrong (the "> shouldn't be included in the 'data')
    319         # but is probably not worth fixing it (in addition to all the cases of
    320         # the previous test, it would require a full attribute parsing).
    321         # see #13993
    322         html = '<b>This</b attr=">"> confuses the parser'
    323         expected = [('starttag', 'b', []),
    324                     ('data', 'This'),
    325                     ('endtag', 'b'),
    326                     ('data', '"> confuses the parser')]
    327         self._run_check(html, expected)
    328 
    329     def test_get_starttag_text(self):
    330         s = """<foo:bar   \n   one="1"\ttwo=2   >"""
    331         self._run_check_extra(s, [
    332             ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
    333             ("starttag_text", s)])
    334 
    335     def test_cdata_content(self):
    336         contents = [
    337             '<!-- not a comment --> &not-an-entity-ref;',
    338             "<not a='start tag'>",
    339             '<a href="" /> <p> <span></span>',
    340             'foo = "</scr" + "ipt>";',
    341             'foo = "</SCRIPT" + ">";',
    342             'foo = <\n/script> ',
    343             '<!-- document.write("</scr" + "ipt>"); -->',
    344             ('\n//<![CDATA[\n'
    345              'document.write(\'<s\'+\'cript type="text/javascript" '
    346              'src="http://www.example.org/r=\'+new '
    347              'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
    348             '\n<!-- //\nvar foo = 3.14;\n// -->\n',
    349             'foo = "</sty" + "le>";',
    350             u'<!-- \u2603 -->',
    351             # these two should be invalid according to the HTML 5 spec,
    352             # section 8.1.2.2
    353             #'foo = </\nscript>',
    354             #'foo = </ script>',
    355         ]
    356         elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
    357         for content in contents:
    358             for element in elements:
    359                 element_lower = element.lower()
    360                 s = u'<{element}>{content}</{element}>'.format(element=element,
    361                                                                content=content)
    362                 self._run_check(s, [("starttag", element_lower, []),
    363                                     ("data", content),
    364                                     ("endtag", element_lower)])
    365 
    366     def test_cdata_with_closing_tags(self):
    367         # see issue #13358
    368         # make sure that HTMLParser calls handle_data only once for each CDATA.
    369         # The normal event collector normalizes the events in get_events,
    370         # so we override it to return the original list of events.
    371         class Collector(EventCollector):
    372             def get_events(self):
    373                 return self.events
    374 
    375         content = """<!-- not a comment --> &not-an-entity-ref;
    376                   <a href="" /> </p><p> &amp; <span></span></style>
    377                   '</script' + '>' </html> </head> </scripter>!"""
    378         for element in [' script', 'script ', ' script ',
    379                         '\nscript', 'script\n', '\nscript\n']:
    380             s = u'<script>{content}</{element}>'.format(element=element,
    381                                                         content=content)
    382             self._run_check(s, [("starttag", "script", []),
    383                                 ("data", content),
    384                                 ("endtag", "script")],
    385                             collector=Collector)
    386 
    387     def test_malformatted_charref(self):
    388         self._run_check("<p>&#bad;</p>", [
    389             ("starttag", "p", []),
    390             ("data", "&#bad;"),
    391             ("endtag", "p"),
    392         ])
    393 
    394     def test_unescape_function(self):
    395         parser = HTMLParser.HTMLParser()
    396         self.assertEqual(parser.unescape('&#bad;'),'&#bad;')
    397         self.assertEqual(parser.unescape('&#0038;'),'&')
    398 
    399 
    400 
    401 class AttributesTestCase(TestCaseBase):
    402 
    403     def test_attr_syntax(self):
    404         output = [
    405           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
    406         ]
    407         self._run_check("""<a b='v' c="v" d=v e>""", output)
    408         self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
    409         self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
    410         self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
    411 
    412     def test_attr_values(self):
    413         self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
    414                         [("starttag", "a", [("b", "xxx\n\txxx"),
    415                                             ("c", "yyy\t\nyyy"),
    416                                             ("d", "\txyz\n")])])
    417         self._run_check("""<a b='' c="">""",
    418                         [("starttag", "a", [("b", ""), ("c", "")])])
    419         # Regression test for SF patch #669683.
    420         self._run_check("<e a=rgb(1,2,3)>",
    421                         [("starttag", "e", [("a", "rgb(1,2,3)")])])
    422         # Regression test for SF bug #921657.
    423         self._run_check(
    424             "<a href=mailto:xyz (at] example.com>",
    425             [("starttag", "a", [("href", "mailto:xyz (at] example.com")])])
    426 
    427     def test_attr_nonascii(self):
    428         # see issue 7311
    429         self._run_check(
    430             u"<img src=/foo/bar.png alt=\u4e2d\u6587>",
    431             [("starttag", "img", [("src", "/foo/bar.png"),
    432                                   ("alt", u"\u4e2d\u6587")])])
    433         self._run_check(
    434             u"<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
    435             [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
    436                                 ("href", u"\u30c6\u30b9\u30c8.html")])])
    437         self._run_check(
    438             u'<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
    439             [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
    440                                 ("href", u"\u30c6\u30b9\u30c8.html")])])
    441 
    442     def test_attr_entity_replacement(self):
    443         self._run_check(
    444             "<a b='&amp;&gt;&lt;&quot;&apos;'>",
    445             [("starttag", "a", [("b", "&><\"'")])])
    446 
    447     def test_attr_funky_names(self):
    448         self._run_check(
    449             "<a a.b='v' c:d=v e-f=v>",
    450             [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
    451         self._run_check(
    452             "<a $><b $=%><c \=/>",
    453             [("starttag", "a", [("$", None)]),
    454              ("starttag", "b", [("$", "%")]),
    455              ("starttag", "c", [("\\", "/")])])
    456 
    457     def test_entityrefs_in_attributes(self):
    458         self._run_check(
    459             "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
    460             [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])])
    461 
    462     def test_entities_in_attribute_value(self):
    463         # see #1200313
    464         for entity in ['&', '&amp;', '&#38;', '&#x26;']:
    465             self._run_check('<a href="%s">' % entity,
    466                             [("starttag", "a", [("href", "&")])])
    467             self._run_check("<a href='%s'>" % entity,
    468                             [("starttag", "a", [("href", "&")])])
    469             self._run_check("<a href=%s>" % entity,
    470                             [("starttag", "a", [("href", "&")])])
    471 
    472     def test_malformed_attributes(self):
    473         # see #13357
    474         html = (
    475             "<a href=test'style='color:red;bad1'>test - bad1</a>"
    476             "<a href=test'+style='color:red;ba2'>test - bad2</a>"
    477             "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
    478             "<a href = test'&nbsp;style='color:red;bad4'  >test - bad4</a>"
    479         )
    480         expected = [
    481             ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
    482             ('data', 'test - bad1'), ('endtag', 'a'),
    483             ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
    484             ('data', 'test - bad2'), ('endtag', 'a'),
    485             ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad3'")]),
    486             ('data', 'test - bad3'), ('endtag', 'a'),
    487             ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad4'")]),
    488             ('data', 'test - bad4'), ('endtag', 'a')
    489         ]
    490         self._run_check(html, expected)
    491 
    492     def test_malformed_adjacent_attributes(self):
    493         # see #12629
    494         self._run_check('<x><y z=""o"" /></x>',
    495                         [('starttag', 'x', []),
    496                             ('startendtag', 'y', [('z', ''), ('o""', None)]),
    497                             ('endtag', 'x')])
    498         self._run_check('<x><y z="""" /></x>',
    499                         [('starttag', 'x', []),
    500                             ('startendtag', 'y', [('z', ''), ('""', None)]),
    501                             ('endtag', 'x')])
    502 
    503     # see #755670 for the following 3 tests
    504     def test_adjacent_attributes(self):
    505         self._run_check('<a width="100%"cellspacing=0>',
    506                         [("starttag", "a",
    507                           [("width", "100%"), ("cellspacing","0")])])
    508 
    509         self._run_check('<a id="foo"class="bar">',
    510                         [("starttag", "a",
    511                           [("id", "foo"), ("class","bar")])])
    512 
    513     def test_missing_attribute_value(self):
    514         self._run_check('<a v=>',
    515                         [("starttag", "a", [("v", "")])])
    516 
    517     def test_javascript_attribute_value(self):
    518         self._run_check("<a href=javascript:popup('/popup/help.html')>",
    519                         [("starttag", "a",
    520                           [("href", "javascript:popup('/popup/help.html')")])])
    521 
    522     def test_end_tag_in_attribute_value(self):
    523         # see #1745761
    524         self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
    525                         [("starttag", "a",
    526                           [("href", "http://www.example.org/\">;")]),
    527                          ("data", "spam"), ("endtag", "a")])
    528 
    529     def test_comments(self):
    530         html = ("<!-- I'm a valid comment -->"
    531                 '<!--me too!-->'
    532                 '<!------>'
    533                 '<!---->'
    534                 '<!----I have many hyphens---->'
    535                 '<!-- I have a > in the middle -->'
    536                 '<!-- and I have -- in the middle! -->')
    537         expected = [('comment', " I'm a valid comment "),
    538                     ('comment', 'me too!'),
    539                     ('comment', '--'),
    540                     ('comment', ''),
    541                     ('comment', '--I have many hyphens--'),
    542                     ('comment', ' I have a > in the middle '),
    543                     ('comment', ' and I have -- in the middle! ')]
    544         self._run_check(html, expected)
    545 
    546     def test_broken_comments(self):
    547         html = ('<! not really a comment >'
    548                 '<! not a comment either -->'
    549                 '<! -- close enough -->'
    550                 '<!><!<-- this was an empty comment>'
    551                 '<!!! another bogus comment !!!>')
    552         expected = [
    553             ('comment', ' not really a comment '),
    554             ('comment', ' not a comment either --'),
    555             ('comment', ' -- close enough --'),
    556             ('comment', ''),
    557             ('comment', '<-- this was an empty comment'),
    558             ('comment', '!! another bogus comment !!!'),
    559         ]
    560         self._run_check(html, expected)
    561 
    562     def test_condcoms(self):
    563         html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
    564                 '<!--[if IE 8]>condcoms<![endif]-->'
    565                 '<!--[if lte IE 7]>pretty?<![endif]-->')
    566         expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
    567                     ('comment', '[if IE 8]>condcoms<![endif]'),
    568                     ('comment', '[if lte IE 7]>pretty?<![endif]')]
    569         self._run_check(html, expected)
    570 
    571     def test_broken_condcoms(self):
    572         # these condcoms are missing the '--' after '<!' and before the '>'
    573         html = ('<![if !(IE)]>broken condcom<![endif]>'
    574                 '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
    575                 '<![if !IE 6]><img src="firefox.png" /><![endif]>'
    576                 '<![if !ie 6]><b>foo</b><![endif]>'
    577                 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
    578         # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
    579         # and "8.2.4.45 Markup declaration open state", comment tokens should
    580         # be emitted instead of 'unknown decl', but calling unknown_decl
    581         # provides more flexibility.
    582         # See also Lib/_markupbase.py:parse_declaration
    583         expected = [
    584             ('unknown decl', 'if !(IE)'),
    585             ('data', 'broken condcom'),
    586             ('unknown decl', 'endif'),
    587             ('unknown decl', 'if ! IE'),
    588             ('startendtag', 'link', [('href', 'favicon.tiff')]),
    589             ('unknown decl', 'endif'),
    590             ('unknown decl', 'if !IE 6'),
    591             ('startendtag', 'img', [('src', 'firefox.png')]),
    592             ('unknown decl', 'endif'),
    593             ('unknown decl', 'if !ie 6'),
    594             ('starttag', 'b', []),
    595             ('data', 'foo'),
    596             ('endtag', 'b'),
    597             ('unknown decl', 'endif'),
    598             ('unknown decl', 'if (!IE)|(lt IE 9)'),
    599             ('startendtag', 'img', [('src', 'mammoth.bmp')]),
    600             ('unknown decl', 'endif')
    601         ]
    602         self._run_check(html, expected)
    603 
    604 
    605 def test_main():
    606     test_support.run_unittest(HTMLParserTestCase, AttributesTestCase)
    607 
    608 
    609 if __name__ == "__main__":
    610     test_main()
    611