Home | History | Annotate | Download | only in test
      1 """Tests for HTMLParser.py."""
      2 
      3 import HTMLParser
      4 import pprint
      5 import unittest
      6 from test import test_support
      7 
      8 
      9 class EventCollector(HTMLParser.HTMLParser):
     10 
     11     def __init__(self):
     12         self.events = []
     13         self.append = self.events.append
     14         HTMLParser.HTMLParser.__init__(self)
     15 
     16     def get_events(self):
     17         # Normalize the list of events so that buffer artefacts don't
     18         # separate runs of contiguous characters.
     19         L = []
     20         prevtype = None
     21         for event in self.events:
     22             type = event[0]
     23             if type == prevtype == "data":
     24                 L[-1] = ("data", L[-1][1] + event[1])
     25             else:
     26                 L.append(event)
     27             prevtype = type
     28         self.events = L
     29         return L
     30 
     31     # structure markup
     32 
     33     def handle_starttag(self, tag, attrs):
     34         self.append(("starttag", tag, attrs))
     35 
     36     def handle_startendtag(self, tag, attrs):
     37         self.append(("startendtag", tag, attrs))
     38 
     39     def handle_endtag(self, tag):
     40         self.append(("endtag", tag))
     41 
     42     # all other markup
     43 
     44     def handle_comment(self, data):
     45         self.append(("comment", data))
     46 
     47     def handle_charref(self, data):
     48         self.append(("charref", data))
     49 
     50     def handle_data(self, data):
     51         self.append(("data", data))
     52 
     53     def handle_decl(self, data):
     54         self.append(("decl", data))
     55 
     56     def handle_entityref(self, data):
     57         self.append(("entityref", data))
     58 
     59     def handle_pi(self, data):
     60         self.append(("pi", data))
     61 
     62     def unknown_decl(self, decl):
     63         self.append(("unknown decl", decl))
     64 
     65 
     66 class EventCollectorExtra(EventCollector):
     67 
     68     def handle_starttag(self, tag, attrs):
     69         EventCollector.handle_starttag(self, tag, attrs)
     70         self.append(("starttag_text", self.get_starttag_text()))
     71 
     72 
     73 class TestCaseBase(unittest.TestCase):
     74 
     75     def _run_check(self, source, expected_events, collector=EventCollector):
     76         parser = collector()
     77         for s in source:
     78             parser.feed(s)
     79         parser.close()
     80         events = parser.get_events()
     81         if events != expected_events:
     82             self.fail("received events did not match expected events\n"
     83                       "Expected:\n" + pprint.pformat(expected_events) +
     84                       "\nReceived:\n" + pprint.pformat(events))
     85 
     86     def _run_check_extra(self, source, events):
     87         self._run_check(source, events, EventCollectorExtra)
     88 
     89     def _parse_error(self, source):
     90         def parse(source=source):
     91             parser = HTMLParser.HTMLParser()
     92             parser.feed(source)
     93             parser.close()
     94         self.assertRaises(HTMLParser.HTMLParseError, parse)
     95 
     96 
     97 class HTMLParserTestCase(TestCaseBase):
     98 
     99     def test_processing_instruction_only(self):
    100         self._run_check("<?processing instruction>", [
    101             ("pi", "processing instruction"),
    102             ])
    103         self._run_check("<?processing instruction ?>", [
    104             ("pi", "processing instruction ?"),
    105             ])
    106 
    107     def test_simple_html(self):
    108         self._run_check("""
    109 <!DOCTYPE html PUBLIC 'foo'>
    110 <HTML>&entity;&#32;
    111 <!--comment1a
    112 -></foo><bar>&lt;<?pi?></foo<bar
    113 comment1b-->
    114 <Img sRc='Bar' isMAP>sample
    115 text
    116 &#x201C;
    117 <!--comment2a-- --comment2b-->
    118 </Html>
    119 """, [
    120     ("data", "\n"),
    121     ("decl", "DOCTYPE html PUBLIC 'foo'"),
    122     ("data", "\n"),
    123     ("starttag", "html", []),
    124     ("entityref", "entity"),
    125     ("charref", "32"),
    126     ("data", "\n"),
    127     ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
    128     ("data", "\n"),
    129     ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
    130     ("data", "sample\ntext\n"),
    131     ("charref", "x201C"),
    132     ("data", "\n"),
    133     ("comment", "comment2a-- --comment2b"),
    134     ("data", "\n"),
    135     ("endtag", "html"),
    136     ("data", "\n"),
    137     ])
    138 
    139     def test_unclosed_entityref(self):
    140         self._run_check("&entityref foo", [
    141             ("entityref", "entityref"),
    142             ("data", " foo"),
    143             ])
    144 
    145     def test_bad_nesting(self):
    146         # Strangely, this *is* supposed to test that overlapping
    147         # elements are allowed.  HTMLParser is more geared toward
    148         # lexing the input that parsing the structure.
    149         self._run_check("<a><b></a></b>", [
    150             ("starttag", "a", []),
    151             ("starttag", "b", []),
    152             ("endtag", "a"),
    153             ("endtag", "b"),
    154             ])
    155 
    156     def test_bare_ampersands(self):
    157         self._run_check("this text & contains & ampersands &", [
    158             ("data", "this text & contains & ampersands &"),
    159             ])
    160 
    161     def test_bare_pointy_brackets(self):
    162         self._run_check("this < text > contains < bare>pointy< brackets", [
    163             ("data", "this < text > contains < bare>pointy< brackets"),
    164             ])
    165 
    166     def test_illegal_declarations(self):
    167         self._run_check('<!spacer type="block" height="25">',
    168                         [('comment', 'spacer type="block" height="25"')])
    169 
    170     def test_starttag_end_boundary(self):
    171         self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
    172         self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
    173 
    174     def test_buffer_artefacts(self):
    175         output = [("starttag", "a", [("b", "<")])]
    176         self._run_check(["<a b='<'>"], output)
    177         self._run_check(["<a ", "b='<'>"], output)
    178         self._run_check(["<a b", "='<'>"], output)
    179         self._run_check(["<a b=", "'<'>"], output)
    180         self._run_check(["<a b='<", "'>"], output)
    181         self._run_check(["<a b='<'", ">"], output)
    182 
    183         output = [("starttag", "a", [("b", ">")])]
    184         self._run_check(["<a b='>'>"], output)
    185         self._run_check(["<a ", "b='>'>"], output)
    186         self._run_check(["<a b", "='>'>"], output)
    187         self._run_check(["<a b=", "'>'>"], output)
    188         self._run_check(["<a b='>", "'>"], output)
    189         self._run_check(["<a b='>'", ">"], output)
    190 
    191         output = [("comment", "abc")]
    192         self._run_check(["", "<!--abc-->"], output)
    193         self._run_check(["<", "!--abc-->"], output)
    194         self._run_check(["<!", "--abc-->"], output)
    195         self._run_check(["<!-", "-abc-->"], output)
    196         self._run_check(["<!--", "abc-->"], output)
    197         self._run_check(["<!--a", "bc-->"], output)
    198         self._run_check(["<!--ab", "c-->"], output)
    199         self._run_check(["<!--abc", "-->"], output)
    200         self._run_check(["<!--abc-", "->"], output)
    201         self._run_check(["<!--abc--", ">"], output)
    202         self._run_check(["<!--abc-->", ""], output)
    203 
    204     def test_starttag_junk_chars(self):
    205         self._run_check("</>", [])
    206         self._run_check("</$>", [('comment', '$')])
    207         self._run_check("</", [('data', '</')])
    208         self._run_check("</a", [('data', '</a')])
    209         self._run_check("<a<a>", [('starttag', 'a<a', [])])
    210         self._run_check("</a<a>", [('endtag', 'a<a')])
    211         self._run_check("<!", [('data', '<!')])
    212         self._run_check("<a", [('data', '<a')])
    213         self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
    214         self._run_check("<a foo='bar", [('data', "<a foo='bar")])
    215         self._run_check("<a foo='>'", [('data', "<a foo='>'")])
    216         self._run_check("<a foo='>", [('data', "<a foo='>")])
    217         self._run_check("<a$>", [('starttag', 'a$', [])])
    218         self._run_check("<a$b>", [('starttag', 'a$b', [])])
    219         self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
    220         self._run_check("<a$b  >", [('starttag', 'a$b', [])])
    221         self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
    222 
    223     def test_valid_doctypes(self):
    224         # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
    225         dtds = ['HTML',  # HTML5 doctype
    226                 ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
    227                  '"http://www.w3.org/TR/html4/strict.dtd"'),
    228                 ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
    229                  '"http://www.w3.org/TR/html4/loose.dtd"'),
    230                 ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
    231                  '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
    232                 ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
    233                  '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
    234                 ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
    235                  '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
    236                 ('html PUBLIC "-//W3C//DTD '
    237                  'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
    238                  '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
    239                 ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
    240                  '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
    241                 'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
    242                 'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
    243         for dtd in dtds:
    244             self._run_check("<!DOCTYPE %s>" % dtd,
    245                             [('decl', 'DOCTYPE ' + dtd)])
    246 
    247     def test_slashes_in_starttag(self):
    248         self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
    249         html = ('<img width=902 height=250px '
    250                 'src="/sites/default/files/images/homepage/foo.jpg" '
    251                 '/*what am I doing here*/ />')
    252         expected = [(
    253             'startendtag', 'img',
    254             [('width', '902'), ('height', '250px'),
    255              ('src', '/sites/default/files/images/homepage/foo.jpg'),
    256              ('*what', None), ('am', None), ('i', None),
    257              ('doing', None), ('here*', None)]
    258         )]
    259         self._run_check(html, expected)
    260         html = ('<a / /foo/ / /=/ / /bar/ / />'
    261                 '<a / /foo/ / /=/ / /bar/ / >')
    262         expected = [
    263             ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
    264             ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
    265         ]
    266         self._run_check(html, expected)
    267         #see issue #14538
    268         html = ('<meta><meta / ><meta // ><meta / / >'
    269                 '<meta/><meta /><meta //><meta//>')
    270         expected = [
    271             ('starttag', 'meta', []), ('starttag', 'meta', []),
    272             ('starttag', 'meta', []), ('starttag', 'meta', []),
    273             ('startendtag', 'meta', []), ('startendtag', 'meta', []),
    274             ('startendtag', 'meta', []), ('startendtag', 'meta', []),
    275         ]
    276         self._run_check(html, expected)
    277 
    278     def test_declaration_junk_chars(self):
    279         self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
    280 
    281     def test_startendtag(self):
    282         self._run_check("<p/>", [
    283             ("startendtag", "p", []),
    284             ])
    285         self._run_check("<p></p>", [
    286             ("starttag", "p", []),
    287             ("endtag", "p"),
    288             ])
    289         self._run_check("<p><img src='foo' /></p>", [
    290             ("starttag", "p", []),
    291             ("startendtag", "img", [("src", "foo")]),
    292             ("endtag", "p"),
    293             ])
    294 
    295     def test_invalid_end_tags(self):
    296         # A collection of broken end tags. <br> is used as separator.
    297         # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
    298         # and #13993
    299         html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
    300                 '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
    301         expected = [('starttag', 'br', []),
    302                     # < is part of the name, / is discarded, p is an attribute
    303                     ('endtag', 'label<'),
    304                     ('starttag', 'br', []),
    305                     # text and attributes are discarded
    306                     ('endtag', 'div'),
    307                     ('starttag', 'br', []),
    308                     # comment because the first char after </ is not a-zA-Z
    309                     ('comment', '<h4'),
    310                     ('starttag', 'br', []),
    311                     # attributes are discarded
    312                     ('endtag', 'li'),
    313                     ('starttag', 'br', []),
    314                     # everything till ul (included) is discarded
    315                     ('endtag', 'li'),
    316                     ('starttag', 'br', []),
    317                     # </> is ignored
    318                     ('starttag', 'br', [])]
    319         self._run_check(html, expected)
    320 
    321     def test_broken_invalid_end_tag(self):
    322         # This is technically wrong (the "> shouldn't be included in the 'data')
    323         # but is probably not worth fixing it (in addition to all the cases of
    324         # the previous test, it would require a full attribute parsing).
    325         # see #13993
    326         html = '<b>This</b attr=">"> confuses the parser'
    327         expected = [('starttag', 'b', []),
    328                     ('data', 'This'),
    329                     ('endtag', 'b'),
    330                     ('data', '"> confuses the parser')]
    331         self._run_check(html, expected)
    332 
    333     def test_get_starttag_text(self):
    334         s = """<foo:bar   \n   one="1"\ttwo=2   >"""
    335         self._run_check_extra(s, [
    336             ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
    337             ("starttag_text", s)])
    338 
    339     def test_cdata_content(self):
    340         contents = [
    341             '<!-- not a comment --> &not-an-entity-ref;',
    342             "<not a='start tag'>",
    343             '<a href="" /> <p> <span></span>',
    344             'foo = "</scr" + "ipt>";',
    345             'foo = "</SCRIPT" + ">";',
    346             'foo = <\n/script> ',
    347             '<!-- document.write("</scr" + "ipt>"); -->',
    348             ('\n//<![CDATA[\n'
    349              'document.write(\'<s\'+\'cript type="text/javascript" '
    350              'src="http://www.example.org/r=\'+new '
    351              'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
    352             '\n<!-- //\nvar foo = 3.14;\n// -->\n',
    353             'foo = "</sty" + "le>";',
    354             u'<!-- \u2603 -->',
    355             # these two should be invalid according to the HTML 5 spec,
    356             # section 8.1.2.2
    357             #'foo = </\nscript>',
    358             #'foo = </ script>',
    359         ]
    360         elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
    361         for content in contents:
    362             for element in elements:
    363                 element_lower = element.lower()
    364                 s = u'<{element}>{content}</{element}>'.format(element=element,
    365                                                                content=content)
    366                 self._run_check(s, [("starttag", element_lower, []),
    367                                     ("data", content),
    368                                     ("endtag", element_lower)])
    369 
    370     def test_cdata_with_closing_tags(self):
    371         # see issue #13358
    372         # make sure that HTMLParser calls handle_data only once for each CDATA.
    373         # The normal event collector normalizes the events in get_events,
    374         # so we override it to return the original list of events.
    375         class Collector(EventCollector):
    376             def get_events(self):
    377                 return self.events
    378 
    379         content = """<!-- not a comment --> &not-an-entity-ref;
    380                   <a href="" /> </p><p> &amp; <span></span></style>
    381                   '</script' + '>' </html> </head> </scripter>!"""
    382         for element in [' script', 'script ', ' script ',
    383                         '\nscript', 'script\n', '\nscript\n']:
    384             s = u'<script>{content}</{element}>'.format(element=element,
    385                                                         content=content)
    386             self._run_check(s, [("starttag", "script", []),
    387                                 ("data", content),
    388                                 ("endtag", "script")],
    389                             collector=Collector)
    390 
    391     def test_malformatted_charref(self):
    392         self._run_check("<p>&#bad;</p>", [
    393             ("starttag", "p", []),
    394             ("data", "&#bad;"),
    395             ("endtag", "p"),
    396         ])
    397         # add the [] as a workaround to avoid buffering (see #20288)
    398         self._run_check(["<div>&#bad;</div>"], [
    399             ("starttag", "div", []),
    400             ("data", "&#bad;"),
    401             ("endtag", "div"),
    402         ])
    403 
    404     def test_unescape_function(self):
    405         parser = HTMLParser.HTMLParser()
    406         self.assertEqual(parser.unescape('&#bad;'),'&#bad;')
    407         self.assertEqual(parser.unescape('&#0038;'),'&')
    408 
    409 
    410 
    411 class AttributesTestCase(TestCaseBase):
    412 
    413     def test_attr_syntax(self):
    414         output = [
    415           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
    416         ]
    417         self._run_check("""<a b='v' c="v" d=v e>""", output)
    418         self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
    419         self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
    420         self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
    421 
    422     def test_attr_values(self):
    423         self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
    424                         [("starttag", "a", [("b", "xxx\n\txxx"),
    425                                             ("c", "yyy\t\nyyy"),
    426                                             ("d", "\txyz\n")])])
    427         self._run_check("""<a b='' c="">""",
    428                         [("starttag", "a", [("b", ""), ("c", "")])])
    429         # Regression test for SF patch #669683.
    430         self._run_check("<e a=rgb(1,2,3)>",
    431                         [("starttag", "e", [("a", "rgb(1,2,3)")])])
    432         # Regression test for SF bug #921657.
    433         self._run_check(
    434             "<a href=mailto:xyz (at] example.com>",
    435             [("starttag", "a", [("href", "mailto:xyz (at] example.com")])])
    436 
    437     def test_attr_nonascii(self):
    438         # see issue 7311
    439         self._run_check(
    440             u"<img src=/foo/bar.png alt=\u4e2d\u6587>",
    441             [("starttag", "img", [("src", "/foo/bar.png"),
    442                                   ("alt", u"\u4e2d\u6587")])])
    443         self._run_check(
    444             u"<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
    445             [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
    446                                 ("href", u"\u30c6\u30b9\u30c8.html")])])
    447         self._run_check(
    448             u'<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
    449             [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
    450                                 ("href", u"\u30c6\u30b9\u30c8.html")])])
    451 
    452     def test_attr_entity_replacement(self):
    453         self._run_check(
    454             "<a b='&amp;&gt;&lt;&quot;&apos;'>",
    455             [("starttag", "a", [("b", "&><\"'")])])
    456 
    457     def test_attr_funky_names(self):
    458         self._run_check(
    459             "<a a.b='v' c:d=v e-f=v>",
    460             [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
    461         self._run_check(
    462             "<a $><b $=%><c \=/>",
    463             [("starttag", "a", [("$", None)]),
    464              ("starttag", "b", [("$", "%")]),
    465              ("starttag", "c", [("\\", "/")])])
    466 
    467     def test_entityrefs_in_attributes(self):
    468         self._run_check(
    469             "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
    470             [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])])
    471 
    472     def test_entities_in_attribute_value(self):
    473         # see #1200313
    474         for entity in ['&', '&amp;', '&#38;', '&#x26;']:
    475             self._run_check('<a href="%s">' % entity,
    476                             [("starttag", "a", [("href", "&")])])
    477             self._run_check("<a href='%s'>" % entity,
    478                             [("starttag", "a", [("href", "&")])])
    479             self._run_check("<a href=%s>" % entity,
    480                             [("starttag", "a", [("href", "&")])])
    481 
    482     def test_malformed_attributes(self):
    483         # see #13357
    484         html = (
    485             "<a href=test'style='color:red;bad1'>test - bad1</a>"
    486             "<a href=test'+style='color:red;ba2'>test - bad2</a>"
    487             "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
    488             "<a href = test'&nbsp;style='color:red;bad4'  >test - bad4</a>"
    489         )
    490         expected = [
    491             ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
    492             ('data', 'test - bad1'), ('endtag', 'a'),
    493             ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
    494             ('data', 'test - bad2'), ('endtag', 'a'),
    495             ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad3'")]),
    496             ('data', 'test - bad3'), ('endtag', 'a'),
    497             ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad4'")]),
    498             ('data', 'test - bad4'), ('endtag', 'a')
    499         ]
    500         self._run_check(html, expected)
    501 
    502     def test_malformed_adjacent_attributes(self):
    503         # see #12629
    504         self._run_check('<x><y z=""o"" /></x>',
    505                         [('starttag', 'x', []),
    506                             ('startendtag', 'y', [('z', ''), ('o""', None)]),
    507                             ('endtag', 'x')])
    508         self._run_check('<x><y z="""" /></x>',
    509                         [('starttag', 'x', []),
    510                             ('startendtag', 'y', [('z', ''), ('""', None)]),
    511                             ('endtag', 'x')])
    512 
    513     # see #755670 for the following 3 tests
    514     def test_adjacent_attributes(self):
    515         self._run_check('<a width="100%"cellspacing=0>',
    516                         [("starttag", "a",
    517                           [("width", "100%"), ("cellspacing","0")])])
    518 
    519         self._run_check('<a id="foo"class="bar">',
    520                         [("starttag", "a",
    521                           [("id", "foo"), ("class","bar")])])
    522 
    523     def test_missing_attribute_value(self):
    524         self._run_check('<a v=>',
    525                         [("starttag", "a", [("v", "")])])
    526 
    527     def test_javascript_attribute_value(self):
    528         self._run_check("<a href=javascript:popup('/popup/help.html')>",
    529                         [("starttag", "a",
    530                           [("href", "javascript:popup('/popup/help.html')")])])
    531 
    532     def test_end_tag_in_attribute_value(self):
    533         # see #1745761
    534         self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
    535                         [("starttag", "a",
    536                           [("href", "http://www.example.org/\">;")]),
    537                          ("data", "spam"), ("endtag", "a")])
    538 
    539     def test_comments(self):
    540         html = ("<!-- I'm a valid comment -->"
    541                 '<!--me too!-->'
    542                 '<!------>'
    543                 '<!---->'
    544                 '<!----I have many hyphens---->'
    545                 '<!-- I have a > in the middle -->'
    546                 '<!-- and I have -- in the middle! -->')
    547         expected = [('comment', " I'm a valid comment "),
    548                     ('comment', 'me too!'),
    549                     ('comment', '--'),
    550                     ('comment', ''),
    551                     ('comment', '--I have many hyphens--'),
    552                     ('comment', ' I have a > in the middle '),
    553                     ('comment', ' and I have -- in the middle! ')]
    554         self._run_check(html, expected)
    555 
    556     def test_broken_comments(self):
    557         html = ('<! not really a comment >'
    558                 '<! not a comment either -->'
    559                 '<! -- close enough -->'
    560                 '<!><!<-- this was an empty comment>'
    561                 '<!!! another bogus comment !!!>')
    562         expected = [
    563             ('comment', ' not really a comment '),
    564             ('comment', ' not a comment either --'),
    565             ('comment', ' -- close enough --'),
    566             ('comment', ''),
    567             ('comment', '<-- this was an empty comment'),
    568             ('comment', '!! another bogus comment !!!'),
    569         ]
    570         self._run_check(html, expected)
    571 
    572     def test_condcoms(self):
    573         html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
    574                 '<!--[if IE 8]>condcoms<![endif]-->'
    575                 '<!--[if lte IE 7]>pretty?<![endif]-->')
    576         expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
    577                     ('comment', '[if IE 8]>condcoms<![endif]'),
    578                     ('comment', '[if lte IE 7]>pretty?<![endif]')]
    579         self._run_check(html, expected)
    580 
    581     def test_broken_condcoms(self):
    582         # these condcoms are missing the '--' after '<!' and before the '>'
    583         html = ('<![if !(IE)]>broken condcom<![endif]>'
    584                 '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
    585                 '<![if !IE 6]><img src="firefox.png" /><![endif]>'
    586                 '<![if !ie 6]><b>foo</b><![endif]>'
    587                 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
    588         # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
    589         # and "8.2.4.45 Markup declaration open state", comment tokens should
    590         # be emitted instead of 'unknown decl', but calling unknown_decl
    591         # provides more flexibility.
    592         # See also Lib/_markupbase.py:parse_declaration
    593         expected = [
    594             ('unknown decl', 'if !(IE)'),
    595             ('data', 'broken condcom'),
    596             ('unknown decl', 'endif'),
    597             ('unknown decl', 'if ! IE'),
    598             ('startendtag', 'link', [('href', 'favicon.tiff')]),
    599             ('unknown decl', 'endif'),
    600             ('unknown decl', 'if !IE 6'),
    601             ('startendtag', 'img', [('src', 'firefox.png')]),
    602             ('unknown decl', 'endif'),
    603             ('unknown decl', 'if !ie 6'),
    604             ('starttag', 'b', []),
    605             ('data', 'foo'),
    606             ('endtag', 'b'),
    607             ('unknown decl', 'endif'),
    608             ('unknown decl', 'if (!IE)|(lt IE 9)'),
    609             ('startendtag', 'img', [('src', 'mammoth.bmp')]),
    610             ('unknown decl', 'endif')
    611         ]
    612         self._run_check(html, expected)
    613 
    614 
    615 def test_main():
    616     test_support.run_unittest(HTMLParserTestCase, AttributesTestCase)
    617 
    618 
    619 if __name__ == "__main__":
    620     test_main()
    621