Home | History | Annotate | Download | only in test
      1 import pprint
      2 import re
      3 import unittest
      4 from test import test_support
      5 sgmllib = test_support.import_module('sgmllib', deprecated=True)
      6 
      7 
      8 class EventCollector(sgmllib.SGMLParser):
      9 
     10     def __init__(self):
     11         self.events = []
     12         self.append = self.events.append
     13         sgmllib.SGMLParser.__init__(self)
     14 
     15     def get_events(self):
     16         # Normalize the list of events so that buffer artefacts don't
     17         # separate runs of contiguous characters.
     18         L = []
     19         prevtype = None
     20         for event in self.events:
     21             type = event[0]
     22             if type == prevtype == "data":
     23                 L[-1] = ("data", L[-1][1] + event[1])
     24             else:
     25                 L.append(event)
     26             prevtype = type
     27         self.events = L
     28         return L
     29 
     30     # structure markup
     31 
     32     def unknown_starttag(self, tag, attrs):
     33         self.append(("starttag", tag, attrs))
     34 
     35     def unknown_endtag(self, tag):
     36         self.append(("endtag", tag))
     37 
     38     # all other markup
     39 
     40     def handle_comment(self, data):
     41         self.append(("comment", data))
     42 
     43     def handle_charref(self, data):
     44         self.append(("charref", data))
     45 
     46     def handle_data(self, data):
     47         self.append(("data", data))
     48 
     49     def handle_decl(self, decl):
     50         self.append(("decl", decl))
     51 
     52     def handle_entityref(self, data):
     53         self.append(("entityref", data))
     54 
     55     def handle_pi(self, data):
     56         self.append(("pi", data))
     57 
     58     def unknown_decl(self, decl):
     59         self.append(("unknown decl", decl))
     60 
     61 
     62 class CDATAEventCollector(EventCollector):
     63     def start_cdata(self, attrs):
     64         self.append(("starttag", "cdata", attrs))
     65         self.setliteral()
     66 
     67 
     68 class HTMLEntityCollector(EventCollector):
     69 
     70     entity_or_charref = re.compile('(?:&([a-zA-Z][-.a-zA-Z0-9]*)'
     71         '|&#(x[0-9a-zA-Z]+|[0-9]+))(;?)')
     72 
     73     def convert_charref(self, name):
     74         self.append(("charref", "convert", name))
     75         if name[0] != "x":
     76             return EventCollector.convert_charref(self, name)
     77 
     78     def convert_codepoint(self, codepoint):
     79         self.append(("codepoint", "convert", codepoint))
     80         EventCollector.convert_codepoint(self, codepoint)
     81 
     82     def convert_entityref(self, name):
     83         self.append(("entityref", "convert", name))
     84         return EventCollector.convert_entityref(self, name)
     85 
     86     # These to record that they were called, then pass the call along
     87     # to the default implementation so that it's actions can be
     88     # recorded.
     89 
     90     def handle_charref(self, data):
     91         self.append(("charref", data))
     92         sgmllib.SGMLParser.handle_charref(self, data)
     93 
     94     def handle_entityref(self, data):
     95         self.append(("entityref", data))
     96         sgmllib.SGMLParser.handle_entityref(self, data)
     97 
     98 
     99 class SGMLParserTestCase(unittest.TestCase):
    100 
    101     collector = EventCollector
    102 
    103     def get_events(self, source):
    104         parser = self.collector()
    105         try:
    106             for s in source:
    107                 parser.feed(s)
    108             parser.close()
    109         except:
    110             #self.events = parser.events
    111             raise
    112         return parser.get_events()
    113 
    114     def check_events(self, source, expected_events):
    115         try:
    116             events = self.get_events(source)
    117         except:
    118             #import sys
    119             #print >>sys.stderr, pprint.pformat(self.events)
    120             raise
    121         if events != expected_events:
    122             self.fail("received events did not match expected events\n"
    123                       "Expected:\n" + pprint.pformat(expected_events) +
    124                       "\nReceived:\n" + pprint.pformat(events))
    125 
    126     def check_parse_error(self, source):
    127         parser = EventCollector()
    128         try:
    129             parser.feed(source)
    130             parser.close()
    131         except sgmllib.SGMLParseError:
    132             pass
    133         else:
    134             self.fail("expected SGMLParseError for %r\nReceived:\n%s"
    135                       % (source, pprint.pformat(parser.get_events())))
    136 
    137     def test_doctype_decl_internal(self):
    138         inside = """\
    139 DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'
    140              SYSTEM 'http://www.w3.org/TR/html401/strict.dtd' [
    141   <!ELEMENT html - O EMPTY>
    142   <!ATTLIST html
    143       version CDATA #IMPLIED
    144       profile CDATA 'DublinCore'>
    145   <!NOTATION datatype SYSTEM 'http://xml.python.org/notations/python-module'>
    146   <!ENTITY myEntity 'internal parsed entity'>
    147   <!ENTITY anEntity SYSTEM 'http://xml.python.org/entities/something.xml'>
    148   <!ENTITY % paramEntity 'name|name|name'>
    149   %paramEntity;
    150   <!-- comment -->
    151 ]"""
    152         self.check_events(["<!%s>" % inside], [
    153             ("decl", inside),
    154             ])
    155 
    156     def test_doctype_decl_external(self):
    157         inside = "DOCTYPE html PUBLIC '-//W3C//DTD HTML 4.01//EN'"
    158         self.check_events("<!%s>" % inside, [
    159             ("decl", inside),
    160             ])
    161 
    162     def test_underscore_in_attrname(self):
    163         # SF bug #436621
    164         """Make sure attribute names with underscores are accepted"""
    165         self.check_events("<a has_under _under>", [
    166             ("starttag", "a", [("has_under", "has_under"),
    167                                ("_under", "_under")]),
    168             ])
    169 
    170     def test_underscore_in_tagname(self):
    171         # SF bug #436621
    172         """Make sure tag names with underscores are accepted"""
    173         self.check_events("<has_under></has_under>", [
    174             ("starttag", "has_under", []),
    175             ("endtag", "has_under"),
    176             ])
    177 
    178     def test_quotes_in_unquoted_attrs(self):
    179         # SF bug #436621
    180         """Be sure quotes in unquoted attributes are made part of the value"""
    181         self.check_events("<a href=foo'bar\"baz>", [
    182             ("starttag", "a", [("href", "foo'bar\"baz")]),
    183             ])
    184 
    185     def test_xhtml_empty_tag(self):
    186         """Handling of XHTML-style empty start tags"""
    187         self.check_events("<br />text<i></i>", [
    188             ("starttag", "br", []),
    189             ("data", "text"),
    190             ("starttag", "i", []),
    191             ("endtag", "i"),
    192             ])
    193 
    194     def test_processing_instruction_only(self):
    195         self.check_events("<?processing instruction>", [
    196             ("pi", "processing instruction"),
    197             ])
    198 
    199     def test_bad_nesting(self):
    200         self.check_events("<a><b></a></b>", [
    201             ("starttag", "a", []),
    202             ("starttag", "b", []),
    203             ("endtag", "a"),
    204             ("endtag", "b"),
    205             ])
    206 
    207     def test_bare_ampersands(self):
    208         self.check_events("this text & contains & ampersands &", [
    209             ("data", "this text & contains & ampersands &"),
    210             ])
    211 
    212     def test_bare_pointy_brackets(self):
    213         self.check_events("this < text > contains < bare>pointy< brackets", [
    214             ("data", "this < text > contains < bare>pointy< brackets"),
    215             ])
    216 
    217     def test_attr_syntax(self):
    218         output = [
    219           ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", "e")])
    220           ]
    221         self.check_events("""<a b='v' c="v" d=v e>""", output)
    222         self.check_events("""<a  b = 'v' c = "v" d = v e>""", output)
    223         self.check_events("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
    224         self.check_events("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
    225 
    226     def test_attr_values(self):
    227         self.check_events("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
    228                         [("starttag", "a", [("b", "xxx\n\txxx"),
    229                                             ("c", "yyy\t\nyyy"),
    230                                             ("d", "\txyz\n")])
    231                          ])
    232         self.check_events("""<a b='' c="">""", [
    233             ("starttag", "a", [("b", ""), ("c", "")]),
    234             ])
    235         # URL construction stuff from RFC 1808:
    236         safe = "$-_.+"
    237         extra = "!*'(),"
    238         reserved = ";/?:@&="
    239         url = "http://example.com:8080/path/to/file?%s%s%s" % (
    240             safe, extra, reserved)
    241         self.check_events("""<e a=%s>""" % url, [
    242             ("starttag", "e", [("a", url)]),
    243             ])
    244         # Regression test for SF patch #669683.
    245         self.check_events("<e a=rgb(1,2,3)>", [
    246             ("starttag", "e", [("a", "rgb(1,2,3)")]),
    247             ])
    248 
    249     def test_attr_values_entities(self):
    250         """Substitution of entities and charrefs in attribute values"""
    251         # SF bug #1452246
    252         self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
    253                                 f="&xxx;" g='&#32;&#33;' h='&#500;'
    254                                 i='x?a=b&c=d;'
    255                                 j='&amp;#42;' k='&#38;#42;'>""",
    256             [("starttag", "a", [("b", "<"),
    257                                 ("c", "<>"),
    258                                 ("d", "&lt->"),
    259                                 ("e", "< "),
    260                                 ("f", "&xxx;"),
    261                                 ("g", " !"),
    262                                 ("h", "&#500;"),
    263                                 ("i", "x?a=b&c=d;"),
    264                                 ("j", "&#42;"),
    265                                 ("k", "&#42;"),
    266                                 ])])
    267 
    268     def test_convert_overrides(self):
    269         # This checks that the character and entity reference
    270         # conversion helpers are called at the documented times.  No
    271         # attempt is made to really change what the parser accepts.
    272         #
    273         self.collector = HTMLEntityCollector
    274         self.check_events(('<a title="&ldquo;test&#x201d;">foo</a>'
    275                            '&foobar;&#42;'), [
    276             ('entityref', 'convert', 'ldquo'),
    277             ('charref', 'convert', 'x201d'),
    278             ('starttag', 'a', [('title', '&ldquo;test&#x201d;')]),
    279             ('data', 'foo'),
    280             ('endtag', 'a'),
    281             ('entityref', 'foobar'),
    282             ('entityref', 'convert', 'foobar'),
    283             ('charref', '42'),
    284             ('charref', 'convert', '42'),
    285             ('codepoint', 'convert', 42),
    286             ])
    287 
    288     def test_attr_funky_names(self):
    289         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
    290             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
    291             ])
    292 
    293     def test_attr_value_ip6_url(self):
    294         # http://www.python.org/sf/853506
    295         self.check_events(("<a href='http://[1080::8:800:200C:417A]/'>"
    296                            "<a href=http://[1080::8:800:200C:417A]/>"), [
    297             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
    298             ("starttag", "a", [("href", "http://[1080::8:800:200C:417A]/")]),
    299             ])
    300 
    301     def test_weird_starttags(self):
    302         self.check_events("<a<a>", [
    303             ("starttag", "a", []),
    304             ("starttag", "a", []),
    305             ])
    306         self.check_events("</a<a>", [
    307             ("endtag", "a"),
    308             ("starttag", "a", []),
    309             ])
    310 
    311     def test_declaration_junk_chars(self):
    312         self.check_parse_error("<!DOCTYPE foo $ >")
    313 
    314     def test_get_starttag_text(self):
    315         s = """<foobar   \n   one="1"\ttwo=2   >"""
    316         self.check_events(s, [
    317             ("starttag", "foobar", [("one", "1"), ("two", "2")]),
    318             ])
    319 
    320     def test_cdata_content(self):
    321         s = ("<cdata> <!-- not a comment --> &not-an-entity-ref; </cdata>"
    322              "<notcdata> <!-- comment --> </notcdata>")
    323         self.collector = CDATAEventCollector
    324         self.check_events(s, [
    325             ("starttag", "cdata", []),
    326             ("data", " <!-- not a comment --> &not-an-entity-ref; "),
    327             ("endtag", "cdata"),
    328             ("starttag", "notcdata", []),
    329             ("data", " "),
    330             ("comment", " comment "),
    331             ("data", " "),
    332             ("endtag", "notcdata"),
    333             ])
    334         s = """<cdata> <not a='start tag'> </cdata>"""
    335         self.check_events(s, [
    336             ("starttag", "cdata", []),
    337             ("data", " <not a='start tag'> "),
    338             ("endtag", "cdata"),
    339             ])
    340 
    341     def test_illegal_declarations(self):
    342         s = 'abc<!spacer type="block" height="25">def'
    343         self.check_events(s, [
    344             ("data", "abc"),
    345             ("unknown decl", 'spacer type="block" height="25"'),
    346             ("data", "def"),
    347             ])
    348 
    349     def test_enumerated_attr_type(self):
    350         s = "<!DOCTYPE doc [<!ATTLIST doc attr (a | b) >]>"
    351         self.check_events(s, [
    352             ('decl', 'DOCTYPE doc [<!ATTLIST doc attr (a | b) >]'),
    353             ])
    354 
    355     def test_read_chunks(self):
    356         # SF bug #1541697, this caused sgml parser to hang
    357         # Just verify this code doesn't cause a hang.
    358         CHUNK = 1024  # increasing this to 8212 makes the problem go away
    359 
    360         f = open(test_support.findfile('sgml_input.html'))
    361         fp = sgmllib.SGMLParser()
    362         while 1:
    363             data = f.read(CHUNK)
    364             fp.feed(data)
    365             if len(data) != CHUNK:
    366                 break
    367 
    368     def test_only_decode_ascii(self):
    369         # SF bug #1651995, make sure non-ascii character references are not decoded
    370         s = '<signs exclamation="&#33" copyright="&#169" quoteleft="&#8216;">'
    371         self.check_events(s, [
    372             ('starttag', 'signs',
    373              [('exclamation', '!'), ('copyright', '&#169'),
    374               ('quoteleft', '&#8216;')]),
    375             ])
    376 
    377     # XXX These tests have been disabled by prefixing their names with
    378     # an underscore.  The first two exercise outstanding bugs in the
    379     # sgmllib module, and the third exhibits questionable behavior
    380     # that needs to be carefully considered before changing it.
    381 
    382     def _test_starttag_end_boundary(self):
    383         self.check_events("<a b='<'>", [("starttag", "a", [("b", "<")])])
    384         self.check_events("<a b='>'>", [("starttag", "a", [("b", ">")])])
    385 
    386     def _test_buffer_artefacts(self):
    387         output = [("starttag", "a", [("b", "<")])]
    388         self.check_events(["<a b='<'>"], output)
    389         self.check_events(["<a ", "b='<'>"], output)
    390         self.check_events(["<a b", "='<'>"], output)
    391         self.check_events(["<a b=", "'<'>"], output)
    392         self.check_events(["<a b='<", "'>"], output)
    393         self.check_events(["<a b='<'", ">"], output)
    394 
    395         output = [("starttag", "a", [("b", ">")])]
    396         self.check_events(["<a b='>'>"], output)
    397         self.check_events(["<a ", "b='>'>"], output)
    398         self.check_events(["<a b", "='>'>"], output)
    399         self.check_events(["<a b=", "'>'>"], output)
    400         self.check_events(["<a b='>", "'>"], output)
    401         self.check_events(["<a b='>'", ">"], output)
    402 
    403         output = [("comment", "abc")]
    404         self.check_events(["", "<!--abc-->"], output)
    405         self.check_events(["<", "!--abc-->"], output)
    406         self.check_events(["<!", "--abc-->"], output)
    407         self.check_events(["<!-", "-abc-->"], output)
    408         self.check_events(["<!--", "abc-->"], output)
    409         self.check_events(["<!--a", "bc-->"], output)
    410         self.check_events(["<!--ab", "c-->"], output)
    411         self.check_events(["<!--abc", "-->"], output)
    412         self.check_events(["<!--abc-", "->"], output)
    413         self.check_events(["<!--abc--", ">"], output)
    414         self.check_events(["<!--abc-->", ""], output)
    415 
    416     def _test_starttag_junk_chars(self):
    417         self.check_parse_error("<")
    418         self.check_parse_error("<>")
    419         self.check_parse_error("</$>")
    420         self.check_parse_error("</")
    421         self.check_parse_error("</a")
    422         self.check_parse_error("<$")
    423         self.check_parse_error("<$>")
    424         self.check_parse_error("<!")
    425         self.check_parse_error("<a $>")
    426         self.check_parse_error("<a")
    427         self.check_parse_error("<a foo='bar'")
    428         self.check_parse_error("<a foo='bar")
    429         self.check_parse_error("<a foo='>'")
    430         self.check_parse_error("<a foo='>")
    431         self.check_parse_error("<a foo=>")
    432 
    433 
    434 def test_main():
    435     test_support.run_unittest(SGMLParserTestCase)
    436 
    437 
    438 if __name__ == "__main__":
    439     test_main()
    440