Home | History | Annotate | Download | only in Lib
      1 """Shared support for scanning document type declarations in HTML and XHTML.
      2 
      3 This module is used as a foundation for the html.parser module.  It has no
      4 documented public API and should not be used directly.
      5 
      6 """
      7 
      8 import re
      9 
     10 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
     11 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
     12 _commentclose = re.compile(r'--\s*>')
     13 _markedsectionclose = re.compile(r']\s*]\s*>')
     14 
     15 # An analysis of the MS-Word extensions is available at
     16 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
     17 
     18 _msmarkedsectionclose = re.compile(r']\s*>')
     19 
     20 del re
     21 
     22 
     23 class ParserBase:
     24     """Parser base class which provides some common support methods used
     25     by the SGML/HTML and XHTML parsers."""
     26 
     27     def __init__(self):
     28         if self.__class__ is ParserBase:
     29             raise RuntimeError(
     30                 "_markupbase.ParserBase must be subclassed")
     31 
     32     def error(self, message):
     33         raise NotImplementedError(
     34             "subclasses of ParserBase must override error()")
     35 
     36     def reset(self):
     37         self.lineno = 1
     38         self.offset = 0
     39 
     40     def getpos(self):
     41         """Return current line number and offset."""
     42         return self.lineno, self.offset
     43 
     44     # Internal -- update line number and offset.  This should be
     45     # called for each piece of data exactly once, in order -- in other
     46     # words the concatenation of all the input strings to this
     47     # function should be exactly the entire input.
     48     def updatepos(self, i, j):
     49         if i >= j:
     50             return j
     51         rawdata = self.rawdata
     52         nlines = rawdata.count("\n", i, j)
     53         if nlines:
     54             self.lineno = self.lineno + nlines
     55             pos = rawdata.rindex("\n", i, j) # Should not fail
     56             self.offset = j-(pos+1)
     57         else:
     58             self.offset = self.offset + j-i
     59         return j
     60 
     61     _decl_otherchars = ''
     62 
     63     # Internal -- parse declaration (for use by subclasses).
     64     def parse_declaration(self, i):
     65         # This is some sort of declaration; in "HTML as
     66         # deployed," this should only be the document type
     67         # declaration ("<!DOCTYPE html...>").
     68         # ISO 8879:1986, however, has more complex
     69         # declaration syntax for elements in <!...>, including:
     70         # --comment--
     71         # [marked section]
     72         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
     73         # ATTLIST, NOTATION, SHORTREF, USEMAP,
     74         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
     75         rawdata = self.rawdata
     76         j = i + 2
     77         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
     78         if rawdata[j:j+1] == ">":
     79             # the empty comment <!>
     80             return j + 1
     81         if rawdata[j:j+1] in ("-", ""):
     82             # Start of comment followed by buffer boundary,
     83             # or just a buffer boundary.
     84             return -1
     85         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
     86         n = len(rawdata)
     87         if rawdata[j:j+2] == '--': #comment
     88             # Locate --.*-- as the body of the comment
     89             return self.parse_comment(i)
     90         elif rawdata[j] == '[': #marked section
     91             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
     92             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
     93             # Note that this is extended by Microsoft Office "Save as Web" function
     94             # to include [if...] and [endif].
     95             return self.parse_marked_section(i)
     96         else: #all other declaration elements
     97             decltype, j = self._scan_name(j, i)
     98         if j < 0:
     99             return j
    100         if decltype == "doctype":
    101             self._decl_otherchars = ''
    102         while j < n:
    103             c = rawdata[j]
    104             if c == ">":
    105                 # end of declaration syntax
    106                 data = rawdata[i+2:j]
    107                 if decltype == "doctype":
    108                     self.handle_decl(data)
    109                 else:
    110                     # According to the HTML5 specs sections "8.2.4.44 Bogus
    111                     # comment state" and "8.2.4.45 Markup declaration open
    112                     # state", a comment token should be emitted.
    113                     # Calling unknown_decl provides more flexibility though.
    114                     self.unknown_decl(data)
    115                 return j + 1
    116             if c in "\"'":
    117                 m = _declstringlit_match(rawdata, j)
    118                 if not m:
    119                     return -1 # incomplete
    120                 j = m.end()
    121             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
    122                 name, j = self._scan_name(j, i)
    123             elif c in self._decl_otherchars:
    124                 j = j + 1
    125             elif c == "[":
    126                 # this could be handled in a separate doctype parser
    127                 if decltype == "doctype":
    128                     j = self._parse_doctype_subset(j + 1, i)
    129                 elif decltype in {"attlist", "linktype", "link", "element"}:
    130                     # must tolerate []'d groups in a content model in an element declaration
    131                     # also in data attribute specifications of attlist declaration
    132                     # also link type declaration subsets in linktype declarations
    133                     # also link attribute specification lists in link declarations
    134                     self.error("unsupported '[' char in %s declaration" % decltype)
    135                 else:
    136                     self.error("unexpected '[' char in declaration")
    137             else:
    138                 self.error(
    139                     "unexpected %r char in declaration" % rawdata[j])
    140             if j < 0:
    141                 return j
    142         return -1 # incomplete
    143 
    144     # Internal -- parse a marked section
    145     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
    146     def parse_marked_section(self, i, report=1):
    147         rawdata= self.rawdata
    148         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
    149         sectName, j = self._scan_name( i+3, i )
    150         if j < 0:
    151             return j
    152         if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
    153             # look for standard ]]> ending
    154             match= _markedsectionclose.search(rawdata, i+3)
    155         elif sectName in {"if", "else", "endif"}:
    156             # look for MS Office ]> ending
    157             match= _msmarkedsectionclose.search(rawdata, i+3)
    158         else:
    159             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
    160         if not match:
    161             return -1
    162         if report:
    163             j = match.start(0)
    164             self.unknown_decl(rawdata[i+3: j])
    165         return match.end(0)
    166 
    167     # Internal -- parse comment, return length or -1 if not terminated
    168     def parse_comment(self, i, report=1):
    169         rawdata = self.rawdata
    170         if rawdata[i:i+4] != '<!--':
    171             self.error('unexpected call to parse_comment()')
    172         match = _commentclose.search(rawdata, i+4)
    173         if not match:
    174             return -1
    175         if report:
    176             j = match.start(0)
    177             self.handle_comment(rawdata[i+4: j])
    178         return match.end(0)
    179 
    180     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
    181     # returning the index just past any whitespace following the trailing ']'.
    182     def _parse_doctype_subset(self, i, declstartpos):
    183         rawdata = self.rawdata
    184         n = len(rawdata)
    185         j = i
    186         while j < n:
    187             c = rawdata[j]
    188             if c == "<":
    189                 s = rawdata[j:j+2]
    190                 if s == "<":
    191                     # end of buffer; incomplete
    192                     return -1
    193                 if s != "<!":
    194                     self.updatepos(declstartpos, j + 1)
    195                     self.error("unexpected char in internal subset (in %r)" % s)
    196                 if (j + 2) == n:
    197                     # end of buffer; incomplete
    198                     return -1
    199                 if (j + 4) > n:
    200                     # end of buffer; incomplete
    201                     return -1
    202                 if rawdata[j:j+4] == "<!--":
    203                     j = self.parse_comment(j, report=0)
    204                     if j < 0:
    205                         return j
    206                     continue
    207                 name, j = self._scan_name(j + 2, declstartpos)
    208                 if j == -1:
    209                     return -1
    210                 if name not in {"attlist", "element", "entity", "notation"}:
    211                     self.updatepos(declstartpos, j + 2)
    212                     self.error(
    213                         "unknown declaration %r in internal subset" % name)
    214                 # handle the individual names
    215                 meth = getattr(self, "_parse_doctype_" + name)
    216                 j = meth(j, declstartpos)
    217                 if j < 0:
    218                     return j
    219             elif c == "%":
    220                 # parameter entity reference
    221                 if (j + 1) == n:
    222                     # end of buffer; incomplete
    223                     return -1
    224                 s, j = self._scan_name(j + 1, declstartpos)
    225                 if j < 0:
    226                     return j
    227                 if rawdata[j] == ";":
    228                     j = j + 1
    229             elif c == "]":
    230                 j = j + 1
    231                 while j < n and rawdata[j].isspace():
    232                     j = j + 1
    233                 if j < n:
    234                     if rawdata[j] == ">":
    235                         return j
    236                     self.updatepos(declstartpos, j)
    237                     self.error("unexpected char after internal subset")
    238                 else:
    239                     return -1
    240             elif c.isspace():
    241                 j = j + 1
    242             else:
    243                 self.updatepos(declstartpos, j)
    244                 self.error("unexpected char %r in internal subset" % c)
    245         # end of buffer reached
    246         return -1
    247 
    248     # Internal -- scan past <!ELEMENT declarations
    249     def _parse_doctype_element(self, i, declstartpos):
    250         name, j = self._scan_name(i, declstartpos)
    251         if j == -1:
    252             return -1
    253         # style content model; just skip until '>'
    254         rawdata = self.rawdata
    255         if '>' in rawdata[j:]:
    256             return rawdata.find(">", j) + 1
    257         return -1
    258 
    259     # Internal -- scan past <!ATTLIST declarations
    260     def _parse_doctype_attlist(self, i, declstartpos):
    261         rawdata = self.rawdata
    262         name, j = self._scan_name(i, declstartpos)
    263         c = rawdata[j:j+1]
    264         if c == "":
    265             return -1
    266         if c == ">":
    267             return j + 1
    268         while 1:
    269             # scan a series of attribute descriptions; simplified:
    270             #   name type [value] [#constraint]
    271             name, j = self._scan_name(j, declstartpos)
    272             if j < 0:
    273                 return j
    274             c = rawdata[j:j+1]
    275             if c == "":
    276                 return -1
    277             if c == "(":
    278                 # an enumerated type; look for ')'
    279                 if ")" in rawdata[j:]:
    280                     j = rawdata.find(")", j) + 1
    281                 else:
    282                     return -1
    283                 while rawdata[j:j+1].isspace():
    284                     j = j + 1
    285                 if not rawdata[j:]:
    286                     # end of buffer, incomplete
    287                     return -1
    288             else:
    289                 name, j = self._scan_name(j, declstartpos)
    290             c = rawdata[j:j+1]
    291             if not c:
    292                 return -1
    293             if c in "'\"":
    294                 m = _declstringlit_match(rawdata, j)
    295                 if m:
    296                     j = m.end()
    297                 else:
    298                     return -1
    299                 c = rawdata[j:j+1]
    300                 if not c:
    301                     return -1
    302             if c == "#":
    303                 if rawdata[j:] == "#":
    304                     # end of buffer
    305                     return -1
    306                 name, j = self._scan_name(j + 1, declstartpos)
    307                 if j < 0:
    308                     return j
    309                 c = rawdata[j:j+1]
    310                 if not c:
    311                     return -1
    312             if c == '>':
    313                 # all done
    314                 return j + 1
    315 
    316     # Internal -- scan past <!NOTATION declarations
    317     def _parse_doctype_notation(self, i, declstartpos):
    318         name, j = self._scan_name(i, declstartpos)
    319         if j < 0:
    320             return j
    321         rawdata = self.rawdata
    322         while 1:
    323             c = rawdata[j:j+1]
    324             if not c:
    325                 # end of buffer; incomplete
    326                 return -1
    327             if c == '>':
    328                 return j + 1
    329             if c in "'\"":
    330                 m = _declstringlit_match(rawdata, j)
    331                 if not m:
    332                     return -1
    333                 j = m.end()
    334             else:
    335                 name, j = self._scan_name(j, declstartpos)
    336                 if j < 0:
    337                     return j
    338 
    339     # Internal -- scan past <!ENTITY declarations
    340     def _parse_doctype_entity(self, i, declstartpos):
    341         rawdata = self.rawdata
    342         if rawdata[i:i+1] == "%":
    343             j = i + 1
    344             while 1:
    345                 c = rawdata[j:j+1]
    346                 if not c:
    347                     return -1
    348                 if c.isspace():
    349                     j = j + 1
    350                 else:
    351                     break
    352         else:
    353             j = i
    354         name, j = self._scan_name(j, declstartpos)
    355         if j < 0:
    356             return j
    357         while 1:
    358             c = self.rawdata[j:j+1]
    359             if not c:
    360                 return -1
    361             if c in "'\"":
    362                 m = _declstringlit_match(rawdata, j)
    363                 if m:
    364                     j = m.end()
    365                 else:
    366                     return -1    # incomplete
    367             elif c == ">":
    368                 return j + 1
    369             else:
    370                 name, j = self._scan_name(j, declstartpos)
    371                 if j < 0:
    372                     return j
    373 
    374     # Internal -- scan a name token and the new position and the token, or
    375     # return -1 if we've reached the end of the buffer.
    376     def _scan_name(self, i, declstartpos):
    377         rawdata = self.rawdata
    378         n = len(rawdata)
    379         if i == n:
    380             return None, -1
    381         m = _declname_match(rawdata, i)
    382         if m:
    383             s = m.group()
    384             name = s.strip()
    385             if (i + len(s)) == n:
    386                 return None, -1  # end of buffer
    387             return name.lower(), m.end()
    388         else:
    389             self.updatepos(declstartpos, i)
    390             self.error("expected name token at %r"
    391                        % rawdata[declstartpos:declstartpos+20])
    392 
    393     # To be overridden -- handlers for unknown objects
    394     def unknown_decl(self, data):
    395         pass
    396