Home | History | Annotate | Download | only in python2.7
      1 """Shared support for scanning document type declarations in HTML and XHTML.
      2 
      3 This module is used as a foundation for the HTMLParser and sgmllib
      4 modules (indirectly, for htmllib as well).  It has no documented
      5 public API and should not be used directly.
      6 
      7 """
      8 
      9 import re
     10 
     11 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
     12 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
     13 _commentclose = re.compile(r'--\s*>')
     14 _markedsectionclose = re.compile(r']\s*]\s*>')
     15 
     16 # An analysis of the MS-Word extensions is available at
     17 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
     18 
     19 _msmarkedsectionclose = re.compile(r']\s*>')
     20 
     21 del re
     22 
     23 
     24 class ParserBase:
     25     """Parser base class which provides some common support methods used
     26     by the SGML/HTML and XHTML parsers."""
     27 
     28     def __init__(self):
     29         if self.__class__ is ParserBase:
     30             raise RuntimeError(
     31                 "markupbase.ParserBase must be subclassed")
     32 
     33     def error(self, message):
     34         raise NotImplementedError(
     35             "subclasses of ParserBase must override error()")
     36 
     37     def reset(self):
     38         self.lineno = 1
     39         self.offset = 0
     40 
     41     def getpos(self):
     42         """Return current line number and offset."""
     43         return self.lineno, self.offset
     44 
     45     # Internal -- update line number and offset.  This should be
     46     # called for each piece of data exactly once, in order -- in other
     47     # words the concatenation of all the input strings to this
     48     # function should be exactly the entire input.
     49     def updatepos(self, i, j):
     50         if i >= j:
     51             return j
     52         rawdata = self.rawdata
     53         nlines = rawdata.count("\n", i, j)
     54         if nlines:
     55             self.lineno = self.lineno + nlines
     56             pos = rawdata.rindex("\n", i, j) # Should not fail
     57             self.offset = j-(pos+1)
     58         else:
     59             self.offset = self.offset + j-i
     60         return j
     61 
     62     _decl_otherchars = ''
     63 
     64     # Internal -- parse declaration (for use by subclasses).
     65     def parse_declaration(self, i):
     66         # This is some sort of declaration; in "HTML as
     67         # deployed," this should only be the document type
     68         # declaration ("<!DOCTYPE html...>").
     69         # ISO 8879:1986, however, has more complex
     70         # declaration syntax for elements in <!...>, including:
     71         # --comment--
     72         # [marked section]
     73         # name in the following list: ENTITY, DOCTYPE, ELEMENT,
     74         # ATTLIST, NOTATION, SHORTREF, USEMAP,
     75         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
     76         rawdata = self.rawdata
     77         j = i + 2
     78         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
     79         if rawdata[j:j+1] == ">":
     80             # the empty comment <!>
     81             return j + 1
     82         if rawdata[j:j+1] in ("-", ""):
     83             # Start of comment followed by buffer boundary,
     84             # or just a buffer boundary.
     85             return -1
     86         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
     87         n = len(rawdata)
     88         if rawdata[j:j+2] == '--': #comment
     89             # Locate --.*-- as the body of the comment
     90             return self.parse_comment(i)
     91         elif rawdata[j] == '[': #marked section
     92             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
     93             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
     94             # Note that this is extended by Microsoft Office "Save as Web" function
     95             # to include [if...] and [endif].
     96             return self.parse_marked_section(i)
     97         else: #all other declaration elements
     98             decltype, j = self._scan_name(j, i)
     99         if j < 0:
    100             return j
    101         if decltype == "doctype":
    102             self._decl_otherchars = ''
    103         while j < n:
    104             c = rawdata[j]
    105             if c == ">":
    106                 # end of declaration syntax
    107                 data = rawdata[i+2:j]
    108                 if decltype == "doctype":
    109                     self.handle_decl(data)
    110                 else:
    111                     # According to the HTML5 specs sections "8.2.4.44 Bogus
    112                     # comment state" and "8.2.4.45 Markup declaration open
    113                     # state", a comment token should be emitted.
    114                     # Calling unknown_decl provides more flexibility though.
    115                     self.unknown_decl(data)
    116                 return j + 1
    117             if c in "\"'":
    118                 m = _declstringlit_match(rawdata, j)
    119                 if not m:
    120                     return -1 # incomplete
    121                 j = m.end()
    122             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
    123                 name, j = self._scan_name(j, i)
    124             elif c in self._decl_otherchars:
    125                 j = j + 1
    126             elif c == "[":
    127                 # this could be handled in a separate doctype parser
    128                 if decltype == "doctype":
    129                     j = self._parse_doctype_subset(j + 1, i)
    130                 elif decltype in ("attlist", "linktype", "link", "element"):
    131                     # must tolerate []'d groups in a content model in an element declaration
    132                     # also in data attribute specifications of attlist declaration
    133                     # also link type declaration subsets in linktype declarations
    134                     # also link attribute specification lists in link declarations
    135                     self.error("unsupported '[' char in %s declaration" % decltype)
    136                 else:
    137                     self.error("unexpected '[' char in declaration")
    138             else:
    139                 self.error(
    140                     "unexpected %r char in declaration" % rawdata[j])
    141             if j < 0:
    142                 return j
    143         return -1 # incomplete
    144 
    145     # Internal -- parse a marked section
    146     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
    147     def parse_marked_section(self, i, report=1):
    148         rawdata= self.rawdata
    149         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
    150         sectName, j = self._scan_name( i+3, i )
    151         if j < 0:
    152             return j
    153         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
    154             # look for standard ]]> ending
    155             match= _markedsectionclose.search(rawdata, i+3)
    156         elif sectName in ("if", "else", "endif"):
    157             # look for MS Office ]> ending
    158             match= _msmarkedsectionclose.search(rawdata, i+3)
    159         else:
    160             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
    161         if not match:
    162             return -1
    163         if report:
    164             j = match.start(0)
    165             self.unknown_decl(rawdata[i+3: j])
    166         return match.end(0)
    167 
    168     # Internal -- parse comment, return length or -1 if not terminated
    169     def parse_comment(self, i, report=1):
    170         rawdata = self.rawdata
    171         if rawdata[i:i+4] != '<!--':
    172             self.error('unexpected call to parse_comment()')
    173         match = _commentclose.search(rawdata, i+4)
    174         if not match:
    175             return -1
    176         if report:
    177             j = match.start(0)
    178             self.handle_comment(rawdata[i+4: j])
    179         return match.end(0)
    180 
    181     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
    182     # returning the index just past any whitespace following the trailing ']'.
    183     def _parse_doctype_subset(self, i, declstartpos):
    184         rawdata = self.rawdata
    185         n = len(rawdata)
    186         j = i
    187         while j < n:
    188             c = rawdata[j]
    189             if c == "<":
    190                 s = rawdata[j:j+2]
    191                 if s == "<":
    192                     # end of buffer; incomplete
    193                     return -1
    194                 if s != "<!":
    195                     self.updatepos(declstartpos, j + 1)
    196                     self.error("unexpected char in internal subset (in %r)" % s)
    197                 if (j + 2) == n:
    198                     # end of buffer; incomplete
    199                     return -1
    200                 if (j + 4) > n:
    201                     # end of buffer; incomplete
    202                     return -1
    203                 if rawdata[j:j+4] == "<!--":
    204                     j = self.parse_comment(j, report=0)
    205                     if j < 0:
    206                         return j
    207                     continue
    208                 name, j = self._scan_name(j + 2, declstartpos)
    209                 if j == -1:
    210                     return -1
    211                 if name not in ("attlist", "element", "entity", "notation"):
    212                     self.updatepos(declstartpos, j + 2)
    213                     self.error(
    214                         "unknown declaration %r in internal subset" % name)
    215                 # handle the individual names
    216                 meth = getattr(self, "_parse_doctype_" + name)
    217                 j = meth(j, declstartpos)
    218                 if j < 0:
    219                     return j
    220             elif c == "%":
    221                 # parameter entity reference
    222                 if (j + 1) == n:
    223                     # end of buffer; incomplete
    224                     return -1
    225                 s, j = self._scan_name(j + 1, declstartpos)
    226                 if j < 0:
    227                     return j
    228                 if rawdata[j] == ";":
    229                     j = j + 1
    230             elif c == "]":
    231                 j = j + 1
    232                 while j < n and rawdata[j].isspace():
    233                     j = j + 1
    234                 if j < n:
    235                     if rawdata[j] == ">":
    236                         return j
    237                     self.updatepos(declstartpos, j)
    238                     self.error("unexpected char after internal subset")
    239                 else:
    240                     return -1
    241             elif c.isspace():
    242                 j = j + 1
    243             else:
    244                 self.updatepos(declstartpos, j)
    245                 self.error("unexpected char %r in internal subset" % c)
    246         # end of buffer reached
    247         return -1
    248 
    249     # Internal -- scan past <!ELEMENT declarations
    250     def _parse_doctype_element(self, i, declstartpos):
    251         name, j = self._scan_name(i, declstartpos)
    252         if j == -1:
    253             return -1
    254         # style content model; just skip until '>'
    255         rawdata = self.rawdata
    256         if '>' in rawdata[j:]:
    257             return rawdata.find(">", j) + 1
    258         return -1
    259 
    260     # Internal -- scan past <!ATTLIST declarations
    261     def _parse_doctype_attlist(self, i, declstartpos):
    262         rawdata = self.rawdata
    263         name, j = self._scan_name(i, declstartpos)
    264         c = rawdata[j:j+1]
    265         if c == "":
    266             return -1
    267         if c == ">":
    268             return j + 1
    269         while 1:
    270             # scan a series of attribute descriptions; simplified:
    271             #   name type [value] [#constraint]
    272             name, j = self._scan_name(j, declstartpos)
    273             if j < 0:
    274                 return j
    275             c = rawdata[j:j+1]
    276             if c == "":
    277                 return -1
    278             if c == "(":
    279                 # an enumerated type; look for ')'
    280                 if ")" in rawdata[j:]:
    281                     j = rawdata.find(")", j) + 1
    282                 else:
    283                     return -1
    284                 while rawdata[j:j+1].isspace():
    285                     j = j + 1
    286                 if not rawdata[j:]:
    287                     # end of buffer, incomplete
    288                     return -1
    289             else:
    290                 name, j = self._scan_name(j, declstartpos)
    291             c = rawdata[j:j+1]
    292             if not c:
    293                 return -1
    294             if c in "'\"":
    295                 m = _declstringlit_match(rawdata, j)
    296                 if m:
    297                     j = m.end()
    298                 else:
    299                     return -1
    300                 c = rawdata[j:j+1]
    301                 if not c:
    302                     return -1
    303             if c == "#":
    304                 if rawdata[j:] == "#":
    305                     # end of buffer
    306                     return -1
    307                 name, j = self._scan_name(j + 1, declstartpos)
    308                 if j < 0:
    309                     return j
    310                 c = rawdata[j:j+1]
    311                 if not c:
    312                     return -1
    313             if c == '>':
    314                 # all done
    315                 return j + 1
    316 
    317     # Internal -- scan past <!NOTATION declarations
    318     def _parse_doctype_notation(self, i, declstartpos):
    319         name, j = self._scan_name(i, declstartpos)
    320         if j < 0:
    321             return j
    322         rawdata = self.rawdata
    323         while 1:
    324             c = rawdata[j:j+1]
    325             if not c:
    326                 # end of buffer; incomplete
    327                 return -1
    328             if c == '>':
    329                 return j + 1
    330             if c in "'\"":
    331                 m = _declstringlit_match(rawdata, j)
    332                 if not m:
    333                     return -1
    334                 j = m.end()
    335             else:
    336                 name, j = self._scan_name(j, declstartpos)
    337                 if j < 0:
    338                     return j
    339 
    340     # Internal -- scan past <!ENTITY declarations
    341     def _parse_doctype_entity(self, i, declstartpos):
    342         rawdata = self.rawdata
    343         if rawdata[i:i+1] == "%":
    344             j = i + 1
    345             while 1:
    346                 c = rawdata[j:j+1]
    347                 if not c:
    348                     return -1
    349                 if c.isspace():
    350                     j = j + 1
    351                 else:
    352                     break
    353         else:
    354             j = i
    355         name, j = self._scan_name(j, declstartpos)
    356         if j < 0:
    357             return j
    358         while 1:
    359             c = self.rawdata[j:j+1]
    360             if not c:
    361                 return -1
    362             if c in "'\"":
    363                 m = _declstringlit_match(rawdata, j)
    364                 if m:
    365                     j = m.end()
    366                 else:
    367                     return -1    # incomplete
    368             elif c == ">":
    369                 return j + 1
    370             else:
    371                 name, j = self._scan_name(j, declstartpos)
    372                 if j < 0:
    373                     return j
    374 
    375     # Internal -- scan a name token and the new position and the token, or
    376     # return -1 if we've reached the end of the buffer.
    377     def _scan_name(self, i, declstartpos):
    378         rawdata = self.rawdata
    379         n = len(rawdata)
    380         if i == n:
    381             return None, -1
    382         m = _declname_match(rawdata, i)
    383         if m:
    384             s = m.group()
    385             name = s.strip()
    386             if (i + len(s)) == n:
    387                 return None, -1  # end of buffer
    388             return name.lower(), m.end()
    389         else:
    390             self.updatepos(declstartpos, i)
    391             self.error("expected name token at %r"
    392                        % rawdata[declstartpos:declstartpos+20])
    393 
    394     # To be overridden -- handlers for unknown objects
    395     def unknown_decl(self, data):
    396         pass
    397