Home | History | Annotate | Download | only in Lib
      1 """Shared support for scanning document type declarations in HTML and XHTML.
      2 
      3 This module is used as a foundation for the HTMLParser and sgmllib
      4 modules (indirectly, for htmllib as well).  It has no documented
      5 public API and should not be used directly.
      6 
      7 """
      8 
      9 import re
     10 
     11 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
     12 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
     13 _commentclose = re.compile(r'--\s*>')
     14 _markedsectionclose = re.compile(r']\s*]\s*>')
     15 
     16 # An analysis of the MS-Word extensions is available at

     17 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf

     18 
     19 _msmarkedsectionclose = re.compile(r']\s*>')
     20 
     21 del re
     22 
     23 
     24 class ParserBase:
     25     """Parser base class which provides some common support methods used
     26     by the SGML/HTML and XHTML parsers."""
     27 
     28     def __init__(self):
     29         if self.__class__ is ParserBase:
     30             raise RuntimeError(
     31                 "markupbase.ParserBase must be subclassed")
     32 
     33     def error(self, message):
     34         raise NotImplementedError(
     35             "subclasses of ParserBase must override error()")
     36 
     37     def reset(self):
     38         self.lineno = 1
     39         self.offset = 0
     40 
     41     def getpos(self):
     42         """Return current line number and offset."""
     43         return self.lineno, self.offset
     44 
     45     # Internal -- update line number and offset.  This should be

     46     # called for each piece of data exactly once, in order -- in other

     47     # words the concatenation of all the input strings to this

     48     # function should be exactly the entire input.

     49     def updatepos(self, i, j):
     50         if i >= j:
     51             return j
     52         rawdata = self.rawdata
     53         nlines = rawdata.count("\n", i, j)
     54         if nlines:
     55             self.lineno = self.lineno + nlines
     56             pos = rawdata.rindex("\n", i, j) # Should not fail

     57             self.offset = j-(pos+1)
     58         else:
     59             self.offset = self.offset + j-i
     60         return j
     61 
     62     _decl_otherchars = ''
     63 
     64     # Internal -- parse declaration (for use by subclasses).

     65     def parse_declaration(self, i):
     66         # This is some sort of declaration; in "HTML as

     67         # deployed," this should only be the document type

     68         # declaration ("<!DOCTYPE html...>").

     69         # ISO 8879:1986, however, has more complex

     70         # declaration syntax for elements in <!...>, including:

     71         # --comment--

     72         # [marked section]

     73         # name in the following list: ENTITY, DOCTYPE, ELEMENT,

     74         # ATTLIST, NOTATION, SHORTREF, USEMAP,

     75         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM

     76         rawdata = self.rawdata
     77         j = i + 2
     78         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
     79         if rawdata[j:j+1] == ">":
     80             # the empty comment <!>

     81             return j + 1
     82         if rawdata[j:j+1] in ("-", ""):
     83             # Start of comment followed by buffer boundary,

     84             # or just a buffer boundary.

     85             return -1
     86         # A simple, practical version could look like: ((name|stringlit) S*) + '>'

     87         n = len(rawdata)
     88         if rawdata[j:j+2] == '--': #comment

     89             # Locate --.*-- as the body of the comment

     90             return self.parse_comment(i)
     91         elif rawdata[j] == '[': #marked section

     92             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section

     93             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA

     94             # Note that this is extended by Microsoft Office "Save as Web" function

     95             # to include [if...] and [endif].

     96             return self.parse_marked_section(i)
     97         else: #all other declaration elements

     98             decltype, j = self._scan_name(j, i)
     99         if j < 0:
    100             return j
    101         if decltype == "doctype":
    102             self._decl_otherchars = ''
    103         while j < n:
    104             c = rawdata[j]
    105             if c == ">":
    106                 # end of declaration syntax

    107                 data = rawdata[i+2:j]
    108                 if decltype == "doctype":
    109                     self.handle_decl(data)
    110                 else:
    111                     self.unknown_decl(data)
    112                 return j + 1
    113             if c in "\"'":
    114                 m = _declstringlit_match(rawdata, j)
    115                 if not m:
    116                     return -1 # incomplete

    117                 j = m.end()
    118             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
    119                 name, j = self._scan_name(j, i)
    120             elif c in self._decl_otherchars:
    121                 j = j + 1
    122             elif c == "[":
    123                 # this could be handled in a separate doctype parser

    124                 if decltype == "doctype":
    125                     j = self._parse_doctype_subset(j + 1, i)
    126                 elif decltype in ("attlist", "linktype", "link", "element"):
    127                     # must tolerate []'d groups in a content model in an element declaration

    128                     # also in data attribute specifications of attlist declaration

    129                     # also link type declaration subsets in linktype declarations

    130                     # also link attribute specification lists in link declarations

    131                     self.error("unsupported '[' char in %s declaration" % decltype)
    132                 else:
    133                     self.error("unexpected '[' char in declaration")
    134             else:
    135                 self.error(
    136                     "unexpected %r char in declaration" % rawdata[j])
    137             if j < 0:
    138                 return j
    139         return -1 # incomplete

    140 
    141     # Internal -- parse a marked section

    142     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>

    143     def parse_marked_section(self, i, report=1):
    144         rawdata= self.rawdata
    145         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
    146         sectName, j = self._scan_name( i+3, i )
    147         if j < 0:
    148             return j
    149         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
    150             # look for standard ]]> ending

    151             match= _markedsectionclose.search(rawdata, i+3)
    152         elif sectName in ("if", "else", "endif"):
    153             # look for MS Office ]> ending

    154             match= _msmarkedsectionclose.search(rawdata, i+3)
    155         else:
    156             self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
    157         if not match:
    158             return -1
    159         if report:
    160             j = match.start(0)
    161             self.unknown_decl(rawdata[i+3: j])
    162         return match.end(0)
    163 
    164     # Internal -- parse comment, return length or -1 if not terminated

    165     def parse_comment(self, i, report=1):
    166         rawdata = self.rawdata
    167         if rawdata[i:i+4] != '<!--':
    168             self.error('unexpected call to parse_comment()')
    169         match = _commentclose.search(rawdata, i+4)
    170         if not match:
    171             return -1
    172         if report:
    173             j = match.start(0)
    174             self.handle_comment(rawdata[i+4: j])
    175         return match.end(0)
    176 
    177     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,

    178     # returning the index just past any whitespace following the trailing ']'.

    179     def _parse_doctype_subset(self, i, declstartpos):
    180         rawdata = self.rawdata
    181         n = len(rawdata)
    182         j = i
    183         while j < n:
    184             c = rawdata[j]
    185             if c == "<":
    186                 s = rawdata[j:j+2]
    187                 if s == "<":
    188                     # end of buffer; incomplete

    189                     return -1
    190                 if s != "<!":
    191                     self.updatepos(declstartpos, j + 1)
    192                     self.error("unexpected char in internal subset (in %r)" % s)
    193                 if (j + 2) == n:
    194                     # end of buffer; incomplete

    195                     return -1
    196                 if (j + 4) > n:
    197                     # end of buffer; incomplete

    198                     return -1
    199                 if rawdata[j:j+4] == "<!--":
    200                     j = self.parse_comment(j, report=0)
    201                     if j < 0:
    202                         return j
    203                     continue
    204                 name, j = self._scan_name(j + 2, declstartpos)
    205                 if j == -1:
    206                     return -1
    207                 if name not in ("attlist", "element", "entity", "notation"):
    208                     self.updatepos(declstartpos, j + 2)
    209                     self.error(
    210                         "unknown declaration %r in internal subset" % name)
    211                 # handle the individual names

    212                 meth = getattr(self, "_parse_doctype_" + name)
    213                 j = meth(j, declstartpos)
    214                 if j < 0:
    215                     return j
    216             elif c == "%":
    217                 # parameter entity reference

    218                 if (j + 1) == n:
    219                     # end of buffer; incomplete

    220                     return -1
    221                 s, j = self._scan_name(j + 1, declstartpos)
    222                 if j < 0:
    223                     return j
    224                 if rawdata[j] == ";":
    225                     j = j + 1
    226             elif c == "]":
    227                 j = j + 1
    228                 while j < n and rawdata[j].isspace():
    229                     j = j + 1
    230                 if j < n:
    231                     if rawdata[j] == ">":
    232                         return j
    233                     self.updatepos(declstartpos, j)
    234                     self.error("unexpected char after internal subset")
    235                 else:
    236                     return -1
    237             elif c.isspace():
    238                 j = j + 1
    239             else:
    240                 self.updatepos(declstartpos, j)
    241                 self.error("unexpected char %r in internal subset" % c)
    242         # end of buffer reached

    243         return -1
    244 
    245     # Internal -- scan past <!ELEMENT declarations

    246     def _parse_doctype_element(self, i, declstartpos):
    247         name, j = self._scan_name(i, declstartpos)
    248         if j == -1:
    249             return -1
    250         # style content model; just skip until '>'

    251         rawdata = self.rawdata
    252         if '>' in rawdata[j:]:
    253             return rawdata.find(">", j) + 1
    254         return -1
    255 
    256     # Internal -- scan past <!ATTLIST declarations

    257     def _parse_doctype_attlist(self, i, declstartpos):
    258         rawdata = self.rawdata
    259         name, j = self._scan_name(i, declstartpos)
    260         c = rawdata[j:j+1]
    261         if c == "":
    262             return -1
    263         if c == ">":
    264             return j + 1
    265         while 1:
    266             # scan a series of attribute descriptions; simplified:

    267             #   name type [value] [#constraint]

    268             name, j = self._scan_name(j, declstartpos)
    269             if j < 0:
    270                 return j
    271             c = rawdata[j:j+1]
    272             if c == "":
    273                 return -1
    274             if c == "(":
    275                 # an enumerated type; look for ')'

    276                 if ")" in rawdata[j:]:
    277                     j = rawdata.find(")", j) + 1
    278                 else:
    279                     return -1
    280                 while rawdata[j:j+1].isspace():
    281                     j = j + 1
    282                 if not rawdata[j:]:
    283                     # end of buffer, incomplete

    284                     return -1
    285             else:
    286                 name, j = self._scan_name(j, declstartpos)
    287             c = rawdata[j:j+1]
    288             if not c:
    289                 return -1
    290             if c in "'\"":
    291                 m = _declstringlit_match(rawdata, j)
    292                 if m:
    293                     j = m.end()
    294                 else:
    295                     return -1
    296                 c = rawdata[j:j+1]
    297                 if not c:
    298                     return -1
    299             if c == "#":
    300                 if rawdata[j:] == "#":
    301                     # end of buffer

    302                     return -1
    303                 name, j = self._scan_name(j + 1, declstartpos)
    304                 if j < 0:
    305                     return j
    306                 c = rawdata[j:j+1]
    307                 if not c:
    308                     return -1
    309             if c == '>':
    310                 # all done

    311                 return j + 1
    312 
    313     # Internal -- scan past <!NOTATION declarations

    314     def _parse_doctype_notation(self, i, declstartpos):
    315         name, j = self._scan_name(i, declstartpos)
    316         if j < 0:
    317             return j
    318         rawdata = self.rawdata
    319         while 1:
    320             c = rawdata[j:j+1]
    321             if not c:
    322                 # end of buffer; incomplete

    323                 return -1
    324             if c == '>':
    325                 return j + 1
    326             if c in "'\"":
    327                 m = _declstringlit_match(rawdata, j)
    328                 if not m:
    329                     return -1
    330                 j = m.end()
    331             else:
    332                 name, j = self._scan_name(j, declstartpos)
    333                 if j < 0:
    334                     return j
    335 
    336     # Internal -- scan past <!ENTITY declarations

    337     def _parse_doctype_entity(self, i, declstartpos):
    338         rawdata = self.rawdata
    339         if rawdata[i:i+1] == "%":
    340             j = i + 1
    341             while 1:
    342                 c = rawdata[j:j+1]
    343                 if not c:
    344                     return -1
    345                 if c.isspace():
    346                     j = j + 1
    347                 else:
    348                     break
    349         else:
    350             j = i
    351         name, j = self._scan_name(j, declstartpos)
    352         if j < 0:
    353             return j
    354         while 1:
    355             c = self.rawdata[j:j+1]
    356             if not c:
    357                 return -1
    358             if c in "'\"":
    359                 m = _declstringlit_match(rawdata, j)
    360                 if m:
    361                     j = m.end()
    362                 else:
    363                     return -1    # incomplete

    364             elif c == ">":
    365                 return j + 1
    366             else:
    367                 name, j = self._scan_name(j, declstartpos)
    368                 if j < 0:
    369                     return j
    370 
    371     # Internal -- scan a name token and the new position and the token, or

    372     # return -1 if we've reached the end of the buffer.

    373     def _scan_name(self, i, declstartpos):
    374         rawdata = self.rawdata
    375         n = len(rawdata)
    376         if i == n:
    377             return None, -1
    378         m = _declname_match(rawdata, i)
    379         if m:
    380             s = m.group()
    381             name = s.strip()
    382             if (i + len(s)) == n:
    383                 return None, -1  # end of buffer

    384             return name.lower(), m.end()
    385         else:
    386             self.updatepos(declstartpos, i)
    387             self.error("expected name token at %r"
    388                        % rawdata[declstartpos:declstartpos+20])
    389 
    390     # To be overridden -- handlers for unknown objects

    391     def unknown_decl(self, data):
    392         pass
    393