1 """Shared support for scanning document type declarations in HTML and XHTML. 2 3 This module is used as a foundation for the HTMLParser and sgmllib 4 modules (indirectly, for htmllib as well). It has no documented 5 public API and should not be used directly. 6 7 """ 8 9 import re 10 11 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match 12 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match 13 _commentclose = re.compile(r'--\s*>') 14 _markedsectionclose = re.compile(r']\s*]\s*>') 15 16 # An analysis of the MS-Word extensions is available at 17 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf 18 19 _msmarkedsectionclose = re.compile(r']\s*>') 20 21 del re 22 23 24 class ParserBase: 25 """Parser base class which provides some common support methods used 26 by the SGML/HTML and XHTML parsers.""" 27 28 def __init__(self): 29 if self.__class__ is ParserBase: 30 raise RuntimeError( 31 "markupbase.ParserBase must be subclassed") 32 33 def error(self, message): 34 raise NotImplementedError( 35 "subclasses of ParserBase must override error()") 36 37 def reset(self): 38 self.lineno = 1 39 self.offset = 0 40 41 def getpos(self): 42 """Return current line number and offset.""" 43 return self.lineno, self.offset 44 45 # Internal -- update line number and offset. This should be 46 # called for each piece of data exactly once, in order -- in other 47 # words the concatenation of all the input strings to this 48 # function should be exactly the entire input. 49 def updatepos(self, i, j): 50 if i >= j: 51 return j 52 rawdata = self.rawdata 53 nlines = rawdata.count("\n", i, j) 54 if nlines: 55 self.lineno = self.lineno + nlines 56 pos = rawdata.rindex("\n", i, j) # Should not fail 57 self.offset = j-(pos+1) 58 else: 59 self.offset = self.offset + j-i 60 return j 61 62 _decl_otherchars = '' 63 64 # Internal -- parse declaration (for use by subclasses). 65 def parse_declaration(self, i): 66 # This is some sort of declaration; in "HTML as 67 # deployed," this should only be the document type 68 # declaration ("<!DOCTYPE html...>"). 69 # ISO 8879:1986, however, has more complex 70 # declaration syntax for elements in <!...>, including: 71 # --comment-- 72 # [marked section] 73 # name in the following list: ENTITY, DOCTYPE, ELEMENT, 74 # ATTLIST, NOTATION, SHORTREF, USEMAP, 75 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM 76 rawdata = self.rawdata 77 j = i + 2 78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" 79 if rawdata[j:j+1] == ">": 80 # the empty comment <!> 81 return j + 1 82 if rawdata[j:j+1] in ("-", ""): 83 # Start of comment followed by buffer boundary, 84 # or just a buffer boundary. 85 return -1 86 # A simple, practical version could look like: ((name|stringlit) S*) + '>' 87 n = len(rawdata) 88 if rawdata[j:j+2] == '--': #comment 89 # Locate --.*-- as the body of the comment 90 return self.parse_comment(i) 91 elif rawdata[j] == '[': #marked section 92 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section 93 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA 94 # Note that this is extended by Microsoft Office "Save as Web" function 95 # to include [if...] and [endif]. 96 return self.parse_marked_section(i) 97 else: #all other declaration elements 98 decltype, j = self._scan_name(j, i) 99 if j < 0: 100 return j 101 if decltype == "doctype": 102 self._decl_otherchars = '' 103 while j < n: 104 c = rawdata[j] 105 if c == ">": 106 # end of declaration syntax 107 data = rawdata[i+2:j] 108 if decltype == "doctype": 109 self.handle_decl(data) 110 else: 111 # According to the HTML5 specs sections "8.2.4.44 Bogus 112 # comment state" and "8.2.4.45 Markup declaration open 113 # state", a comment token should be emitted. 114 # Calling unknown_decl provides more flexibility though. 115 self.unknown_decl(data) 116 return j + 1 117 if c in "\"'": 118 m = _declstringlit_match(rawdata, j) 119 if not m: 120 return -1 # incomplete 121 j = m.end() 122 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": 123 name, j = self._scan_name(j, i) 124 elif c in self._decl_otherchars: 125 j = j + 1 126 elif c == "[": 127 # this could be handled in a separate doctype parser 128 if decltype == "doctype": 129 j = self._parse_doctype_subset(j + 1, i) 130 elif decltype in ("attlist", "linktype", "link", "element"): 131 # must tolerate []'d groups in a content model in an element declaration 132 # also in data attribute specifications of attlist declaration 133 # also link type declaration subsets in linktype declarations 134 # also link attribute specification lists in link declarations 135 self.error("unsupported '[' char in %s declaration" % decltype) 136 else: 137 self.error("unexpected '[' char in declaration") 138 else: 139 self.error( 140 "unexpected %r char in declaration" % rawdata[j]) 141 if j < 0: 142 return j 143 return -1 # incomplete 144 145 # Internal -- parse a marked section 146 # Override this to handle MS-word extension syntax <![if word]>content<![endif]> 147 def parse_marked_section(self, i, report=1): 148 rawdata= self.rawdata 149 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()" 150 sectName, j = self._scan_name( i+3, i ) 151 if j < 0: 152 return j 153 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"): 154 # look for standard ]]> ending 155 match= _markedsectionclose.search(rawdata, i+3) 156 elif sectName in ("if", "else", "endif"): 157 # look for MS Office ]> ending 158 match= _msmarkedsectionclose.search(rawdata, i+3) 159 else: 160 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) 161 if not match: 162 return -1 163 if report: 164 j = match.start(0) 165 self.unknown_decl(rawdata[i+3: j]) 166 return match.end(0) 167 168 # Internal -- parse comment, return length or -1 if not terminated 169 def parse_comment(self, i, report=1): 170 rawdata = self.rawdata 171 if rawdata[i:i+4] != '<!--': 172 self.error('unexpected call to parse_comment()') 173 match = _commentclose.search(rawdata, i+4) 174 if not match: 175 return -1 176 if report: 177 j = match.start(0) 178 self.handle_comment(rawdata[i+4: j]) 179 return match.end(0) 180 181 # Internal -- scan past the internal subset in a <!DOCTYPE declaration, 182 # returning the index just past any whitespace following the trailing ']'. 183 def _parse_doctype_subset(self, i, declstartpos): 184 rawdata = self.rawdata 185 n = len(rawdata) 186 j = i 187 while j < n: 188 c = rawdata[j] 189 if c == "<": 190 s = rawdata[j:j+2] 191 if s == "<": 192 # end of buffer; incomplete 193 return -1 194 if s != "<!": 195 self.updatepos(declstartpos, j + 1) 196 self.error("unexpected char in internal subset (in %r)" % s) 197 if (j + 2) == n: 198 # end of buffer; incomplete 199 return -1 200 if (j + 4) > n: 201 # end of buffer; incomplete 202 return -1 203 if rawdata[j:j+4] == "<!--": 204 j = self.parse_comment(j, report=0) 205 if j < 0: 206 return j 207 continue 208 name, j = self._scan_name(j + 2, declstartpos) 209 if j == -1: 210 return -1 211 if name not in ("attlist", "element", "entity", "notation"): 212 self.updatepos(declstartpos, j + 2) 213 self.error( 214 "unknown declaration %r in internal subset" % name) 215 # handle the individual names 216 meth = getattr(self, "_parse_doctype_" + name) 217 j = meth(j, declstartpos) 218 if j < 0: 219 return j 220 elif c == "%": 221 # parameter entity reference 222 if (j + 1) == n: 223 # end of buffer; incomplete 224 return -1 225 s, j = self._scan_name(j + 1, declstartpos) 226 if j < 0: 227 return j 228 if rawdata[j] == ";": 229 j = j + 1 230 elif c == "]": 231 j = j + 1 232 while j < n and rawdata[j].isspace(): 233 j = j + 1 234 if j < n: 235 if rawdata[j] == ">": 236 return j 237 self.updatepos(declstartpos, j) 238 self.error("unexpected char after internal subset") 239 else: 240 return -1 241 elif c.isspace(): 242 j = j + 1 243 else: 244 self.updatepos(declstartpos, j) 245 self.error("unexpected char %r in internal subset" % c) 246 # end of buffer reached 247 return -1 248 249 # Internal -- scan past <!ELEMENT declarations 250 def _parse_doctype_element(self, i, declstartpos): 251 name, j = self._scan_name(i, declstartpos) 252 if j == -1: 253 return -1 254 # style content model; just skip until '>' 255 rawdata = self.rawdata 256 if '>' in rawdata[j:]: 257 return rawdata.find(">", j) + 1 258 return -1 259 260 # Internal -- scan past <!ATTLIST declarations 261 def _parse_doctype_attlist(self, i, declstartpos): 262 rawdata = self.rawdata 263 name, j = self._scan_name(i, declstartpos) 264 c = rawdata[j:j+1] 265 if c == "": 266 return -1 267 if c == ">": 268 return j + 1 269 while 1: 270 # scan a series of attribute descriptions; simplified: 271 # name type [value] [#constraint] 272 name, j = self._scan_name(j, declstartpos) 273 if j < 0: 274 return j 275 c = rawdata[j:j+1] 276 if c == "": 277 return -1 278 if c == "(": 279 # an enumerated type; look for ')' 280 if ")" in rawdata[j:]: 281 j = rawdata.find(")", j) + 1 282 else: 283 return -1 284 while rawdata[j:j+1].isspace(): 285 j = j + 1 286 if not rawdata[j:]: 287 # end of buffer, incomplete 288 return -1 289 else: 290 name, j = self._scan_name(j, declstartpos) 291 c = rawdata[j:j+1] 292 if not c: 293 return -1 294 if c in "'\"": 295 m = _declstringlit_match(rawdata, j) 296 if m: 297 j = m.end() 298 else: 299 return -1 300 c = rawdata[j:j+1] 301 if not c: 302 return -1 303 if c == "#": 304 if rawdata[j:] == "#": 305 # end of buffer 306 return -1 307 name, j = self._scan_name(j + 1, declstartpos) 308 if j < 0: 309 return j 310 c = rawdata[j:j+1] 311 if not c: 312 return -1 313 if c == '>': 314 # all done 315 return j + 1 316 317 # Internal -- scan past <!NOTATION declarations 318 def _parse_doctype_notation(self, i, declstartpos): 319 name, j = self._scan_name(i, declstartpos) 320 if j < 0: 321 return j 322 rawdata = self.rawdata 323 while 1: 324 c = rawdata[j:j+1] 325 if not c: 326 # end of buffer; incomplete 327 return -1 328 if c == '>': 329 return j + 1 330 if c in "'\"": 331 m = _declstringlit_match(rawdata, j) 332 if not m: 333 return -1 334 j = m.end() 335 else: 336 name, j = self._scan_name(j, declstartpos) 337 if j < 0: 338 return j 339 340 # Internal -- scan past <!ENTITY declarations 341 def _parse_doctype_entity(self, i, declstartpos): 342 rawdata = self.rawdata 343 if rawdata[i:i+1] == "%": 344 j = i + 1 345 while 1: 346 c = rawdata[j:j+1] 347 if not c: 348 return -1 349 if c.isspace(): 350 j = j + 1 351 else: 352 break 353 else: 354 j = i 355 name, j = self._scan_name(j, declstartpos) 356 if j < 0: 357 return j 358 while 1: 359 c = self.rawdata[j:j+1] 360 if not c: 361 return -1 362 if c in "'\"": 363 m = _declstringlit_match(rawdata, j) 364 if m: 365 j = m.end() 366 else: 367 return -1 # incomplete 368 elif c == ">": 369 return j + 1 370 else: 371 name, j = self._scan_name(j, declstartpos) 372 if j < 0: 373 return j 374 375 # Internal -- scan a name token and the new position and the token, or 376 # return -1 if we've reached the end of the buffer. 377 def _scan_name(self, i, declstartpos): 378 rawdata = self.rawdata 379 n = len(rawdata) 380 if i == n: 381 return None, -1 382 m = _declname_match(rawdata, i) 383 if m: 384 s = m.group() 385 name = s.strip() 386 if (i + len(s)) == n: 387 return None, -1 # end of buffer 388 return name.lower(), m.end() 389 else: 390 self.updatepos(declstartpos, i) 391 self.error("expected name token at %r" 392 % rawdata[declstartpos:declstartpos+20]) 393 394 # To be overridden -- handlers for unknown objects 395 def unknown_decl(self, data): 396 pass 397