Home | History | Annotate | Download | only in python2.7
      1 """A parser for HTML and XHTML."""
      2 
      3 # This file is based on sgmllib.py, but the API is slightly different.
      4 
      5 # XXX There should be a way to distinguish between PCDATA (parsed
      6 # character data -- the normal case), RCDATA (replaceable character
      7 # data -- only char and entity references and end tags are special)
      8 # and CDATA (character data -- only end tags are special).
      9 
     10 
     11 import markupbase
     12 import re
     13 
     14 # Regular expressions used for parsing
     15 
     16 interesting_normal = re.compile('[&<]')
     17 incomplete = re.compile('&[a-zA-Z#]')
     18 
     19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
     20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     21 
     22 starttagopen = re.compile('<[a-zA-Z]')
     23 piclose = re.compile('>')
     24 commentclose = re.compile(r'--\s*>')
     25 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
     26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
     27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
     28 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
     29 
     30 attrfind = re.compile(
     31     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
     32     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
     33 
     34 locatestarttagend = re.compile(r"""
     35   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
     36   (?:[\s/]*                          # optional whitespace before attribute name
     37     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
     38       (?:\s*=+\s*                    # value indicator
     39         (?:'[^']*'                   # LITA-enclosed value
     40           |"[^"]*"                   # LIT-enclosed value
     41           |(?!['"])[^>\s]*           # bare value
     42          )
     43        )?(?:\s|/(?!>))*
     44      )*
     45    )?
     46   \s*                                # trailing whitespace
     47 """, re.VERBOSE)
     48 endendtag = re.compile('>')
     49 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
     50 # </ and the tag name, so maybe this should be fixed
     51 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
     52 
     53 
     54 class HTMLParseError(Exception):
     55     """Exception raised for all parse errors."""
     56 
     57     def __init__(self, msg, position=(None, None)):
     58         assert msg
     59         self.msg = msg
     60         self.lineno = position[0]
     61         self.offset = position[1]
     62 
     63     def __str__(self):
     64         result = self.msg
     65         if self.lineno is not None:
     66             result = result + ", at line %d" % self.lineno
     67         if self.offset is not None:
     68             result = result + ", column %d" % (self.offset + 1)
     69         return result
     70 
     71 
     72 class HTMLParser(markupbase.ParserBase):
     73     """Find tags and other markup and call handler functions.
     74 
     75     Usage:
     76         p = HTMLParser()
     77         p.feed(data)
     78         ...
     79         p.close()
     80 
     81     Start tags are handled by calling self.handle_starttag() or
     82     self.handle_startendtag(); end tags by self.handle_endtag().  The
     83     data between tags is passed from the parser to the derived class
     84     by calling self.handle_data() with the data as argument (the data
     85     may be split up in arbitrary chunks).  Entity references are
     86     passed by calling self.handle_entityref() with the entity
     87     reference as the argument.  Numeric character references are
     88     passed to self.handle_charref() with the string containing the
     89     reference as the argument.
     90     """
     91 
     92     CDATA_CONTENT_ELEMENTS = ("script", "style")
     93 
     94 
     95     def __init__(self):
     96         """Initialize and reset this instance."""
     97         self.reset()
     98 
     99     def reset(self):
    100         """Reset this instance.  Loses all unprocessed data."""
    101         self.rawdata = ''
    102         self.lasttag = '???'
    103         self.interesting = interesting_normal
    104         self.cdata_elem = None
    105         markupbase.ParserBase.reset(self)
    106 
    107     def feed(self, data):
    108         r"""Feed data to the parser.
    109 
    110         Call this as often as you want, with as little or as much text
    111         as you want (may include '\n').
    112         """
    113         self.rawdata = self.rawdata + data
    114         self.goahead(0)
    115 
    116     def close(self):
    117         """Handle any buffered data."""
    118         self.goahead(1)
    119 
    120     def error(self, message):
    121         raise HTMLParseError(message, self.getpos())
    122 
    123     __starttag_text = None
    124 
    125     def get_starttag_text(self):
    126         """Return full source of start tag: '<...>'."""
    127         return self.__starttag_text
    128 
    129     def set_cdata_mode(self, elem):
    130         self.cdata_elem = elem.lower()
    131         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    132 
    133     def clear_cdata_mode(self):
    134         self.interesting = interesting_normal
    135         self.cdata_elem = None
    136 
    137     # Internal -- handle data as far as reasonable.  May leave state
    138     # and data to be processed by a subsequent call.  If 'end' is
    139     # true, force handling all data as if followed by EOF marker.
    140     def goahead(self, end):
    141         rawdata = self.rawdata
    142         i = 0
    143         n = len(rawdata)
    144         while i < n:
    145             match = self.interesting.search(rawdata, i) # < or &
    146             if match:
    147                 j = match.start()
    148             else:
    149                 if self.cdata_elem:
    150                     break
    151                 j = n
    152             if i < j: self.handle_data(rawdata[i:j])
    153             i = self.updatepos(i, j)
    154             if i == n: break
    155             startswith = rawdata.startswith
    156             if startswith('<', i):
    157                 if starttagopen.match(rawdata, i): # < + letter
    158                     k = self.parse_starttag(i)
    159                 elif startswith("</", i):
    160                     k = self.parse_endtag(i)
    161                 elif startswith("<!--", i):
    162                     k = self.parse_comment(i)
    163                 elif startswith("<?", i):
    164                     k = self.parse_pi(i)
    165                 elif startswith("<!", i):
    166                     k = self.parse_html_declaration(i)
    167                 elif (i + 1) < n:
    168                     self.handle_data("<")
    169                     k = i + 1
    170                 else:
    171                     break
    172                 if k < 0:
    173                     if not end:
    174                         break
    175                     k = rawdata.find('>', i + 1)
    176                     if k < 0:
    177                         k = rawdata.find('<', i + 1)
    178                         if k < 0:
    179                             k = i + 1
    180                     else:
    181                         k += 1
    182                     self.handle_data(rawdata[i:k])
    183                 i = self.updatepos(i, k)
    184             elif startswith("&#", i):
    185                 match = charref.match(rawdata, i)
    186                 if match:
    187                     name = match.group()[2:-1]
    188                     self.handle_charref(name)
    189                     k = match.end()
    190                     if not startswith(';', k-1):
    191                         k = k - 1
    192                     i = self.updatepos(i, k)
    193                     continue
    194                 else:
    195                     if ";" in rawdata[i:]: #bail by consuming &#
    196                         self.handle_data(rawdata[0:2])
    197                         i = self.updatepos(i, 2)
    198                     break
    199             elif startswith('&', i):
    200                 match = entityref.match(rawdata, i)
    201                 if match:
    202                     name = match.group(1)
    203                     self.handle_entityref(name)
    204                     k = match.end()
    205                     if not startswith(';', k-1):
    206                         k = k - 1
    207                     i = self.updatepos(i, k)
    208                     continue
    209                 match = incomplete.match(rawdata, i)
    210                 if match:
    211                     # match.group() will contain at least 2 chars
    212                     if end and match.group() == rawdata[i:]:
    213                         self.error("EOF in middle of entity or char ref")
    214                     # incomplete
    215                     break
    216                 elif (i + 1) < n:
    217                     # not the end of the buffer, and can't be confused
    218                     # with some other construct
    219                     self.handle_data("&")
    220                     i = self.updatepos(i, i + 1)
    221                 else:
    222                     break
    223             else:
    224                 assert 0, "interesting.search() lied"
    225         # end while
    226         if end and i < n and not self.cdata_elem:
    227             self.handle_data(rawdata[i:n])
    228             i = self.updatepos(i, n)
    229         self.rawdata = rawdata[i:]
    230 
    231     # Internal -- parse html declarations, return length or -1 if not terminated
    232     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
    233     # See also parse_declaration in _markupbase
    234     def parse_html_declaration(self, i):
    235         rawdata = self.rawdata
    236         if rawdata[i:i+2] != '<!':
    237             self.error('unexpected call to parse_html_declaration()')
    238         if rawdata[i:i+4] == '<!--':
    239             # this case is actually already handled in goahead()
    240             return self.parse_comment(i)
    241         elif rawdata[i:i+3] == '<![':
    242             return self.parse_marked_section(i)
    243         elif rawdata[i:i+9].lower() == '<!doctype':
    244             # find the closing >
    245             gtpos = rawdata.find('>', i+9)
    246             if gtpos == -1:
    247                 return -1
    248             self.handle_decl(rawdata[i+2:gtpos])
    249             return gtpos+1
    250         else:
    251             return self.parse_bogus_comment(i)
    252 
    253     # Internal -- parse bogus comment, return length or -1 if not terminated
    254     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    255     def parse_bogus_comment(self, i, report=1):
    256         rawdata = self.rawdata
    257         if rawdata[i:i+2] not in ('<!', '</'):
    258             self.error('unexpected call to parse_comment()')
    259         pos = rawdata.find('>', i+2)
    260         if pos == -1:
    261             return -1
    262         if report:
    263             self.handle_comment(rawdata[i+2:pos])
    264         return pos + 1
    265 
    266     # Internal -- parse processing instr, return end or -1 if not terminated
    267     def parse_pi(self, i):
    268         rawdata = self.rawdata
    269         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
    270         match = piclose.search(rawdata, i+2) # >
    271         if not match:
    272             return -1
    273         j = match.start()
    274         self.handle_pi(rawdata[i+2: j])
    275         j = match.end()
    276         return j
    277 
    278     # Internal -- handle starttag, return end or -1 if not terminated
    279     def parse_starttag(self, i):
    280         self.__starttag_text = None
    281         endpos = self.check_for_whole_start_tag(i)
    282         if endpos < 0:
    283             return endpos
    284         rawdata = self.rawdata
    285         self.__starttag_text = rawdata[i:endpos]
    286 
    287         # Now parse the data between i+1 and j into a tag and attrs
    288         attrs = []
    289         match = tagfind.match(rawdata, i+1)
    290         assert match, 'unexpected call to parse_starttag()'
    291         k = match.end()
    292         self.lasttag = tag = match.group(1).lower()
    293 
    294         while k < endpos:
    295             m = attrfind.match(rawdata, k)
    296             if not m:
    297                 break
    298             attrname, rest, attrvalue = m.group(1, 2, 3)
    299             if not rest:
    300                 attrvalue = None
    301             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
    302                  attrvalue[:1] == '"' == attrvalue[-1:]:
    303                 attrvalue = attrvalue[1:-1]
    304             if attrvalue:
    305                 attrvalue = self.unescape(attrvalue)
    306             attrs.append((attrname.lower(), attrvalue))
    307             k = m.end()
    308 
    309         end = rawdata[k:endpos].strip()
    310         if end not in (">", "/>"):
    311             lineno, offset = self.getpos()
    312             if "\n" in self.__starttag_text:
    313                 lineno = lineno + self.__starttag_text.count("\n")
    314                 offset = len(self.__starttag_text) \
    315                          - self.__starttag_text.rfind("\n")
    316             else:
    317                 offset = offset + len(self.__starttag_text)
    318             self.handle_data(rawdata[i:endpos])
    319             return endpos
    320         if end.endswith('/>'):
    321             # XHTML-style empty tag: <span attr="value" />
    322             self.handle_startendtag(tag, attrs)
    323         else:
    324             self.handle_starttag(tag, attrs)
    325             if tag in self.CDATA_CONTENT_ELEMENTS:
    326                 self.set_cdata_mode(tag)
    327         return endpos
    328 
    329     # Internal -- check to see if we have a complete starttag; return end
    330     # or -1 if incomplete.
    331     def check_for_whole_start_tag(self, i):
    332         rawdata = self.rawdata
    333         m = locatestarttagend.match(rawdata, i)
    334         if m:
    335             j = m.end()
    336             next = rawdata[j:j+1]
    337             if next == ">":
    338                 return j + 1
    339             if next == "/":
    340                 if rawdata.startswith("/>", j):
    341                     return j + 2
    342                 if rawdata.startswith("/", j):
    343                     # buffer boundary
    344                     return -1
    345                 # else bogus input
    346                 self.updatepos(i, j + 1)
    347                 self.error("malformed empty start tag")
    348             if next == "":
    349                 # end of input
    350                 return -1
    351             if next in ("abcdefghijklmnopqrstuvwxyz=/"
    352                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
    353                 # end of input in or before attribute value, or we have the
    354                 # '/' from a '/>' ending
    355                 return -1
    356             if j > i:
    357                 return j
    358             else:
    359                 return i + 1
    360         raise AssertionError("we should not get here!")
    361 
    362     # Internal -- parse endtag, return end or -1 if incomplete
    363     def parse_endtag(self, i):
    364         rawdata = self.rawdata
    365         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
    366         match = endendtag.search(rawdata, i+1) # >
    367         if not match:
    368             return -1
    369         gtpos = match.end()
    370         match = endtagfind.match(rawdata, i) # </ + tag + >
    371         if not match:
    372             if self.cdata_elem is not None:
    373                 self.handle_data(rawdata[i:gtpos])
    374                 return gtpos
    375             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
    376             namematch = tagfind_tolerant.match(rawdata, i+2)
    377             if not namematch:
    378                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
    379                 if rawdata[i:i+3] == '</>':
    380                     return i+3
    381                 else:
    382                     return self.parse_bogus_comment(i)
    383             tagname = namematch.group().lower()
    384             # consume and ignore other stuff between the name and the >
    385             # Note: this is not 100% correct, since we might have things like
    386             # </tag attr=">">, but looking for > after tha name should cover
    387             # most of the cases and is much simpler
    388             gtpos = rawdata.find('>', namematch.end())
    389             self.handle_endtag(tagname)
    390             return gtpos+1
    391 
    392         elem = match.group(1).lower() # script or style
    393         if self.cdata_elem is not None:
    394             if elem != self.cdata_elem:
    395                 self.handle_data(rawdata[i:gtpos])
    396                 return gtpos
    397 
    398         self.handle_endtag(elem)
    399         self.clear_cdata_mode()
    400         return gtpos
    401 
    402     # Overridable -- finish processing of start+end tag: <tag.../>
    403     def handle_startendtag(self, tag, attrs):
    404         self.handle_starttag(tag, attrs)
    405         self.handle_endtag(tag)
    406 
    407     # Overridable -- handle start tag
    408     def handle_starttag(self, tag, attrs):
    409         pass
    410 
    411     # Overridable -- handle end tag
    412     def handle_endtag(self, tag):
    413         pass
    414 
    415     # Overridable -- handle character reference
    416     def handle_charref(self, name):
    417         pass
    418 
    419     # Overridable -- handle entity reference
    420     def handle_entityref(self, name):
    421         pass
    422 
    423     # Overridable -- handle data
    424     def handle_data(self, data):
    425         pass
    426 
    427     # Overridable -- handle comment
    428     def handle_comment(self, data):
    429         pass
    430 
    431     # Overridable -- handle declaration
    432     def handle_decl(self, decl):
    433         pass
    434 
    435     # Overridable -- handle processing instruction
    436     def handle_pi(self, data):
    437         pass
    438 
    439     def unknown_decl(self, data):
    440         pass
    441 
    442     # Internal -- helper to remove special character quoting
    443     entitydefs = None
    444     def unescape(self, s):
    445         if '&' not in s:
    446             return s
    447         def replaceEntities(s):
    448             s = s.groups()[0]
    449             try:
    450                 if s[0] == "#":
    451                     s = s[1:]
    452                     if s[0] in ['x','X']:
    453                         c = int(s[1:], 16)
    454                     else:
    455                         c = int(s)
    456                     return unichr(c)
    457             except ValueError:
    458                 return '&#'+s+';'
    459             else:
    460                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
    461                 # which is not part of HTML 4
    462                 import htmlentitydefs
    463                 if HTMLParser.entitydefs is None:
    464                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
    465                     for k, v in htmlentitydefs.name2codepoint.iteritems():
    466                         entitydefs[k] = unichr(v)
    467                 try:
    468                     return self.entitydefs[s]
    469                 except KeyError:
    470                     return '&'+s+';'
    471 
    472         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
    473