Home | History | Annotate | Download | only in html
      1 """A parser for HTML and XHTML."""
      2 
      3 # This file is based on sgmllib.py, but the API is slightly different.
      4 
      5 # XXX There should be a way to distinguish between PCDATA (parsed
      6 # character data -- the normal case), RCDATA (replaceable character
      7 # data -- only char and entity references and end tags are special)
      8 # and CDATA (character data -- only end tags are special).
      9 
     10 
     11 import re
     12 import warnings
     13 import _markupbase
     14 
     15 from html import unescape
     16 
     17 
     18 __all__ = ['HTMLParser']
     19 
     20 # Regular expressions used for parsing
     21 
     22 interesting_normal = re.compile('[&<]')
     23 incomplete = re.compile('&[a-zA-Z#]')
     24 
     25 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
     26 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     27 
     28 starttagopen = re.compile('<[a-zA-Z]')
     29 piclose = re.compile('>')
     30 commentclose = re.compile(r'--\s*>')
     31 # Note:
     32 #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
     33 #  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
     34 #     explode, so don't do it.
     35 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
     36 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
     37 tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
     38 attrfind_tolerant = re.compile(
     39     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
     40     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
     41 locatestarttagend_tolerant = re.compile(r"""
     42   <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
     43   (?:[\s/]*                          # optional whitespace before attribute name
     44     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
     45       (?:\s*=+\s*                    # value indicator
     46         (?:'[^']*'                   # LITA-enclosed value
     47           |"[^"]*"                   # LIT-enclosed value
     48           |(?!['"])[^>\s]*           # bare value
     49          )
     50          (?:\s*,)*                   # possibly followed by a comma
     51        )?(?:\s|/(?!>))*
     52      )*
     53    )?
     54   \s*                                # trailing whitespace
     55 """, re.VERBOSE)
     56 endendtag = re.compile('>')
     57 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
     58 # </ and the tag name, so maybe this should be fixed
     59 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
     60 
     61 
     62 
     63 class HTMLParser(_markupbase.ParserBase):
     64     """Find tags and other markup and call handler functions.
     65 
     66     Usage:
     67         p = HTMLParser()
     68         p.feed(data)
     69         ...
     70         p.close()
     71 
     72     Start tags are handled by calling self.handle_starttag() or
     73     self.handle_startendtag(); end tags by self.handle_endtag().  The
     74     data between tags is passed from the parser to the derived class
     75     by calling self.handle_data() with the data as argument (the data
     76     may be split up in arbitrary chunks).  If convert_charrefs is
     77     True the character references are converted automatically to the
     78     corresponding Unicode character (and self.handle_data() is no
     79     longer split in chunks), otherwise they are passed by calling
     80     self.handle_entityref() or self.handle_charref() with the string
     81     containing respectively the named or numeric reference as the
     82     argument.
     83     """
     84 
     85     CDATA_CONTENT_ELEMENTS = ("script", "style")
     86 
     87     def __init__(self, *, convert_charrefs=True):
     88         """Initialize and reset this instance.
     89 
     90         If convert_charrefs is True (the default), all character references
     91         are automatically converted to the corresponding Unicode characters.
     92         """
     93         self.convert_charrefs = convert_charrefs
     94         self.reset()
     95 
     96     def reset(self):
     97         """Reset this instance.  Loses all unprocessed data."""
     98         self.rawdata = ''
     99         self.lasttag = '???'
    100         self.interesting = interesting_normal
    101         self.cdata_elem = None
    102         _markupbase.ParserBase.reset(self)
    103 
    104     def feed(self, data):
    105         r"""Feed data to the parser.
    106 
    107         Call this as often as you want, with as little or as much text
    108         as you want (may include '\n').
    109         """
    110         self.rawdata = self.rawdata + data
    111         self.goahead(0)
    112 
    113     def close(self):
    114         """Handle any buffered data."""
    115         self.goahead(1)
    116 
    117     __starttag_text = None
    118 
    119     def get_starttag_text(self):
    120         """Return full source of start tag: '<...>'."""
    121         return self.__starttag_text
    122 
    123     def set_cdata_mode(self, elem):
    124         self.cdata_elem = elem.lower()
    125         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    126 
    127     def clear_cdata_mode(self):
    128         self.interesting = interesting_normal
    129         self.cdata_elem = None
    130 
    131     # Internal -- handle data as far as reasonable.  May leave state
    132     # and data to be processed by a subsequent call.  If 'end' is
    133     # true, force handling all data as if followed by EOF marker.
    134     def goahead(self, end):
    135         rawdata = self.rawdata
    136         i = 0
    137         n = len(rawdata)
    138         while i < n:
    139             if self.convert_charrefs and not self.cdata_elem:
    140                 j = rawdata.find('<', i)
    141                 if j < 0:
    142                     # if we can't find the next <, either we are at the end
    143                     # or there's more text incoming.  If the latter is True,
    144                     # we can't pass the text to handle_data in case we have
    145                     # a charref cut in half at end.  Try to determine if
    146                     # this is the case before proceeding by looking for an
    147                     # & near the end and see if it's followed by a space or ;.
    148                     amppos = rawdata.rfind('&', max(i, n-34))
    149                     if (amppos >= 0 and
    150                         not re.compile(r'[\s;]').search(rawdata, amppos)):
    151                         break  # wait till we get all the text
    152                     j = n
    153             else:
    154                 match = self.interesting.search(rawdata, i)  # < or &
    155                 if match:
    156                     j = match.start()
    157                 else:
    158                     if self.cdata_elem:
    159                         break
    160                     j = n
    161             if i < j:
    162                 if self.convert_charrefs and not self.cdata_elem:
    163                     self.handle_data(unescape(rawdata[i:j]))
    164                 else:
    165                     self.handle_data(rawdata[i:j])
    166             i = self.updatepos(i, j)
    167             if i == n: break
    168             startswith = rawdata.startswith
    169             if startswith('<', i):
    170                 if starttagopen.match(rawdata, i): # < + letter
    171                     k = self.parse_starttag(i)
    172                 elif startswith("</", i):
    173                     k = self.parse_endtag(i)
    174                 elif startswith("<!--", i):
    175                     k = self.parse_comment(i)
    176                 elif startswith("<?", i):
    177                     k = self.parse_pi(i)
    178                 elif startswith("<!", i):
    179                     k = self.parse_html_declaration(i)
    180                 elif (i + 1) < n:
    181                     self.handle_data("<")
    182                     k = i + 1
    183                 else:
    184                     break
    185                 if k < 0:
    186                     if not end:
    187                         break
    188                     k = rawdata.find('>', i + 1)
    189                     if k < 0:
    190                         k = rawdata.find('<', i + 1)
    191                         if k < 0:
    192                             k = i + 1
    193                     else:
    194                         k += 1
    195                     if self.convert_charrefs and not self.cdata_elem:
    196                         self.handle_data(unescape(rawdata[i:k]))
    197                     else:
    198                         self.handle_data(rawdata[i:k])
    199                 i = self.updatepos(i, k)
    200             elif startswith("&#", i):
    201                 match = charref.match(rawdata, i)
    202                 if match:
    203                     name = match.group()[2:-1]
    204                     self.handle_charref(name)
    205                     k = match.end()
    206                     if not startswith(';', k-1):
    207                         k = k - 1
    208                     i = self.updatepos(i, k)
    209                     continue
    210                 else:
    211                     if ";" in rawdata[i:]:  # bail by consuming &#
    212                         self.handle_data(rawdata[i:i+2])
    213                         i = self.updatepos(i, i+2)
    214                     break
    215             elif startswith('&', i):
    216                 match = entityref.match(rawdata, i)
    217                 if match:
    218                     name = match.group(1)
    219                     self.handle_entityref(name)
    220                     k = match.end()
    221                     if not startswith(';', k-1):
    222                         k = k - 1
    223                     i = self.updatepos(i, k)
    224                     continue
    225                 match = incomplete.match(rawdata, i)
    226                 if match:
    227                     # match.group() will contain at least 2 chars
    228                     if end and match.group() == rawdata[i:]:
    229                         k = match.end()
    230                         if k <= i:
    231                             k = n
    232                         i = self.updatepos(i, i + 1)
    233                     # incomplete
    234                     break
    235                 elif (i + 1) < n:
    236                     # not the end of the buffer, and can't be confused
    237                     # with some other construct
    238                     self.handle_data("&")
    239                     i = self.updatepos(i, i + 1)
    240                 else:
    241                     break
    242             else:
    243                 assert 0, "interesting.search() lied"
    244         # end while
    245         if end and i < n and not self.cdata_elem:
    246             if self.convert_charrefs and not self.cdata_elem:
    247                 self.handle_data(unescape(rawdata[i:n]))
    248             else:
    249                 self.handle_data(rawdata[i:n])
    250             i = self.updatepos(i, n)
    251         self.rawdata = rawdata[i:]
    252 
    253     # Internal -- parse html declarations, return length or -1 if not terminated
    254     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
    255     # See also parse_declaration in _markupbase
    256     def parse_html_declaration(self, i):
    257         rawdata = self.rawdata
    258         assert rawdata[i:i+2] == '<!', ('unexpected call to '
    259                                         'parse_html_declaration()')
    260         if rawdata[i:i+4] == '<!--':
    261             # this case is actually already handled in goahead()
    262             return self.parse_comment(i)
    263         elif rawdata[i:i+3] == '<![':
    264             return self.parse_marked_section(i)
    265         elif rawdata[i:i+9].lower() == '<!doctype':
    266             # find the closing >
    267             gtpos = rawdata.find('>', i+9)
    268             if gtpos == -1:
    269                 return -1
    270             self.handle_decl(rawdata[i+2:gtpos])
    271             return gtpos+1
    272         else:
    273             return self.parse_bogus_comment(i)
    274 
    275     # Internal -- parse bogus comment, return length or -1 if not terminated
    276     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    277     def parse_bogus_comment(self, i, report=1):
    278         rawdata = self.rawdata
    279         assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
    280                                                 'parse_comment()')
    281         pos = rawdata.find('>', i+2)
    282         if pos == -1:
    283             return -1
    284         if report:
    285             self.handle_comment(rawdata[i+2:pos])
    286         return pos + 1
    287 
    288     # Internal -- parse processing instr, return end or -1 if not terminated
    289     def parse_pi(self, i):
    290         rawdata = self.rawdata
    291         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
    292         match = piclose.search(rawdata, i+2) # >
    293         if not match:
    294             return -1
    295         j = match.start()
    296         self.handle_pi(rawdata[i+2: j])
    297         j = match.end()
    298         return j
    299 
    300     # Internal -- handle starttag, return end or -1 if not terminated
    301     def parse_starttag(self, i):
    302         self.__starttag_text = None
    303         endpos = self.check_for_whole_start_tag(i)
    304         if endpos < 0:
    305             return endpos
    306         rawdata = self.rawdata
    307         self.__starttag_text = rawdata[i:endpos]
    308 
    309         # Now parse the data between i+1 and j into a tag and attrs
    310         attrs = []
    311         match = tagfind_tolerant.match(rawdata, i+1)
    312         assert match, 'unexpected call to parse_starttag()'
    313         k = match.end()
    314         self.lasttag = tag = match.group(1).lower()
    315         while k < endpos:
    316             m = attrfind_tolerant.match(rawdata, k)
    317             if not m:
    318                 break
    319             attrname, rest, attrvalue = m.group(1, 2, 3)
    320             if not rest:
    321                 attrvalue = None
    322             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
    323                  attrvalue[:1] == '"' == attrvalue[-1:]:
    324                 attrvalue = attrvalue[1:-1]
    325             if attrvalue:
    326                 attrvalue = unescape(attrvalue)
    327             attrs.append((attrname.lower(), attrvalue))
    328             k = m.end()
    329 
    330         end = rawdata[k:endpos].strip()
    331         if end not in (">", "/>"):
    332             lineno, offset = self.getpos()
    333             if "\n" in self.__starttag_text:
    334                 lineno = lineno + self.__starttag_text.count("\n")
    335                 offset = len(self.__starttag_text) \
    336                          - self.__starttag_text.rfind("\n")
    337             else:
    338                 offset = offset + len(self.__starttag_text)
    339             self.handle_data(rawdata[i:endpos])
    340             return endpos
    341         if end.endswith('/>'):
    342             # XHTML-style empty tag: <span attr="value" />
    343             self.handle_startendtag(tag, attrs)
    344         else:
    345             self.handle_starttag(tag, attrs)
    346             if tag in self.CDATA_CONTENT_ELEMENTS:
    347                 self.set_cdata_mode(tag)
    348         return endpos
    349 
    350     # Internal -- check to see if we have a complete starttag; return end
    351     # or -1 if incomplete.
    352     def check_for_whole_start_tag(self, i):
    353         rawdata = self.rawdata
    354         m = locatestarttagend_tolerant.match(rawdata, i)
    355         if m:
    356             j = m.end()
    357             next = rawdata[j:j+1]
    358             if next == ">":
    359                 return j + 1
    360             if next == "/":
    361                 if rawdata.startswith("/>", j):
    362                     return j + 2
    363                 if rawdata.startswith("/", j):
    364                     # buffer boundary
    365                     return -1
    366                 # else bogus input
    367                 if j > i:
    368                     return j
    369                 else:
    370                     return i + 1
    371             if next == "":
    372                 # end of input
    373                 return -1
    374             if next in ("abcdefghijklmnopqrstuvwxyz=/"
    375                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
    376                 # end of input in or before attribute value, or we have the
    377                 # '/' from a '/>' ending
    378                 return -1
    379             if j > i:
    380                 return j
    381             else:
    382                 return i + 1
    383         raise AssertionError("we should not get here!")
    384 
    385     # Internal -- parse endtag, return end or -1 if incomplete
    386     def parse_endtag(self, i):
    387         rawdata = self.rawdata
    388         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
    389         match = endendtag.search(rawdata, i+1) # >
    390         if not match:
    391             return -1
    392         gtpos = match.end()
    393         match = endtagfind.match(rawdata, i) # </ + tag + >
    394         if not match:
    395             if self.cdata_elem is not None:
    396                 self.handle_data(rawdata[i:gtpos])
    397                 return gtpos
    398             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
    399             namematch = tagfind_tolerant.match(rawdata, i+2)
    400             if not namematch:
    401                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
    402                 if rawdata[i:i+3] == '</>':
    403                     return i+3
    404                 else:
    405                     return self.parse_bogus_comment(i)
    406             tagname = namematch.group(1).lower()
    407             # consume and ignore other stuff between the name and the >
    408             # Note: this is not 100% correct, since we might have things like
    409             # </tag attr=">">, but looking for > after tha name should cover
    410             # most of the cases and is much simpler
    411             gtpos = rawdata.find('>', namematch.end())
    412             self.handle_endtag(tagname)
    413             return gtpos+1
    414 
    415         elem = match.group(1).lower() # script or style
    416         if self.cdata_elem is not None:
    417             if elem != self.cdata_elem:
    418                 self.handle_data(rawdata[i:gtpos])
    419                 return gtpos
    420 
    421         self.handle_endtag(elem)
    422         self.clear_cdata_mode()
    423         return gtpos
    424 
    425     # Overridable -- finish processing of start+end tag: <tag.../>
    426     def handle_startendtag(self, tag, attrs):
    427         self.handle_starttag(tag, attrs)
    428         self.handle_endtag(tag)
    429 
    430     # Overridable -- handle start tag
    431     def handle_starttag(self, tag, attrs):
    432         pass
    433 
    434     # Overridable -- handle end tag
    435     def handle_endtag(self, tag):
    436         pass
    437 
    438     # Overridable -- handle character reference
    439     def handle_charref(self, name):
    440         pass
    441 
    442     # Overridable -- handle entity reference
    443     def handle_entityref(self, name):
    444         pass
    445 
    446     # Overridable -- handle data
    447     def handle_data(self, data):
    448         pass
    449 
    450     # Overridable -- handle comment
    451     def handle_comment(self, data):
    452         pass
    453 
    454     # Overridable -- handle declaration
    455     def handle_decl(self, decl):
    456         pass
    457 
    458     # Overridable -- handle processing instruction
    459     def handle_pi(self, data):
    460         pass
    461 
    462     def unknown_decl(self, data):
    463         pass
    464 
    465     # Internal -- helper to remove special character quoting
    466     def unescape(self, s):
    467         warnings.warn('The unescape method is deprecated and will be removed '
    468                       'in 3.5, use html.unescape() instead.',
    469                       DeprecationWarning, stacklevel=2)
    470         return unescape(s)
    471