Home | History | Annotate | Download | only in Lib
      1 """A parser for HTML and XHTML."""
      2 
      3 # This file is based on sgmllib.py, but the API is slightly different.
      4 
      5 # XXX There should be a way to distinguish between PCDATA (parsed
      6 # character data -- the normal case), RCDATA (replaceable character
      7 # data -- only char and entity references and end tags are special)
      8 # and CDATA (character data -- only end tags are special).
      9 
     10 
     11 import markupbase
     12 import re
     13 
     14 # Regular expressions used for parsing
     15 
     16 interesting_normal = re.compile('[&<]')
     17 incomplete = re.compile('&[a-zA-Z#]')
     18 
     19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
     20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     21 
     22 starttagopen = re.compile('<[a-zA-Z]')
     23 piclose = re.compile('>')
     24 commentclose = re.compile(r'--\s*>')
     25 
     26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
     27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
     28 # note: if you change tagfind/attrfind remember to update locatestarttagend too
     29 tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
     30 # this regex is currently unused, but left for backward compatibility
     31 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
     32 
     33 attrfind = re.compile(
     34     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
     35     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
     36 
     37 locatestarttagend = re.compile(r"""
     38   <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
     39   (?:[\s/]*                          # optional whitespace before attribute name
     40     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
     41       (?:\s*=+\s*                    # value indicator
     42         (?:'[^']*'                   # LITA-enclosed value
     43           |"[^"]*"                   # LIT-enclosed value
     44           |(?!['"])[^>\s]*           # bare value
     45          )
     46        )?(?:\s|/(?!>))*
     47      )*
     48    )?
     49   \s*                                # trailing whitespace
     50 """, re.VERBOSE)
     51 endendtag = re.compile('>')
     52 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
     53 # </ and the tag name, so maybe this should be fixed
     54 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
     55 
     56 
     57 class HTMLParseError(Exception):
     58     """Exception raised for all parse errors."""
     59 
     60     def __init__(self, msg, position=(None, None)):
     61         assert msg
     62         self.msg = msg
     63         self.lineno = position[0]
     64         self.offset = position[1]
     65 
     66     def __str__(self):
     67         result = self.msg
     68         if self.lineno is not None:
     69             result = result + ", at line %d" % self.lineno
     70         if self.offset is not None:
     71             result = result + ", column %d" % (self.offset + 1)
     72         return result
     73 
     74 
     75 class HTMLParser(markupbase.ParserBase):
     76     """Find tags and other markup and call handler functions.
     77 
     78     Usage:
     79         p = HTMLParser()
     80         p.feed(data)
     81         ...
     82         p.close()
     83 
     84     Start tags are handled by calling self.handle_starttag() or
     85     self.handle_startendtag(); end tags by self.handle_endtag().  The
     86     data between tags is passed from the parser to the derived class
     87     by calling self.handle_data() with the data as argument (the data
     88     may be split up in arbitrary chunks).  Entity references are
     89     passed by calling self.handle_entityref() with the entity
     90     reference as the argument.  Numeric character references are
     91     passed to self.handle_charref() with the string containing the
     92     reference as the argument.
     93     """
     94 
     95     CDATA_CONTENT_ELEMENTS = ("script", "style")
     96 
     97 
     98     def __init__(self):
     99         """Initialize and reset this instance."""
    100         self.reset()
    101 
    102     def reset(self):
    103         """Reset this instance.  Loses all unprocessed data."""
    104         self.rawdata = ''
    105         self.lasttag = '???'
    106         self.interesting = interesting_normal
    107         self.cdata_elem = None
    108         markupbase.ParserBase.reset(self)
    109 
    110     def feed(self, data):
    111         r"""Feed data to the parser.
    112 
    113         Call this as often as you want, with as little or as much text
    114         as you want (may include '\n').
    115         """
    116         self.rawdata = self.rawdata + data
    117         self.goahead(0)
    118 
    119     def close(self):
    120         """Handle any buffered data."""
    121         self.goahead(1)
    122 
    123     def error(self, message):
    124         raise HTMLParseError(message, self.getpos())
    125 
    126     __starttag_text = None
    127 
    128     def get_starttag_text(self):
    129         """Return full source of start tag: '<...>'."""
    130         return self.__starttag_text
    131 
    132     def set_cdata_mode(self, elem):
    133         self.cdata_elem = elem.lower()
    134         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
    135 
    136     def clear_cdata_mode(self):
    137         self.interesting = interesting_normal
    138         self.cdata_elem = None
    139 
    140     # Internal -- handle data as far as reasonable.  May leave state
    141     # and data to be processed by a subsequent call.  If 'end' is
    142     # true, force handling all data as if followed by EOF marker.
    143     def goahead(self, end):
    144         rawdata = self.rawdata
    145         i = 0
    146         n = len(rawdata)
    147         while i < n:
    148             match = self.interesting.search(rawdata, i) # < or &
    149             if match:
    150                 j = match.start()
    151             else:
    152                 if self.cdata_elem:
    153                     break
    154                 j = n
    155             if i < j: self.handle_data(rawdata[i:j])
    156             i = self.updatepos(i, j)
    157             if i == n: break
    158             startswith = rawdata.startswith
    159             if startswith('<', i):
    160                 if starttagopen.match(rawdata, i): # < + letter
    161                     k = self.parse_starttag(i)
    162                 elif startswith("</", i):
    163                     k = self.parse_endtag(i)
    164                 elif startswith("<!--", i):
    165                     k = self.parse_comment(i)
    166                 elif startswith("<?", i):
    167                     k = self.parse_pi(i)
    168                 elif startswith("<!", i):
    169                     k = self.parse_html_declaration(i)
    170                 elif (i + 1) < n:
    171                     self.handle_data("<")
    172                     k = i + 1
    173                 else:
    174                     break
    175                 if k < 0:
    176                     if not end:
    177                         break
    178                     k = rawdata.find('>', i + 1)
    179                     if k < 0:
    180                         k = rawdata.find('<', i + 1)
    181                         if k < 0:
    182                             k = i + 1
    183                     else:
    184                         k += 1
    185                     self.handle_data(rawdata[i:k])
    186                 i = self.updatepos(i, k)
    187             elif startswith("&#", i):
    188                 match = charref.match(rawdata, i)
    189                 if match:
    190                     name = match.group()[2:-1]
    191                     self.handle_charref(name)
    192                     k = match.end()
    193                     if not startswith(';', k-1):
    194                         k = k - 1
    195                     i = self.updatepos(i, k)
    196                     continue
    197                 else:
    198                     if ";" in rawdata[i:]:  # bail by consuming '&#'
    199                         self.handle_data(rawdata[i:i+2])
    200                         i = self.updatepos(i, i+2)
    201                     break
    202             elif startswith('&', i):
    203                 match = entityref.match(rawdata, i)
    204                 if match:
    205                     name = match.group(1)
    206                     self.handle_entityref(name)
    207                     k = match.end()
    208                     if not startswith(';', k-1):
    209                         k = k - 1
    210                     i = self.updatepos(i, k)
    211                     continue
    212                 match = incomplete.match(rawdata, i)
    213                 if match:
    214                     # match.group() will contain at least 2 chars
    215                     if end and match.group() == rawdata[i:]:
    216                         self.error("EOF in middle of entity or char ref")
    217                     # incomplete
    218                     break
    219                 elif (i + 1) < n:
    220                     # not the end of the buffer, and can't be confused
    221                     # with some other construct
    222                     self.handle_data("&")
    223                     i = self.updatepos(i, i + 1)
    224                 else:
    225                     break
    226             else:
    227                 assert 0, "interesting.search() lied"
    228         # end while
    229         if end and i < n and not self.cdata_elem:
    230             self.handle_data(rawdata[i:n])
    231             i = self.updatepos(i, n)
    232         self.rawdata = rawdata[i:]
    233 
    234     # Internal -- parse html declarations, return length or -1 if not terminated
    235     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
    236     # See also parse_declaration in _markupbase
    237     def parse_html_declaration(self, i):
    238         rawdata = self.rawdata
    239         if rawdata[i:i+2] != '<!':
    240             self.error('unexpected call to parse_html_declaration()')
    241         if rawdata[i:i+4] == '<!--':
    242             # this case is actually already handled in goahead()
    243             return self.parse_comment(i)
    244         elif rawdata[i:i+3] == '<![':
    245             return self.parse_marked_section(i)
    246         elif rawdata[i:i+9].lower() == '<!doctype':
    247             # find the closing >
    248             gtpos = rawdata.find('>', i+9)
    249             if gtpos == -1:
    250                 return -1
    251             self.handle_decl(rawdata[i+2:gtpos])
    252             return gtpos+1
    253         else:
    254             return self.parse_bogus_comment(i)
    255 
    256     # Internal -- parse bogus comment, return length or -1 if not terminated
    257     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
    258     def parse_bogus_comment(self, i, report=1):
    259         rawdata = self.rawdata
    260         if rawdata[i:i+2] not in ('<!', '</'):
    261             self.error('unexpected call to parse_comment()')
    262         pos = rawdata.find('>', i+2)
    263         if pos == -1:
    264             return -1
    265         if report:
    266             self.handle_comment(rawdata[i+2:pos])
    267         return pos + 1
    268 
    269     # Internal -- parse processing instr, return end or -1 if not terminated
    270     def parse_pi(self, i):
    271         rawdata = self.rawdata
    272         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
    273         match = piclose.search(rawdata, i+2) # >
    274         if not match:
    275             return -1
    276         j = match.start()
    277         self.handle_pi(rawdata[i+2: j])
    278         j = match.end()
    279         return j
    280 
    281     # Internal -- handle starttag, return end or -1 if not terminated
    282     def parse_starttag(self, i):
    283         self.__starttag_text = None
    284         endpos = self.check_for_whole_start_tag(i)
    285         if endpos < 0:
    286             return endpos
    287         rawdata = self.rawdata
    288         self.__starttag_text = rawdata[i:endpos]
    289 
    290         # Now parse the data between i+1 and j into a tag and attrs
    291         attrs = []
    292         match = tagfind.match(rawdata, i+1)
    293         assert match, 'unexpected call to parse_starttag()'
    294         k = match.end()
    295         self.lasttag = tag = match.group(1).lower()
    296 
    297         while k < endpos:
    298             m = attrfind.match(rawdata, k)
    299             if not m:
    300                 break
    301             attrname, rest, attrvalue = m.group(1, 2, 3)
    302             if not rest:
    303                 attrvalue = None
    304             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
    305                  attrvalue[:1] == '"' == attrvalue[-1:]:
    306                 attrvalue = attrvalue[1:-1]
    307             if attrvalue:
    308                 attrvalue = self.unescape(attrvalue)
    309             attrs.append((attrname.lower(), attrvalue))
    310             k = m.end()
    311 
    312         end = rawdata[k:endpos].strip()
    313         if end not in (">", "/>"):
    314             lineno, offset = self.getpos()
    315             if "\n" in self.__starttag_text:
    316                 lineno = lineno + self.__starttag_text.count("\n")
    317                 offset = len(self.__starttag_text) \
    318                          - self.__starttag_text.rfind("\n")
    319             else:
    320                 offset = offset + len(self.__starttag_text)
    321             self.handle_data(rawdata[i:endpos])
    322             return endpos
    323         if end.endswith('/>'):
    324             # XHTML-style empty tag: <span attr="value" />
    325             self.handle_startendtag(tag, attrs)
    326         else:
    327             self.handle_starttag(tag, attrs)
    328             if tag in self.CDATA_CONTENT_ELEMENTS:
    329                 self.set_cdata_mode(tag)
    330         return endpos
    331 
    332     # Internal -- check to see if we have a complete starttag; return end
    333     # or -1 if incomplete.
    334     def check_for_whole_start_tag(self, i):
    335         rawdata = self.rawdata
    336         m = locatestarttagend.match(rawdata, i)
    337         if m:
    338             j = m.end()
    339             next = rawdata[j:j+1]
    340             if next == ">":
    341                 return j + 1
    342             if next == "/":
    343                 if rawdata.startswith("/>", j):
    344                     return j + 2
    345                 if rawdata.startswith("/", j):
    346                     # buffer boundary
    347                     return -1
    348                 # else bogus input
    349                 self.updatepos(i, j + 1)
    350                 self.error("malformed empty start tag")
    351             if next == "":
    352                 # end of input
    353                 return -1
    354             if next in ("abcdefghijklmnopqrstuvwxyz=/"
    355                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
    356                 # end of input in or before attribute value, or we have the
    357                 # '/' from a '/>' ending
    358                 return -1
    359             if j > i:
    360                 return j
    361             else:
    362                 return i + 1
    363         raise AssertionError("we should not get here!")
    364 
    365     # Internal -- parse endtag, return end or -1 if incomplete
    366     def parse_endtag(self, i):
    367         rawdata = self.rawdata
    368         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
    369         match = endendtag.search(rawdata, i+1) # >
    370         if not match:
    371             return -1
    372         gtpos = match.end()
    373         match = endtagfind.match(rawdata, i) # </ + tag + >
    374         if not match:
    375             if self.cdata_elem is not None:
    376                 self.handle_data(rawdata[i:gtpos])
    377                 return gtpos
    378             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
    379             namematch = tagfind.match(rawdata, i+2)
    380             if not namematch:
    381                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
    382                 if rawdata[i:i+3] == '</>':
    383                     return i+3
    384                 else:
    385                     return self.parse_bogus_comment(i)
    386             tagname = namematch.group(1).lower()
    387             # consume and ignore other stuff between the name and the >
    388             # Note: this is not 100% correct, since we might have things like
    389             # </tag attr=">">, but looking for > after tha name should cover
    390             # most of the cases and is much simpler
    391             gtpos = rawdata.find('>', namematch.end())
    392             self.handle_endtag(tagname)
    393             return gtpos+1
    394 
    395         elem = match.group(1).lower() # script or style
    396         if self.cdata_elem is not None:
    397             if elem != self.cdata_elem:
    398                 self.handle_data(rawdata[i:gtpos])
    399                 return gtpos
    400 
    401         self.handle_endtag(elem)
    402         self.clear_cdata_mode()
    403         return gtpos
    404 
    405     # Overridable -- finish processing of start+end tag: <tag.../>
    406     def handle_startendtag(self, tag, attrs):
    407         self.handle_starttag(tag, attrs)
    408         self.handle_endtag(tag)
    409 
    410     # Overridable -- handle start tag
    411     def handle_starttag(self, tag, attrs):
    412         pass
    413 
    414     # Overridable -- handle end tag
    415     def handle_endtag(self, tag):
    416         pass
    417 
    418     # Overridable -- handle character reference
    419     def handle_charref(self, name):
    420         pass
    421 
    422     # Overridable -- handle entity reference
    423     def handle_entityref(self, name):
    424         pass
    425 
    426     # Overridable -- handle data
    427     def handle_data(self, data):
    428         pass
    429 
    430     # Overridable -- handle comment
    431     def handle_comment(self, data):
    432         pass
    433 
    434     # Overridable -- handle declaration
    435     def handle_decl(self, decl):
    436         pass
    437 
    438     # Overridable -- handle processing instruction
    439     def handle_pi(self, data):
    440         pass
    441 
    442     def unknown_decl(self, data):
    443         pass
    444 
    445     # Internal -- helper to remove special character quoting
    446     entitydefs = None
    447     def unescape(self, s):
    448         if '&' not in s:
    449             return s
    450         def replaceEntities(s):
    451             s = s.groups()[0]
    452             try:
    453                 if s[0] == "#":
    454                     s = s[1:]
    455                     if s[0] in ['x','X']:
    456                         c = int(s[1:], 16)
    457                     else:
    458                         c = int(s)
    459                     return unichr(c)
    460             except ValueError:
    461                 return '&#'+s+';'
    462             else:
    463                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
    464                 # which is not part of HTML 4
    465                 import htmlentitydefs
    466                 if HTMLParser.entitydefs is None:
    467                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
    468                     for k, v in htmlentitydefs.name2codepoint.iteritems():
    469                         entitydefs[k] = unichr(v)
    470                 try:
    471                     return self.entitydefs[s]
    472                 except KeyError:
    473                     return '&'+s+';'
    474 
    475         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
    476