Home | History | Annotate | Download | only in Lib
      1 """A parser for HTML and XHTML."""
      2 
      3 # This file is based on sgmllib.py, but the API is slightly different.

      4 
      5 # XXX There should be a way to distinguish between PCDATA (parsed

      6 # character data -- the normal case), RCDATA (replaceable character

      7 # data -- only char and entity references and end tags are special)

      8 # and CDATA (character data -- only end tags are special).

      9 
     10 
     11 import markupbase
     12 import re
     13 
     14 # Regular expressions used for parsing

     15 
     16 interesting_normal = re.compile('[&<]')
     17 interesting_cdata = re.compile(r'<(/|\Z)')
     18 incomplete = re.compile('&[a-zA-Z#]')
     19 
     20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
     21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
     22 
     23 starttagopen = re.compile('<[a-zA-Z]')
     24 piclose = re.compile('>')
     25 commentclose = re.compile(r'--\s*>')
     26 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
     27 attrfind = re.compile(
     28     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     29     r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
     30 
     31 locatestarttagend = re.compile(r"""
     32   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
     33   (?:\s+                             # whitespace before attribute name
     34     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
     35       (?:\s*=\s*                     # value indicator
     36         (?:'[^']*'                   # LITA-enclosed value
     37           |\"[^\"]*\"                # LIT-enclosed value
     38           |[^'\">\s]+                # bare value
     39          )
     40        )?
     41      )
     42    )*
     43   \s*                                # trailing whitespace
     44 """, re.VERBOSE)
     45 endendtag = re.compile('>')
     46 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
     47 
     48 
     49 class HTMLParseError(Exception):
     50     """Exception raised for all parse errors."""
     51 
     52     def __init__(self, msg, position=(None, None)):
     53         assert msg
     54         self.msg = msg
     55         self.lineno = position[0]
     56         self.offset = position[1]
     57 
     58     def __str__(self):
     59         result = self.msg
     60         if self.lineno is not None:
     61             result = result + ", at line %d" % self.lineno
     62         if self.offset is not None:
     63             result = result + ", column %d" % (self.offset + 1)
     64         return result
     65 
     66 
     67 class HTMLParser(markupbase.ParserBase):
     68     """Find tags and other markup and call handler functions.
     69 
     70     Usage:
     71         p = HTMLParser()
     72         p.feed(data)
     73         ...
     74         p.close()
     75 
     76     Start tags are handled by calling self.handle_starttag() or
     77     self.handle_startendtag(); end tags by self.handle_endtag().  The
     78     data between tags is passed from the parser to the derived class
     79     by calling self.handle_data() with the data as argument (the data
     80     may be split up in arbitrary chunks).  Entity references are
     81     passed by calling self.handle_entityref() with the entity
     82     reference as the argument.  Numeric character references are
     83     passed to self.handle_charref() with the string containing the
     84     reference as the argument.
     85     """
     86 
     87     CDATA_CONTENT_ELEMENTS = ("script", "style")
     88 
     89 
     90     def __init__(self):
     91         """Initialize and reset this instance."""
     92         self.reset()
     93 
     94     def reset(self):
     95         """Reset this instance.  Loses all unprocessed data."""
     96         self.rawdata = ''
     97         self.lasttag = '???'
     98         self.interesting = interesting_normal
     99         markupbase.ParserBase.reset(self)
    100 
    101     def feed(self, data):
    102         r"""Feed data to the parser.
    103 
    104         Call this as often as you want, with as little or as much text
    105         as you want (may include '\n').
    106         """
    107         self.rawdata = self.rawdata + data
    108         self.goahead(0)
    109 
    110     def close(self):
    111         """Handle any buffered data."""
    112         self.goahead(1)
    113 
    114     def error(self, message):
    115         raise HTMLParseError(message, self.getpos())
    116 
    117     __starttag_text = None
    118 
    119     def get_starttag_text(self):
    120         """Return full source of start tag: '<...>'."""
    121         return self.__starttag_text
    122 
    123     def set_cdata_mode(self):
    124         self.interesting = interesting_cdata
    125 
    126     def clear_cdata_mode(self):
    127         self.interesting = interesting_normal
    128 
    129     # Internal -- handle data as far as reasonable.  May leave state

    130     # and data to be processed by a subsequent call.  If 'end' is

    131     # true, force handling all data as if followed by EOF marker.

    132     def goahead(self, end):
    133         rawdata = self.rawdata
    134         i = 0
    135         n = len(rawdata)
    136         while i < n:
    137             match = self.interesting.search(rawdata, i) # < or &

    138             if match:
    139                 j = match.start()
    140             else:
    141                 j = n
    142             if i < j: self.handle_data(rawdata[i:j])
    143             i = self.updatepos(i, j)
    144             if i == n: break
    145             startswith = rawdata.startswith
    146             if startswith('<', i):
    147                 if starttagopen.match(rawdata, i): # < + letter

    148                     k = self.parse_starttag(i)
    149                 elif startswith("</", i):
    150                     k = self.parse_endtag(i)
    151                 elif startswith("<!--", i):
    152                     k = self.parse_comment(i)
    153                 elif startswith("<?", i):
    154                     k = self.parse_pi(i)
    155                 elif startswith("<!", i):
    156                     k = self.parse_declaration(i)
    157                 elif (i + 1) < n:
    158                     self.handle_data("<")
    159                     k = i + 1
    160                 else:
    161                     break
    162                 if k < 0:
    163                     if end:
    164                         self.error("EOF in middle of construct")
    165                     break
    166                 i = self.updatepos(i, k)
    167             elif startswith("&#", i):
    168                 match = charref.match(rawdata, i)
    169                 if match:
    170                     name = match.group()[2:-1]
    171                     self.handle_charref(name)
    172                     k = match.end()
    173                     if not startswith(';', k-1):
    174                         k = k - 1
    175                     i = self.updatepos(i, k)
    176                     continue
    177                 else:
    178                     if ";" in rawdata[i:]: #bail by consuming &#

    179                         self.handle_data(rawdata[0:2])
    180                         i = self.updatepos(i, 2)
    181                     break
    182             elif startswith('&', i):
    183                 match = entityref.match(rawdata, i)
    184                 if match:
    185                     name = match.group(1)
    186                     self.handle_entityref(name)
    187                     k = match.end()
    188                     if not startswith(';', k-1):
    189                         k = k - 1
    190                     i = self.updatepos(i, k)
    191                     continue
    192                 match = incomplete.match(rawdata, i)
    193                 if match:
    194                     # match.group() will contain at least 2 chars

    195                     if end and match.group() == rawdata[i:]:
    196                         self.error("EOF in middle of entity or char ref")
    197                     # incomplete

    198                     break
    199                 elif (i + 1) < n:
    200                     # not the end of the buffer, and can't be confused

    201                     # with some other construct

    202                     self.handle_data("&")
    203                     i = self.updatepos(i, i + 1)
    204                 else:
    205                     break
    206             else:
    207                 assert 0, "interesting.search() lied"
    208         # end while

    209         if end and i < n:
    210             self.handle_data(rawdata[i:n])
    211             i = self.updatepos(i, n)
    212         self.rawdata = rawdata[i:]
    213 
    214     # Internal -- parse processing instr, return end or -1 if not terminated

    215     def parse_pi(self, i):
    216         rawdata = self.rawdata
    217         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
    218         match = piclose.search(rawdata, i+2) # >

    219         if not match:
    220             return -1
    221         j = match.start()
    222         self.handle_pi(rawdata[i+2: j])
    223         j = match.end()
    224         return j
    225 
    226     # Internal -- handle starttag, return end or -1 if not terminated

    227     def parse_starttag(self, i):
    228         self.__starttag_text = None
    229         endpos = self.check_for_whole_start_tag(i)
    230         if endpos < 0:
    231             return endpos
    232         rawdata = self.rawdata
    233         self.__starttag_text = rawdata[i:endpos]
    234 
    235         # Now parse the data between i+1 and j into a tag and attrs

    236         attrs = []
    237         match = tagfind.match(rawdata, i+1)
    238         assert match, 'unexpected call to parse_starttag()'
    239         k = match.end()
    240         self.lasttag = tag = rawdata[i+1:k].lower()
    241 
    242         while k < endpos:
    243             m = attrfind.match(rawdata, k)
    244             if not m:
    245                 break
    246             attrname, rest, attrvalue = m.group(1, 2, 3)
    247             if not rest:
    248                 attrvalue = None
    249             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
    250                  attrvalue[:1] == '"' == attrvalue[-1:]:
    251                 attrvalue = attrvalue[1:-1]
    252                 attrvalue = self.unescape(attrvalue)
    253             attrs.append((attrname.lower(), attrvalue))
    254             k = m.end()
    255 
    256         end = rawdata[k:endpos].strip()
    257         if end not in (">", "/>"):
    258             lineno, offset = self.getpos()
    259             if "\n" in self.__starttag_text:
    260                 lineno = lineno + self.__starttag_text.count("\n")
    261                 offset = len(self.__starttag_text) \
    262                          - self.__starttag_text.rfind("\n")
    263             else:
    264                 offset = offset + len(self.__starttag_text)
    265             self.error("junk characters in start tag: %r"
    266                        % (rawdata[k:endpos][:20],))
    267         if end.endswith('/>'):
    268             # XHTML-style empty tag: <span attr="value" />

    269             self.handle_startendtag(tag, attrs)
    270         else:
    271             self.handle_starttag(tag, attrs)
    272             if tag in self.CDATA_CONTENT_ELEMENTS:
    273                 self.set_cdata_mode()
    274         return endpos
    275 
    276     # Internal -- check to see if we have a complete starttag; return end

    277     # or -1 if incomplete.

    278     def check_for_whole_start_tag(self, i):
    279         rawdata = self.rawdata
    280         m = locatestarttagend.match(rawdata, i)
    281         if m:
    282             j = m.end()
    283             next = rawdata[j:j+1]
    284             if next == ">":
    285                 return j + 1
    286             if next == "/":
    287                 if rawdata.startswith("/>", j):
    288                     return j + 2
    289                 if rawdata.startswith("/", j):
    290                     # buffer boundary

    291                     return -1
    292                 # else bogus input

    293                 self.updatepos(i, j + 1)
    294                 self.error("malformed empty start tag")
    295             if next == "":
    296                 # end of input

    297                 return -1
    298             if next in ("abcdefghijklmnopqrstuvwxyz=/"
    299                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
    300                 # end of input in or before attribute value, or we have the

    301                 # '/' from a '/>' ending

    302                 return -1
    303             self.updatepos(i, j)
    304             self.error("malformed start tag")
    305         raise AssertionError("we should not get here!")
    306 
    307     # Internal -- parse endtag, return end or -1 if incomplete

    308     def parse_endtag(self, i):
    309         rawdata = self.rawdata
    310         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
    311         match = endendtag.search(rawdata, i+1) # >

    312         if not match:
    313             return -1
    314         j = match.end()
    315         match = endtagfind.match(rawdata, i) # </ + tag + >

    316         if not match:
    317             self.error("bad end tag: %r" % (rawdata[i:j],))
    318         tag = match.group(1)
    319         self.handle_endtag(tag.lower())
    320         self.clear_cdata_mode()
    321         return j
    322 
    323     # Overridable -- finish processing of start+end tag: <tag.../>

    324     def handle_startendtag(self, tag, attrs):
    325         self.handle_starttag(tag, attrs)
    326         self.handle_endtag(tag)
    327 
    328     # Overridable -- handle start tag

    329     def handle_starttag(self, tag, attrs):
    330         pass
    331 
    332     # Overridable -- handle end tag

    333     def handle_endtag(self, tag):
    334         pass
    335 
    336     # Overridable -- handle character reference

    337     def handle_charref(self, name):
    338         pass
    339 
    340     # Overridable -- handle entity reference

    341     def handle_entityref(self, name):
    342         pass
    343 
    344     # Overridable -- handle data

    345     def handle_data(self, data):
    346         pass
    347 
    348     # Overridable -- handle comment

    349     def handle_comment(self, data):
    350         pass
    351 
    352     # Overridable -- handle declaration

    353     def handle_decl(self, decl):
    354         pass
    355 
    356     # Overridable -- handle processing instruction

    357     def handle_pi(self, data):
    358         pass
    359 
    360     def unknown_decl(self, data):
    361         self.error("unknown declaration: %r" % (data,))
    362 
    363     # Internal -- helper to remove special character quoting

    364     entitydefs = None
    365     def unescape(self, s):
    366         if '&' not in s:
    367             return s
    368         def replaceEntities(s):
    369             s = s.groups()[0]
    370             try:
    371                 if s[0] == "#":
    372                     s = s[1:]
    373                     if s[0] in ['x','X']:
    374                         c = int(s[1:], 16)
    375                     else:
    376                         c = int(s)
    377                     return unichr(c)
    378             except ValueError:
    379                 return '&#'+s+';'
    380             else:
    381                 # Cannot use name2codepoint directly, because HTMLParser supports apos,

    382                 # which is not part of HTML 4

    383                 import htmlentitydefs
    384                 if HTMLParser.entitydefs is None:
    385                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
    386                     for k, v in htmlentitydefs.name2codepoint.iteritems():
    387                         entitydefs[k] = unichr(v)
    388                 try:
    389                     return self.entitydefs[s]
    390                 except KeyError:
    391                     return '&'+s+';'
    392 
    393         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
    394