Home | History | Annotate | Download | only in python2.7
      1 """A parser for SGML, using the derived class as a static DTD."""
      2 
      3 # XXX This only supports those SGML features used by HTML.
      4 
      5 # XXX There should be a way to distinguish between PCDATA (parsed
      6 # character data -- the normal case), RCDATA (replaceable character
      7 # data -- only char and entity references and end tags are special)
      8 # and CDATA (character data -- only end tags are special).  RCDATA is
      9 # not supported at all.
     10 
     11 
     12 from warnings import warnpy3k
     13 warnpy3k("the sgmllib module has been removed in Python 3.0",
     14          stacklevel=2)
     15 del warnpy3k
     16 
     17 import markupbase
     18 import re
     19 
     20 __all__ = ["SGMLParser", "SGMLParseError"]
     21 
     22 # Regular expressions used for parsing
     23 
     24 interesting = re.compile('[&<]')
     25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
     26                            '<([a-zA-Z][^<>]*|'
     27                               '/([a-zA-Z][^<>]*)?|'
     28                               '![^<>]*)?')
     29 
     30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
     31 charref = re.compile('&#([0-9]+)[^0-9]')
     32 
     33 starttagopen = re.compile('<[>a-zA-Z]')
     34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
     35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
     36 piclose = re.compile('>')
     37 endbracket = re.compile('[<>]')
     38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
     39 attrfind = re.compile(
     40     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
     41     r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
     42 
     43 
     44 class SGMLParseError(RuntimeError):
     45     """Exception raised for all parse errors."""
     46     pass
     47 
     48 
     49 # SGML parser base class -- find tags and call handler functions.
     50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
     51 # The dtd is defined by deriving a class which defines methods
     52 # with special names to handle tags: start_foo and end_foo to handle
     53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
     54 # (Tags are converted to lower case for this purpose.)  The data
     55 # between tags is passed to the parser by calling self.handle_data()
     56 # with some data as argument (the data may be split up in arbitrary
     57 # chunks).  Entity references are passed by calling
     58 # self.handle_entityref() with the entity reference as argument.
     59 
     60 class SGMLParser(markupbase.ParserBase):
     61     # Definition of entities -- derived classes may override
     62     entity_or_charref = re.compile('&(?:'
     63       '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
     64       ')(;?)')
     65 
     66     def __init__(self, verbose=0):
     67         """Initialize and reset this instance."""
     68         self.verbose = verbose
     69         self.reset()
     70 
     71     def reset(self):
     72         """Reset this instance. Loses all unprocessed data."""
     73         self.__starttag_text = None
     74         self.rawdata = ''
     75         self.stack = []
     76         self.lasttag = '???'
     77         self.nomoretags = 0
     78         self.literal = 0
     79         markupbase.ParserBase.reset(self)
     80 
     81     def setnomoretags(self):
     82         """Enter literal mode (CDATA) till EOF.
     83 
     84         Intended for derived classes only.
     85         """
     86         self.nomoretags = self.literal = 1
     87 
     88     def setliteral(self, *args):
     89         """Enter literal mode (CDATA).
     90 
     91         Intended for derived classes only.
     92         """
     93         self.literal = 1
     94 
     95     def feed(self, data):
     96         """Feed some data to the parser.
     97 
     98         Call this as often as you want, with as little or as much text
     99         as you want (may include '\n').  (This just saves the text,
    100         all the processing is done by goahead().)
    101         """
    102 
    103         self.rawdata = self.rawdata + data
    104         self.goahead(0)
    105 
    106     def close(self):
    107         """Handle the remaining data."""
    108         self.goahead(1)
    109 
    110     def error(self, message):
    111         raise SGMLParseError(message)
    112 
    113     # Internal -- handle data as far as reasonable.  May leave state
    114     # and data to be processed by a subsequent call.  If 'end' is
    115     # true, force handling all data as if followed by EOF marker.
    116     def goahead(self, end):
    117         rawdata = self.rawdata
    118         i = 0
    119         n = len(rawdata)
    120         while i < n:
    121             if self.nomoretags:
    122                 self.handle_data(rawdata[i:n])
    123                 i = n
    124                 break
    125             match = interesting.search(rawdata, i)
    126             if match: j = match.start()
    127             else: j = n
    128             if i < j:
    129                 self.handle_data(rawdata[i:j])
    130             i = j
    131             if i == n: break
    132             if rawdata[i] == '<':
    133                 if starttagopen.match(rawdata, i):
    134                     if self.literal:
    135                         self.handle_data(rawdata[i])
    136                         i = i+1
    137                         continue
    138                     k = self.parse_starttag(i)
    139                     if k < 0: break
    140                     i = k
    141                     continue
    142                 if rawdata.startswith("</", i):
    143                     k = self.parse_endtag(i)
    144                     if k < 0: break
    145                     i = k
    146                     self.literal = 0
    147                     continue
    148                 if self.literal:
    149                     if n > (i + 1):
    150                         self.handle_data("<")
    151                         i = i+1
    152                     else:
    153                         # incomplete
    154                         break
    155                     continue
    156                 if rawdata.startswith("<!--", i):
    157                         # Strictly speaking, a comment is --.*--
    158                         # within a declaration tag <!...>.
    159                         # This should be removed,
    160                         # and comments handled only in parse_declaration.
    161                     k = self.parse_comment(i)
    162                     if k < 0: break
    163                     i = k
    164                     continue
    165                 if rawdata.startswith("<?", i):
    166                     k = self.parse_pi(i)
    167                     if k < 0: break
    168                     i = i+k
    169                     continue
    170                 if rawdata.startswith("<!", i):
    171                     # This is some sort of declaration; in "HTML as
    172                     # deployed," this should only be the document type
    173                     # declaration ("<!DOCTYPE html...>").
    174                     k = self.parse_declaration(i)
    175                     if k < 0: break
    176                     i = k
    177                     continue
    178             elif rawdata[i] == '&':
    179                 if self.literal:
    180                     self.handle_data(rawdata[i])
    181                     i = i+1
    182                     continue
    183                 match = charref.match(rawdata, i)
    184                 if match:
    185                     name = match.group(1)
    186                     self.handle_charref(name)
    187                     i = match.end(0)
    188                     if rawdata[i-1] != ';': i = i-1
    189                     continue
    190                 match = entityref.match(rawdata, i)
    191                 if match:
    192                     name = match.group(1)
    193                     self.handle_entityref(name)
    194                     i = match.end(0)
    195                     if rawdata[i-1] != ';': i = i-1
    196                     continue
    197             else:
    198                 self.error('neither < nor & ??')
    199             # We get here only if incomplete matches but
    200             # nothing else
    201             match = incomplete.match(rawdata, i)
    202             if not match:
    203                 self.handle_data(rawdata[i])
    204                 i = i+1
    205                 continue
    206             j = match.end(0)
    207             if j == n:
    208                 break # Really incomplete
    209             self.handle_data(rawdata[i:j])
    210             i = j
    211         # end while
    212         if end and i < n:
    213             self.handle_data(rawdata[i:n])
    214             i = n
    215         self.rawdata = rawdata[i:]
    216         # XXX if end: check for empty stack
    217 
    218     # Extensions for the DOCTYPE scanner:
    219     _decl_otherchars = '='
    220 
    221     # Internal -- parse processing instr, return length or -1 if not terminated
    222     def parse_pi(self, i):
    223         rawdata = self.rawdata
    224         if rawdata[i:i+2] != '<?':
    225             self.error('unexpected call to parse_pi()')
    226         match = piclose.search(rawdata, i+2)
    227         if not match:
    228             return -1
    229         j = match.start(0)
    230         self.handle_pi(rawdata[i+2: j])
    231         j = match.end(0)
    232         return j-i
    233 
    234     def get_starttag_text(self):
    235         return self.__starttag_text
    236 
    237     # Internal -- handle starttag, return length or -1 if not terminated
    238     def parse_starttag(self, i):
    239         self.__starttag_text = None
    240         start_pos = i
    241         rawdata = self.rawdata
    242         if shorttagopen.match(rawdata, i):
    243             # SGML shorthand: <tag/data/ == <tag>data</tag>
    244             # XXX Can data contain &... (entity or char refs)?
    245             # XXX Can data contain < or > (tag characters)?
    246             # XXX Can there be whitespace before the first /?
    247             match = shorttag.match(rawdata, i)
    248             if not match:
    249                 return -1
    250             tag, data = match.group(1, 2)
    251             self.__starttag_text = '<%s/' % tag
    252             tag = tag.lower()
    253             k = match.end(0)
    254             self.finish_shorttag(tag, data)
    255             self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
    256             return k
    257         # XXX The following should skip matching quotes (' or ")
    258         # As a shortcut way to exit, this isn't so bad, but shouldn't
    259         # be used to locate the actual end of the start tag since the
    260         # < or > characters may be embedded in an attribute value.
    261         match = endbracket.search(rawdata, i+1)
    262         if not match:
    263             return -1
    264         j = match.start(0)
    265         # Now parse the data between i+1 and j into a tag and attrs
    266         attrs = []
    267         if rawdata[i:i+2] == '<>':
    268             # SGML shorthand: <> == <last open tag seen>
    269             k = j
    270             tag = self.lasttag
    271         else:
    272             match = tagfind.match(rawdata, i+1)
    273             if not match:
    274                 self.error('unexpected call to parse_starttag')
    275             k = match.end(0)
    276             tag = rawdata[i+1:k].lower()
    277             self.lasttag = tag
    278         while k < j:
    279             match = attrfind.match(rawdata, k)
    280             if not match: break
    281             attrname, rest, attrvalue = match.group(1, 2, 3)
    282             if not rest:
    283                 attrvalue = attrname
    284             else:
    285                 if (attrvalue[:1] == "'" == attrvalue[-1:] or
    286                     attrvalue[:1] == '"' == attrvalue[-1:]):
    287                     # strip quotes
    288                     attrvalue = attrvalue[1:-1]
    289                 attrvalue = self.entity_or_charref.sub(
    290                     self._convert_ref, attrvalue)
    291             attrs.append((attrname.lower(), attrvalue))
    292             k = match.end(0)
    293         if rawdata[j] == '>':
    294             j = j+1
    295         self.__starttag_text = rawdata[start_pos:j]
    296         self.finish_starttag(tag, attrs)
    297         return j
    298 
    299     # Internal -- convert entity or character reference
    300     def _convert_ref(self, match):
    301         if match.group(2):
    302             return self.convert_charref(match.group(2)) or \
    303                 '&#%s%s' % match.groups()[1:]
    304         elif match.group(3):
    305             return self.convert_entityref(match.group(1)) or \
    306                 '&%s;' % match.group(1)
    307         else:
    308             return '&%s' % match.group(1)
    309 
    310     # Internal -- parse endtag
    311     def parse_endtag(self, i):
    312         rawdata = self.rawdata
    313         match = endbracket.search(rawdata, i+1)
    314         if not match:
    315             return -1
    316         j = match.start(0)
    317         tag = rawdata[i+2:j].strip().lower()
    318         if rawdata[j] == '>':
    319             j = j+1
    320         self.finish_endtag(tag)
    321         return j
    322 
    323     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
    324     def finish_shorttag(self, tag, data):
    325         self.finish_starttag(tag, [])
    326         self.handle_data(data)
    327         self.finish_endtag(tag)
    328 
    329     # Internal -- finish processing of start tag
    330     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
    331     def finish_starttag(self, tag, attrs):
    332         try:
    333             method = getattr(self, 'start_' + tag)
    334         except AttributeError:
    335             try:
    336                 method = getattr(self, 'do_' + tag)
    337             except AttributeError:
    338                 self.unknown_starttag(tag, attrs)
    339                 return -1
    340             else:
    341                 self.handle_starttag(tag, method, attrs)
    342                 return 0
    343         else:
    344             self.stack.append(tag)
    345             self.handle_starttag(tag, method, attrs)
    346             return 1
    347 
    348     # Internal -- finish processing of end tag
    349     def finish_endtag(self, tag):
    350         if not tag:
    351             found = len(self.stack) - 1
    352             if found < 0:
    353                 self.unknown_endtag(tag)
    354                 return
    355         else:
    356             if tag not in self.stack:
    357                 try:
    358                     method = getattr(self, 'end_' + tag)
    359                 except AttributeError:
    360                     self.unknown_endtag(tag)
    361                 else:
    362                     self.report_unbalanced(tag)
    363                 return
    364             found = len(self.stack)
    365             for i in range(found):
    366                 if self.stack[i] == tag: found = i
    367         while len(self.stack) > found:
    368             tag = self.stack[-1]
    369             try:
    370                 method = getattr(self, 'end_' + tag)
    371             except AttributeError:
    372                 method = None
    373             if method:
    374                 self.handle_endtag(tag, method)
    375             else:
    376                 self.unknown_endtag(tag)
    377             del self.stack[-1]
    378 
    379     # Overridable -- handle start tag
    380     def handle_starttag(self, tag, method, attrs):
    381         method(attrs)
    382 
    383     # Overridable -- handle end tag
    384     def handle_endtag(self, tag, method):
    385         method()
    386 
    387     # Example -- report an unbalanced </...> tag.
    388     def report_unbalanced(self, tag):
    389         if self.verbose:
    390             print '*** Unbalanced </' + tag + '>'
    391             print '*** Stack:', self.stack
    392 
    393     def convert_charref(self, name):
    394         """Convert character reference, may be overridden."""
    395         try:
    396             n = int(name)
    397         except ValueError:
    398             return
    399         if not 0 <= n <= 127:
    400             return
    401         return self.convert_codepoint(n)
    402 
    403     def convert_codepoint(self, codepoint):
    404         return chr(codepoint)
    405 
    406     def handle_charref(self, name):
    407         """Handle character reference, no need to override."""
    408         replacement = self.convert_charref(name)
    409         if replacement is None:
    410             self.unknown_charref(name)
    411         else:
    412             self.handle_data(replacement)
    413 
    414     # Definition of entities -- derived classes may override
    415     entitydefs = \
    416             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
    417 
    418     def convert_entityref(self, name):
    419         """Convert entity references.
    420 
    421         As an alternative to overriding this method; one can tailor the
    422         results by setting up the self.entitydefs mapping appropriately.
    423         """
    424         table = self.entitydefs
    425         if name in table:
    426             return table[name]
    427         else:
    428             return
    429 
    430     def handle_entityref(self, name):
    431         """Handle entity references, no need to override."""
    432         replacement = self.convert_entityref(name)
    433         if replacement is None:
    434             self.unknown_entityref(name)
    435         else:
    436             self.handle_data(replacement)
    437 
    438     # Example -- handle data, should be overridden
    439     def handle_data(self, data):
    440         pass
    441 
    442     # Example -- handle comment, could be overridden
    443     def handle_comment(self, data):
    444         pass
    445 
    446     # Example -- handle declaration, could be overridden
    447     def handle_decl(self, decl):
    448         pass
    449 
    450     # Example -- handle processing instruction, could be overridden
    451     def handle_pi(self, data):
    452         pass
    453 
    454     # To be overridden -- handlers for unknown objects
    455     def unknown_starttag(self, tag, attrs): pass
    456     def unknown_endtag(self, tag): pass
    457     def unknown_charref(self, ref): pass
    458     def unknown_entityref(self, ref): pass
    459 
    460 
    461 class TestSGMLParser(SGMLParser):
    462 
    463     def __init__(self, verbose=0):
    464         self.testdata = ""
    465         SGMLParser.__init__(self, verbose)
    466 
    467     def handle_data(self, data):
    468         self.testdata = self.testdata + data
    469         if len(repr(self.testdata)) >= 70:
    470             self.flush()
    471 
    472     def flush(self):
    473         data = self.testdata
    474         if data:
    475             self.testdata = ""
    476             print 'data:', repr(data)
    477 
    478     def handle_comment(self, data):
    479         self.flush()
    480         r = repr(data)
    481         if len(r) > 68:
    482             r = r[:32] + '...' + r[-32:]
    483         print 'comment:', r
    484 
    485     def unknown_starttag(self, tag, attrs):
    486         self.flush()
    487         if not attrs:
    488             print 'start tag: <' + tag + '>'
    489         else:
    490             print 'start tag: <' + tag,
    491             for name, value in attrs:
    492                 print name + '=' + '"' + value + '"',
    493             print '>'
    494 
    495     def unknown_endtag(self, tag):
    496         self.flush()
    497         print 'end tag: </' + tag + '>'
    498 
    499     def unknown_entityref(self, ref):
    500         self.flush()
    501         print '*** unknown entity ref: &' + ref + ';'
    502 
    503     def unknown_charref(self, ref):
    504         self.flush()
    505         print '*** unknown char ref: &#' + ref + ';'
    506 
    507     def unknown_decl(self, data):
    508         self.flush()
    509         print '*** unknown decl: [' + data + ']'
    510 
    511     def close(self):
    512         SGMLParser.close(self)
    513         self.flush()
    514 
    515 
    516 def test(args = None):
    517     import sys
    518 
    519     if args is None:
    520         args = sys.argv[1:]
    521 
    522     if args and args[0] == '-s':
    523         args = args[1:]
    524         klass = SGMLParser
    525     else:
    526         klass = TestSGMLParser
    527 
    528     if args:
    529         file = args[0]
    530     else:
    531         file = 'test.html'
    532 
    533     if file == '-':
    534         f = sys.stdin
    535     else:
    536         try:
    537             f = open(file, 'r')
    538         except IOError, msg:
    539             print file, ":", msg
    540             sys.exit(1)
    541 
    542     data = f.read()
    543     if f is not sys.stdin:
    544         f.close()
    545 
    546     x = klass()
    547     for c in data:
    548         x.feed(c)
    549     x.close()
    550 
    551 
    552 if __name__ == '__main__':
    553     test()
    554