Home | History | Annotate | Download | only in Lib
      1 """A parser for XML, using the derived class as static DTD."""
      2 
      3 # Author: Sjoerd Mullender.

      4 
      5 import re
      6 import string
      7 
      8 import warnings
      9 warnings.warn("The xmllib module is obsolete.  Use xml.sax instead.",
     10               DeprecationWarning, 2)
     11 del warnings
     12 
     13 version = '0.3'
     14 
     15 class Error(RuntimeError):
     16     pass
     17 
     18 # Regular expressions used for parsing

     19 
     20 _S = '[ \t\r\n]+'                       # white space

     21 _opS = '[ \t\r\n]*'                     # optional white space

     22 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name

     23 _QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string

     24 illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content

     25 interesting = re.compile('[]&<]')
     26 
     27 amp = re.compile('&')
     28 ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
     29 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
     30 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
     31 space = re.compile(_S + '$')
     32 newline = re.compile('\n')
     33 
     34 attrfind = re.compile(
     35     _S + '(?P<name>' + _Name + ')'
     36     '(' + _opS + '=' + _opS +
     37     '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')
     38 starttagopen = re.compile('<' + _Name)
     39 starttagend = re.compile(_opS + '(?P<slash>/?)>')
     40 starttagmatch = re.compile('<(?P<tagname>'+_Name+')'
     41                       '(?P<attrs>(?:'+attrfind.pattern+')*)'+
     42                       starttagend.pattern)
     43 endtagopen = re.compile('</')
     44 endbracket = re.compile(_opS + '>')
     45 endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')
     46 tagfind = re.compile(_Name)
     47 cdataopen = re.compile(r'<!\[CDATA\[')
     48 cdataclose = re.compile(r'\]\]>')
     49 # this matches one of the following:

     50 # SYSTEM SystemLiteral

     51 # PUBLIC PubidLiteral SystemLiteral

     52 _SystemLiteral = '(?P<%s>'+_QStr+')'
     53 _PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
     54                         "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
     55 _ExternalId = '(?:SYSTEM|' \
     56                  'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
     57               ')'+_S+_SystemLiteral%'syslit'
     58 doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
     59                      '(?:'+_S+_ExternalId+')?'+_opS)
     60 xmldecl = re.compile('<\?xml'+_S+
     61                      'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+
     62                      '(?:'+_S+'encoding'+_opS+'='+_opS+
     63                         "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
     64                         '"[A-Za-z][-A-Za-z0-9._]*"))?'
     65                      '(?:'+_S+'standalone'+_opS+'='+_opS+
     66                         '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
     67                      _opS+'\?>')
     68 procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
     69 procclose = re.compile(_opS + r'\?>')
     70 commentopen = re.compile('<!--')
     71 commentclose = re.compile('-->')
     72 doubledash = re.compile('--')
     73 attrtrans = string.maketrans(' \r\n\t', '    ')
     74 
     75 # definitions for XML namespaces

     76 _NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"

     77 ncname = re.compile(_NCName + '$')
     78 qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix

     79                    '(?P<local>' + _NCName + ')$')
     80 
     81 xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')
     82 
     83 # XML parser base class -- find tags and call handler functions.

     84 # Usage: p = XMLParser(); p.feed(data); ...; p.close().

     85 # The dtd is defined by deriving a class which defines methods with

     86 # special names to handle tags: start_foo and end_foo to handle <foo>

     87 # and </foo>, respectively.  The data between tags is passed to the

     88 # parser by calling self.handle_data() with some data as argument (the

     89 # data may be split up in arbitrary chunks).

     90 
     91 class XMLParser:
     92     attributes = {}                     # default, to be overridden

     93     elements = {}                       # default, to be overridden

     94 
     95     # parsing options, settable using keyword args in __init__

     96     __accept_unquoted_attributes = 0
     97     __accept_missing_endtag_name = 0
     98     __map_case = 0
     99     __accept_utf8 = 0
    100     __translate_attribute_references = 1
    101 
    102     # Interface -- initialize and reset this instance

    103     def __init__(self, **kw):
    104         self.__fixed = 0
    105         if 'accept_unquoted_attributes' in kw:
    106             self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
    107         if 'accept_missing_endtag_name' in kw:
    108             self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
    109         if 'map_case' in kw:
    110             self.__map_case = kw['map_case']
    111         if 'accept_utf8' in kw:
    112             self.__accept_utf8 = kw['accept_utf8']
    113         if 'translate_attribute_references' in kw:
    114             self.__translate_attribute_references = kw['translate_attribute_references']
    115         self.reset()
    116 
    117     def __fixelements(self):
    118         self.__fixed = 1
    119         self.elements = {}
    120         self.__fixdict(self.__dict__)
    121         self.__fixclass(self.__class__)
    122 
    123     def __fixclass(self, kl):
    124         self.__fixdict(kl.__dict__)
    125         for k in kl.__bases__:
    126             self.__fixclass(k)
    127 
    128     def __fixdict(self, dict):
    129         for key in dict.keys():
    130             if key[:6] == 'start_':
    131                 tag = key[6:]
    132                 start, end = self.elements.get(tag, (None, None))
    133                 if start is None:
    134                     self.elements[tag] = getattr(self, key), end
    135             elif key[:4] == 'end_':
    136                 tag = key[4:]
    137                 start, end = self.elements.get(tag, (None, None))
    138                 if end is None:
    139                     self.elements[tag] = start, getattr(self, key)
    140 
    141     # Interface -- reset this instance.  Loses all unprocessed data

    142     def reset(self):
    143         self.rawdata = ''
    144         self.stack = []
    145         self.nomoretags = 0
    146         self.literal = 0
    147         self.lineno = 1
    148         self.__at_start = 1
    149         self.__seen_doctype = None
    150         self.__seen_starttag = 0
    151         self.__use_namespaces = 0
    152         self.__namespaces = {'xml':None}   # xml is implicitly declared

    153         # backward compatibility hack: if elements not overridden,

    154         # fill it in ourselves

    155         if self.elements is XMLParser.elements:
    156             self.__fixelements()
    157 
    158     # For derived classes only -- enter literal mode (CDATA) till EOF

    159     def setnomoretags(self):
    160         self.nomoretags = self.literal = 1
    161 
    162     # For derived classes only -- enter literal mode (CDATA)

    163     def setliteral(self, *args):
    164         self.literal = 1
    165 
    166     # Interface -- feed some data to the parser.  Call this as

    167     # often as you want, with as little or as much text as you

    168     # want (may include '\n').  (This just saves the text, all the

    169     # processing is done by goahead().)

    170     def feed(self, data):
    171         self.rawdata = self.rawdata + data
    172         self.goahead(0)
    173 
    174     # Interface -- handle the remaining data

    175     def close(self):
    176         self.goahead(1)
    177         if self.__fixed:
    178             self.__fixed = 0
    179             # remove self.elements so that we don't leak

    180             del self.elements
    181 
    182     # Interface -- translate references

    183     def translate_references(self, data, all = 1):
    184         if not self.__translate_attribute_references:
    185             return data
    186         i = 0
    187         while 1:
    188             res = amp.search(data, i)
    189             if res is None:
    190                 return data
    191             s = res.start(0)
    192             res = ref.match(data, s)
    193             if res is None:
    194                 self.syntax_error("bogus `&'")
    195                 i = s+1
    196                 continue
    197             i = res.end(0)
    198             str = res.group(1)
    199             rescan = 0
    200             if str[0] == '#':

    201                 if str[1] == 'x':
    202                     str = chr(int(str[2:], 16))
    203                 else:
    204                     str = chr(int(str[1:]))
    205                 if data[i - 1] != ';':
    206                     self.syntax_error("`;' missing after char reference")
    207                     i = i-1
    208             elif all:
    209                 if str in self.entitydefs:
    210                     str = self.entitydefs[str]
    211                     rescan = 1
    212                 elif data[i - 1] != ';':
    213                     self.syntax_error("bogus `&'")
    214                     i = s + 1 # just past the &
    215                     continue
    216                 else:
    217                     self.syntax_error("reference to unknown entity `&%s;'" % str)
    218                     str = '&' + str + ';'
    219             elif data[i - 1] != ';':
    220                 self.syntax_error("bogus `&'")
    221                 i = s + 1 # just past the &
    222                 continue
    223 
    224             # when we get here, str contains the translated text and i points
    225             # to the end of the string that is to be replaced
    226             data = data[:s] + str + data[i:]
    227             if rescan:
    228                 i = s
    229             else:
    230                 i = s + len(str)
    231 
    232     # Interface - return a dictionary of all namespaces currently valid
    233     def getnamespace(self):
    234         nsdict = {}
    235         for t, d, nst in self.stack:
    236             nsdict.update(d)
    237         return nsdict
    238 
    239     # Internal -- handle data as far as reasonable.  May leave state
    240     # and data to be processed by a subsequent call.  If 'end' is
    241     # true, force handling all data as if followed by EOF marker.
    242     def goahead(self, end):
    243         rawdata = self.rawdata
    244         i = 0
    245         n = len(rawdata)
    246         while i < n:
    247             if i > 0:
    248                 self.__at_start = 0
    249             if self.nomoretags:
    250                 data = rawdata[i:n]
    251                 self.handle_data(data)
    252                 self.lineno = self.lineno + data.count('\n')
    253                 i = n
    254                 break
    255             res = interesting.search(rawdata, i)
    256             if res:
    257                 j = res.start(0)
    258             else:
    259                 j = n
    260             if i < j:
    261                 data = rawdata[i:j]
    262                 if self.__at_start and space.match(data) is None:
    263                     self.syntax_error('illegal data at start of file')
    264                 self.__at_start = 0
    265                 if not self.stack and space.match(data) is None:
    266                     self.syntax_error('data not in content')
    267                 if not self.__accept_utf8 and illegal.search(data):
    268                     self.syntax_error('illegal character in content')
    269                 self.handle_data(data)
    270                 self.lineno = self.lineno + data.count('\n')
    271             i = j
    272             if i == n: break
    273             if rawdata[i] == '<':
    274                 if starttagopen.match(rawdata, i):
    275                     if self.literal:
    276                         data = rawdata[i]
    277                         self.handle_data(data)
    278                         self.lineno = self.lineno + data.count('\n')
    279                         i = i+1
    280                         continue
    281                     k = self.parse_starttag(i)
    282                     if k < 0: break
    283                     self.__seen_starttag = 1
    284                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    285                     i = k
    286                     continue
    287                 if endtagopen.match(rawdata, i):
    288                     k = self.parse_endtag(i)
    289                     if k < 0: break
    290                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    291                     i =  k
    292                     continue
    293                 if commentopen.match(rawdata, i):
    294                     if self.literal:
    295                         data = rawdata[i]
    296                         self.handle_data(data)
    297                         self.lineno = self.lineno + data.count('\n')
    298                         i = i+1
    299                         continue
    300                     k = self.parse_comment(i)
    301                     if k < 0: break
    302                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    303                     i = k
    304                     continue
    305                 if cdataopen.match(rawdata, i):
    306                     k = self.parse_cdata(i)
    307                     if k < 0: break
    308                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    309                     i = k
    310                     continue
    311                 res = xmldecl.match(rawdata, i)
    312                 if res:
    313                     if not self.__at_start:
    314                         self.syntax_error("<?xml?> declaration not at start of document")
    315                     version, encoding, standalone = res.group('version',
    316                                                               'encoding',
    317                                                               'standalone')
    318                     if version[1:-1] != '1.0':
    319                         raise Error('only XML version 1.0 supported')
    320                     if encoding: encoding = encoding[1:-1]
    321                     if standalone: standalone = standalone[1:-1]
    322                     self.handle_xml(encoding, standalone)
    323                     i = res.end(0)
    324                     continue
    325                 res = procopen.match(rawdata, i)
    326                 if res:
    327                     k = self.parse_proc(i)
    328                     if k < 0: break
    329                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    330                     i = k
    331                     continue
    332                 res = doctype.match(rawdata, i)
    333                 if res:
    334                     if self.literal:
    335                         data = rawdata[i]
    336                         self.handle_data(data)
    337                         self.lineno = self.lineno + data.count('\n')
    338                         i = i+1
    339                         continue
    340                     if self.__seen_doctype:
    341                         self.syntax_error('multiple DOCTYPE elements')
    342                     if self.__seen_starttag:
    343                         self.syntax_error('DOCTYPE not at beginning of document')
    344                     k = self.parse_doctype(res)
    345                     if k < 0: break
    346                     self.__seen_doctype = res.group('name')
    347                     if self.__map_case:
    348                         self.__seen_doctype = self.__seen_doctype.lower()
    349                     self.lineno = self.lineno + rawdata[i:k].count('\n')
    350                     i = k
    351                     continue
    352             elif rawdata[i] == '&':
    353                 if self.literal:
    354                     data = rawdata[i]
    355                     self.handle_data(data)
    356                     i = i+1
    357                     continue
    358                 res = charref.match(rawdata, i)
    359                 if res is not None:
    360                     i = res.end(0)
    361                     if rawdata[i-1] != ';':
    362                         self.syntax_error("`;' missing in charref")
    363                         i = i-1
    364                     if not self.stack:
    365                         self.syntax_error('data not in content')
    366                     self.handle_charref(res.group('char')[:-1])
    367                     self.lineno = self.lineno + res.group(0).count('\n')
    368                     continue
    369                 res = entityref.match(rawdata, i)
    370                 if res is not None:
    371                     i = res.end(0)
    372                     if rawdata[i-1] != ';':
    373                         self.syntax_error("`;' missing in entityref")
    374                         i = i-1
    375                     name = res.group('name')
    376                     if self.__map_case:
    377                         name = name.lower()
    378                     if name in self.entitydefs:
    379                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
    380                         n = len(rawdata)
    381                         i = res.start(0)
    382                     else:
    383                         self.unknown_entityref(name)
    384                     self.lineno = self.lineno + res.group(0).count('\n')
    385                     continue
    386             elif rawdata[i] == ']':
    387                 if self.literal:
    388                     data = rawdata[i]
    389                     self.handle_data(data)
    390                     i = i+1
    391                     continue
    392                 if n-i < 3:
    393                     break
    394                 if cdataclose.match(rawdata, i):
    395                     self.syntax_error("bogus `]]>'")
    396                 self.handle_data(rawdata[i])
    397                 i = i+1
    398                 continue
    399             else:
    400                 raise Error('neither < nor & ??')
    401             # We get here only if incomplete matches but
    402             # nothing else
    403             break
    404         # end while
    405         if i > 0:
    406             self.__at_start = 0
    407         if end and i < n:
    408             data = rawdata[i]
    409             self.syntax_error("bogus `%s'" % data)
    410             if not self.__accept_utf8 and illegal.search(data):
    411                 self.syntax_error('illegal character in content')
    412             self.handle_data(data)
    413             self.lineno = self.lineno + data.count('\n')
    414             self.rawdata = rawdata[i+1:]
    415             return self.goahead(end)
    416         self.rawdata = rawdata[i:]
    417         if end:
    418             if not self.__seen_starttag:
    419                 self.syntax_error('no elements in file')
    420             if self.stack:
    421                 self.syntax_error('missing end tags')
    422                 while self.stack:
    423                     self.finish_endtag(self.stack[-1][0])
    424 
    425     # Internal -- parse comment, return length or -1 if not terminated
    426     def parse_comment(self, i):
    427         rawdata = self.rawdata
    428         if rawdata[i:i+4] != '<!--':
    429             raise Error('unexpected call to handle_comment')
    430         res = commentclose.search(rawdata, i+4)
    431         if res is None:
    432             return -1
    433         if doubledash.search(rawdata, i+4, res.start(0)):
    434             self.syntax_error("`--' inside comment")
    435         if rawdata[res.start(0)-1] == '-':
    436             self.syntax_error('comment cannot end in three dashes')
    437         if not self.__accept_utf8 and \
    438            illegal.search(rawdata, i+4, res.start(0)):
    439             self.syntax_error('illegal character in comment')
    440         self.handle_comment(rawdata[i+4: res.start(0)])
    441         return res.end(0)
    442 
    443     # Internal -- handle DOCTYPE tag, return length or -1 if not terminated

    444     def parse_doctype(self, res):
    445         rawdata = self.rawdata
    446         n = len(rawdata)
    447         name = res.group('name')
    448         if self.__map_case:
    449             name = name.lower()
    450         pubid, syslit = res.group('pubid', 'syslit')
    451         if pubid is not None:
    452             pubid = pubid[1:-1]         # remove quotes

    453             pubid = ' '.join(pubid.split()) # normalize

    454         if syslit is not None: syslit = syslit[1:-1] # remove quotes

    455         j = k = res.end(0)
    456         if k >= n:
    457             return -1
    458         if rawdata[k] == '[':
    459             level = 0
    460             k = k+1
    461             dq = sq = 0
    462             while k < n:
    463                 c = rawdata[k]
    464                 if not sq and c == '"':
    465                     dq = not dq
    466                 elif not dq and c == "'":
    467                     sq = not sq
    468                 elif sq or dq:
    469                     pass
    470                 elif level <= 0 and c == ']':
    471                     res = endbracket.match(rawdata, k+1)
    472                     if res is None:
    473                         return -1
    474                     self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
    475                     return res.end(0)
    476                 elif c == '<':
    477                     level = level + 1
    478                 elif c == '>':
    479                     level = level - 1
    480                     if level < 0:
    481                         self.syntax_error("bogus `>' in DOCTYPE")
    482                 k = k+1
    483         res = endbracketfind.match(rawdata, k)
    484         if res is None:
    485             return -1
    486         if endbracket.match(rawdata, k) is None:
    487             self.syntax_error('garbage in DOCTYPE')
    488         self.handle_doctype(name, pubid, syslit, None)
    489         return res.end(0)
    490 
    491     # Internal -- handle CDATA tag, return length or -1 if not terminated
    492     def parse_cdata(self, i):
    493         rawdata = self.rawdata
    494         if rawdata[i:i+9] != '<![CDATA[':
    495             raise Error('unexpected call to parse_cdata')
    496         res = cdataclose.search(rawdata, i+9)
    497         if res is None:
    498             return -1
    499         if not self.__accept_utf8 and \
    500            illegal.search(rawdata, i+9, res.start(0)):
    501             self.syntax_error('illegal character in CDATA')
    502         if not self.stack:
    503             self.syntax_error('CDATA not in content')
    504         self.handle_cdata(rawdata[i+9:res.start(0)])
    505         return res.end(0)
    506 
    507     __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}
    508     # Internal -- handle a processing instruction tag
    509     def parse_proc(self, i):
    510         rawdata = self.rawdata
    511         end = procclose.search(rawdata, i)
    512         if end is None:
    513             return -1
    514         j = end.start(0)
    515         if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
    516             self.syntax_error('illegal character in processing instruction')
    517         res = tagfind.match(rawdata, i+2)
    518         if res is None:
    519             raise Error('unexpected call to parse_proc')
    520         k = res.end(0)
    521         name = res.group(0)
    522         if self.__map_case:
    523             name = name.lower()
    524         if name == 'xml:namespace':
    525             self.syntax_error('old-fashioned namespace declaration')
    526             self.__use_namespaces = -1
    527             # namespace declaration
    528             # this must come after the <?xml?> declaration (if any)
    529             # and before the <!DOCTYPE> (if any).
    530             if self.__seen_doctype or self.__seen_starttag:
    531                 self.syntax_error('xml:namespace declaration too late in document')
    532             attrdict, namespace, k = self.parse_attributes(name, k, j)
    533             if namespace:
    534                 self.syntax_error('namespace declaration inside namespace declaration')
    535             for attrname in attrdict.keys():
    536                 if not attrname in self.__xml_namespace_attributes:
    537                     self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
    538             if not 'ns' in attrdict or not 'prefix' in attrdict:
    539                 self.syntax_error('xml:namespace without required attributes')
    540             prefix = attrdict.get('prefix')
    541             if ncname.match(prefix) is None:
    542                 self.syntax_error('xml:namespace illegal prefix value')
    543                 return end.end(0)
    544             if prefix in self.__namespaces:
    545                 self.syntax_error('xml:namespace prefix not unique')
    546             self.__namespaces[prefix] = attrdict['ns']
    547         else:
    548             if name.lower() == 'xml':
    549                 self.syntax_error('illegal processing instruction target name')
    550             self.handle_proc(name, rawdata[k:j])
    551         return end.end(0)
    552 
    553     # Internal -- parse attributes between i and j
    554     def parse_attributes(self, tag, i, j):
    555         rawdata = self.rawdata
    556         attrdict = {}
    557         namespace = {}
    558         while i < j:
    559             res = attrfind.match(rawdata, i)
    560             if res is None:
    561                 break
    562             attrname, attrvalue = res.group('name', 'value')
    563             if self.__map_case:
    564                 attrname = attrname.lower()
    565             i = res.end(0)
    566             if attrvalue is None:
    567                 self.syntax_error("no value specified for attribute `%s'" % attrname)
    568                 attrvalue = attrname
    569             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
    570                  attrvalue[:1] == '"' == attrvalue[-1:]:
    571                 attrvalue = attrvalue[1:-1]
    572             elif not self.__accept_unquoted_attributes:
    573                 self.syntax_error("attribute `%s' value not quoted" % attrname)
    574             res = xmlns.match(attrname)
    575             if res is not None:
    576                 # namespace declaration
    577                 ncname = res.group('ncname')
    578                 namespace[ncname or ''] = attrvalue or None
    579                 if not self.__use_namespaces:
    580                     self.__use_namespaces = len(self.stack)+1
    581                 continue
    582             if '<' in attrvalue:
    583                 self.syntax_error("`<' illegal in attribute value")
    584             if attrname in attrdict:
    585                 self.syntax_error("attribute `%s' specified twice" % attrname)
    586             attrvalue = attrvalue.translate(attrtrans)
    587             attrdict[attrname] = self.translate_references(attrvalue)
    588         return attrdict, namespace, i
    589 
    590     # Internal -- handle starttag, return length or -1 if not terminated

    591     def parse_starttag(self, i):
    592         rawdata = self.rawdata
    593         # i points to start of tag

    594         end = endbracketfind.match(rawdata, i+1)
    595         if end is None:
    596             return -1
    597         tag = starttagmatch.match(rawdata, i)
    598         if tag is None or tag.end(0) != end.end(0):
    599             self.syntax_error('garbage in starttag')
    600             return end.end(0)
    601         nstag = tagname = tag.group('tagname')
    602         if self.__map_case:
    603             nstag = tagname = nstag.lower()
    604         if not self.__seen_starttag and self.__seen_doctype and \
    605            tagname != self.__seen_doctype:
    606             self.syntax_error('starttag does not match DOCTYPE')
    607         if self.__seen_starttag and not self.stack:
    608             self.syntax_error('multiple elements on top level')
    609         k, j = tag.span('attrs')
    610         attrdict, nsdict, k = self.parse_attributes(tagname, k, j)
    611         self.stack.append((tagname, nsdict, nstag))
    612         if self.__use_namespaces:
    613             res = qname.match(tagname)
    614         else:
    615             res = None
    616         if res is not None:
    617             prefix, nstag = res.group('prefix', 'local')
    618             if prefix is None:
    619                 prefix = ''
    620             ns = None
    621             for t, d, nst in self.stack:
    622                 if prefix in d:
    623                     ns = d[prefix]
    624             if ns is None and prefix != '':
    625                 ns = self.__namespaces.get(prefix)
    626             if ns is not None:
    627                 nstag = ns + ' ' + nstag
    628             elif prefix != '':
    629                 nstag = prefix + ':' + nstag # undo split

    630             self.stack[-1] = tagname, nsdict, nstag
    631         # translate namespace of attributes

    632         attrnamemap = {} # map from new name to old name (used for error reporting)

    633         for key in attrdict.keys():
    634             attrnamemap[key] = key
    635         if self.__use_namespaces:
    636             nattrdict = {}
    637             for key, val in attrdict.items():
    638                 okey = key
    639                 res = qname.match(key)
    640                 if res is not None:
    641                     aprefix, key = res.group('prefix', 'local')
    642                     if self.__map_case:
    643                         key = key.lower()
    644                     if aprefix is not None:
    645                         ans = None
    646                         for t, d, nst in self.stack:
    647                             if aprefix in d:
    648                                 ans = d[aprefix]
    649                         if ans is None:
    650                             ans = self.__namespaces.get(aprefix)
    651                         if ans is not None:
    652                             key = ans + ' ' + key
    653                         else:
    654                             key = aprefix + ':' + key
    655                 nattrdict[key] = val
    656                 attrnamemap[key] = okey
    657             attrdict = nattrdict
    658         attributes = self.attributes.get(nstag)
    659         if attributes is not None:
    660             for key in attrdict.keys():
    661                 if not key in attributes:
    662                     self.syntax_error("unknown attribute `%s' in tag `%s'" % (attrnamemap[key], tagname))
    663             for key, val in attributes.items():
    664                 if val is not None and not key in attrdict:
    665                     attrdict[key] = val
    666         method = self.elements.get(nstag, (None, None))[0]
    667         self.finish_starttag(nstag, attrdict, method)
    668         if tag.group('slash') == '/':
    669             self.finish_endtag(tagname)
    670         return tag.end(0)
    671 
    672     # Internal -- parse endtag
    673     def parse_endtag(self, i):
    674         rawdata = self.rawdata
    675         end = endbracketfind.match(rawdata, i+1)
    676         if end is None:
    677             return -1
    678         res = tagfind.match(rawdata, i+2)
    679         if res is None:
    680             if self.literal:
    681                 self.handle_data(rawdata[i])
    682                 return i+1
    683             if not self.__accept_missing_endtag_name:
    684                 self.syntax_error('no name specified in end tag')
    685             tag = self.stack[-1][0]
    686             k = i+2
    687         else:
    688             tag = res.group(0)
    689             if self.__map_case:
    690                 tag = tag.lower()
    691             if self.literal:
    692                 if not self.stack or tag != self.stack[-1][0]:
    693                     self.handle_data(rawdata[i])
    694                     return i+1
    695             k = res.end(0)
    696         if endbracket.match(rawdata, k) is None:
    697             self.syntax_error('garbage in end tag')
    698         self.finish_endtag(tag)
    699         return end.end(0)
    700 
    701     # Internal -- finish processing of start tag
    702     def finish_starttag(self, tagname, attrdict, method):
    703         if method is not None:
    704             self.handle_starttag(tagname, method, attrdict)
    705         else:
    706             self.unknown_starttag(tagname, attrdict)
    707 
    708     # Internal -- finish processing of end tag
    709     def finish_endtag(self, tag):
    710         self.literal = 0
    711         if not tag:
    712             self.syntax_error('name-less end tag')
    713             found = len(self.stack) - 1
    714             if found < 0:
    715                 self.unknown_endtag(tag)
    716                 return
    717         else:
    718             found = -1
    719             for i in range(len(self.stack)):
    720                 if tag == self.stack[i][0]:
    721                     found = i
    722             if found == -1:
    723                 self.syntax_error('unopened end tag')
    724                 return
    725         while len(self.stack) > found:
    726             if found < len(self.stack) - 1:
    727                 self.syntax_error('missing close tag for %s' % self.stack[-1][2])
    728             nstag = self.stack[-1][2]
    729             method = self.elements.get(nstag, (None, None))[1]
    730             if method is not None:
    731                 self.handle_endtag(nstag, method)
    732             else:
    733                 self.unknown_endtag(nstag)
    734             if self.__use_namespaces == len(self.stack):
    735                 self.__use_namespaces = 0
    736             del self.stack[-1]
    737 
    738     # Overridable -- handle xml processing instruction
    739     def handle_xml(self, encoding, standalone):
    740         pass
    741 
    742     # Overridable -- handle DOCTYPE
    743     def handle_doctype(self, tag, pubid, syslit, data):
    744         pass
    745 
    746     # Overridable -- handle start tag
    747     def handle_starttag(self, tag, method, attrs):
    748         method(attrs)
    749 
    750     # Overridable -- handle end tag
    751     def handle_endtag(self, tag, method):
    752         method()
    753 
    754     # Example -- handle character reference, no need to override
    755     def handle_charref(self, name):
    756         try:
    757             if name[0] == 'x':
    758                 n = int(name[1:], 16)
    759             else:
    760                 n = int(name)
    761         except ValueError:
    762             self.unknown_charref(name)
    763             return
    764         if not 0 <= n <= 255:
    765             self.unknown_charref(name)
    766             return
    767         self.handle_data(chr(n))
    768 
    769     # Definition of entities -- derived classes may override
    770     entitydefs = {'lt': '&#60;',        # must use charref
    771                   'gt': '&#62;',
    772                   'amp': '&#38;',       # must use charref
    773                   'quot': '&#34;',
    774                   'apos': '&#39;',
    775                   }
    776 
    777     # Example -- handle data, should be overridden
    778     def handle_data(self, data):
    779         pass
    780 
    781     # Example -- handle cdata, could be overridden
    782     def handle_cdata(self, data):
    783         pass
    784 
    785     # Example -- handle comment, could be overridden
    786     def handle_comment(self, data):
    787         pass
    788 
    789     # Example -- handle processing instructions, could be overridden
    790     def handle_proc(self, name, data):
    791         pass
    792 
    793     # Example -- handle relatively harmless syntax errors, could be overridden
    794     def syntax_error(self, message):
    795         raise Error('Syntax error at line %d: %s' % (self.lineno, message))
    796 
    797     # To be overridden -- handlers for unknown objects
    798     def unknown_starttag(self, tag, attrs): pass
    799     def unknown_endtag(self, tag): pass
    800     def unknown_charref(self, ref): pass
    801     def unknown_entityref(self, name):
    802         self.syntax_error("reference to unknown entity `&%s;'" % name)
    803 
    804 
    805 class TestXMLParser(XMLParser):
    806 
    807     def __init__(self, **kw):
    808         self.testdata = ""
    809         XMLParser.__init__(self, **kw)
    810 
    811     def handle_xml(self, encoding, standalone):
    812         self.flush()
    813         print 'xml: encoding =',encoding,'standalone =',standalone
    814 
    815     def handle_doctype(self, tag, pubid, syslit, data):
    816         self.flush()
    817         print 'DOCTYPE:',tag, repr(data)
    818 
    819     def handle_data(self, data):
    820         self.testdata = self.testdata + data
    821         if len(repr(self.testdata)) >= 70:
    822             self.flush()
    823 
    824     def flush(self):
    825         data = self.testdata
    826         if data:
    827             self.testdata = ""
    828             print 'data:', repr(data)
    829 
    830     def handle_cdata(self, data):
    831         self.flush()
    832         print 'cdata:', repr(data)
    833 
    834     def handle_proc(self, name, data):
    835         self.flush()
    836         print 'processing:',name,repr(data)
    837 
    838     def handle_comment(self, data):
    839         self.flush()
    840         r = repr(data)
    841         if len(r) > 68:
    842             r = r[:32] + '...' + r[-32:]
    843         print 'comment:', r
    844 
    845     def syntax_error(self, message):
    846         print 'error at line %d:' % self.lineno, message
    847 
    848     def unknown_starttag(self, tag, attrs):
    849         self.flush()
    850         if not attrs:
    851             print 'start tag: <' + tag + '>'
    852         else:
    853             print 'start tag: <' + tag,
    854             for name, value in attrs.items():
    855                 print name + '=' + '"' + value + '"',
    856             print '>'
    857 
    858     def unknown_endtag(self, tag):
    859         self.flush()
    860         print 'end tag: </' + tag + '>'
    861 
    862     def unknown_entityref(self, ref):
    863         self.flush()
    864         print '*** unknown entity ref: &' + ref + ';'
    865 
    866     def unknown_charref(self, ref):
    867         self.flush()
    868         print '*** unknown char ref: &#' + ref + ';'
    869 
    870     def close(self):
    871         XMLParser.close(self)
    872         self.flush()
    873 
    874 def test(args = None):
    875     import sys, getopt
    876     from time import time
    877 
    878     if not args:
    879         args = sys.argv[1:]
    880 
    881     opts, args = getopt.getopt(args, 'st')
    882     klass = TestXMLParser
    883     do_time = 0
    884     for o, a in opts:
    885         if o == '-s':
    886             klass = XMLParser
    887         elif o == '-t':
    888             do_time = 1
    889 
    890     if args:
    891         file = args[0]
    892     else:
    893         file = 'test.xml'
    894 
    895     if file == '-':
    896         f = sys.stdin
    897     else:
    898         try:
    899             f = open(file, 'r')
    900         except IOError, msg:
    901             print file, ":", msg
    902             sys.exit(1)
    903 
    904     data = f.read()
    905     if f is not sys.stdin:
    906         f.close()
    907 
    908     x = klass()
    909     t0 = time()
    910     try:
    911         if do_time:
    912             x.feed(data)
    913             x.close()
    914         else:
    915             for c in data:
    916                 x.feed(c)
    917             x.close()
    918     except Error, msg:
    919         t1 = time()
    920         print msg
    921         if do_time:
    922             print 'total time: %g' % (t1-t0)
    923         sys.exit(1)
    924     t1 = time()
    925     if do_time:
    926         print 'total time: %g' % (t1-t0)
    927 
    928 
    929 if __name__ == '__main__':
    930     test()
    931