Home | History | Annotate | Download | only in etree
      1 #

      2 # ElementTree

      3 # $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $

      4 #

      5 # light-weight XML support for Python 2.3 and later.

      6 #

      7 # history (since 1.2.6):

      8 # 2005-11-12 fl   added tostringlist/fromstringlist helpers

      9 # 2006-07-05 fl   merged in selected changes from the 1.3 sandbox

     10 # 2006-07-05 fl   removed support for 2.1 and earlier

     11 # 2007-06-21 fl   added deprecation/future warnings

     12 # 2007-08-25 fl   added doctype hook, added parser version attribute etc

     13 # 2007-08-26 fl   added new serializer code (better namespace handling, etc)

     14 # 2007-08-27 fl   warn for broken /tag searches on tree level

     15 # 2007-09-02 fl   added html/text methods to serializer (experimental)

     16 # 2007-09-05 fl   added method argument to tostring/tostringlist

     17 # 2007-09-06 fl   improved error handling

     18 # 2007-09-13 fl   added itertext, iterfind; assorted cleanups

     19 # 2007-12-15 fl   added C14N hooks, copy method (experimental)

     20 #

     21 # Copyright (c) 1999-2008 by Fredrik Lundh.  All rights reserved.

     22 #

     23 # fredrik (at] pythonware.com

     24 # http://www.pythonware.com

     25 #

     26 # --------------------------------------------------------------------

     27 # The ElementTree toolkit is

     28 #

     29 # Copyright (c) 1999-2008 by Fredrik Lundh

     30 #

     31 # By obtaining, using, and/or copying this software and/or its

     32 # associated documentation, you agree that you have read, understood,

     33 # and will comply with the following terms and conditions:

     34 #

     35 # Permission to use, copy, modify, and distribute this software and

     36 # its associated documentation for any purpose and without fee is

     37 # hereby granted, provided that the above copyright notice appears in

     38 # all copies, and that both that copyright notice and this permission

     39 # notice appear in supporting documentation, and that the name of

     40 # Secret Labs AB or the author not be used in advertising or publicity

     41 # pertaining to distribution of the software without specific, written

     42 # prior permission.

     43 #

     44 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD

     45 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-

     46 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR

     47 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY

     48 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,

     49 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS

     50 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE

     51 # OF THIS SOFTWARE.

     52 # --------------------------------------------------------------------

     53 
     54 # Licensed to PSF under a Contributor Agreement.

     55 # See http://www.python.org/psf/license for licensing details.

     56 
     57 __all__ = [
     58     # public symbols

     59     "Comment",
     60     "dump",
     61     "Element", "ElementTree",
     62     "fromstring", "fromstringlist",
     63     "iselement", "iterparse",
     64     "parse", "ParseError",
     65     "PI", "ProcessingInstruction",
     66     "QName",
     67     "SubElement",
     68     "tostring", "tostringlist",
     69     "TreeBuilder",
     70     "VERSION",
     71     "XML",
     72     "XMLParser", "XMLTreeBuilder",
     73     ]
     74 
     75 VERSION = "1.3.0"
     76 
     77 ##

     78 # The <b>Element</b> type is a flexible container object, designed to

     79 # store hierarchical data structures in memory. The type can be

     80 # described as a cross between a list and a dictionary.

     81 # <p>

     82 # Each element has a number of properties associated with it:

     83 # <ul>

     84 # <li>a <i>tag</i>. This is a string identifying what kind of data

     85 # this element represents (the element type, in other words).</li>

     86 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>

     87 # <li>a <i>text</i> string.</li>

     88 # <li>an optional <i>tail</i> string.</li>

     89 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>

     90 # </ul>

     91 #

     92 # To create an element instance, use the {@link #Element} constructor

     93 # or the {@link #SubElement} factory function.

     94 # <p>

     95 # The {@link #ElementTree} class can be used to wrap an element

     96 # structure, and convert it from and to XML.

     97 ##

     98 
     99 import sys
    100 import re
    101 import warnings
    102 
    103 
    104 class _SimpleElementPath(object):
    105     # emulate pre-1.2 find/findtext/findall behaviour

    106     def find(self, element, tag, namespaces=None):
    107         for elem in element:
    108             if elem.tag == tag:
    109                 return elem
    110         return None
    111     def findtext(self, element, tag, default=None, namespaces=None):
    112         elem = self.find(element, tag)
    113         if elem is None:
    114             return default
    115         return elem.text or ""
    116     def iterfind(self, element, tag, namespaces=None):
    117         if tag[:3] == ".//":
    118             for elem in element.iter(tag[3:]):
    119                 yield elem
    120         for elem in element:
    121             if elem.tag == tag:
    122                 yield elem
    123     def findall(self, element, tag, namespaces=None):
    124         return list(self.iterfind(element, tag, namespaces))
    125 
    126 try:
    127     from . import ElementPath
    128 except ImportError:
    129     ElementPath = _SimpleElementPath()
    130 
    131 ##

    132 # Parser error.  This is a subclass of <b>SyntaxError</b>.

    133 # <p>

    134 # In addition to the exception value, an exception instance contains a

    135 # specific exception code in the <b>code</b> attribute, and the line and

    136 # column of the error in the <b>position</b> attribute.

    137 
    138 class ParseError(SyntaxError):
    139     pass
    140 
    141 # --------------------------------------------------------------------

    142 
    143 ##

    144 # Checks if an object appears to be a valid element object.

    145 #

    146 # @param An element instance.

    147 # @return A true value if this is an element object.

    148 # @defreturn flag

    149 
    150 def iselement(element):
    151     # FIXME: not sure about this; might be a better idea to look

    152     # for tag/attrib/text attributes

    153     return isinstance(element, Element) or hasattr(element, "tag")
    154 
    155 ##

    156 # Element class.  This class defines the Element interface, and

    157 # provides a reference implementation of this interface.

    158 # <p>

    159 # The element name, attribute names, and attribute values can be

    160 # either ASCII strings (ordinary Python strings containing only 7-bit

    161 # ASCII characters) or Unicode strings.

    162 #

    163 # @param tag The element name.

    164 # @param attrib An optional dictionary, containing element attributes.

    165 # @param **extra Additional attributes, given as keyword arguments.

    166 # @see Element

    167 # @see SubElement

    168 # @see Comment

    169 # @see ProcessingInstruction

    170 
    171 class Element(object):
    172     # <tag attrib>text<child/>...</tag>tail

    173 
    174     ##

    175     # (Attribute) Element tag.

    176 
    177     tag = None
    178 
    179     ##

    180     # (Attribute) Element attribute dictionary.  Where possible, use

    181     # {@link #Element.get},

    182     # {@link #Element.set},

    183     # {@link #Element.keys}, and

    184     # {@link #Element.items} to access

    185     # element attributes.

    186 
    187     attrib = None
    188 
    189     ##

    190     # (Attribute) Text before first subelement.  This is either a

    191     # string or the value None.  Note that if there was no text, this

    192     # attribute may be either None or an empty string, depending on

    193     # the parser.

    194 
    195     text = None
    196 
    197     ##

    198     # (Attribute) Text after this element's end tag, but before the

    199     # next sibling element's start tag.  This is either a string or

    200     # the value None.  Note that if there was no text, this attribute

    201     # may be either None or an empty string, depending on the parser.

    202 
    203     tail = None # text after end tag, if any

    204 
    205     # constructor

    206 
    207     def __init__(self, tag, attrib={}, **extra):
    208         attrib = attrib.copy()
    209         attrib.update(extra)
    210         self.tag = tag
    211         self.attrib = attrib
    212         self._children = []
    213 
    214     def __repr__(self):
    215         return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
    216 
    217     ##

    218     # Creates a new element object of the same type as this element.

    219     #

    220     # @param tag Element tag.

    221     # @param attrib Element attributes, given as a dictionary.

    222     # @return A new element instance.

    223 
    224     def makeelement(self, tag, attrib):
    225         return self.__class__(tag, attrib)
    226 
    227     ##

    228     # (Experimental) Copies the current element.  This creates a

    229     # shallow copy; subelements will be shared with the original tree.

    230     #

    231     # @return A new element instance.

    232 
    233     def copy(self):
    234         elem = self.makeelement(self.tag, self.attrib)
    235         elem.text = self.text
    236         elem.tail = self.tail
    237         elem[:] = self
    238         return elem
    239 
    240     ##

    241     # Returns the number of subelements.  Note that this only counts

    242     # full elements; to check if there's any content in an element, you

    243     # have to check both the length and the <b>text</b> attribute.

    244     #

    245     # @return The number of subelements.

    246 
    247     def __len__(self):
    248         return len(self._children)
    249 
    250     def __nonzero__(self):
    251         warnings.warn(
    252             "The behavior of this method will change in future versions.  "
    253             "Use specific 'len(elem)' or 'elem is not None' test instead.",
    254             FutureWarning, stacklevel=2
    255             )
    256         return len(self._children) != 0 # emulate old behaviour, for now

    257 
    258     ##

    259     # Returns the given subelement, by index.

    260     #

    261     # @param index What subelement to return.

    262     # @return The given subelement.

    263     # @exception IndexError If the given element does not exist.

    264 
    265     def __getitem__(self, index):
    266         return self._children[index]
    267 
    268     ##

    269     # Replaces the given subelement, by index.

    270     #

    271     # @param index What subelement to replace.

    272     # @param element The new element value.

    273     # @exception IndexError If the given element does not exist.

    274 
    275     def __setitem__(self, index, element):
    276         # if isinstance(index, slice):

    277         #     for elt in element:

    278         #         assert iselement(elt)

    279         # else:

    280         #     assert iselement(element)

    281         self._children[index] = element
    282 
    283     ##

    284     # Deletes the given subelement, by index.

    285     #

    286     # @param index What subelement to delete.

    287     # @exception IndexError If the given element does not exist.

    288 
    289     def __delitem__(self, index):
    290         del self._children[index]
    291 
    292     ##

    293     # Adds a subelement to the end of this element.  In document order,

    294     # the new element will appear after the last existing subelement (or

    295     # directly after the text, if it's the first subelement), but before

    296     # the end tag for this element.

    297     #

    298     # @param element The element to add.

    299 
    300     def append(self, element):
    301         # assert iselement(element)

    302         self._children.append(element)
    303 
    304     ##

    305     # Appends subelements from a sequence.

    306     #

    307     # @param elements A sequence object with zero or more elements.

    308     # @since 1.3

    309 
    310     def extend(self, elements):
    311         # for element in elements:

    312         #     assert iselement(element)

    313         self._children.extend(elements)
    314 
    315     ##

    316     # Inserts a subelement at the given position in this element.

    317     #

    318     # @param index Where to insert the new subelement.

    319 
    320     def insert(self, index, element):
    321         # assert iselement(element)

    322         self._children.insert(index, element)
    323 
    324     ##

    325     # Removes a matching subelement.  Unlike the <b>find</b> methods,

    326     # this method compares elements based on identity, not on tag

    327     # value or contents.  To remove subelements by other means, the

    328     # easiest way is often to use a list comprehension to select what

    329     # elements to keep, and use slice assignment to update the parent

    330     # element.

    331     #

    332     # @param element What element to remove.

    333     # @exception ValueError If a matching element could not be found.

    334 
    335     def remove(self, element):
    336         # assert iselement(element)

    337         self._children.remove(element)
    338 
    339     ##

    340     # (Deprecated) Returns all subelements.  The elements are returned

    341     # in document order.

    342     #

    343     # @return A list of subelements.

    344     # @defreturn list of Element instances

    345 
    346     def getchildren(self):
    347         warnings.warn(
    348             "This method will be removed in future versions.  "
    349             "Use 'list(elem)' or iteration over elem instead.",
    350             DeprecationWarning, stacklevel=2
    351             )
    352         return self._children
    353 
    354     ##

    355     # Finds the first matching subelement, by tag name or path.

    356     #

    357     # @param path What element to look for.

    358     # @keyparam namespaces Optional namespace prefix map.

    359     # @return The first matching element, or None if no element was found.

    360     # @defreturn Element or None

    361 
    362     def find(self, path, namespaces=None):
    363         return ElementPath.find(self, path, namespaces)
    364 
    365     ##

    366     # Finds text for the first matching subelement, by tag name or path.

    367     #

    368     # @param path What element to look for.

    369     # @param default What to return if the element was not found.

    370     # @keyparam namespaces Optional namespace prefix map.

    371     # @return The text content of the first matching element, or the

    372     #     default value no element was found.  Note that if the element

    373     #     is found, but has no text content, this method returns an

    374     #     empty string.

    375     # @defreturn string

    376 
    377     def findtext(self, path, default=None, namespaces=None):
    378         return ElementPath.findtext(self, path, default, namespaces)
    379 
    380     ##

    381     # Finds all matching subelements, by tag name or path.

    382     #

    383     # @param path What element to look for.

    384     # @keyparam namespaces Optional namespace prefix map.

    385     # @return A list or other sequence containing all matching elements,

    386     #    in document order.

    387     # @defreturn list of Element instances

    388 
    389     def findall(self, path, namespaces=None):
    390         return ElementPath.findall(self, path, namespaces)
    391 
    392     ##

    393     # Finds all matching subelements, by tag name or path.

    394     #

    395     # @param path What element to look for.

    396     # @keyparam namespaces Optional namespace prefix map.

    397     # @return An iterator or sequence containing all matching elements,

    398     #    in document order.

    399     # @defreturn a generated sequence of Element instances

    400 
    401     def iterfind(self, path, namespaces=None):
    402         return ElementPath.iterfind(self, path, namespaces)
    403 
    404     ##

    405     # Resets an element.  This function removes all subelements, clears

    406     # all attributes, and sets the <b>text</b> and <b>tail</b> attributes

    407     # to None.

    408 
    409     def clear(self):
    410         self.attrib.clear()
    411         self._children = []
    412         self.text = self.tail = None
    413 
    414     ##

    415     # Gets an element attribute.  Equivalent to <b>attrib.get</b>, but

    416     # some implementations may handle this a bit more efficiently.

    417     #

    418     # @param key What attribute to look for.

    419     # @param default What to return if the attribute was not found.

    420     # @return The attribute value, or the default value, if the

    421     #     attribute was not found.

    422     # @defreturn string or None

    423 
    424     def get(self, key, default=None):
    425         return self.attrib.get(key, default)
    426 
    427     ##

    428     # Sets an element attribute.  Equivalent to <b>attrib[key] = value</b>,

    429     # but some implementations may handle this a bit more efficiently.

    430     #

    431     # @param key What attribute to set.

    432     # @param value The attribute value.

    433 
    434     def set(self, key, value):
    435         self.attrib[key] = value
    436 
    437     ##

    438     # Gets a list of attribute names.  The names are returned in an

    439     # arbitrary order (just like for an ordinary Python dictionary).

    440     # Equivalent to <b>attrib.keys()</b>.

    441     #

    442     # @return A list of element attribute names.

    443     # @defreturn list of strings

    444 
    445     def keys(self):
    446         return self.attrib.keys()
    447 
    448     ##

    449     # Gets element attributes, as a sequence.  The attributes are

    450     # returned in an arbitrary order.  Equivalent to <b>attrib.items()</b>.

    451     #

    452     # @return A list of (name, value) tuples for all attributes.

    453     # @defreturn list of (string, string) tuples

    454 
    455     def items(self):
    456         return self.attrib.items()
    457 
    458     ##

    459     # Creates a tree iterator.  The iterator loops over this element

    460     # and all subelements, in document order, and returns all elements

    461     # with a matching tag.

    462     # <p>

    463     # If the tree structure is modified during iteration, new or removed

    464     # elements may or may not be included.  To get a stable set, use the

    465     # list() function on the iterator, and loop over the resulting list.

    466     #

    467     # @param tag What tags to look for (default is to return all elements).

    468     # @return An iterator containing all the matching elements.

    469     # @defreturn iterator

    470 
    471     def iter(self, tag=None):
    472         if tag == "*":
    473             tag = None
    474         if tag is None or self.tag == tag:
    475             yield self
    476         for e in self._children:
    477             for e in e.iter(tag):
    478                 yield e
    479 
    480     # compatibility

    481     def getiterator(self, tag=None):
    482         # Change for a DeprecationWarning in 1.4

    483         warnings.warn(
    484             "This method will be removed in future versions.  "
    485             "Use 'elem.iter()' or 'list(elem.iter())' instead.",
    486             PendingDeprecationWarning, stacklevel=2
    487         )
    488         return list(self.iter(tag))
    489 
    490     ##

    491     # Creates a text iterator.  The iterator loops over this element

    492     # and all subelements, in document order, and returns all inner

    493     # text.

    494     #

    495     # @return An iterator containing all inner text.

    496     # @defreturn iterator

    497 
    498     def itertext(self):
    499         tag = self.tag
    500         if not isinstance(tag, basestring) and tag is not None:
    501             return
    502         if self.text:
    503             yield self.text
    504         for e in self:
    505             for s in e.itertext():
    506                 yield s
    507             if e.tail:
    508                 yield e.tail
    509 
    510 # compatibility

    511 _Element = _ElementInterface = Element
    512 
    513 ##

    514 # Subelement factory.  This function creates an element instance, and

    515 # appends it to an existing element.

    516 # <p>

    517 # The element name, attribute names, and attribute values can be

    518 # either 8-bit ASCII strings or Unicode strings.

    519 #

    520 # @param parent The parent element.

    521 # @param tag The subelement name.

    522 # @param attrib An optional dictionary, containing element attributes.

    523 # @param **extra Additional attributes, given as keyword arguments.

    524 # @return An element instance.

    525 # @defreturn Element

    526 
    527 def SubElement(parent, tag, attrib={}, **extra):
    528     attrib = attrib.copy()
    529     attrib.update(extra)
    530     element = parent.makeelement(tag, attrib)
    531     parent.append(element)
    532     return element
    533 
    534 ##

    535 # Comment element factory.  This factory function creates a special

    536 # element that will be serialized as an XML comment by the standard

    537 # serializer.

    538 # <p>

    539 # The comment string can be either an 8-bit ASCII string or a Unicode

    540 # string.

    541 #

    542 # @param text A string containing the comment string.

    543 # @return An element instance, representing a comment.

    544 # @defreturn Element

    545 
    546 def Comment(text=None):
    547     element = Element(Comment)
    548     element.text = text
    549     return element
    550 
    551 ##

    552 # PI element factory.  This factory function creates a special element

    553 # that will be serialized as an XML processing instruction by the standard

    554 # serializer.

    555 #

    556 # @param target A string containing the PI target.

    557 # @param text A string containing the PI contents, if any.

    558 # @return An element instance, representing a PI.

    559 # @defreturn Element

    560 
    561 def ProcessingInstruction(target, text=None):
    562     element = Element(ProcessingInstruction)
    563     element.text = target
    564     if text:
    565         element.text = element.text + " " + text
    566     return element
    567 
    568 PI = ProcessingInstruction
    569 
    570 ##

    571 # QName wrapper.  This can be used to wrap a QName attribute value, in

    572 # order to get proper namespace handling on output.

    573 #

    574 # @param text A string containing the QName value, in the form {uri}local,

    575 #     or, if the tag argument is given, the URI part of a QName.

    576 # @param tag Optional tag.  If given, the first argument is interpreted as

    577 #     an URI, and this argument is interpreted as a local name.

    578 # @return An opaque object, representing the QName.

    579 
    580 class QName(object):
    581     def __init__(self, text_or_uri, tag=None):
    582         if tag:
    583             text_or_uri = "{%s}%s" % (text_or_uri, tag)
    584         self.text = text_or_uri
    585     def __str__(self):
    586         return self.text
    587     def __hash__(self):
    588         return hash(self.text)
    589     def __cmp__(self, other):
    590         if isinstance(other, QName):
    591             return cmp(self.text, other.text)
    592         return cmp(self.text, other)
    593 
    594 # --------------------------------------------------------------------

    595 
    596 ##

    597 # ElementTree wrapper class.  This class represents an entire element

    598 # hierarchy, and adds some extra support for serialization to and from

    599 # standard XML.

    600 #

    601 # @param element Optional root element.

    602 # @keyparam file Optional file handle or file name.  If given, the

    603 #     tree is initialized with the contents of this XML file.

    604 
    605 class ElementTree(object):
    606 
    607     def __init__(self, element=None, file=None):
    608         # assert element is None or iselement(element)

    609         self._root = element # first node

    610         if file:
    611             self.parse(file)
    612 
    613     ##

    614     # Gets the root element for this tree.

    615     #

    616     # @return An element instance.

    617     # @defreturn Element

    618 
    619     def getroot(self):
    620         return self._root
    621 
    622     ##

    623     # Replaces the root element for this tree.  This discards the

    624     # current contents of the tree, and replaces it with the given

    625     # element.  Use with care.

    626     #

    627     # @param element An element instance.

    628 
    629     def _setroot(self, element):
    630         # assert iselement(element)

    631         self._root = element
    632 
    633     ##

    634     # Loads an external XML document into this element tree.

    635     #

    636     # @param source A file name or file object.  If a file object is

    637     #     given, it only has to implement a <b>read(n)</b> method.

    638     # @keyparam parser An optional parser instance.  If not given, the

    639     #     standard {@link XMLParser} parser is used.

    640     # @return The document root element.

    641     # @defreturn Element

    642     # @exception ParseError If the parser fails to parse the document.

    643 
    644     def parse(self, source, parser=None):
    645         if not hasattr(source, "read"):
    646             source = open(source, "rb")
    647         if not parser:
    648             parser = XMLParser(target=TreeBuilder())
    649         while 1:
    650             data = source.read(65536)
    651             if not data:
    652                 break
    653             parser.feed(data)
    654         self._root = parser.close()
    655         return self._root
    656 
    657     ##

    658     # Creates a tree iterator for the root element.  The iterator loops

    659     # over all elements in this tree, in document order.

    660     #

    661     # @param tag What tags to look for (default is to return all elements)

    662     # @return An iterator.

    663     # @defreturn iterator

    664 
    665     def iter(self, tag=None):
    666         # assert self._root is not None

    667         return self._root.iter(tag)
    668 
    669     # compatibility

    670     def getiterator(self, tag=None):
    671         # Change for a DeprecationWarning in 1.4

    672         warnings.warn(
    673             "This method will be removed in future versions.  "
    674             "Use 'tree.iter()' or 'list(tree.iter())' instead.",
    675             PendingDeprecationWarning, stacklevel=2
    676         )
    677         return list(self.iter(tag))
    678 
    679     ##

    680     # Finds the first toplevel element with given tag.

    681     # Same as getroot().find(path).

    682     #

    683     # @param path What element to look for.

    684     # @keyparam namespaces Optional namespace prefix map.

    685     # @return The first matching element, or None if no element was found.

    686     # @defreturn Element or None

    687 
    688     def find(self, path, namespaces=None):
    689         # assert self._root is not None

    690         if path[:1] == "/":
    691             path = "." + path
    692             warnings.warn(
    693                 "This search is broken in 1.3 and earlier, and will be "
    694                 "fixed in a future version.  If you rely on the current "
    695                 "behaviour, change it to %r" % path,
    696                 FutureWarning, stacklevel=2
    697                 )
    698         return self._root.find(path, namespaces)
    699 
    700     ##

    701     # Finds the element text for the first toplevel element with given

    702     # tag.  Same as getroot().findtext(path).

    703     #

    704     # @param path What toplevel element to look for.

    705     # @param default What to return if the element was not found.

    706     # @keyparam namespaces Optional namespace prefix map.

    707     # @return The text content of the first matching element, or the

    708     #     default value no element was found.  Note that if the element

    709     #     is found, but has no text content, this method returns an

    710     #     empty string.

    711     # @defreturn string

    712 
    713     def findtext(self, path, default=None, namespaces=None):
    714         # assert self._root is not None

    715         if path[:1] == "/":
    716             path = "." + path
    717             warnings.warn(
    718                 "This search is broken in 1.3 and earlier, and will be "
    719                 "fixed in a future version.  If you rely on the current "
    720                 "behaviour, change it to %r" % path,
    721                 FutureWarning, stacklevel=2
    722                 )
    723         return self._root.findtext(path, default, namespaces)
    724 
    725     ##

    726     # Finds all toplevel elements with the given tag.

    727     # Same as getroot().findall(path).

    728     #

    729     # @param path What element to look for.

    730     # @keyparam namespaces Optional namespace prefix map.

    731     # @return A list or iterator containing all matching elements,

    732     #    in document order.

    733     # @defreturn list of Element instances

    734 
    735     def findall(self, path, namespaces=None):
    736         # assert self._root is not None

    737         if path[:1] == "/":
    738             path = "." + path
    739             warnings.warn(
    740                 "This search is broken in 1.3 and earlier, and will be "
    741                 "fixed in a future version.  If you rely on the current "
    742                 "behaviour, change it to %r" % path,
    743                 FutureWarning, stacklevel=2
    744                 )
    745         return self._root.findall(path, namespaces)
    746 
    747     ##

    748     # Finds all matching subelements, by tag name or path.

    749     # Same as getroot().iterfind(path).

    750     #

    751     # @param path What element to look for.

    752     # @keyparam namespaces Optional namespace prefix map.

    753     # @return An iterator or sequence containing all matching elements,

    754     #    in document order.

    755     # @defreturn a generated sequence of Element instances

    756 
    757     def iterfind(self, path, namespaces=None):
    758         # assert self._root is not None

    759         if path[:1] == "/":
    760             path = "." + path
    761             warnings.warn(
    762                 "This search is broken in 1.3 and earlier, and will be "
    763                 "fixed in a future version.  If you rely on the current "
    764                 "behaviour, change it to %r" % path,
    765                 FutureWarning, stacklevel=2
    766                 )
    767         return self._root.iterfind(path, namespaces)
    768 
    769     ##

    770     # Writes the element tree to a file, as XML.

    771     #

    772     # @def write(file, **options)

    773     # @param file A file name, or a file object opened for writing.

    774     # @param **options Options, given as keyword arguments.

    775     # @keyparam encoding Optional output encoding (default is US-ASCII).

    776     # @keyparam method Optional output method ("xml", "html", "text" or

    777     #     "c14n"; default is "xml").

    778     # @keyparam xml_declaration Controls if an XML declaration should

    779     #     be added to the file.  Use False for never, True for always,

    780     #     None for only if not US-ASCII or UTF-8.  None is default.

    781 
    782     def write(self, file_or_filename,
    783               # keyword arguments

    784               encoding=None,
    785               xml_declaration=None,
    786               default_namespace=None,
    787               method=None):
    788         # assert self._root is not None

    789         if not method:
    790             method = "xml"
    791         elif method not in _serialize:
    792             # FIXME: raise an ImportError for c14n if ElementC14N is missing?

    793             raise ValueError("unknown method %r" % method)
    794         if hasattr(file_or_filename, "write"):
    795             file = file_or_filename
    796         else:
    797             file = open(file_or_filename, "wb")
    798         write = file.write
    799         if not encoding:
    800             if method == "c14n":
    801                 encoding = "utf-8"
    802             else:
    803                 encoding = "us-ascii"
    804         elif xml_declaration or (xml_declaration is None and
    805                                  encoding not in ("utf-8", "us-ascii")):
    806             if method == "xml":
    807                 write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
    808         if method == "text":
    809             _serialize_text(write, self._root, encoding)
    810         else:
    811             qnames, namespaces = _namespaces(
    812                 self._root, encoding, default_namespace
    813                 )
    814             serialize = _serialize[method]
    815             serialize(write, self._root, encoding, qnames, namespaces)
    816         if file_or_filename is not file:
    817             file.close()
    818 
    819     def write_c14n(self, file):
    820         # lxml.etree compatibility.  use output method instead

    821         return self.write(file, method="c14n")
    822 
    823 # --------------------------------------------------------------------

    824 # serialization support

    825 
    826 def _namespaces(elem, encoding, default_namespace=None):
    827     # identify namespaces used in this tree

    828 
    829     # maps qnames to *encoded* prefix:local names

    830     qnames = {None: None}
    831 
    832     # maps uri:s to prefixes

    833     namespaces = {}
    834     if default_namespace:
    835         namespaces[default_namespace] = ""
    836 
    837     def encode(text):
    838         return text.encode(encoding)
    839 
    840     def add_qname(qname):
    841         # calculate serialized qname representation

    842         try:
    843             if qname[:1] == "{":
    844                 uri, tag = qname[1:].rsplit("}", 1)
    845                 prefix = namespaces.get(uri)
    846                 if prefix is None:
    847                     prefix = _namespace_map.get(uri)
    848                     if prefix is None:
    849                         prefix = "ns%d" % len(namespaces)
    850                     if prefix != "xml":
    851                         namespaces[uri] = prefix
    852                 if prefix:
    853                     qnames[qname] = encode("%s:%s" % (prefix, tag))
    854                 else:
    855                     qnames[qname] = encode(tag) # default element

    856             else:
    857                 if default_namespace:
    858                     # FIXME: can this be handled in XML 1.0?

    859                     raise ValueError(
    860                         "cannot use non-qualified names with "
    861                         "default_namespace option"
    862                         )
    863                 qnames[qname] = encode(qname)
    864         except TypeError:
    865             _raise_serialization_error(qname)
    866 
    867     # populate qname and namespaces table

    868     try:
    869         iterate = elem.iter
    870     except AttributeError:
    871         iterate = elem.getiterator # cET compatibility

    872     for elem in iterate():
    873         tag = elem.tag
    874         if isinstance(tag, QName):
    875             if tag.text not in qnames:
    876                 add_qname(tag.text)
    877         elif isinstance(tag, basestring):
    878             if tag not in qnames:
    879                 add_qname(tag)
    880         elif tag is not None and tag is not Comment and tag is not PI:
    881             _raise_serialization_error(tag)
    882         for key, value in elem.items():
    883             if isinstance(key, QName):
    884                 key = key.text
    885             if key not in qnames:
    886                 add_qname(key)
    887             if isinstance(value, QName) and value.text not in qnames:
    888                 add_qname(value.text)
    889         text = elem.text
    890         if isinstance(text, QName) and text.text not in qnames:
    891             add_qname(text.text)
    892     return qnames, namespaces
    893 
    894 def _serialize_xml(write, elem, encoding, qnames, namespaces):
    895     tag = elem.tag
    896     text = elem.text
    897     if tag is Comment:
    898         write("<!--%s-->" % _encode(text, encoding))
    899     elif tag is ProcessingInstruction:
    900         write("<?%s?>" % _encode(text, encoding))
    901     else:
    902         tag = qnames[tag]
    903         if tag is None:
    904             if text:
    905                 write(_escape_cdata(text, encoding))
    906             for e in elem:
    907                 _serialize_xml(write, e, encoding, qnames, None)
    908         else:
    909             write("<" + tag)
    910             items = elem.items()
    911             if items or namespaces:
    912                 if namespaces:
    913                     for v, k in sorted(namespaces.items(),
    914                                        key=lambda x: x[1]):  # sort on prefix

    915                         if k:
    916                             k = ":" + k
    917                         write(" xmlns%s=\"%s\"" % (
    918                             k.encode(encoding),
    919                             _escape_attrib(v, encoding)
    920                             ))
    921                 for k, v in sorted(items):  # lexical order

    922                     if isinstance(k, QName):
    923                         k = k.text
    924                     if isinstance(v, QName):
    925                         v = qnames[v.text]
    926                     else:
    927                         v = _escape_attrib(v, encoding)
    928                     write(" %s=\"%s\"" % (qnames[k], v))
    929             if text or len(elem):
    930                 write(">")
    931                 if text:
    932                     write(_escape_cdata(text, encoding))
    933                 for e in elem:
    934                     _serialize_xml(write, e, encoding, qnames, None)
    935                 write("</" + tag + ">")
    936             else:
    937                 write(" />")
    938     if elem.tail:
    939         write(_escape_cdata(elem.tail, encoding))
    940 
    941 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
    942               "img", "input", "isindex", "link", "meta" "param")
    943 
    944 try:
    945     HTML_EMPTY = set(HTML_EMPTY)
    946 except NameError:
    947     pass
    948 
    949 def _serialize_html(write, elem, encoding, qnames, namespaces):
    950     tag = elem.tag
    951     text = elem.text
    952     if tag is Comment:
    953         write("<!--%s-->" % _escape_cdata(text, encoding))
    954     elif tag is ProcessingInstruction:
    955         write("<?%s?>" % _escape_cdata(text, encoding))
    956     else:
    957         tag = qnames[tag]
    958         if tag is None:
    959             if text:
    960                 write(_escape_cdata(text, encoding))
    961             for e in elem:
    962                 _serialize_html(write, e, encoding, qnames, None)
    963         else:
    964             write("<" + tag)
    965             items = elem.items()
    966             if items or namespaces:
    967                 if namespaces:
    968                     for v, k in sorted(namespaces.items(),
    969                                        key=lambda x: x[1]):  # sort on prefix

    970                         if k:
    971                             k = ":" + k
    972                         write(" xmlns%s=\"%s\"" % (
    973                             k.encode(encoding),
    974                             _escape_attrib(v, encoding)
    975                             ))
    976                 for k, v in sorted(items):  # lexical order

    977                     if isinstance(k, QName):
    978                         k = k.text
    979                     if isinstance(v, QName):
    980                         v = qnames[v.text]
    981                     else:
    982                         v = _escape_attrib_html(v, encoding)
    983                     # FIXME: handle boolean attributes

    984                     write(" %s=\"%s\"" % (qnames[k], v))
    985             write(">")
    986             tag = tag.lower()
    987             if text:
    988                 if tag == "script" or tag == "style":
    989                     write(_encode(text, encoding))
    990                 else:
    991                     write(_escape_cdata(text, encoding))
    992             for e in elem:
    993                 _serialize_html(write, e, encoding, qnames, None)
    994             if tag not in HTML_EMPTY:
    995                 write("</" + tag + ">")
    996     if elem.tail:
    997         write(_escape_cdata(elem.tail, encoding))
    998 
    999 def _serialize_text(write, elem, encoding):
   1000     for part in elem.itertext():
   1001         write(part.encode(encoding))
   1002     if elem.tail:
   1003         write(elem.tail.encode(encoding))
   1004 
   1005 _serialize = {
   1006     "xml": _serialize_xml,
   1007     "html": _serialize_html,
   1008     "text": _serialize_text,
   1009 # this optional method is imported at the end of the module

   1010 #   "c14n": _serialize_c14n,

   1011 }
   1012 
   1013 ##

   1014 # Registers a namespace prefix.  The registry is global, and any

   1015 # existing mapping for either the given prefix or the namespace URI

   1016 # will be removed.

   1017 #

   1018 # @param prefix Namespace prefix.

   1019 # @param uri Namespace uri.  Tags and attributes in this namespace

   1020 #     will be serialized with the given prefix, if at all possible.

   1021 # @exception ValueError If the prefix is reserved, or is otherwise

   1022 #     invalid.

   1023 
   1024 def register_namespace(prefix, uri):
   1025     if re.match("ns\d+$", prefix):
   1026         raise ValueError("Prefix format reserved for internal use")
   1027     for k, v in _namespace_map.items():
   1028         if k == uri or v == prefix:
   1029             del _namespace_map[k]
   1030     _namespace_map[uri] = prefix
   1031 
   1032 _namespace_map = {
   1033     # "well-known" namespace prefixes

   1034     "http://www.w3.org/XML/1998/namespace": "xml",
   1035     "http://www.w3.org/1999/xhtml": "html",
   1036     "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
   1037     "http://schemas.xmlsoap.org/wsdl/": "wsdl",
   1038     # xml schema

   1039     "http://www.w3.org/2001/XMLSchema": "xs",
   1040     "http://www.w3.org/2001/XMLSchema-instance": "xsi",
   1041     # dublin core

   1042     "http://purl.org/dc/elements/1.1/": "dc",
   1043 }
   1044 
   1045 def _raise_serialization_error(text):
   1046     raise TypeError(
   1047         "cannot serialize %r (type %s)" % (text, type(text).__name__)
   1048         )
   1049 
   1050 def _encode(text, encoding):
   1051     try:
   1052         return text.encode(encoding, "xmlcharrefreplace")
   1053     except (TypeError, AttributeError):
   1054         _raise_serialization_error(text)
   1055 
   1056 def _escape_cdata(text, encoding):
   1057     # escape character data

   1058     try:
   1059         # it's worth avoiding do-nothing calls for strings that are

   1060         # shorter than 500 character, or so.  assume that's, by far,

   1061         # the most common case in most applications.

   1062         if "&" in text:
   1063             text = text.replace("&", "&amp;")
   1064         if "<" in text:
   1065             text = text.replace("<", "&lt;")
   1066         if ">" in text:
   1067             text = text.replace(">", "&gt;")
   1068         return text.encode(encoding, "xmlcharrefreplace")
   1069     except (TypeError, AttributeError):
   1070         _raise_serialization_error(text)
   1071 
   1072 def _escape_attrib(text, encoding):
   1073     # escape attribute value

   1074     try:
   1075         if "&" in text:
   1076             text = text.replace("&", "&amp;")
   1077         if "<" in text:
   1078             text = text.replace("<", "&lt;")
   1079         if ">" in text:
   1080             text = text.replace(">", "&gt;")
   1081         if "\"" in text:
   1082             text = text.replace("\"", "&quot;")
   1083         if "\n" in text:
   1084             text = text.replace("\n", "&#10;")
   1085         return text.encode(encoding, "xmlcharrefreplace")
   1086     except (TypeError, AttributeError):
   1087         _raise_serialization_error(text)
   1088 
   1089 def _escape_attrib_html(text, encoding):
   1090     # escape attribute value

   1091     try:
   1092         if "&" in text:
   1093             text = text.replace("&", "&amp;")
   1094         if ">" in text:
   1095             text = text.replace(">", "&gt;")
   1096         if "\"" in text:
   1097             text = text.replace("\"", "&quot;")
   1098         return text.encode(encoding, "xmlcharrefreplace")
   1099     except (TypeError, AttributeError):
   1100         _raise_serialization_error(text)
   1101 
   1102 # --------------------------------------------------------------------

   1103 
   1104 ##

   1105 # Generates a string representation of an XML element, including all

   1106 # subelements.

   1107 #

   1108 # @param element An Element instance.

   1109 # @keyparam encoding Optional output encoding (default is US-ASCII).

   1110 # @keyparam method Optional output method ("xml", "html", "text" or

   1111 #     "c14n"; default is "xml").

   1112 # @return An encoded string containing the XML data.

   1113 # @defreturn string

   1114 
   1115 def tostring(element, encoding=None, method=None):
   1116     class dummy:
   1117         pass
   1118     data = []
   1119     file = dummy()
   1120     file.write = data.append
   1121     ElementTree(element).write(file, encoding, method=method)
   1122     return "".join(data)
   1123 
   1124 ##

   1125 # Generates a string representation of an XML element, including all

   1126 # subelements.  The string is returned as a sequence of string fragments.

   1127 #

   1128 # @param element An Element instance.

   1129 # @keyparam encoding Optional output encoding (default is US-ASCII).

   1130 # @keyparam method Optional output method ("xml", "html", "text" or

   1131 #     "c14n"; default is "xml").

   1132 # @return A sequence object containing the XML data.

   1133 # @defreturn sequence

   1134 # @since 1.3

   1135 
   1136 def tostringlist(element, encoding=None, method=None):
   1137     class dummy:
   1138         pass
   1139     data = []
   1140     file = dummy()
   1141     file.write = data.append
   1142     ElementTree(element).write(file, encoding, method=method)
   1143     # FIXME: merge small fragments into larger parts

   1144     return data
   1145 
   1146 ##

   1147 # Writes an element tree or element structure to sys.stdout.  This

   1148 # function should be used for debugging only.

   1149 # <p>

   1150 # The exact output format is implementation dependent.  In this

   1151 # version, it's written as an ordinary XML file.

   1152 #

   1153 # @param elem An element tree or an individual element.

   1154 
   1155 def dump(elem):
   1156     # debugging

   1157     if not isinstance(elem, ElementTree):
   1158         elem = ElementTree(elem)
   1159     elem.write(sys.stdout)
   1160     tail = elem.getroot().tail
   1161     if not tail or tail[-1] != "\n":
   1162         sys.stdout.write("\n")
   1163 
   1164 # --------------------------------------------------------------------

   1165 # parsing

   1166 
   1167 ##

   1168 # Parses an XML document into an element tree.

   1169 #

   1170 # @param source A filename or file object containing XML data.

   1171 # @param parser An optional parser instance.  If not given, the

   1172 #     standard {@link XMLParser} parser is used.

   1173 # @return An ElementTree instance

   1174 
   1175 def parse(source, parser=None):
   1176     tree = ElementTree()
   1177     tree.parse(source, parser)
   1178     return tree
   1179 
   1180 ##

   1181 # Parses an XML document into an element tree incrementally, and reports

   1182 # what's going on to the user.

   1183 #

   1184 # @param source A filename or file object containing XML data.

   1185 # @param events A list of events to report back.  If omitted, only "end"

   1186 #     events are reported.

   1187 # @param parser An optional parser instance.  If not given, the

   1188 #     standard {@link XMLParser} parser is used.

   1189 # @return A (event, elem) iterator.

   1190 
   1191 def iterparse(source, events=None, parser=None):
   1192     if not hasattr(source, "read"):
   1193         source = open(source, "rb")
   1194     if not parser:
   1195         parser = XMLParser(target=TreeBuilder())
   1196     return _IterParseIterator(source, events, parser)
   1197 
   1198 class _IterParseIterator(object):
   1199 
   1200     def __init__(self, source, events, parser):
   1201         self._file = source
   1202         self._events = []
   1203         self._index = 0
   1204         self.root = self._root = None
   1205         self._parser = parser
   1206         # wire up the parser for event reporting

   1207         parser = self._parser._parser
   1208         append = self._events.append
   1209         if events is None:
   1210             events = ["end"]
   1211         for event in events:
   1212             if event == "start":
   1213                 try:
   1214                     parser.ordered_attributes = 1
   1215                     parser.specified_attributes = 1
   1216                     def handler(tag, attrib_in, event=event, append=append,
   1217                                 start=self._parser._start_list):
   1218                         append((event, start(tag, attrib_in)))
   1219                     parser.StartElementHandler = handler
   1220                 except AttributeError:
   1221                     def handler(tag, attrib_in, event=event, append=append,
   1222                                 start=self._parser._start):
   1223                         append((event, start(tag, attrib_in)))
   1224                     parser.StartElementHandler = handler
   1225             elif event == "end":
   1226                 def handler(tag, event=event, append=append,
   1227                             end=self._parser._end):
   1228                     append((event, end(tag)))
   1229                 parser.EndElementHandler = handler
   1230             elif event == "start-ns":
   1231                 def handler(prefix, uri, event=event, append=append):
   1232                     try:
   1233                         uri = (uri or "").encode("ascii")
   1234                     except UnicodeError:
   1235                         pass
   1236                     append((event, (prefix or "", uri or "")))
   1237                 parser.StartNamespaceDeclHandler = handler
   1238             elif event == "end-ns":
   1239                 def handler(prefix, event=event, append=append):
   1240                     append((event, None))
   1241                 parser.EndNamespaceDeclHandler = handler
   1242             else:
   1243                 raise ValueError("unknown event %r" % event)
   1244 
   1245     def next(self):
   1246         while 1:
   1247             try:
   1248                 item = self._events[self._index]
   1249             except IndexError:
   1250                 if self._parser is None:
   1251                     self.root = self._root
   1252                     raise StopIteration
   1253                 # load event buffer

   1254                 del self._events[:]
   1255                 self._index = 0
   1256                 data = self._file.read(16384)
   1257                 if data:
   1258                     self._parser.feed(data)
   1259                 else:
   1260                     self._root = self._parser.close()
   1261                     self._parser = None
   1262             else:
   1263                 self._index = self._index + 1
   1264                 return item
   1265 
   1266     def __iter__(self):
   1267         return self
   1268 
   1269 ##

   1270 # Parses an XML document from a string constant.  This function can

   1271 # be used to embed "XML literals" in Python code.

   1272 #

   1273 # @param source A string containing XML data.

   1274 # @param parser An optional parser instance.  If not given, the

   1275 #     standard {@link XMLParser} parser is used.

   1276 # @return An Element instance.

   1277 # @defreturn Element

   1278 
   1279 def XML(text, parser=None):
   1280     if not parser:
   1281         parser = XMLParser(target=TreeBuilder())
   1282     parser.feed(text)
   1283     return parser.close()
   1284 
   1285 ##

   1286 # Parses an XML document from a string constant, and also returns

   1287 # a dictionary which maps from element id:s to elements.

   1288 #

   1289 # @param source A string containing XML data.

   1290 # @param parser An optional parser instance.  If not given, the

   1291 #     standard {@link XMLParser} parser is used.

   1292 # @return A tuple containing an Element instance and a dictionary.

   1293 # @defreturn (Element, dictionary)

   1294 
   1295 def XMLID(text, parser=None):
   1296     if not parser:
   1297         parser = XMLParser(target=TreeBuilder())
   1298     parser.feed(text)
   1299     tree = parser.close()
   1300     ids = {}
   1301     for elem in tree.iter():
   1302         id = elem.get("id")
   1303         if id:
   1304             ids[id] = elem
   1305     return tree, ids
   1306 
   1307 ##

   1308 # Parses an XML document from a string constant.  Same as {@link #XML}.

   1309 #

   1310 # @def fromstring(text)

   1311 # @param source A string containing XML data.

   1312 # @return An Element instance.

   1313 # @defreturn Element

   1314 
   1315 fromstring = XML
   1316 
   1317 ##

   1318 # Parses an XML document from a sequence of string fragments.

   1319 #

   1320 # @param sequence A list or other sequence containing XML data fragments.

   1321 # @param parser An optional parser instance.  If not given, the

   1322 #     standard {@link XMLParser} parser is used.

   1323 # @return An Element instance.

   1324 # @defreturn Element

   1325 # @since 1.3

   1326 
   1327 def fromstringlist(sequence, parser=None):
   1328     if not parser:
   1329         parser = XMLParser(target=TreeBuilder())
   1330     for text in sequence:
   1331         parser.feed(text)
   1332     return parser.close()
   1333 
   1334 # --------------------------------------------------------------------

   1335 
   1336 ##

   1337 # Generic element structure builder.  This builder converts a sequence

   1338 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link

   1339 # #TreeBuilder.end} method calls to a well-formed element structure.

   1340 # <p>

   1341 # You can use this class to build an element structure using a custom XML

   1342 # parser, or a parser for some other XML-like format.

   1343 #

   1344 # @param element_factory Optional element factory.  This factory

   1345 #    is called to create new Element instances, as necessary.

   1346 
   1347 class TreeBuilder(object):
   1348 
   1349     def __init__(self, element_factory=None):
   1350         self._data = [] # data collector

   1351         self._elem = [] # element stack

   1352         self._last = None # last element

   1353         self._tail = None # true if we're after an end tag

   1354         if element_factory is None:
   1355             element_factory = Element
   1356         self._factory = element_factory
   1357 
   1358     ##

   1359     # Flushes the builder buffers, and returns the toplevel document

   1360     # element.

   1361     #

   1362     # @return An Element instance.

   1363     # @defreturn Element

   1364 
   1365     def close(self):
   1366         assert len(self._elem) == 0, "missing end tags"
   1367         assert self._last is not None, "missing toplevel element"
   1368         return self._last
   1369 
   1370     def _flush(self):
   1371         if self._data:
   1372             if self._last is not None:
   1373                 text = "".join(self._data)
   1374                 if self._tail:
   1375                     assert self._last.tail is None, "internal error (tail)"
   1376                     self._last.tail = text
   1377                 else:
   1378                     assert self._last.text is None, "internal error (text)"
   1379                     self._last.text = text
   1380             self._data = []
   1381 
   1382     ##

   1383     # Adds text to the current element.

   1384     #

   1385     # @param data A string.  This should be either an 8-bit string

   1386     #    containing ASCII text, or a Unicode string.

   1387 
   1388     def data(self, data):
   1389         self._data.append(data)
   1390 
   1391     ##

   1392     # Opens a new element.

   1393     #

   1394     # @param tag The element name.

   1395     # @param attrib A dictionary containing element attributes.

   1396     # @return The opened element.

   1397     # @defreturn Element

   1398 
   1399     def start(self, tag, attrs):
   1400         self._flush()
   1401         self._last = elem = self._factory(tag, attrs)
   1402         if self._elem:
   1403             self._elem[-1].append(elem)
   1404         self._elem.append(elem)
   1405         self._tail = 0
   1406         return elem
   1407 
   1408     ##

   1409     # Closes the current element.

   1410     #

   1411     # @param tag The element name.

   1412     # @return The closed element.

   1413     # @defreturn Element

   1414 
   1415     def end(self, tag):
   1416         self._flush()
   1417         self._last = self._elem.pop()
   1418         assert self._last.tag == tag,\
   1419                "end tag mismatch (expected %s, got %s)" % (
   1420                    self._last.tag, tag)
   1421         self._tail = 1
   1422         return self._last
   1423 
   1424 ##

   1425 # Element structure builder for XML source data, based on the

   1426 # <b>expat</b> parser.

   1427 #

   1428 # @keyparam target Target object.  If omitted, the builder uses an

   1429 #     instance of the standard {@link #TreeBuilder} class.

   1430 # @keyparam html Predefine HTML entities.  This flag is not supported

   1431 #     by the current implementation.

   1432 # @keyparam encoding Optional encoding.  If given, the value overrides

   1433 #     the encoding specified in the XML file.

   1434 # @see #ElementTree

   1435 # @see #TreeBuilder

   1436 
   1437 class XMLParser(object):
   1438 
   1439     def __init__(self, html=0, target=None, encoding=None):
   1440         try:
   1441             from xml.parsers import expat
   1442         except ImportError:
   1443             try:
   1444                 import pyexpat as expat
   1445             except ImportError:
   1446                 raise ImportError(
   1447                     "No module named expat; use SimpleXMLTreeBuilder instead"
   1448                     )
   1449         parser = expat.ParserCreate(encoding, "}")
   1450         if target is None:
   1451             target = TreeBuilder()
   1452         # underscored names are provided for compatibility only

   1453         self.parser = self._parser = parser
   1454         self.target = self._target = target
   1455         self._error = expat.error
   1456         self._names = {} # name memo cache

   1457         # callbacks

   1458         parser.DefaultHandlerExpand = self._default
   1459         parser.StartElementHandler = self._start
   1460         parser.EndElementHandler = self._end
   1461         parser.CharacterDataHandler = self._data
   1462         # optional callbacks

   1463         parser.CommentHandler = self._comment
   1464         parser.ProcessingInstructionHandler = self._pi
   1465         # let expat do the buffering, if supported

   1466         try:
   1467             self._parser.buffer_text = 1
   1468         except AttributeError:
   1469             pass
   1470         # use new-style attribute handling, if supported

   1471         try:
   1472             self._parser.ordered_attributes = 1
   1473             self._parser.specified_attributes = 1
   1474             parser.StartElementHandler = self._start_list
   1475         except AttributeError:
   1476             pass
   1477         self._doctype = None
   1478         self.entity = {}
   1479         try:
   1480             self.version = "Expat %d.%d.%d" % expat.version_info
   1481         except AttributeError:
   1482             pass # unknown

   1483 
   1484     def _raiseerror(self, value):
   1485         err = ParseError(value)
   1486         err.code = value.code
   1487         err.position = value.lineno, value.offset
   1488         raise err
   1489 
   1490     def _fixtext(self, text):
   1491         # convert text string to ascii, if possible

   1492         try:
   1493             return text.encode("ascii")
   1494         except UnicodeError:
   1495             return text
   1496 
   1497     def _fixname(self, key):
   1498         # expand qname, and convert name string to ascii, if possible

   1499         try:
   1500             name = self._names[key]
   1501         except KeyError:
   1502             name = key
   1503             if "}" in name:
   1504                 name = "{" + name
   1505             self._names[key] = name = self._fixtext(name)
   1506         return name
   1507 
   1508     def _start(self, tag, attrib_in):
   1509         fixname = self._fixname
   1510         fixtext = self._fixtext
   1511         tag = fixname(tag)
   1512         attrib = {}
   1513         for key, value in attrib_in.items():
   1514             attrib[fixname(key)] = fixtext(value)
   1515         return self.target.start(tag, attrib)
   1516 
   1517     def _start_list(self, tag, attrib_in):
   1518         fixname = self._fixname
   1519         fixtext = self._fixtext
   1520         tag = fixname(tag)
   1521         attrib = {}
   1522         if attrib_in:
   1523             for i in range(0, len(attrib_in), 2):
   1524                 attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1])
   1525         return self.target.start(tag, attrib)
   1526 
   1527     def _data(self, text):
   1528         return self.target.data(self._fixtext(text))
   1529 
   1530     def _end(self, tag):
   1531         return self.target.end(self._fixname(tag))
   1532 
   1533     def _comment(self, data):
   1534         try:
   1535             comment = self.target.comment
   1536         except AttributeError:
   1537             pass
   1538         else:
   1539             return comment(self._fixtext(data))
   1540 
   1541     def _pi(self, target, data):
   1542         try:
   1543             pi = self.target.pi
   1544         except AttributeError:
   1545             pass
   1546         else:
   1547             return pi(self._fixtext(target), self._fixtext(data))
   1548 
   1549     def _default(self, text):
   1550         prefix = text[:1]
   1551         if prefix == "&":
   1552             # deal with undefined entities

   1553             try:
   1554                 self.target.data(self.entity[text[1:-1]])
   1555             except KeyError:
   1556                 from xml.parsers import expat
   1557                 err = expat.error(
   1558                     "undefined entity %s: line %d, column %d" %
   1559                     (text, self._parser.ErrorLineNumber,
   1560                     self._parser.ErrorColumnNumber)
   1561                     )
   1562                 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY

   1563                 err.lineno = self._parser.ErrorLineNumber
   1564                 err.offset = self._parser.ErrorColumnNumber
   1565                 raise err
   1566         elif prefix == "<" and text[:9] == "<!DOCTYPE":
   1567             self._doctype = [] # inside a doctype declaration

   1568         elif self._doctype is not None:
   1569             # parse doctype contents

   1570             if prefix == ">":
   1571                 self._doctype = None
   1572                 return
   1573             text = text.strip()
   1574             if not text:
   1575                 return
   1576             self._doctype.append(text)
   1577             n = len(self._doctype)
   1578             if n > 2:
   1579                 type = self._doctype[1]
   1580                 if type == "PUBLIC" and n == 4:
   1581                     name, type, pubid, system = self._doctype
   1582                 elif type == "SYSTEM" and n == 3:
   1583                     name, type, system = self._doctype
   1584                     pubid = None
   1585                 else:
   1586                     return
   1587                 if pubid:
   1588                     pubid = pubid[1:-1]
   1589                 if hasattr(self.target, "doctype"):
   1590                     self.target.doctype(name, pubid, system[1:-1])
   1591                 elif self.doctype is not self._XMLParser__doctype:
   1592                     # warn about deprecated call

   1593                     self._XMLParser__doctype(name, pubid, system[1:-1])
   1594                     self.doctype(name, pubid, system[1:-1])
   1595                 self._doctype = None
   1596 
   1597     ##

   1598     # (Deprecated) Handles a doctype declaration.

   1599     #

   1600     # @param name Doctype name.

   1601     # @param pubid Public identifier.

   1602     # @param system System identifier.

   1603 
   1604     def doctype(self, name, pubid, system):
   1605         """This method of XMLParser is deprecated."""
   1606         warnings.warn(
   1607             "This method of XMLParser is deprecated.  Define doctype() "
   1608             "method on the TreeBuilder target.",
   1609             DeprecationWarning,
   1610             )
   1611 
   1612     # sentinel, if doctype is redefined in a subclass

   1613     __doctype = doctype
   1614 
   1615     ##

   1616     # Feeds data to the parser.

   1617     #

   1618     # @param data Encoded data.

   1619 
   1620     def feed(self, data):
   1621         try:
   1622             self._parser.Parse(data, 0)
   1623         except self._error, v:
   1624             self._raiseerror(v)
   1625 
   1626     ##

   1627     # Finishes feeding data to the parser.

   1628     #

   1629     # @return An element structure.

   1630     # @defreturn Element

   1631 
   1632     def close(self):
   1633         try:
   1634             self._parser.Parse("", 1) # end of data

   1635         except self._error, v:
   1636             self._raiseerror(v)
   1637         tree = self.target.close()
   1638         del self.target, self._parser # get rid of circular references

   1639         return tree
   1640 
   1641 # compatibility

   1642 XMLTreeBuilder = XMLParser
   1643 
   1644 # workaround circular import.

   1645 try:
   1646     from ElementC14N import _serialize_c14n
   1647     _serialize["c14n"] = _serialize_c14n
   1648 except ImportError:
   1649     pass
   1650