1 # 2 # ElementTree 3 # $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $ 4 # 5 # light-weight XML support for Python 2.3 and later. 6 # 7 # history (since 1.2.6): 8 # 2005-11-12 fl added tostringlist/fromstringlist helpers 9 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox 10 # 2006-07-05 fl removed support for 2.1 and earlier 11 # 2007-06-21 fl added deprecation/future warnings 12 # 2007-08-25 fl added doctype hook, added parser version attribute etc 13 # 2007-08-26 fl added new serializer code (better namespace handling, etc) 14 # 2007-08-27 fl warn for broken /tag searches on tree level 15 # 2007-09-02 fl added html/text methods to serializer (experimental) 16 # 2007-09-05 fl added method argument to tostring/tostringlist 17 # 2007-09-06 fl improved error handling 18 # 2007-09-13 fl added itertext, iterfind; assorted cleanups 19 # 2007-12-15 fl added C14N hooks, copy method (experimental) 20 # 21 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. 22 # 23 # fredrik (at] pythonware.com 24 # http://www.pythonware.com 25 # 26 # -------------------------------------------------------------------- 27 # The ElementTree toolkit is 28 # 29 # Copyright (c) 1999-2008 by Fredrik Lundh 30 # 31 # By obtaining, using, and/or copying this software and/or its 32 # associated documentation, you agree that you have read, understood, 33 # and will comply with the following terms and conditions: 34 # 35 # Permission to use, copy, modify, and distribute this software and 36 # its associated documentation for any purpose and without fee is 37 # hereby granted, provided that the above copyright notice appears in 38 # all copies, and that both that copyright notice and this permission 39 # notice appear in supporting documentation, and that the name of 40 # Secret Labs AB or the author not be used in advertising or publicity 41 # pertaining to distribution of the software without specific, written 42 # prior permission. 43 # 44 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD 45 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- 46 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR 47 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 48 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 49 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 50 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 51 # OF THIS SOFTWARE. 52 # -------------------------------------------------------------------- 53 54 # Licensed to PSF under a Contributor Agreement. 55 # See http://www.python.org/psf/license for licensing details. 56 57 __all__ = [ 58 # public symbols 59 "Comment", 60 "dump", 61 "Element", "ElementTree", 62 "fromstring", "fromstringlist", 63 "iselement", "iterparse", 64 "parse", "ParseError", 65 "PI", "ProcessingInstruction", 66 "QName", 67 "SubElement", 68 "tostring", "tostringlist", 69 "TreeBuilder", 70 "VERSION", 71 "XML", 72 "XMLParser", "XMLTreeBuilder", 73 ] 74 75 VERSION = "1.3.0" 76 77 ## 78 # The <b>Element</b> type is a flexible container object, designed to 79 # store hierarchical data structures in memory. The type can be 80 # described as a cross between a list and a dictionary. 81 # <p> 82 # Each element has a number of properties associated with it: 83 # <ul> 84 # <li>a <i>tag</i>. This is a string identifying what kind of data 85 # this element represents (the element type, in other words).</li> 86 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li> 87 # <li>a <i>text</i> string.</li> 88 # <li>an optional <i>tail</i> string.</li> 89 # <li>a number of <i>child elements</i>, stored in a Python sequence</li> 90 # </ul> 91 # 92 # To create an element instance, use the {@link #Element} constructor 93 # or the {@link #SubElement} factory function. 94 # <p> 95 # The {@link #ElementTree} class can be used to wrap an element 96 # structure, and convert it from and to XML. 97 ## 98 99 import sys 100 import re 101 import warnings 102 103 104 class _SimpleElementPath(object): 105 # emulate pre-1.2 find/findtext/findall behaviour 106 def find(self, element, tag, namespaces=None): 107 for elem in element: 108 if elem.tag == tag: 109 return elem 110 return None 111 def findtext(self, element, tag, default=None, namespaces=None): 112 elem = self.find(element, tag) 113 if elem is None: 114 return default 115 return elem.text or "" 116 def iterfind(self, element, tag, namespaces=None): 117 if tag[:3] == ".//": 118 for elem in element.iter(tag[3:]): 119 yield elem 120 for elem in element: 121 if elem.tag == tag: 122 yield elem 123 def findall(self, element, tag, namespaces=None): 124 return list(self.iterfind(element, tag, namespaces)) 125 126 try: 127 from . import ElementPath 128 except ImportError: 129 ElementPath = _SimpleElementPath() 130 131 ## 132 # Parser error. This is a subclass of <b>SyntaxError</b>. 133 # <p> 134 # In addition to the exception value, an exception instance contains a 135 # specific exception code in the <b>code</b> attribute, and the line and 136 # column of the error in the <b>position</b> attribute. 137 138 class ParseError(SyntaxError): 139 pass 140 141 # -------------------------------------------------------------------- 142 143 ## 144 # Checks if an object appears to be a valid element object. 145 # 146 # @param An element instance. 147 # @return A true value if this is an element object. 148 # @defreturn flag 149 150 def iselement(element): 151 # FIXME: not sure about this; might be a better idea to look 152 # for tag/attrib/text attributes 153 return isinstance(element, Element) or hasattr(element, "tag") 154 155 ## 156 # Element class. This class defines the Element interface, and 157 # provides a reference implementation of this interface. 158 # <p> 159 # The element name, attribute names, and attribute values can be 160 # either ASCII strings (ordinary Python strings containing only 7-bit 161 # ASCII characters) or Unicode strings. 162 # 163 # @param tag The element name. 164 # @param attrib An optional dictionary, containing element attributes. 165 # @param **extra Additional attributes, given as keyword arguments. 166 # @see Element 167 # @see SubElement 168 # @see Comment 169 # @see ProcessingInstruction 170 171 class Element(object): 172 # <tag attrib>text<child/>...</tag>tail 173 174 ## 175 # (Attribute) Element tag. 176 177 tag = None 178 179 ## 180 # (Attribute) Element attribute dictionary. Where possible, use 181 # {@link #Element.get}, 182 # {@link #Element.set}, 183 # {@link #Element.keys}, and 184 # {@link #Element.items} to access 185 # element attributes. 186 187 attrib = None 188 189 ## 190 # (Attribute) Text before first subelement. This is either a 191 # string or the value None. Note that if there was no text, this 192 # attribute may be either None or an empty string, depending on 193 # the parser. 194 195 text = None 196 197 ## 198 # (Attribute) Text after this element's end tag, but before the 199 # next sibling element's start tag. This is either a string or 200 # the value None. Note that if there was no text, this attribute 201 # may be either None or an empty string, depending on the parser. 202 203 tail = None # text after end tag, if any 204 205 # constructor 206 207 def __init__(self, tag, attrib={}, **extra): 208 attrib = attrib.copy() 209 attrib.update(extra) 210 self.tag = tag 211 self.attrib = attrib 212 self._children = [] 213 214 def __repr__(self): 215 return "<Element %s at 0x%x>" % (repr(self.tag), id(self)) 216 217 ## 218 # Creates a new element object of the same type as this element. 219 # 220 # @param tag Element tag. 221 # @param attrib Element attributes, given as a dictionary. 222 # @return A new element instance. 223 224 def makeelement(self, tag, attrib): 225 return self.__class__(tag, attrib) 226 227 ## 228 # (Experimental) Copies the current element. This creates a 229 # shallow copy; subelements will be shared with the original tree. 230 # 231 # @return A new element instance. 232 233 def copy(self): 234 elem = self.makeelement(self.tag, self.attrib) 235 elem.text = self.text 236 elem.tail = self.tail 237 elem[:] = self 238 return elem 239 240 ## 241 # Returns the number of subelements. Note that this only counts 242 # full elements; to check if there's any content in an element, you 243 # have to check both the length and the <b>text</b> attribute. 244 # 245 # @return The number of subelements. 246 247 def __len__(self): 248 return len(self._children) 249 250 def __nonzero__(self): 251 warnings.warn( 252 "The behavior of this method will change in future versions. " 253 "Use specific 'len(elem)' or 'elem is not None' test instead.", 254 FutureWarning, stacklevel=2 255 ) 256 return len(self._children) != 0 # emulate old behaviour, for now 257 258 ## 259 # Returns the given subelement, by index. 260 # 261 # @param index What subelement to return. 262 # @return The given subelement. 263 # @exception IndexError If the given element does not exist. 264 265 def __getitem__(self, index): 266 return self._children[index] 267 268 ## 269 # Replaces the given subelement, by index. 270 # 271 # @param index What subelement to replace. 272 # @param element The new element value. 273 # @exception IndexError If the given element does not exist. 274 275 def __setitem__(self, index, element): 276 # if isinstance(index, slice): 277 # for elt in element: 278 # assert iselement(elt) 279 # else: 280 # assert iselement(element) 281 self._children[index] = element 282 283 ## 284 # Deletes the given subelement, by index. 285 # 286 # @param index What subelement to delete. 287 # @exception IndexError If the given element does not exist. 288 289 def __delitem__(self, index): 290 del self._children[index] 291 292 ## 293 # Adds a subelement to the end of this element. In document order, 294 # the new element will appear after the last existing subelement (or 295 # directly after the text, if it's the first subelement), but before 296 # the end tag for this element. 297 # 298 # @param element The element to add. 299 300 def append(self, element): 301 # assert iselement(element) 302 self._children.append(element) 303 304 ## 305 # Appends subelements from a sequence. 306 # 307 # @param elements A sequence object with zero or more elements. 308 # @since 1.3 309 310 def extend(self, elements): 311 # for element in elements: 312 # assert iselement(element) 313 self._children.extend(elements) 314 315 ## 316 # Inserts a subelement at the given position in this element. 317 # 318 # @param index Where to insert the new subelement. 319 320 def insert(self, index, element): 321 # assert iselement(element) 322 self._children.insert(index, element) 323 324 ## 325 # Removes a matching subelement. Unlike the <b>find</b> methods, 326 # this method compares elements based on identity, not on tag 327 # value or contents. To remove subelements by other means, the 328 # easiest way is often to use a list comprehension to select what 329 # elements to keep, and use slice assignment to update the parent 330 # element. 331 # 332 # @param element What element to remove. 333 # @exception ValueError If a matching element could not be found. 334 335 def remove(self, element): 336 # assert iselement(element) 337 self._children.remove(element) 338 339 ## 340 # (Deprecated) Returns all subelements. The elements are returned 341 # in document order. 342 # 343 # @return A list of subelements. 344 # @defreturn list of Element instances 345 346 def getchildren(self): 347 warnings.warn( 348 "This method will be removed in future versions. " 349 "Use 'list(elem)' or iteration over elem instead.", 350 DeprecationWarning, stacklevel=2 351 ) 352 return self._children 353 354 ## 355 # Finds the first matching subelement, by tag name or path. 356 # 357 # @param path What element to look for. 358 # @keyparam namespaces Optional namespace prefix map. 359 # @return The first matching element, or None if no element was found. 360 # @defreturn Element or None 361 362 def find(self, path, namespaces=None): 363 return ElementPath.find(self, path, namespaces) 364 365 ## 366 # Finds text for the first matching subelement, by tag name or path. 367 # 368 # @param path What element to look for. 369 # @param default What to return if the element was not found. 370 # @keyparam namespaces Optional namespace prefix map. 371 # @return The text content of the first matching element, or the 372 # default value no element was found. Note that if the element 373 # is found, but has no text content, this method returns an 374 # empty string. 375 # @defreturn string 376 377 def findtext(self, path, default=None, namespaces=None): 378 return ElementPath.findtext(self, path, default, namespaces) 379 380 ## 381 # Finds all matching subelements, by tag name or path. 382 # 383 # @param path What element to look for. 384 # @keyparam namespaces Optional namespace prefix map. 385 # @return A list or other sequence containing all matching elements, 386 # in document order. 387 # @defreturn list of Element instances 388 389 def findall(self, path, namespaces=None): 390 return ElementPath.findall(self, path, namespaces) 391 392 ## 393 # Finds all matching subelements, by tag name or path. 394 # 395 # @param path What element to look for. 396 # @keyparam namespaces Optional namespace prefix map. 397 # @return An iterator or sequence containing all matching elements, 398 # in document order. 399 # @defreturn a generated sequence of Element instances 400 401 def iterfind(self, path, namespaces=None): 402 return ElementPath.iterfind(self, path, namespaces) 403 404 ## 405 # Resets an element. This function removes all subelements, clears 406 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes 407 # to None. 408 409 def clear(self): 410 self.attrib.clear() 411 self._children = [] 412 self.text = self.tail = None 413 414 ## 415 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but 416 # some implementations may handle this a bit more efficiently. 417 # 418 # @param key What attribute to look for. 419 # @param default What to return if the attribute was not found. 420 # @return The attribute value, or the default value, if the 421 # attribute was not found. 422 # @defreturn string or None 423 424 def get(self, key, default=None): 425 return self.attrib.get(key, default) 426 427 ## 428 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>, 429 # but some implementations may handle this a bit more efficiently. 430 # 431 # @param key What attribute to set. 432 # @param value The attribute value. 433 434 def set(self, key, value): 435 self.attrib[key] = value 436 437 ## 438 # Gets a list of attribute names. The names are returned in an 439 # arbitrary order (just like for an ordinary Python dictionary). 440 # Equivalent to <b>attrib.keys()</b>. 441 # 442 # @return A list of element attribute names. 443 # @defreturn list of strings 444 445 def keys(self): 446 return self.attrib.keys() 447 448 ## 449 # Gets element attributes, as a sequence. The attributes are 450 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>. 451 # 452 # @return A list of (name, value) tuples for all attributes. 453 # @defreturn list of (string, string) tuples 454 455 def items(self): 456 return self.attrib.items() 457 458 ## 459 # Creates a tree iterator. The iterator loops over this element 460 # and all subelements, in document order, and returns all elements 461 # with a matching tag. 462 # <p> 463 # If the tree structure is modified during iteration, new or removed 464 # elements may or may not be included. To get a stable set, use the 465 # list() function on the iterator, and loop over the resulting list. 466 # 467 # @param tag What tags to look for (default is to return all elements). 468 # @return An iterator containing all the matching elements. 469 # @defreturn iterator 470 471 def iter(self, tag=None): 472 if tag == "*": 473 tag = None 474 if tag is None or self.tag == tag: 475 yield self 476 for e in self._children: 477 for e in e.iter(tag): 478 yield e 479 480 # compatibility 481 def getiterator(self, tag=None): 482 # Change for a DeprecationWarning in 1.4 483 warnings.warn( 484 "This method will be removed in future versions. " 485 "Use 'elem.iter()' or 'list(elem.iter())' instead.", 486 PendingDeprecationWarning, stacklevel=2 487 ) 488 return list(self.iter(tag)) 489 490 ## 491 # Creates a text iterator. The iterator loops over this element 492 # and all subelements, in document order, and returns all inner 493 # text. 494 # 495 # @return An iterator containing all inner text. 496 # @defreturn iterator 497 498 def itertext(self): 499 tag = self.tag 500 if not isinstance(tag, basestring) and tag is not None: 501 return 502 if self.text: 503 yield self.text 504 for e in self: 505 for s in e.itertext(): 506 yield s 507 if e.tail: 508 yield e.tail 509 510 # compatibility 511 _Element = _ElementInterface = Element 512 513 ## 514 # Subelement factory. This function creates an element instance, and 515 # appends it to an existing element. 516 # <p> 517 # The element name, attribute names, and attribute values can be 518 # either 8-bit ASCII strings or Unicode strings. 519 # 520 # @param parent The parent element. 521 # @param tag The subelement name. 522 # @param attrib An optional dictionary, containing element attributes. 523 # @param **extra Additional attributes, given as keyword arguments. 524 # @return An element instance. 525 # @defreturn Element 526 527 def SubElement(parent, tag, attrib={}, **extra): 528 attrib = attrib.copy() 529 attrib.update(extra) 530 element = parent.makeelement(tag, attrib) 531 parent.append(element) 532 return element 533 534 ## 535 # Comment element factory. This factory function creates a special 536 # element that will be serialized as an XML comment by the standard 537 # serializer. 538 # <p> 539 # The comment string can be either an 8-bit ASCII string or a Unicode 540 # string. 541 # 542 # @param text A string containing the comment string. 543 # @return An element instance, representing a comment. 544 # @defreturn Element 545 546 def Comment(text=None): 547 element = Element(Comment) 548 element.text = text 549 return element 550 551 ## 552 # PI element factory. This factory function creates a special element 553 # that will be serialized as an XML processing instruction by the standard 554 # serializer. 555 # 556 # @param target A string containing the PI target. 557 # @param text A string containing the PI contents, if any. 558 # @return An element instance, representing a PI. 559 # @defreturn Element 560 561 def ProcessingInstruction(target, text=None): 562 element = Element(ProcessingInstruction) 563 element.text = target 564 if text: 565 element.text = element.text + " " + text 566 return element 567 568 PI = ProcessingInstruction 569 570 ## 571 # QName wrapper. This can be used to wrap a QName attribute value, in 572 # order to get proper namespace handling on output. 573 # 574 # @param text A string containing the QName value, in the form {uri}local, 575 # or, if the tag argument is given, the URI part of a QName. 576 # @param tag Optional tag. If given, the first argument is interpreted as 577 # an URI, and this argument is interpreted as a local name. 578 # @return An opaque object, representing the QName. 579 580 class QName(object): 581 def __init__(self, text_or_uri, tag=None): 582 if tag: 583 text_or_uri = "{%s}%s" % (text_or_uri, tag) 584 self.text = text_or_uri 585 def __str__(self): 586 return self.text 587 def __hash__(self): 588 return hash(self.text) 589 def __cmp__(self, other): 590 if isinstance(other, QName): 591 return cmp(self.text, other.text) 592 return cmp(self.text, other) 593 594 # -------------------------------------------------------------------- 595 596 ## 597 # ElementTree wrapper class. This class represents an entire element 598 # hierarchy, and adds some extra support for serialization to and from 599 # standard XML. 600 # 601 # @param element Optional root element. 602 # @keyparam file Optional file handle or file name. If given, the 603 # tree is initialized with the contents of this XML file. 604 605 class ElementTree(object): 606 607 def __init__(self, element=None, file=None): 608 # assert element is None or iselement(element) 609 self._root = element # first node 610 if file: 611 self.parse(file) 612 613 ## 614 # Gets the root element for this tree. 615 # 616 # @return An element instance. 617 # @defreturn Element 618 619 def getroot(self): 620 return self._root 621 622 ## 623 # Replaces the root element for this tree. This discards the 624 # current contents of the tree, and replaces it with the given 625 # element. Use with care. 626 # 627 # @param element An element instance. 628 629 def _setroot(self, element): 630 # assert iselement(element) 631 self._root = element 632 633 ## 634 # Loads an external XML document into this element tree. 635 # 636 # @param source A file name or file object. If a file object is 637 # given, it only has to implement a <b>read(n)</b> method. 638 # @keyparam parser An optional parser instance. If not given, the 639 # standard {@link XMLParser} parser is used. 640 # @return The document root element. 641 # @defreturn Element 642 # @exception ParseError If the parser fails to parse the document. 643 644 def parse(self, source, parser=None): 645 if not hasattr(source, "read"): 646 source = open(source, "rb") 647 if not parser: 648 parser = XMLParser(target=TreeBuilder()) 649 while 1: 650 data = source.read(65536) 651 if not data: 652 break 653 parser.feed(data) 654 self._root = parser.close() 655 return self._root 656 657 ## 658 # Creates a tree iterator for the root element. The iterator loops 659 # over all elements in this tree, in document order. 660 # 661 # @param tag What tags to look for (default is to return all elements) 662 # @return An iterator. 663 # @defreturn iterator 664 665 def iter(self, tag=None): 666 # assert self._root is not None 667 return self._root.iter(tag) 668 669 # compatibility 670 def getiterator(self, tag=None): 671 # Change for a DeprecationWarning in 1.4 672 warnings.warn( 673 "This method will be removed in future versions. " 674 "Use 'tree.iter()' or 'list(tree.iter())' instead.", 675 PendingDeprecationWarning, stacklevel=2 676 ) 677 return list(self.iter(tag)) 678 679 ## 680 # Finds the first toplevel element with given tag. 681 # Same as getroot().find(path). 682 # 683 # @param path What element to look for. 684 # @keyparam namespaces Optional namespace prefix map. 685 # @return The first matching element, or None if no element was found. 686 # @defreturn Element or None 687 688 def find(self, path, namespaces=None): 689 # assert self._root is not None 690 if path[:1] == "/": 691 path = "." + path 692 warnings.warn( 693 "This search is broken in 1.3 and earlier, and will be " 694 "fixed in a future version. If you rely on the current " 695 "behaviour, change it to %r" % path, 696 FutureWarning, stacklevel=2 697 ) 698 return self._root.find(path, namespaces) 699 700 ## 701 # Finds the element text for the first toplevel element with given 702 # tag. Same as getroot().findtext(path). 703 # 704 # @param path What toplevel element to look for. 705 # @param default What to return if the element was not found. 706 # @keyparam namespaces Optional namespace prefix map. 707 # @return The text content of the first matching element, or the 708 # default value no element was found. Note that if the element 709 # is found, but has no text content, this method returns an 710 # empty string. 711 # @defreturn string 712 713 def findtext(self, path, default=None, namespaces=None): 714 # assert self._root is not None 715 if path[:1] == "/": 716 path = "." + path 717 warnings.warn( 718 "This search is broken in 1.3 and earlier, and will be " 719 "fixed in a future version. If you rely on the current " 720 "behaviour, change it to %r" % path, 721 FutureWarning, stacklevel=2 722 ) 723 return self._root.findtext(path, default, namespaces) 724 725 ## 726 # Finds all toplevel elements with the given tag. 727 # Same as getroot().findall(path). 728 # 729 # @param path What element to look for. 730 # @keyparam namespaces Optional namespace prefix map. 731 # @return A list or iterator containing all matching elements, 732 # in document order. 733 # @defreturn list of Element instances 734 735 def findall(self, path, namespaces=None): 736 # assert self._root is not None 737 if path[:1] == "/": 738 path = "." + path 739 warnings.warn( 740 "This search is broken in 1.3 and earlier, and will be " 741 "fixed in a future version. If you rely on the current " 742 "behaviour, change it to %r" % path, 743 FutureWarning, stacklevel=2 744 ) 745 return self._root.findall(path, namespaces) 746 747 ## 748 # Finds all matching subelements, by tag name or path. 749 # Same as getroot().iterfind(path). 750 # 751 # @param path What element to look for. 752 # @keyparam namespaces Optional namespace prefix map. 753 # @return An iterator or sequence containing all matching elements, 754 # in document order. 755 # @defreturn a generated sequence of Element instances 756 757 def iterfind(self, path, namespaces=None): 758 # assert self._root is not None 759 if path[:1] == "/": 760 path = "." + path 761 warnings.warn( 762 "This search is broken in 1.3 and earlier, and will be " 763 "fixed in a future version. If you rely on the current " 764 "behaviour, change it to %r" % path, 765 FutureWarning, stacklevel=2 766 ) 767 return self._root.iterfind(path, namespaces) 768 769 ## 770 # Writes the element tree to a file, as XML. 771 # 772 # @def write(file, **options) 773 # @param file A file name, or a file object opened for writing. 774 # @param **options Options, given as keyword arguments. 775 # @keyparam encoding Optional output encoding (default is US-ASCII). 776 # @keyparam method Optional output method ("xml", "html", "text" or 777 # "c14n"; default is "xml"). 778 # @keyparam xml_declaration Controls if an XML declaration should 779 # be added to the file. Use False for never, True for always, 780 # None for only if not US-ASCII or UTF-8. None is default. 781 782 def write(self, file_or_filename, 783 # keyword arguments 784 encoding=None, 785 xml_declaration=None, 786 default_namespace=None, 787 method=None): 788 # assert self._root is not None 789 if not method: 790 method = "xml" 791 elif method not in _serialize: 792 # FIXME: raise an ImportError for c14n if ElementC14N is missing? 793 raise ValueError("unknown method %r" % method) 794 if hasattr(file_or_filename, "write"): 795 file = file_or_filename 796 else: 797 file = open(file_or_filename, "wb") 798 write = file.write 799 if not encoding: 800 if method == "c14n": 801 encoding = "utf-8" 802 else: 803 encoding = "us-ascii" 804 elif xml_declaration or (xml_declaration is None and 805 encoding not in ("utf-8", "us-ascii")): 806 if method == "xml": 807 write("<?xml version='1.0' encoding='%s'?>\n" % encoding) 808 if method == "text": 809 _serialize_text(write, self._root, encoding) 810 else: 811 qnames, namespaces = _namespaces( 812 self._root, encoding, default_namespace 813 ) 814 serialize = _serialize[method] 815 serialize(write, self._root, encoding, qnames, namespaces) 816 if file_or_filename is not file: 817 file.close() 818 819 def write_c14n(self, file): 820 # lxml.etree compatibility. use output method instead 821 return self.write(file, method="c14n") 822 823 # -------------------------------------------------------------------- 824 # serialization support 825 826 def _namespaces(elem, encoding, default_namespace=None): 827 # identify namespaces used in this tree 828 829 # maps qnames to *encoded* prefix:local names 830 qnames = {None: None} 831 832 # maps uri:s to prefixes 833 namespaces = {} 834 if default_namespace: 835 namespaces[default_namespace] = "" 836 837 def encode(text): 838 return text.encode(encoding) 839 840 def add_qname(qname): 841 # calculate serialized qname representation 842 try: 843 if qname[:1] == "{": 844 uri, tag = qname[1:].rsplit("}", 1) 845 prefix = namespaces.get(uri) 846 if prefix is None: 847 prefix = _namespace_map.get(uri) 848 if prefix is None: 849 prefix = "ns%d" % len(namespaces) 850 if prefix != "xml": 851 namespaces[uri] = prefix 852 if prefix: 853 qnames[qname] = encode("%s:%s" % (prefix, tag)) 854 else: 855 qnames[qname] = encode(tag) # default element 856 else: 857 if default_namespace: 858 # FIXME: can this be handled in XML 1.0? 859 raise ValueError( 860 "cannot use non-qualified names with " 861 "default_namespace option" 862 ) 863 qnames[qname] = encode(qname) 864 except TypeError: 865 _raise_serialization_error(qname) 866 867 # populate qname and namespaces table 868 try: 869 iterate = elem.iter 870 except AttributeError: 871 iterate = elem.getiterator # cET compatibility 872 for elem in iterate(): 873 tag = elem.tag 874 if isinstance(tag, QName): 875 if tag.text not in qnames: 876 add_qname(tag.text) 877 elif isinstance(tag, basestring): 878 if tag not in qnames: 879 add_qname(tag) 880 elif tag is not None and tag is not Comment and tag is not PI: 881 _raise_serialization_error(tag) 882 for key, value in elem.items(): 883 if isinstance(key, QName): 884 key = key.text 885 if key not in qnames: 886 add_qname(key) 887 if isinstance(value, QName) and value.text not in qnames: 888 add_qname(value.text) 889 text = elem.text 890 if isinstance(text, QName) and text.text not in qnames: 891 add_qname(text.text) 892 return qnames, namespaces 893 894 def _serialize_xml(write, elem, encoding, qnames, namespaces): 895 tag = elem.tag 896 text = elem.text 897 if tag is Comment: 898 write("<!--%s-->" % _encode(text, encoding)) 899 elif tag is ProcessingInstruction: 900 write("<?%s?>" % _encode(text, encoding)) 901 else: 902 tag = qnames[tag] 903 if tag is None: 904 if text: 905 write(_escape_cdata(text, encoding)) 906 for e in elem: 907 _serialize_xml(write, e, encoding, qnames, None) 908 else: 909 write("<" + tag) 910 items = elem.items() 911 if items or namespaces: 912 if namespaces: 913 for v, k in sorted(namespaces.items(), 914 key=lambda x: x[1]): # sort on prefix 915 if k: 916 k = ":" + k 917 write(" xmlns%s=\"%s\"" % ( 918 k.encode(encoding), 919 _escape_attrib(v, encoding) 920 )) 921 for k, v in sorted(items): # lexical order 922 if isinstance(k, QName): 923 k = k.text 924 if isinstance(v, QName): 925 v = qnames[v.text] 926 else: 927 v = _escape_attrib(v, encoding) 928 write(" %s=\"%s\"" % (qnames[k], v)) 929 if text or len(elem): 930 write(">") 931 if text: 932 write(_escape_cdata(text, encoding)) 933 for e in elem: 934 _serialize_xml(write, e, encoding, qnames, None) 935 write("</" + tag + ">") 936 else: 937 write(" />") 938 if elem.tail: 939 write(_escape_cdata(elem.tail, encoding)) 940 941 HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", 942 "img", "input", "isindex", "link", "meta" "param") 943 944 try: 945 HTML_EMPTY = set(HTML_EMPTY) 946 except NameError: 947 pass 948 949 def _serialize_html(write, elem, encoding, qnames, namespaces): 950 tag = elem.tag 951 text = elem.text 952 if tag is Comment: 953 write("<!--%s-->" % _escape_cdata(text, encoding)) 954 elif tag is ProcessingInstruction: 955 write("<?%s?>" % _escape_cdata(text, encoding)) 956 else: 957 tag = qnames[tag] 958 if tag is None: 959 if text: 960 write(_escape_cdata(text, encoding)) 961 for e in elem: 962 _serialize_html(write, e, encoding, qnames, None) 963 else: 964 write("<" + tag) 965 items = elem.items() 966 if items or namespaces: 967 if namespaces: 968 for v, k in sorted(namespaces.items(), 969 key=lambda x: x[1]): # sort on prefix 970 if k: 971 k = ":" + k 972 write(" xmlns%s=\"%s\"" % ( 973 k.encode(encoding), 974 _escape_attrib(v, encoding) 975 )) 976 for k, v in sorted(items): # lexical order 977 if isinstance(k, QName): 978 k = k.text 979 if isinstance(v, QName): 980 v = qnames[v.text] 981 else: 982 v = _escape_attrib_html(v, encoding) 983 # FIXME: handle boolean attributes 984 write(" %s=\"%s\"" % (qnames[k], v)) 985 write(">") 986 tag = tag.lower() 987 if text: 988 if tag == "script" or tag == "style": 989 write(_encode(text, encoding)) 990 else: 991 write(_escape_cdata(text, encoding)) 992 for e in elem: 993 _serialize_html(write, e, encoding, qnames, None) 994 if tag not in HTML_EMPTY: 995 write("</" + tag + ">") 996 if elem.tail: 997 write(_escape_cdata(elem.tail, encoding)) 998 999 def _serialize_text(write, elem, encoding): 1000 for part in elem.itertext(): 1001 write(part.encode(encoding)) 1002 if elem.tail: 1003 write(elem.tail.encode(encoding)) 1004 1005 _serialize = { 1006 "xml": _serialize_xml, 1007 "html": _serialize_html, 1008 "text": _serialize_text, 1009 # this optional method is imported at the end of the module 1010 # "c14n": _serialize_c14n, 1011 } 1012 1013 ## 1014 # Registers a namespace prefix. The registry is global, and any 1015 # existing mapping for either the given prefix or the namespace URI 1016 # will be removed. 1017 # 1018 # @param prefix Namespace prefix. 1019 # @param uri Namespace uri. Tags and attributes in this namespace 1020 # will be serialized with the given prefix, if at all possible. 1021 # @exception ValueError If the prefix is reserved, or is otherwise 1022 # invalid. 1023 1024 def register_namespace(prefix, uri): 1025 if re.match("ns\d+$", prefix): 1026 raise ValueError("Prefix format reserved for internal use") 1027 for k, v in _namespace_map.items(): 1028 if k == uri or v == prefix: 1029 del _namespace_map[k] 1030 _namespace_map[uri] = prefix 1031 1032 _namespace_map = { 1033 # "well-known" namespace prefixes 1034 "http://www.w3.org/XML/1998/namespace": "xml", 1035 "http://www.w3.org/1999/xhtml": "html", 1036 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", 1037 "http://schemas.xmlsoap.org/wsdl/": "wsdl", 1038 # xml schema 1039 "http://www.w3.org/2001/XMLSchema": "xs", 1040 "http://www.w3.org/2001/XMLSchema-instance": "xsi", 1041 # dublin core 1042 "http://purl.org/dc/elements/1.1/": "dc", 1043 } 1044 1045 def _raise_serialization_error(text): 1046 raise TypeError( 1047 "cannot serialize %r (type %s)" % (text, type(text).__name__) 1048 ) 1049 1050 def _encode(text, encoding): 1051 try: 1052 return text.encode(encoding, "xmlcharrefreplace") 1053 except (TypeError, AttributeError): 1054 _raise_serialization_error(text) 1055 1056 def _escape_cdata(text, encoding): 1057 # escape character data 1058 try: 1059 # it's worth avoiding do-nothing calls for strings that are 1060 # shorter than 500 character, or so. assume that's, by far, 1061 # the most common case in most applications. 1062 if "&" in text: 1063 text = text.replace("&", "&") 1064 if "<" in text: 1065 text = text.replace("<", "<") 1066 if ">" in text: 1067 text = text.replace(">", ">") 1068 return text.encode(encoding, "xmlcharrefreplace") 1069 except (TypeError, AttributeError): 1070 _raise_serialization_error(text) 1071 1072 def _escape_attrib(text, encoding): 1073 # escape attribute value 1074 try: 1075 if "&" in text: 1076 text = text.replace("&", "&") 1077 if "<" in text: 1078 text = text.replace("<", "<") 1079 if ">" in text: 1080 text = text.replace(">", ">") 1081 if "\"" in text: 1082 text = text.replace("\"", """) 1083 if "\n" in text: 1084 text = text.replace("\n", " ") 1085 return text.encode(encoding, "xmlcharrefreplace") 1086 except (TypeError, AttributeError): 1087 _raise_serialization_error(text) 1088 1089 def _escape_attrib_html(text, encoding): 1090 # escape attribute value 1091 try: 1092 if "&" in text: 1093 text = text.replace("&", "&") 1094 if ">" in text: 1095 text = text.replace(">", ">") 1096 if "\"" in text: 1097 text = text.replace("\"", """) 1098 return text.encode(encoding, "xmlcharrefreplace") 1099 except (TypeError, AttributeError): 1100 _raise_serialization_error(text) 1101 1102 # -------------------------------------------------------------------- 1103 1104 ## 1105 # Generates a string representation of an XML element, including all 1106 # subelements. 1107 # 1108 # @param element An Element instance. 1109 # @keyparam encoding Optional output encoding (default is US-ASCII). 1110 # @keyparam method Optional output method ("xml", "html", "text" or 1111 # "c14n"; default is "xml"). 1112 # @return An encoded string containing the XML data. 1113 # @defreturn string 1114 1115 def tostring(element, encoding=None, method=None): 1116 class dummy: 1117 pass 1118 data = [] 1119 file = dummy() 1120 file.write = data.append 1121 ElementTree(element).write(file, encoding, method=method) 1122 return "".join(data) 1123 1124 ## 1125 # Generates a string representation of an XML element, including all 1126 # subelements. The string is returned as a sequence of string fragments. 1127 # 1128 # @param element An Element instance. 1129 # @keyparam encoding Optional output encoding (default is US-ASCII). 1130 # @keyparam method Optional output method ("xml", "html", "text" or 1131 # "c14n"; default is "xml"). 1132 # @return A sequence object containing the XML data. 1133 # @defreturn sequence 1134 # @since 1.3 1135 1136 def tostringlist(element, encoding=None, method=None): 1137 class dummy: 1138 pass 1139 data = [] 1140 file = dummy() 1141 file.write = data.append 1142 ElementTree(element).write(file, encoding, method=method) 1143 # FIXME: merge small fragments into larger parts 1144 return data 1145 1146 ## 1147 # Writes an element tree or element structure to sys.stdout. This 1148 # function should be used for debugging only. 1149 # <p> 1150 # The exact output format is implementation dependent. In this 1151 # version, it's written as an ordinary XML file. 1152 # 1153 # @param elem An element tree or an individual element. 1154 1155 def dump(elem): 1156 # debugging 1157 if not isinstance(elem, ElementTree): 1158 elem = ElementTree(elem) 1159 elem.write(sys.stdout) 1160 tail = elem.getroot().tail 1161 if not tail or tail[-1] != "\n": 1162 sys.stdout.write("\n") 1163 1164 # -------------------------------------------------------------------- 1165 # parsing 1166 1167 ## 1168 # Parses an XML document into an element tree. 1169 # 1170 # @param source A filename or file object containing XML data. 1171 # @param parser An optional parser instance. If not given, the 1172 # standard {@link XMLParser} parser is used. 1173 # @return An ElementTree instance 1174 1175 def parse(source, parser=None): 1176 tree = ElementTree() 1177 tree.parse(source, parser) 1178 return tree 1179 1180 ## 1181 # Parses an XML document into an element tree incrementally, and reports 1182 # what's going on to the user. 1183 # 1184 # @param source A filename or file object containing XML data. 1185 # @param events A list of events to report back. If omitted, only "end" 1186 # events are reported. 1187 # @param parser An optional parser instance. If not given, the 1188 # standard {@link XMLParser} parser is used. 1189 # @return A (event, elem) iterator. 1190 1191 def iterparse(source, events=None, parser=None): 1192 if not hasattr(source, "read"): 1193 source = open(source, "rb") 1194 if not parser: 1195 parser = XMLParser(target=TreeBuilder()) 1196 return _IterParseIterator(source, events, parser) 1197 1198 class _IterParseIterator(object): 1199 1200 def __init__(self, source, events, parser): 1201 self._file = source 1202 self._events = [] 1203 self._index = 0 1204 self.root = self._root = None 1205 self._parser = parser 1206 # wire up the parser for event reporting 1207 parser = self._parser._parser 1208 append = self._events.append 1209 if events is None: 1210 events = ["end"] 1211 for event in events: 1212 if event == "start": 1213 try: 1214 parser.ordered_attributes = 1 1215 parser.specified_attributes = 1 1216 def handler(tag, attrib_in, event=event, append=append, 1217 start=self._parser._start_list): 1218 append((event, start(tag, attrib_in))) 1219 parser.StartElementHandler = handler 1220 except AttributeError: 1221 def handler(tag, attrib_in, event=event, append=append, 1222 start=self._parser._start): 1223 append((event, start(tag, attrib_in))) 1224 parser.StartElementHandler = handler 1225 elif event == "end": 1226 def handler(tag, event=event, append=append, 1227 end=self._parser._end): 1228 append((event, end(tag))) 1229 parser.EndElementHandler = handler 1230 elif event == "start-ns": 1231 def handler(prefix, uri, event=event, append=append): 1232 try: 1233 uri = (uri or "").encode("ascii") 1234 except UnicodeError: 1235 pass 1236 append((event, (prefix or "", uri or ""))) 1237 parser.StartNamespaceDeclHandler = handler 1238 elif event == "end-ns": 1239 def handler(prefix, event=event, append=append): 1240 append((event, None)) 1241 parser.EndNamespaceDeclHandler = handler 1242 else: 1243 raise ValueError("unknown event %r" % event) 1244 1245 def next(self): 1246 while 1: 1247 try: 1248 item = self._events[self._index] 1249 except IndexError: 1250 if self._parser is None: 1251 self.root = self._root 1252 raise StopIteration 1253 # load event buffer 1254 del self._events[:] 1255 self._index = 0 1256 data = self._file.read(16384) 1257 if data: 1258 self._parser.feed(data) 1259 else: 1260 self._root = self._parser.close() 1261 self._parser = None 1262 else: 1263 self._index = self._index + 1 1264 return item 1265 1266 def __iter__(self): 1267 return self 1268 1269 ## 1270 # Parses an XML document from a string constant. This function can 1271 # be used to embed "XML literals" in Python code. 1272 # 1273 # @param source A string containing XML data. 1274 # @param parser An optional parser instance. If not given, the 1275 # standard {@link XMLParser} parser is used. 1276 # @return An Element instance. 1277 # @defreturn Element 1278 1279 def XML(text, parser=None): 1280 if not parser: 1281 parser = XMLParser(target=TreeBuilder()) 1282 parser.feed(text) 1283 return parser.close() 1284 1285 ## 1286 # Parses an XML document from a string constant, and also returns 1287 # a dictionary which maps from element id:s to elements. 1288 # 1289 # @param source A string containing XML data. 1290 # @param parser An optional parser instance. If not given, the 1291 # standard {@link XMLParser} parser is used. 1292 # @return A tuple containing an Element instance and a dictionary. 1293 # @defreturn (Element, dictionary) 1294 1295 def XMLID(text, parser=None): 1296 if not parser: 1297 parser = XMLParser(target=TreeBuilder()) 1298 parser.feed(text) 1299 tree = parser.close() 1300 ids = {} 1301 for elem in tree.iter(): 1302 id = elem.get("id") 1303 if id: 1304 ids[id] = elem 1305 return tree, ids 1306 1307 ## 1308 # Parses an XML document from a string constant. Same as {@link #XML}. 1309 # 1310 # @def fromstring(text) 1311 # @param source A string containing XML data. 1312 # @return An Element instance. 1313 # @defreturn Element 1314 1315 fromstring = XML 1316 1317 ## 1318 # Parses an XML document from a sequence of string fragments. 1319 # 1320 # @param sequence A list or other sequence containing XML data fragments. 1321 # @param parser An optional parser instance. If not given, the 1322 # standard {@link XMLParser} parser is used. 1323 # @return An Element instance. 1324 # @defreturn Element 1325 # @since 1.3 1326 1327 def fromstringlist(sequence, parser=None): 1328 if not parser: 1329 parser = XMLParser(target=TreeBuilder()) 1330 for text in sequence: 1331 parser.feed(text) 1332 return parser.close() 1333 1334 # -------------------------------------------------------------------- 1335 1336 ## 1337 # Generic element structure builder. This builder converts a sequence 1338 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link 1339 # #TreeBuilder.end} method calls to a well-formed element structure. 1340 # <p> 1341 # You can use this class to build an element structure using a custom XML 1342 # parser, or a parser for some other XML-like format. 1343 # 1344 # @param element_factory Optional element factory. This factory 1345 # is called to create new Element instances, as necessary. 1346 1347 class TreeBuilder(object): 1348 1349 def __init__(self, element_factory=None): 1350 self._data = [] # data collector 1351 self._elem = [] # element stack 1352 self._last = None # last element 1353 self._tail = None # true if we're after an end tag 1354 if element_factory is None: 1355 element_factory = Element 1356 self._factory = element_factory 1357 1358 ## 1359 # Flushes the builder buffers, and returns the toplevel document 1360 # element. 1361 # 1362 # @return An Element instance. 1363 # @defreturn Element 1364 1365 def close(self): 1366 assert len(self._elem) == 0, "missing end tags" 1367 assert self._last is not None, "missing toplevel element" 1368 return self._last 1369 1370 def _flush(self): 1371 if self._data: 1372 if self._last is not None: 1373 text = "".join(self._data) 1374 if self._tail: 1375 assert self._last.tail is None, "internal error (tail)" 1376 self._last.tail = text 1377 else: 1378 assert self._last.text is None, "internal error (text)" 1379 self._last.text = text 1380 self._data = [] 1381 1382 ## 1383 # Adds text to the current element. 1384 # 1385 # @param data A string. This should be either an 8-bit string 1386 # containing ASCII text, or a Unicode string. 1387 1388 def data(self, data): 1389 self._data.append(data) 1390 1391 ## 1392 # Opens a new element. 1393 # 1394 # @param tag The element name. 1395 # @param attrib A dictionary containing element attributes. 1396 # @return The opened element. 1397 # @defreturn Element 1398 1399 def start(self, tag, attrs): 1400 self._flush() 1401 self._last = elem = self._factory(tag, attrs) 1402 if self._elem: 1403 self._elem[-1].append(elem) 1404 self._elem.append(elem) 1405 self._tail = 0 1406 return elem 1407 1408 ## 1409 # Closes the current element. 1410 # 1411 # @param tag The element name. 1412 # @return The closed element. 1413 # @defreturn Element 1414 1415 def end(self, tag): 1416 self._flush() 1417 self._last = self._elem.pop() 1418 assert self._last.tag == tag,\ 1419 "end tag mismatch (expected %s, got %s)" % ( 1420 self._last.tag, tag) 1421 self._tail = 1 1422 return self._last 1423 1424 ## 1425 # Element structure builder for XML source data, based on the 1426 # <b>expat</b> parser. 1427 # 1428 # @keyparam target Target object. If omitted, the builder uses an 1429 # instance of the standard {@link #TreeBuilder} class. 1430 # @keyparam html Predefine HTML entities. This flag is not supported 1431 # by the current implementation. 1432 # @keyparam encoding Optional encoding. If given, the value overrides 1433 # the encoding specified in the XML file. 1434 # @see #ElementTree 1435 # @see #TreeBuilder 1436 1437 class XMLParser(object): 1438 1439 def __init__(self, html=0, target=None, encoding=None): 1440 try: 1441 from xml.parsers import expat 1442 except ImportError: 1443 try: 1444 import pyexpat as expat 1445 except ImportError: 1446 raise ImportError( 1447 "No module named expat; use SimpleXMLTreeBuilder instead" 1448 ) 1449 parser = expat.ParserCreate(encoding, "}") 1450 if target is None: 1451 target = TreeBuilder() 1452 # underscored names are provided for compatibility only 1453 self.parser = self._parser = parser 1454 self.target = self._target = target 1455 self._error = expat.error 1456 self._names = {} # name memo cache 1457 # callbacks 1458 parser.DefaultHandlerExpand = self._default 1459 parser.StartElementHandler = self._start 1460 parser.EndElementHandler = self._end 1461 parser.CharacterDataHandler = self._data 1462 # optional callbacks 1463 parser.CommentHandler = self._comment 1464 parser.ProcessingInstructionHandler = self._pi 1465 # let expat do the buffering, if supported 1466 try: 1467 self._parser.buffer_text = 1 1468 except AttributeError: 1469 pass 1470 # use new-style attribute handling, if supported 1471 try: 1472 self._parser.ordered_attributes = 1 1473 self._parser.specified_attributes = 1 1474 parser.StartElementHandler = self._start_list 1475 except AttributeError: 1476 pass 1477 self._doctype = None 1478 self.entity = {} 1479 try: 1480 self.version = "Expat %d.%d.%d" % expat.version_info 1481 except AttributeError: 1482 pass # unknown 1483 1484 def _raiseerror(self, value): 1485 err = ParseError(value) 1486 err.code = value.code 1487 err.position = value.lineno, value.offset 1488 raise err 1489 1490 def _fixtext(self, text): 1491 # convert text string to ascii, if possible 1492 try: 1493 return text.encode("ascii") 1494 except UnicodeError: 1495 return text 1496 1497 def _fixname(self, key): 1498 # expand qname, and convert name string to ascii, if possible 1499 try: 1500 name = self._names[key] 1501 except KeyError: 1502 name = key 1503 if "}" in name: 1504 name = "{" + name 1505 self._names[key] = name = self._fixtext(name) 1506 return name 1507 1508 def _start(self, tag, attrib_in): 1509 fixname = self._fixname 1510 fixtext = self._fixtext 1511 tag = fixname(tag) 1512 attrib = {} 1513 for key, value in attrib_in.items(): 1514 attrib[fixname(key)] = fixtext(value) 1515 return self.target.start(tag, attrib) 1516 1517 def _start_list(self, tag, attrib_in): 1518 fixname = self._fixname 1519 fixtext = self._fixtext 1520 tag = fixname(tag) 1521 attrib = {} 1522 if attrib_in: 1523 for i in range(0, len(attrib_in), 2): 1524 attrib[fixname(attrib_in[i])] = fixtext(attrib_in[i+1]) 1525 return self.target.start(tag, attrib) 1526 1527 def _data(self, text): 1528 return self.target.data(self._fixtext(text)) 1529 1530 def _end(self, tag): 1531 return self.target.end(self._fixname(tag)) 1532 1533 def _comment(self, data): 1534 try: 1535 comment = self.target.comment 1536 except AttributeError: 1537 pass 1538 else: 1539 return comment(self._fixtext(data)) 1540 1541 def _pi(self, target, data): 1542 try: 1543 pi = self.target.pi 1544 except AttributeError: 1545 pass 1546 else: 1547 return pi(self._fixtext(target), self._fixtext(data)) 1548 1549 def _default(self, text): 1550 prefix = text[:1] 1551 if prefix == "&": 1552 # deal with undefined entities 1553 try: 1554 self.target.data(self.entity[text[1:-1]]) 1555 except KeyError: 1556 from xml.parsers import expat 1557 err = expat.error( 1558 "undefined entity %s: line %d, column %d" % 1559 (text, self._parser.ErrorLineNumber, 1560 self._parser.ErrorColumnNumber) 1561 ) 1562 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY 1563 err.lineno = self._parser.ErrorLineNumber 1564 err.offset = self._parser.ErrorColumnNumber 1565 raise err 1566 elif prefix == "<" and text[:9] == "<!DOCTYPE": 1567 self._doctype = [] # inside a doctype declaration 1568 elif self._doctype is not None: 1569 # parse doctype contents 1570 if prefix == ">": 1571 self._doctype = None 1572 return 1573 text = text.strip() 1574 if not text: 1575 return 1576 self._doctype.append(text) 1577 n = len(self._doctype) 1578 if n > 2: 1579 type = self._doctype[1] 1580 if type == "PUBLIC" and n == 4: 1581 name, type, pubid, system = self._doctype 1582 elif type == "SYSTEM" and n == 3: 1583 name, type, system = self._doctype 1584 pubid = None 1585 else: 1586 return 1587 if pubid: 1588 pubid = pubid[1:-1] 1589 if hasattr(self.target, "doctype"): 1590 self.target.doctype(name, pubid, system[1:-1]) 1591 elif self.doctype is not self._XMLParser__doctype: 1592 # warn about deprecated call 1593 self._XMLParser__doctype(name, pubid, system[1:-1]) 1594 self.doctype(name, pubid, system[1:-1]) 1595 self._doctype = None 1596 1597 ## 1598 # (Deprecated) Handles a doctype declaration. 1599 # 1600 # @param name Doctype name. 1601 # @param pubid Public identifier. 1602 # @param system System identifier. 1603 1604 def doctype(self, name, pubid, system): 1605 """This method of XMLParser is deprecated.""" 1606 warnings.warn( 1607 "This method of XMLParser is deprecated. Define doctype() " 1608 "method on the TreeBuilder target.", 1609 DeprecationWarning, 1610 ) 1611 1612 # sentinel, if doctype is redefined in a subclass 1613 __doctype = doctype 1614 1615 ## 1616 # Feeds data to the parser. 1617 # 1618 # @param data Encoded data. 1619 1620 def feed(self, data): 1621 try: 1622 self._parser.Parse(data, 0) 1623 except self._error, v: 1624 self._raiseerror(v) 1625 1626 ## 1627 # Finishes feeding data to the parser. 1628 # 1629 # @return An element structure. 1630 # @defreturn Element 1631 1632 def close(self): 1633 try: 1634 self._parser.Parse("", 1) # end of data 1635 except self._error, v: 1636 self._raiseerror(v) 1637 tree = self.target.close() 1638 del self.target, self._parser # get rid of circular references 1639 return tree 1640 1641 # compatibility 1642 XMLTreeBuilder = XMLParser 1643 1644 # workaround circular import. 1645 try: 1646 from ElementC14N import _serialize_c14n 1647 _serialize["c14n"] = _serialize_c14n 1648 except ImportError: 1649 pass 1650