1 import libxml2mod 2 import types 3 import sys 4 5 # The root of all libxml2 errors. 6 class libxmlError(Exception): pass 7 8 # Type of the wrapper class for the C objects wrappers 9 def checkWrapper(obj): 10 try: 11 n = type(_obj).__name__ 12 if n != 'PyCObject' and n != 'PyCapsule': 13 return 1 14 except: 15 return 0 16 return 0 17 18 # 19 # id() is sometimes negative ... 20 # 21 def pos_id(o): 22 i = id(o) 23 if (i < 0): 24 return (sys.maxsize - i) 25 return i 26 27 # 28 # Errors raised by the wrappers when some tree handling failed. 29 # 30 class treeError(libxmlError): 31 def __init__(self, msg): 32 self.msg = msg 33 def __str__(self): 34 return self.msg 35 36 class parserError(libxmlError): 37 def __init__(self, msg): 38 self.msg = msg 39 def __str__(self): 40 return self.msg 41 42 class uriError(libxmlError): 43 def __init__(self, msg): 44 self.msg = msg 45 def __str__(self): 46 return self.msg 47 48 class xpathError(libxmlError): 49 def __init__(self, msg): 50 self.msg = msg 51 def __str__(self): 52 return self.msg 53 54 class ioWrapper: 55 def __init__(self, _obj): 56 self.__io = _obj 57 self._o = None 58 59 def io_close(self): 60 if self.__io == None: 61 return(-1) 62 self.__io.close() 63 self.__io = None 64 return(0) 65 66 def io_flush(self): 67 if self.__io == None: 68 return(-1) 69 self.__io.flush() 70 return(0) 71 72 def io_read(self, len = -1): 73 if self.__io == None: 74 return(-1) 75 try: 76 if len < 0: 77 ret = self.__io.read() 78 else: 79 ret = self.__io.read(len) 80 except Exception: 81 import sys 82 e = sys.exc_info()[1] 83 print("failed to read from Python:", type(e)) 84 print("on IO:", self.__io) 85 self.__io == None 86 return(-1) 87 88 return(ret) 89 90 def io_write(self, str, len = -1): 91 if self.__io == None: 92 return(-1) 93 if len < 0: 94 return(self.__io.write(str)) 95 return(self.__io.write(str, len)) 96 97 class ioReadWrapper(ioWrapper): 98 def __init__(self, _obj, enc = ""): 99 ioWrapper.__init__(self, _obj) 100 self._o = libxml2mod.xmlCreateInputBuffer(self, enc) 101 102 def __del__(self): 103 print("__del__") 104 self.io_close() 105 if self._o != None: 106 libxml2mod.xmlFreeParserInputBuffer(self._o) 107 self._o = None 108 109 def close(self): 110 self.io_close() 111 if self._o != None: 112 libxml2mod.xmlFreeParserInputBuffer(self._o) 113 self._o = None 114 115 class ioWriteWrapper(ioWrapper): 116 def __init__(self, _obj, enc = ""): 117 # print "ioWriteWrapper.__init__", _obj 118 if type(_obj) == type(''): 119 print("write io from a string") 120 self.o = None 121 elif type(_obj).__name__ == 'PyCapsule': 122 file = libxml2mod.outputBufferGetPythonFile(_obj) 123 if file != None: 124 ioWrapper.__init__(self, file) 125 else: 126 ioWrapper.__init__(self, _obj) 127 self._o = _obj 128 # elif type(_obj) == types.InstanceType: 129 # print(("write io from instance of %s" % (_obj.__class__))) 130 # ioWrapper.__init__(self, _obj) 131 # self._o = libxml2mod.xmlCreateOutputBuffer(self, enc) 132 else: 133 file = libxml2mod.outputBufferGetPythonFile(_obj) 134 if file != None: 135 ioWrapper.__init__(self, file) 136 else: 137 ioWrapper.__init__(self, _obj) 138 self._o = _obj 139 140 def __del__(self): 141 # print "__del__" 142 self.io_close() 143 if self._o != None: 144 libxml2mod.xmlOutputBufferClose(self._o) 145 self._o = None 146 147 def flush(self): 148 self.io_flush() 149 if self._o != None: 150 libxml2mod.xmlOutputBufferClose(self._o) 151 self._o = None 152 153 def close(self): 154 self.io_flush() 155 if self._o != None: 156 libxml2mod.xmlOutputBufferClose(self._o) 157 self._o = None 158 159 # 160 # Example of a class to handle SAX events 161 # 162 class SAXCallback: 163 """Base class for SAX handlers""" 164 def startDocument(self): 165 """called at the start of the document""" 166 pass 167 168 def endDocument(self): 169 """called at the end of the document""" 170 pass 171 172 def startElement(self, tag, attrs): 173 """called at the start of every element, tag is the name of 174 the element, attrs is a dictionary of the element's attributes""" 175 pass 176 177 def endElement(self, tag): 178 """called at the start of every element, tag is the name of 179 the element""" 180 pass 181 182 def characters(self, data): 183 """called when character data have been read, data is the string 184 containing the data, multiple consecutive characters() callback 185 are possible.""" 186 pass 187 188 def cdataBlock(self, data): 189 """called when CDATA section have been read, data is the string 190 containing the data, multiple consecutive cdataBlock() callback 191 are possible.""" 192 pass 193 194 def reference(self, name): 195 """called when an entity reference has been found""" 196 pass 197 198 def ignorableWhitespace(self, data): 199 """called when potentially ignorable white spaces have been found""" 200 pass 201 202 def processingInstruction(self, target, data): 203 """called when a PI has been found, target contains the PI name and 204 data is the associated data in the PI""" 205 pass 206 207 def comment(self, content): 208 """called when a comment has been found, content contains the comment""" 209 pass 210 211 def externalSubset(self, name, externalID, systemID): 212 """called when a DOCTYPE declaration has been found, name is the 213 DTD name and externalID, systemID are the DTD public and system 214 identifier for that DTd if available""" 215 pass 216 217 def internalSubset(self, name, externalID, systemID): 218 """called when a DOCTYPE declaration has been found, name is the 219 DTD name and externalID, systemID are the DTD public and system 220 identifier for that DTD if available""" 221 pass 222 223 def entityDecl(self, name, type, externalID, systemID, content): 224 """called when an ENTITY declaration has been found, name is the 225 entity name and externalID, systemID are the entity public and 226 system identifier for that entity if available, type indicates 227 the entity type, and content reports it's string content""" 228 pass 229 230 def notationDecl(self, name, externalID, systemID): 231 """called when an NOTATION declaration has been found, name is the 232 notation name and externalID, systemID are the notation public and 233 system identifier for that notation if available""" 234 pass 235 236 def attributeDecl(self, elem, name, type, defi, defaultValue, nameList): 237 """called when an ATTRIBUTE definition has been found""" 238 pass 239 240 def elementDecl(self, name, type, content): 241 """called when an ELEMENT definition has been found""" 242 pass 243 244 def entityDecl(self, name, publicId, systemID, notationName): 245 """called when an unparsed ENTITY declaration has been found, 246 name is the entity name and publicId,, systemID are the entity 247 public and system identifier for that entity if available, 248 and notationName indicate the associated NOTATION""" 249 pass 250 251 def warning(self, msg): 252 #print msg 253 pass 254 255 def error(self, msg): 256 raise parserError(msg) 257 258 def fatalError(self, msg): 259 raise parserError(msg) 260 261 # 262 # This class is the ancestor of all the Node classes. It provides 263 # the basic functionalities shared by all nodes (and handle 264 # gracefylly the exception), like name, navigation in the tree, 265 # doc reference, content access and serializing to a string or URI 266 # 267 class xmlCore: 268 def __init__(self, _obj=None): 269 if _obj != None: 270 self._o = _obj; 271 return 272 self._o = None 273 274 def __eq__(self, other): 275 if other == None: 276 return False 277 ret = libxml2mod.compareNodesEqual(self._o, other._o) 278 if ret == None: 279 return False 280 return ret == True 281 def __ne__(self, other): 282 if other == None: 283 return True 284 ret = libxml2mod.compareNodesEqual(self._o, other._o) 285 return not ret 286 def __hash__(self): 287 ret = libxml2mod.nodeHash(self._o) 288 return ret 289 290 def __str__(self): 291 return self.serialize() 292 def get_parent(self): 293 ret = libxml2mod.parent(self._o) 294 if ret == None: 295 return None 296 return nodeWrap(ret) 297 def get_children(self): 298 ret = libxml2mod.children(self._o) 299 if ret == None: 300 return None 301 return nodeWrap(ret) 302 def get_last(self): 303 ret = libxml2mod.last(self._o) 304 if ret == None: 305 return None 306 return nodeWrap(ret) 307 def get_next(self): 308 ret = libxml2mod.next(self._o) 309 if ret == None: 310 return None 311 return nodeWrap(ret) 312 def get_properties(self): 313 ret = libxml2mod.properties(self._o) 314 if ret == None: 315 return None 316 return xmlAttr(_obj=ret) 317 def get_prev(self): 318 ret = libxml2mod.prev(self._o) 319 if ret == None: 320 return None 321 return nodeWrap(ret) 322 def get_content(self): 323 return libxml2mod.xmlNodeGetContent(self._o) 324 getContent = get_content # why is this duplicate naming needed ? 325 def get_name(self): 326 return libxml2mod.name(self._o) 327 def get_type(self): 328 return libxml2mod.type(self._o) 329 def get_doc(self): 330 ret = libxml2mod.doc(self._o) 331 if ret == None: 332 if self.type in ["document_xml", "document_html"]: 333 return xmlDoc(_obj=self._o) 334 else: 335 return None 336 return xmlDoc(_obj=ret) 337 # 338 # Those are common attributes to nearly all type of nodes 339 # defined as python2 properties 340 # 341 import sys 342 if float(sys.version[0:3]) < 2.2: 343 def __getattr__(self, attr): 344 if attr == "parent": 345 ret = libxml2mod.parent(self._o) 346 if ret == None: 347 return None 348 return nodeWrap(ret) 349 elif attr == "properties": 350 ret = libxml2mod.properties(self._o) 351 if ret == None: 352 return None 353 return xmlAttr(_obj=ret) 354 elif attr == "children": 355 ret = libxml2mod.children(self._o) 356 if ret == None: 357 return None 358 return nodeWrap(ret) 359 elif attr == "last": 360 ret = libxml2mod.last(self._o) 361 if ret == None: 362 return None 363 return nodeWrap(ret) 364 elif attr == "next": 365 ret = libxml2mod.next(self._o) 366 if ret == None: 367 return None 368 return nodeWrap(ret) 369 elif attr == "prev": 370 ret = libxml2mod.prev(self._o) 371 if ret == None: 372 return None 373 return nodeWrap(ret) 374 elif attr == "content": 375 return libxml2mod.xmlNodeGetContent(self._o) 376 elif attr == "name": 377 return libxml2mod.name(self._o) 378 elif attr == "type": 379 return libxml2mod.type(self._o) 380 elif attr == "doc": 381 ret = libxml2mod.doc(self._o) 382 if ret == None: 383 if self.type == "document_xml" or self.type == "document_html": 384 return xmlDoc(_obj=self._o) 385 else: 386 return None 387 return xmlDoc(_obj=ret) 388 raise AttributeError(attr) 389 else: 390 parent = property(get_parent, None, None, "Parent node") 391 children = property(get_children, None, None, "First child node") 392 last = property(get_last, None, None, "Last sibling node") 393 next = property(get_next, None, None, "Next sibling node") 394 prev = property(get_prev, None, None, "Previous sibling node") 395 properties = property(get_properties, None, None, "List of properies") 396 content = property(get_content, None, None, "Content of this node") 397 name = property(get_name, None, None, "Node name") 398 type = property(get_type, None, None, "Node type") 399 doc = property(get_doc, None, None, "The document this node belongs to") 400 401 # 402 # Serialization routines, the optional arguments have the following 403 # meaning: 404 # encoding: string to ask saving in a specific encoding 405 # indent: if 1 the serializer is asked to indent the output 406 # 407 def serialize(self, encoding = None, format = 0): 408 return libxml2mod.serializeNode(self._o, encoding, format) 409 def saveTo(self, file, encoding = None, format = 0): 410 return libxml2mod.saveNodeTo(self._o, file, encoding, format) 411 412 # 413 # Canonicalization routines: 414 # 415 # nodes: the node set (tuple or list) to be included in the 416 # canonized image or None if all document nodes should be 417 # included. 418 # exclusive: the exclusive flag (0 - non-exclusive 419 # canonicalization; otherwise - exclusive canonicalization) 420 # prefixes: the list of inclusive namespace prefixes (strings), 421 # or None if there is no inclusive namespaces (only for 422 # exclusive canonicalization, ignored otherwise) 423 # with_comments: include comments in the result (!=0) or not 424 # (==0) 425 def c14nMemory(self, 426 nodes=None, 427 exclusive=0, 428 prefixes=None, 429 with_comments=0): 430 if nodes: 431 nodes = [n._o for n in nodes] 432 return libxml2mod.xmlC14NDocDumpMemory( 433 self.get_doc()._o, 434 nodes, 435 exclusive != 0, 436 prefixes, 437 with_comments != 0) 438 def c14nSaveTo(self, 439 file, 440 nodes=None, 441 exclusive=0, 442 prefixes=None, 443 with_comments=0): 444 if nodes: 445 nodes = [n._o for n in nodes] 446 return libxml2mod.xmlC14NDocSaveTo( 447 self.get_doc()._o, 448 nodes, 449 exclusive != 0, 450 prefixes, 451 with_comments != 0, 452 file) 453 454 # 455 # Selecting nodes using XPath, a bit slow because the context 456 # is allocated/freed every time but convenient. 457 # 458 def xpathEval(self, expr): 459 doc = self.doc 460 if doc == None: 461 return None 462 ctxt = doc.xpathNewContext() 463 ctxt.setContextNode(self) 464 res = ctxt.xpathEval(expr) 465 ctxt.xpathFreeContext() 466 return res 467 468 # # 469 # # Selecting nodes using XPath, faster because the context 470 # # is allocated just once per xmlDoc. 471 # # 472 # # Removed: DV memleaks c.f. #126735 473 # # 474 # def xpathEval2(self, expr): 475 # doc = self.doc 476 # if doc == None: 477 # return None 478 # try: 479 # doc._ctxt.setContextNode(self) 480 # except: 481 # doc._ctxt = doc.xpathNewContext() 482 # doc._ctxt.setContextNode(self) 483 # res = doc._ctxt.xpathEval(expr) 484 # return res 485 def xpathEval2(self, expr): 486 return self.xpathEval(expr) 487 488 # Remove namespaces 489 def removeNsDef(self, href): 490 """ 491 Remove a namespace definition from a node. If href is None, 492 remove all of the ns definitions on that node. The removed 493 namespaces are returned as a linked list. 494 495 Note: If any child nodes referred to the removed namespaces, 496 they will be left with dangling links. You should call 497 renconciliateNs() to fix those pointers. 498 499 Note: This method does not free memory taken by the ns 500 definitions. You will need to free it manually with the 501 freeNsList() method on the returns xmlNs object. 502 """ 503 504 ret = libxml2mod.xmlNodeRemoveNsDef(self._o, href) 505 if ret is None:return None 506 __tmp = xmlNs(_obj=ret) 507 return __tmp 508 509 # support for python2 iterators 510 def walk_depth_first(self): 511 return xmlCoreDepthFirstItertor(self) 512 def walk_breadth_first(self): 513 return xmlCoreBreadthFirstItertor(self) 514 __iter__ = walk_depth_first 515 516 def free(self): 517 try: 518 self.doc._ctxt.xpathFreeContext() 519 except: 520 pass 521 libxml2mod.xmlFreeDoc(self._o) 522 523 524 # 525 # implements the depth-first iterator for libxml2 DOM tree 526 # 527 class xmlCoreDepthFirstItertor: 528 def __init__(self, node): 529 self.node = node 530 self.parents = [] 531 def __iter__(self): 532 return self 533 def __next__(self): 534 while 1: 535 if self.node: 536 ret = self.node 537 self.parents.append(self.node) 538 self.node = self.node.children 539 return ret 540 try: 541 parent = self.parents.pop() 542 except IndexError: 543 raise StopIteration 544 self.node = parent.next 545 next = __next__ 546 547 # 548 # implements the breadth-first iterator for libxml2 DOM tree 549 # 550 class xmlCoreBreadthFirstItertor: 551 def __init__(self, node): 552 self.node = node 553 self.parents = [] 554 def __iter__(self): 555 return self 556 def __next__(self): 557 while 1: 558 if self.node: 559 ret = self.node 560 self.parents.append(self.node) 561 self.node = self.node.next 562 return ret 563 try: 564 parent = self.parents.pop() 565 except IndexError: 566 raise StopIteration 567 self.node = parent.children 568 next = __next__ 569 570 # 571 # converters to present a nicer view of the XPath returns 572 # 573 def nodeWrap(o): 574 # TODO try to cast to the most appropriate node class 575 name = libxml2mod.type(o) 576 if name == "element" or name == "text": 577 return xmlNode(_obj=o) 578 if name == "attribute": 579 return xmlAttr(_obj=o) 580 if name[0:8] == "document": 581 return xmlDoc(_obj=o) 582 if name == "namespace": 583 return xmlNs(_obj=o) 584 if name == "elem_decl": 585 return xmlElement(_obj=o) 586 if name == "attribute_decl": 587 return xmlAttribute(_obj=o) 588 if name == "entity_decl": 589 return xmlEntity(_obj=o) 590 if name == "dtd": 591 return xmlDtd(_obj=o) 592 return xmlNode(_obj=o) 593 594 def xpathObjectRet(o): 595 otype = type(o) 596 if otype == type([]): 597 ret = list(map(xpathObjectRet, o)) 598 return ret 599 elif otype == type(()): 600 ret = list(map(xpathObjectRet, o)) 601 return tuple(ret) 602 elif otype == type('') or otype == type(0) or otype == type(0.0): 603 return o 604 else: 605 return nodeWrap(o) 606 607 # 608 # register an XPath function 609 # 610 def registerXPathFunction(ctxt, name, ns_uri, f): 611 ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f) 612 613 # 614 # For the xmlTextReader parser configuration 615 # 616 PARSER_LOADDTD=1 617 PARSER_DEFAULTATTRS=2 618 PARSER_VALIDATE=3 619 PARSER_SUBST_ENTITIES=4 620 621 # 622 # For the error callback severities 623 # 624 PARSER_SEVERITY_VALIDITY_WARNING=1 625 PARSER_SEVERITY_VALIDITY_ERROR=2 626 PARSER_SEVERITY_WARNING=3 627 PARSER_SEVERITY_ERROR=4 628 629 # 630 # register the libxml2 error handler 631 # 632 def registerErrorHandler(f, ctx): 633 """Register a Python written function to for error reporting. 634 The function is called back as f(ctx, error). """ 635 import sys 636 if 'libxslt' not in sys.modules: 637 # normal behaviour when libxslt is not imported 638 ret = libxml2mod.xmlRegisterErrorHandler(f,ctx) 639 else: 640 # when libxslt is already imported, one must 641 # use libxst's error handler instead 642 import libxslt 643 ret = libxslt.registerErrorHandler(f,ctx) 644 return ret 645 646 class parserCtxtCore: 647 648 def __init__(self, _obj=None): 649 if _obj != None: 650 self._o = _obj; 651 return 652 self._o = None 653 654 def __del__(self): 655 if self._o != None: 656 libxml2mod.xmlFreeParserCtxt(self._o) 657 self._o = None 658 659 def setErrorHandler(self,f,arg): 660 """Register an error handler that will be called back as 661 f(arg,msg,severity,reserved). 662 663 @reserved is currently always None.""" 664 libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg) 665 666 def getErrorHandler(self): 667 """Return (f,arg) as previously registered with setErrorHandler 668 or (None,None).""" 669 return libxml2mod.xmlParserCtxtGetErrorHandler(self._o) 670 671 def addLocalCatalog(self, uri): 672 """Register a local catalog with the parser""" 673 return libxml2mod.addLocalCatalog(self._o, uri) 674 675 676 class ValidCtxtCore: 677 678 def __init__(self, *args, **kw): 679 pass 680 681 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 682 """ 683 Register error and warning handlers for DTD validation. 684 These will be called back as f(msg,arg) 685 """ 686 libxml2mod.xmlSetValidErrors(self._o, err_func, warn_func, arg) 687 688 689 class SchemaValidCtxtCore: 690 691 def __init__(self, *args, **kw): 692 pass 693 694 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 695 """ 696 Register error and warning handlers for Schema validation. 697 These will be called back as f(msg,arg) 698 """ 699 libxml2mod.xmlSchemaSetValidErrors(self._o, err_func, warn_func, arg) 700 701 702 class relaxNgValidCtxtCore: 703 704 def __init__(self, *args, **kw): 705 pass 706 707 def setValidityErrorHandler(self, err_func, warn_func, arg=None): 708 """ 709 Register error and warning handlers for RelaxNG validation. 710 These will be called back as f(msg,arg) 711 """ 712 libxml2mod.xmlRelaxNGSetValidErrors(self._o, err_func, warn_func, arg) 713 714 715 def _xmlTextReaderErrorFunc(xxx_todo_changeme,msg,severity,locator): 716 """Intermediate callback to wrap the locator""" 717 (f,arg) = xxx_todo_changeme 718 return f(arg,msg,severity,xmlTextReaderLocator(locator)) 719 720 class xmlTextReaderCore: 721 722 def __init__(self, _obj=None): 723 self.input = None 724 if _obj != None:self._o = _obj;return 725 self._o = None 726 727 def __del__(self): 728 if self._o != None: 729 libxml2mod.xmlFreeTextReader(self._o) 730 self._o = None 731 732 def SetErrorHandler(self,f,arg): 733 """Register an error handler that will be called back as 734 f(arg,msg,severity,locator).""" 735 if f is None: 736 libxml2mod.xmlTextReaderSetErrorHandler(\ 737 self._o,None,None) 738 else: 739 libxml2mod.xmlTextReaderSetErrorHandler(\ 740 self._o,_xmlTextReaderErrorFunc,(f,arg)) 741 742 def GetErrorHandler(self): 743 """Return (f,arg) as previously registered with setErrorHandler 744 or (None,None).""" 745 f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o) 746 if f is None: 747 return None,None 748 else: 749 # assert f is _xmlTextReaderErrorFunc 750 return arg 751 752 # 753 # The cleanup now goes though a wrapper in libxml.c 754 # 755 def cleanupParser(): 756 libxml2mod.xmlPythonCleanupParser() 757 758 # 759 # The interface to xmlRegisterInputCallbacks. 760 # Since this API does not allow to pass a data object along with 761 # match/open callbacks, it is necessary to maintain a list of all 762 # Python callbacks. 763 # 764 __input_callbacks = [] 765 def registerInputCallback(func): 766 def findOpenCallback(URI): 767 for cb in reversed(__input_callbacks): 768 o = cb(URI) 769 if o is not None: 770 return o 771 libxml2mod.xmlRegisterInputCallback(findOpenCallback) 772 __input_callbacks.append(func) 773 774 def popInputCallbacks(): 775 # First pop python-level callbacks, when no more available - start 776 # popping built-in ones. 777 if len(__input_callbacks) > 0: 778 __input_callbacks.pop() 779 if len(__input_callbacks) == 0: 780 libxml2mod.xmlUnregisterInputCallback() 781 782 # WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 783 # 784 # Everything before this line comes from libxml.py 785 # Everything after this line is automatically generated 786 # 787 # WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING 788 789