Home | History | Annotate | Download | only in etree
      1 #
      2 # ElementTree
      3 # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
      4 #
      5 # limited xpath support for element trees
      6 #
      7 # history:
      8 # 2003-05-23 fl   created
      9 # 2003-05-28 fl   added support for // etc
     10 # 2003-08-27 fl   fixed parsing of periods in element names
     11 # 2007-09-10 fl   new selection engine
     12 # 2007-09-12 fl   fixed parent selector
     13 # 2007-09-13 fl   added iterfind; changed findall to return a list
     14 # 2007-11-30 fl   added namespaces support
     15 # 2009-10-30 fl   added child element value filter
     16 #
     17 # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
     18 #
     19 # fredrik (at] pythonware.com
     20 # http://www.pythonware.com
     21 #
     22 # --------------------------------------------------------------------
     23 # The ElementTree toolkit is
     24 #
     25 # Copyright (c) 1999-2009 by Fredrik Lundh
     26 #
     27 # By obtaining, using, and/or copying this software and/or its
     28 # associated documentation, you agree that you have read, understood,
     29 # and will comply with the following terms and conditions:
     30 #
     31 # Permission to use, copy, modify, and distribute this software and
     32 # its associated documentation for any purpose and without fee is
     33 # hereby granted, provided that the above copyright notice appears in
     34 # all copies, and that both that copyright notice and this permission
     35 # notice appear in supporting documentation, and that the name of
     36 # Secret Labs AB or the author not be used in advertising or publicity
     37 # pertaining to distribution of the software without specific, written
     38 # prior permission.
     39 #
     40 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
     41 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
     42 # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
     43 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
     44 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
     45 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
     46 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
     47 # OF THIS SOFTWARE.
     48 # --------------------------------------------------------------------
     49 
     50 # Licensed to PSF under a Contributor Agreement.
     51 # See http://www.python.org/psf/license for licensing details.
     52 
     53 ##
     54 # Implementation module for XPath support.  There's usually no reason
     55 # to import this module directly; the <b>ElementTree</b> does this for
     56 # you, if needed.
     57 ##
     58 
     59 import re
     60 
     61 xpath_tokenizer_re = re.compile(
     62     r"("
     63     r"'[^']*'|\"[^\"]*\"|"
     64     r"::|"
     65     r"//?|"
     66     r"\.\.|"
     67     r"\(\)|"
     68     r"[/.*:\[\]\(\)@=])|"
     69     r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
     70     r"\s+"
     71     )
     72 
     73 def xpath_tokenizer(pattern, namespaces=None):
     74     for token in xpath_tokenizer_re.findall(pattern):
     75         tag = token[1]
     76         if tag and tag[0] != "{" and ":" in tag:
     77             try:
     78                 prefix, uri = tag.split(":", 1)
     79                 if not namespaces:
     80                     raise KeyError
     81                 yield token[0], "{%s}%s" % (namespaces[prefix], uri)
     82             except KeyError:
     83                 raise SyntaxError("prefix %r not found in prefix map" % prefix)
     84         else:
     85             yield token
     86 
     87 def get_parent_map(context):
     88     parent_map = context.parent_map
     89     if parent_map is None:
     90         context.parent_map = parent_map = {}
     91         for p in context.root.iter():
     92             for e in p:
     93                 parent_map[e] = p
     94     return parent_map
     95 
     96 def prepare_child(next, token):
     97     tag = token[1]
     98     def select(context, result):
     99         for elem in result:
    100             for e in elem:
    101                 if e.tag == tag:
    102                     yield e
    103     return select
    104 
    105 def prepare_star(next, token):
    106     def select(context, result):
    107         for elem in result:
    108             yield from elem
    109     return select
    110 
    111 def prepare_self(next, token):
    112     def select(context, result):
    113         yield from result
    114     return select
    115 
    116 def prepare_descendant(next, token):
    117     try:
    118         token = next()
    119     except StopIteration:
    120         return
    121     if token[0] == "*":
    122         tag = "*"
    123     elif not token[0]:
    124         tag = token[1]
    125     else:
    126         raise SyntaxError("invalid descendant")
    127     def select(context, result):
    128         for elem in result:
    129             for e in elem.iter(tag):
    130                 if e is not elem:
    131                     yield e
    132     return select
    133 
    134 def prepare_parent(next, token):
    135     def select(context, result):
    136         # FIXME: raise error if .. is applied at toplevel?
    137         parent_map = get_parent_map(context)
    138         result_map = {}
    139         for elem in result:
    140             if elem in parent_map:
    141                 parent = parent_map[elem]
    142                 if parent not in result_map:
    143                     result_map[parent] = None
    144                     yield parent
    145     return select
    146 
    147 def prepare_predicate(next, token):
    148     # FIXME: replace with real parser!!! refs:
    149     # http://effbot.org/zone/simple-iterator-parser.htm
    150     # http://javascript.crockford.com/tdop/tdop.html
    151     signature = []
    152     predicate = []
    153     while 1:
    154         try:
    155             token = next()
    156         except StopIteration:
    157             return
    158         if token[0] == "]":
    159             break
    160         if token[0] and token[0][:1] in "'\"":
    161             token = "'", token[0][1:-1]
    162         signature.append(token[0] or "-")
    163         predicate.append(token[1])
    164     signature = "".join(signature)
    165     # use signature to determine predicate type
    166     if signature == "@-":
    167         # [@attribute] predicate
    168         key = predicate[1]
    169         def select(context, result):
    170             for elem in result:
    171                 if elem.get(key) is not None:
    172                     yield elem
    173         return select
    174     if signature == "@-='":
    175         # [@attribute='value']
    176         key = predicate[1]
    177         value = predicate[-1]
    178         def select(context, result):
    179             for elem in result:
    180                 if elem.get(key) == value:
    181                     yield elem
    182         return select
    183     if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
    184         # [tag]
    185         tag = predicate[0]
    186         def select(context, result):
    187             for elem in result:
    188                 if elem.find(tag) is not None:
    189                     yield elem
    190         return select
    191     if signature == "-='" and not re.match(r"\-?\d+$", predicate[0]):
    192         # [tag='value']
    193         tag = predicate[0]
    194         value = predicate[-1]
    195         def select(context, result):
    196             for elem in result:
    197                 for e in elem.findall(tag):
    198                     if "".join(e.itertext()) == value:
    199                         yield elem
    200                         break
    201         return select
    202     if signature == "-" or signature == "-()" or signature == "-()-":
    203         # [index] or [last()] or [last()-index]
    204         if signature == "-":
    205             # [index]
    206             index = int(predicate[0]) - 1
    207             if index < 0:
    208                 raise SyntaxError("XPath position >= 1 expected")
    209         else:
    210             if predicate[0] != "last":
    211                 raise SyntaxError("unsupported function")
    212             if signature == "-()-":
    213                 try:
    214                     index = int(predicate[2]) - 1
    215                 except ValueError:
    216                     raise SyntaxError("unsupported expression")
    217                 if index > -2:
    218                     raise SyntaxError("XPath offset from last() must be negative")
    219             else:
    220                 index = -1
    221         def select(context, result):
    222             parent_map = get_parent_map(context)
    223             for elem in result:
    224                 try:
    225                     parent = parent_map[elem]
    226                     # FIXME: what if the selector is "*" ?
    227                     elems = list(parent.findall(elem.tag))
    228                     if elems[index] is elem:
    229                         yield elem
    230                 except (IndexError, KeyError):
    231                     pass
    232         return select
    233     raise SyntaxError("invalid predicate")
    234 
    235 ops = {
    236     "": prepare_child,
    237     "*": prepare_star,
    238     ".": prepare_self,
    239     "..": prepare_parent,
    240     "//": prepare_descendant,
    241     "[": prepare_predicate,
    242     }
    243 
    244 _cache = {}
    245 
    246 class _SelectorContext:
    247     parent_map = None
    248     def __init__(self, root):
    249         self.root = root
    250 
    251 # --------------------------------------------------------------------
    252 
    253 ##
    254 # Generate all matching objects.
    255 
    256 def iterfind(elem, path, namespaces=None):
    257     # compile selector pattern
    258     cache_key = (path, None if namespaces is None
    259                             else tuple(sorted(namespaces.items())))
    260     if path[-1:] == "/":
    261         path = path + "*" # implicit all (FIXME: keep this?)
    262     try:
    263         selector = _cache[cache_key]
    264     except KeyError:
    265         if len(_cache) > 100:
    266             _cache.clear()
    267         if path[:1] == "/":
    268             raise SyntaxError("cannot use absolute path on element")
    269         next = iter(xpath_tokenizer(path, namespaces)).__next__
    270         try:
    271             token = next()
    272         except StopIteration:
    273             return
    274         selector = []
    275         while 1:
    276             try:
    277                 selector.append(ops[token[0]](next, token))
    278             except StopIteration:
    279                 raise SyntaxError("invalid path")
    280             try:
    281                 token = next()
    282                 if token[0] == "/":
    283                     token = next()
    284             except StopIteration:
    285                 break
    286         _cache[cache_key] = selector
    287     # execute selector pattern
    288     result = [elem]
    289     context = _SelectorContext(elem)
    290     for select in selector:
    291         result = select(context, result)
    292     return result
    293 
    294 ##
    295 # Find first matching object.
    296 
    297 def find(elem, path, namespaces=None):
    298     return next(iterfind(elem, path, namespaces), None)
    299 
    300 ##
    301 # Find all matching objects.
    302 
    303 def findall(elem, path, namespaces=None):
    304     return list(iterfind(elem, path, namespaces))
    305 
    306 ##
    307 # Find text for first matching object.
    308 
    309 def findtext(elem, path, default=None, namespaces=None):
    310     try:
    311         elem = next(iterfind(elem, path, namespaces))
    312         return elem.text or ""
    313     except StopIteration:
    314         return default
    315